<a href="https://colab.research.google.com/github/oakeshott/lsm-intern-2022/blob/master/jupyter/gnn-gc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!git clone https://github.com/oakeshott/lsm-intern-2022
!pip install torch==1.9.1 torchvision==0.10.1 torchaudio==0.9.1
!pip install torch-scatter==2.0.7 torch-sparse==0.6.10 torch-cluster==1.5.9 torch-spline-conv==1.2.1 torch-geometric==2.0.1 -f https://data.pyg.org/whl/torch-1.9.1+cpu.html
!pip install networkx pandas numpy sklearn joblib tqdm

Cloning into 'lsm-intern-2022'...
remote: Enumerating objects: 50660, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 50660 (delta 23), reused 36 (delta 15), pack-reused 50616[K
Receiving objects: 100% (50660/50660), 50.89 MiB | 27.35 MiB/s, done.
Resolving deltas: 100% (50574/50574), done.
Checking out files: 100% (22641/22641), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.9.1
  Downloading torch-1.9.1-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 6.1 kB/s 
[?25hCollecting torchvision==0.10.1
  Downloading torchvision-0.10.1-cp37-cp37m-manylinux1_x86_64.whl (22.1 MB)
[K     |████████████████████████████████| 22.1 MB 49.0 MB/s 
[?25hCollecting torchaudio==0.9.1
  Downloading torchaudio-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import random_split, Subset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import random
import numpy as np
import os
import joblib
from torch_geometric.data import Dataset, InMemoryDataset
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, train_test_split_edges
from torch_geometric.nn import global_add_pool, GCNConv
import networkx as nx

In [3]:
class NetworkMetricsWithTopologyDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)
#         self.data_dir = "../dataset/train/network"
#         self.processed_dir = '/tmp/'
    @property
    def raw_file_names(self):
        return [filename for filename in sorted(os.listdir(self.raw_dir))]
    @property
    def processed_file_names(self):
#         data_size = 5969
#         data_size = 5333
#         return [f'data_{i}.pt' for i in range(data_size)]
        return [i  for i in sorted(os.listdir(self.processed_dir)) if 'data' in i]

    def process(self):
        idx = 0
        for raw_path in self.raw_paths:
            # Read data from `raw_path`.
            g = nx.read_gpickle(raw_path)
            for n in g.nodes():
                label = g.nodes()[n]['label']
                del g.nodes()[n]['label']
            data = from_networkx(g)
            data.y =  torch.tensor(label)
            data.num_nodes = len(g.nodes())
            data.edge_attr = []
            if self.pre_filter is not None and not self.pre_filter(data):
                continue

            if self.pre_transform is not None:
                data = self.pre_transform(data)

            torch.save(data, os.path.join(self.processed_dir, f'data_{idx}.pt'))
            idx += 1

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt'))
        return data

In [4]:
class GCNClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GCNClassifier, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        hidden_dim = 128
        
        self.linear = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(hidden_dim, self.output_dim),
        )
        
        self.gcn1 = GCNConv(self.input_dim, hidden_dim)
        self.gcn2 = GCNConv(hidden_dim, hidden_dim)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        
    def forward(self, x, edge_index, batch, edge_attr):
        x = self.gcn1(x, edge_index, edge_attr)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.gcn2(x, edge_index, edge_attr)
        x = self.relu(x)
        x = self.dropout(x)
        return self.linear(global_add_pool(x, batch))

In [5]:
seed = 1
batchsize = 16
max_epoch = 100
device = 'cpu'
path = "/content/lsm-intern-2022/dataset/train/network"
model_dir = "models/gcn"
os.makedirs(model_dir, exist_ok=True)
metrics = ["cpu-util", "tx-pps", "rx-pps", "network-incoming-packets-rate", "network-outgoing-packets-rate", "prefix-activity-received-current-prefixes"]
events = {
    'normal': 0,
    'ixnetwork-bgp-hijacking-start': 1,
    'ixnetwork-bgp-injection-start': 2,
    'node-down': 3,
    'interface-down': 4,
    'packet-loss-delay': 5,
}
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

dataset = NetworkMetricsWithTopologyDataset(path)

In [6]:
labels = [dataset[i].y for i in range(len(dataset))]
train_indices, val_indices = train_test_split(
    list(range(len(dataset))),
    test_size=0.2,
    stratify=labels,
    random_state=seed,
)

In [7]:
train_dataset = dataset[train_indices]
train_size = len(train_dataset)
val_dataset = dataset[val_indices]
val_size = len(val_dataset)

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=batchsize)
val_dataloader = DataLoader(val_dataset, batch_size=val_size)


input_dim = train_dataset[0].x.shape[-1]
output_dim = len(events.keys())
model =GCNClassifier(input_dim, output_dim).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


val_data = iter(val_dataloader).next()
val_batch = val_data.batch.to(device)
val_edge_index = val_data.edge_index.to(device)
val_edge_attr = None
val_labels = val_data.y.long().to(device).view(-1)
val_data = val_data.x.float().to(device)

In [None]:
for epoch in range(1, max_epoch+1):
    running_loss = 0
    correct = 0
    total = 0
    model = model.train()
    # Training
    for train_data in train_dataloader:
        train_labels = train_data.y
        x = train_data.x.float().to(device)
        edge_index = train_data.edge_index.to(device)
        batch = train_data.batch.to(device)
        edge_attr = None
        train_labels = train_data.y.long().to(device).view(-1)
        
        model.zero_grad()
        train_scores = model(x, edge_index, batch, edge_attr)
        loss = loss_function(train_scores, train_labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predict = torch.max(train_scores.data, 1)
        correct += (predict == train_labels).sum().item()
        total += train_labels.size(0)

    train_loss = running_loss / len(train_dataloader)
    train_acc = correct / total

    # Check model validation 
    model = model.eval()
    with torch.no_grad():
        val_scores = model(val_data, val_edge_index, val_batch, val_edge_attr)
        val_loss = loss_function(val_scores, val_labels)

        bi_scores = torch.argmax(val_scores, dim=1).to(device).numpy()
        y_val_scores = val_labels.to(device).numpy()
        val_acc = accuracy_score(y_val_scores, bi_scores)
    
    print(f'EPOCH: [{epoch}/{max_epoch}] train loss: {train_loss:.4f} train acc: {train_acc:.4f} val loss: {val_loss:.4f} val acc: {val_acc:4f}')
#     Export model
    if epoch % 10 == 0:
        torch.save(model.state_dict(), f"./{model_dir}/gcn_{epoch}.mdl")

EPOCH: [1/100] train loss: 0.8444 train acc: 0.7401 val loss: 0.6900 val acc: 0.782245
EPOCH: [2/100] train loss: 0.6983 train acc: 0.7964 val loss: 0.6073 val acc: 0.824121
EPOCH: [3/100] train loss: 0.6508 train acc: 0.8184 val loss: 0.5724 val acc: 0.829983
EPOCH: [4/100] train loss: 0.6243 train acc: 0.8220 val loss: 0.5319 val acc: 0.835846
EPOCH: [5/100] train loss: 0.5766 train acc: 0.8318 val loss: 0.4940 val acc: 0.848409
EPOCH: [6/100] train loss: 0.5445 train acc: 0.8477 val loss: 0.4564 val acc: 0.862647
EPOCH: [7/100] train loss: 0.5126 train acc: 0.8501 val loss: 0.4386 val acc: 0.867672
EPOCH: [8/100] train loss: 0.4947 train acc: 0.8580 val loss: 0.4289 val acc: 0.865159
EPOCH: [9/100] train loss: 0.4882 train acc: 0.8595 val loss: 0.4268 val acc: 0.871022
EPOCH: [10/100] train loss: 0.4812 train acc: 0.8595 val loss: 0.4545 val acc: 0.857621
EPOCH: [11/100] train loss: 0.4723 train acc: 0.8616 val loss: 0.4216 val acc: 0.871022
EPOCH: [12/100] train loss: 0.4683 train 

In [None]:
model_path = os.path.join(model_dir, "gcn_100.mdl")
path = '/content/lsm-intern-2022/dataset/test/network'
dataset = NetworkMetricsWithTopologyDataset(path)

input_dim = dataset[0].x.shape[-1]
output_dim = len(events.keys())

test_dataloader = DataLoader(dataset, batch_size=len(dataset))
test_data = iter(test_dataloader).next()
x = test_data.x.float().to(device)
edge_index = test_data.edge_index.to(device)
batch = test_data.batch.to(device)
edge_attr = None
test_label = test_data.y.long().to(device).view(-1)

model = GCNClassifier(input_dim, output_dim).to(device)
model.load_state_dict(torch.load(model_path))
model = model.eval()
loss_function = nn.CrossEntropyLoss()
with torch.no_grad():
    test_scores = model(x, edge_index, batch, edge_attr)
    loss = loss_function(test_scores, test_label)
    bi_scores = torch.argmax(test_scores, dim=1).to('cpu').numpy()
    y_test_scores = test_label.to('cpu').numpy()
print(accuracy_score(y_test_scores, bi_scores))
print(classification_report(y_test_scores, bi_scores, target_names=list(events.keys())))