In [1]:
import pandas as pd
import numpy as np
import dgl
from sklearn.preprocessing import LabelEncoder, StandardScaler
import h5py
from EGraphSAGE import Preprocessing
import networkx as nx 
from torch_geometric.utils.convert import from_dgl


  from .autonotebook import tqdm as notebook_tqdm


### Setup

In [2]:
def to_graph(data: pd.DataFrame):
    data['h'] = data['h'].apply(lambda x: np.array(x, dtype=np.float32))
    data['Attack'] = data['Attack'].astype(np.int64)
    data = data.rename(columns={'h': 'x'})
    G = nx.from_pandas_edgelist(data, source='IPV4_SRC_ADDR', 
                                target='IPV4_DST_ADDR', 
                                edge_attr=['x', 'Attack'], 
                                create_using=nx.MultiGraph())
    G = G.to_directed()
    g = dgl.from_networkx(G, edge_attrs=['x', 'Attack'])
    # g = g.line_graph(shared=True)
    return g

In [11]:
from tqdm import tqdm

# generate chunked dataset (normal flowgraphs first)
with h5py.File('interm/NF-IoT_linegraphs.h5', 'w') as f:
    for i, chunk in tqdm(
        enumerate(pd.read_csv('raw/NF-ToN-IoT-v3.csv', chunksize=10_000))
    ):
        chunk = Preprocessing._prepare_flows(chunk)
        G = to_graph(chunk)
        labels = G.edata['Attack']
        G = from_dgl(G)
        
        grp = f.create_group(f"G_{i}")
        grp.create_dataset("G.x", data=G.x, dtype=np.float32)
        grp.create_dataset("G.edge_index", data=G.edge_index, dtype=int)
        grp.create_dataset("labels", data=labels, dtype=int)

2753it [36:58,  1.24it/s]


In [3]:
from sklearn.utils import class_weight 
from torch import nn
import torch as th

classes_df = pd.read_csv('raw/NF-ToN-IoT-v3.csv', dtype='category', usecols=['Attack'])
unique_classes = np.array(classes_df['Attack'].unique())

# weighted cross entropy loss
class_weights = class_weight.compute_class_weight(
           class_weight= 'balanced',
            classes=unique_classes,
            y=classes_df['Attack'])

class_weights = th.FloatTensor(class_weights)
criterion = nn.CrossEntropyLoss(weight = class_weights)

del classes_df # memory risk
unique_classes

array(['Benign', 'scanning', 'dos', 'injection', 'ddos', 'password',
       'xss', 'ransomware', 'Backdoor', 'mitm'], dtype=object)

In [None]:
f = 'raw/NF-ToN-IoT-v3.csv'

def load_dgl_line_graph(group_name, ds='interm/NF-IoT_linegraphs.h5'):
    with h5py.File(ds, 'r') as f:
        grp = f[group_name]

        # Load tensors
        edge_attr = th.tensor(grp["G.x"][:], dtype=th.float32) 
        edge_index = th.tensor(grp["G.edge_index"][:], dtype=th.int64)
        labels = th.tensor(grp["labels"][:], dtype=th.int64) 
    
    src, dst = edge_index[0], edge_index[1]
    num_nodes = max(src.max(), dst.max()).item() + 1

    g = dgl.graph((src, dst), num_nodes=num_nodes)
    g.edata['x'] = edge_attr
    g.edata['Attack'] = labels

    tr = dgl.LineGraph(backtracking=False)
    g = tr(g)
    return g


g0 = load_dgl_line_graph('G_0')
g0.number_of_nodes(), g0.number_of_edges()

(20000, 1167694)

### Training

In [5]:
from torch.utils.data import Dataset
from dgl.dataloading import GraphDataLoader
import os

class GraphDataset(Dataset):
    def __init__(self, h5_ds):
        self.h5_ds = h5_ds

    def __len__(self):
        with h5py.File(self.h5_ds, 'r') as ds:
            l = len(ds) 
        return l

    def __getitem__(self, idx):
        return load_dgl_line_graph(f'G_{idx}')
 

In [None]:
import torch as th
from tqdm import tqdm
from EGraphSAGE import Model
import torch.nn.functional as F
from torch_geometric.nn import GraphSAGE

         
def train_one_epoch(model, loader, criterion, train=0.8):
    optimizer = th.optim.Adam(model.parameters())
    model.train()
    losses, test_losses = [], []
    
    for G in tqdm(loader):
        
        # test/train masks
        size = G.number_of_nodes()
        train_mask = np.zeros(size)
        train_mask[:int(size*train)] = 1
        test_mask = ~np.array(train_mask, dtype=bool)
        optimizer.zero_grad()
        labels =  G.ndata['Attack']
        G = from_dgl(G)
        pred = model(G.x, G.edge_index)
        loss = criterion(pred[train_mask, :], labels[train_mask])
        test_loss = criterion(pred[test_mask, :], labels[test_mask])
        losses.append(loss)
        test_losses.append(test_loss)
        del G

    return losses, test_losses

metrics = []
dataset = GraphDataset('interm/NF-IoT_linegraphs.h5')  
loader = GraphDataLoader(dataset, batch_size=100, shuffle=False)   
for epoch in range(2):
    print(f'epoch {epoch+1}/2')
    m = train_one_epoch(
        model=GraphSAGE(
            in_channels=g0.ndata['x'].shape[1],
            hidden_channels=64, # 128 in original EGraphSAGE paper
            num_layers=2,
            out_channels=len(unique_classes), #  assumes ordered ?
            dropout=0.2
        ),
        loader=loader,
        criterion=criterion6
        
    )  
    print(m)
    metrics.append(m)
    
    # !! what should be the chunked sizes??
    

epoch 1/2


  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/28 [00:49<?, ?it/s]


KeyboardInterrupt: 

In [49]:
G0.number_of_edges()

1190494