In [None]:
from EGraphSAGE import Model, compute_accuracy, train
import pickle
import dgl
import torch as th
import os
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd

In [17]:
features_df = pd.read_csv('raw/NetFlow_v3_Features.csv')
features = list(features_df.Feature)
features

['IPV4_SRC_ADDR',
 'IPV4_DST_ADDR',
 'L4_SRC_PORT',
 'L4_DST_PORT',
 'PROTOCOL',
 'L7_PROTO',
 'IN_BYTES',
 'OUT_BYTES',
 'IN_PKTS',
 'OUT_PKTS',
 'FLOW_DURATION_MILLISECONDS',
 'TCP_FLAGS',
 'CLIENT_TCP_FLAGS',
 'SERVER_TCP_FLAGS',
 'DURATION_IN',
 'DURATION_OUT',
 'MIN_TTL',
 'MAX_TTL',
 'LONGEST_FLOW_PKT',
 'SHORTEST_FLOW_PKT',
 'MIN_IP_PKT_LEN',
 'MAX_IP_PKT_LEN',
 'SRC_TO_DST_SECOND_BYTES',
 'DST_TO_SRC_SECOND_BYTES',
 'RETRANSMITTED_IN_BYTES',
 'RETRANSMITTED_IN_PKTS',
 'RETRANSMITTED_OUT_BYTES',
 'RETRANSMITTED_OUT_PKTS',
 'SRC_TO_DST_AVG_THROUGHPUT',
 'DST_TO_SRC_AVG_THROUGHPUT',
 'NUM_PKTS_UP_TO_128_BYTES',
 'NUM_PKTS_128_TO_256_BYTES',
 'NUM_PKTS_256_TO_512_BYTES',
 'NUM_PKTS_512_TO_1024_BYTES',
 'NUM_PKTS_1024_TO_1514_BYTES',
 'TCP_WIN_MAX_IN',
 'TCP_WIN_MAX_OUT',
 'ICMP_TYPE',
 'ICMP_IPV4_TYPE',
 'DNS_QUERY_ID',
 'DNS_QUERY_TYPE',
 'DNS_TTL_ANSWER',
 'FTP_COMMAND_RET_CODE',
 'FLOW_START_MILLISECONDS',
 'FLOW_END_MILLISECONDS',
 'SRC_TO_DST_IAT_MIN                ',
 'SRC_TO

In [18]:
classes = pd.read_csv('raw/NF-ToN-IoT-v3.csv', dtype='category', usecols=['Attack'])

In [30]:
c = classes['Attack']
np.array(c.unique())

array(['Benign', 'scanning', 'dos', 'injection', 'ddos', 'password',
       'xss', 'ransomware', 'Backdoor', 'mitm'], dtype=object)

In [None]:
from sklearn.utils import class_weight 
from torch import nn

# weighted cross entropy loss
class_weights = class_weight.compute_class_weight(
           class_weight= 'balanced',
            classes=np.array(c.unique()),
            y=c)

class_weights = th.FloatTensor(class_weights)
criterion = nn.CrossEntropyLoss(weight = class_weights)

In [49]:
def get_edge_masks(l, train_split=0.8, valid_test_split=0.5):
    tr = int(l * train_split)
    o = l - tr
    edge_train_mask = np.concatenate((np.ones(tr), np.zeros(o)))
    valid = int(o*valid_test_split)
    edge_valid_mask = np.concatenate((np.zeros(tr), np.ones(valid), np.zeros(o - valid)))
    edge_test_mask = ~np.array(edge_train_mask + edge_valid_mask, dtype=bool)
    return edge_train_mask, edge_valid_mask, edge_test_mask

In [59]:
import torch
from torch.utils.data import Dataset
import pickle
from dgl.dataloading import GraphDataLoader

class GraphDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        # return len(self.file_paths)
        return 2 # !

    def __getitem__(self, idx):
        with open(self.file_paths[idx], 'rb') as f:
            data = pickle.load(f)
        return data
    
p =  './interm/NF-IoT flowgraphs/'
dataset = GraphDataset([p + f for f in os.listdir(p)])  
loader = GraphDataLoader(dataset, batch_size=1, shuffle=True)    

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch.nn.functional as F
from tqdm import tqdm

def train_one_epoch(model, loader, criterion):
    optimizer = th.optim.Adam(model.parameters())
    
    model.train()
    
    valid_masks = []
    test_masks = []
    all_preds = []
    all_labels = []

    for G in tqdm(loader):
        # batch = batch.to(device)
        
        train_mask, valid_mask, test_mask = get_edge_masks(len(G.edata['h']))
        
        G.ndata['h'] = th.ones(G.num_nodes(), G.edata['h'].shape[1])
        G.ndata['h'] = th.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
        G.edata['h'] = th.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))
        node_features = G.ndata['h']
        edge_features = G.edata['h']
        edge_label = G.edata['Attack'] # ! label encoded not 1he

        optimizer.zero_grad()
        pred = model(G, node_features, edge_features)
        loss = criterion(pred[train_mask], edge_label[train_mask])
        loss.backward()
        optimizer.step()

        test_masks.append(test_mask)
        valid_masks.append(valid_mask)
        all_preds.append(pred.cpu())
        all_labels.append(edge_label.cpu())


    # Metrics
    test_masks = np.concatenate(test_masks)
    valid_masks = np.concatenate(valid_masks)
    all_preds = th.cat(all_preds)
    all_labels = th.cat(all_labels)
    
    validation_m, test_m = {}, {}
    targets = list(classes['Attack'].unique())
    
    for metrics, mask in zip(
        (validation_m, test_m), (valid_mask, test_mask)
    ):
        for i, c in enumerate(targets):
            print(all_labels)
            assert all_labels.shape[0] < all_labels.shape[1], 'wrong shape'
            y_true = all_labels[i, :][mask]
            y_pred = all_preds[i, :][mask]
            
            acc = accuracy_score(y_true, y_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(
                y_true, y_pred, 
                # labels=range(10), 
                zero_division=0)
            
            metrics[c] = (acc, precision, recall, f1)
        
    return validation_m, test_m


with open('interm/NF-IoT flowgraphs/0.pkl', 'rb') as f:
    G = pickle.load(f)      
    
G.ndata['h'] = th.ones(G.num_nodes(), G.edata['h'].shape[1])
G.ndata['h'] = th.reshape(G.ndata['h'], (G.ndata['h'].shape[0], 1,G.ndata['h'].shape[1]))
G.edata['h'] = th.reshape(G.edata['h'], (G.edata['h'].shape[0], 1,G.edata['h'].shape[1]))
            
for epoch in range(2):
    print(f'epoch {epoch+1}/2')
    valid_metrics, test_metrics = train_one_epoch(
        model=Model(
            ndim_in=G.ndata['h'].shape[2], 
            ndim_out=128, 
            edim=G.ndata['h'].shape[2], 
            activation=F.relu, 
            dropout=0.2
        ),
        loader=loader,
        criterion=criterion
        
    )
    
    print(valid_metrics, test_metrics)
    
    
# TODO
# stop what your doing, expected perfomance on NF-ToN is ~0.99. Why HPO
# need GCP GPU 

  return torch.load(io.BytesIO(b))


epoch 1/2


100%|██████████| 2/2 [00:07<00:00,  3.58s/it]


tensor([0, 0, 1,  ..., 1, 1, 1])


IndexError: tuple index out of range