In [16]:
from src.graph.graph_measures import calculate_graph_measures
from src.dataset.dataset_info import datasets
from src.models import EGRAPHSAGE, EGAT, EGCN
import itertools
import matplotlib.pyplot as plt
from sklearn.utils import class_weight
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F
import torch.nn as nn
import torch as th
import networkx as nx
from dgl import from_networkx
import pandas as pd
import numpy as np
import os
import pickle
import time
import timeit
import json

os.environ["DGLBACKEND"] = "pytorch"


# datasets = {dataset.name: dataset for dataset in datasets_list}

num_epochs = 100
batch_size = 128
learning_rate = 0.001
LAMBD_1 = 0.0001
LAMBD_2 = 0.001


e_gcn = False
e_gcn_res = False
e_graph_sage = True
e_graph_sage_res = True
e_gat = False
e_gat_res = False

In [17]:
name = "cic_ids_2017_5_percent"
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017"
# name = "nf_bot_iot"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"

dataset = datasets[name]

In [18]:
results_final = {}

results_final["name"] = name
results_final["configuration"] = {
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    # "early_stopping": early_stopping,
    # "pca": pca,
    # "digraph_centralities": digraph_centralities,
    # "multi_graph_centralities": multi_graph_centralities,
    # "learning_rate": learning_rate,
    # "LAMBD_1": LAMBD_1,
    # "LAMBD_2": LAMBD_2,
    # "cfg": OmegaConf.to_container(cfg)
}

results_final["accuracy"] = {}
results_final["time_elapsed"] = {}

if e_gcn:
    results_final["e_gcn"] = {}

if e_gcn_res:
    results_final["e_gcn_res"] = {}

if e_graph_sage:
    results_final["e_graph_sage"] = {}

if e_graph_sage_res:
    results_final["e_graph_sage_res"] = {}

if e_gat:
    results_final["e_gat"] = {}

if e_gat_res:
    results_final["e_gat_res"] = {}

results_final

{'name': 'cic_ids_2017_5_percent',
 'configuration': {'num_epochs': 100, 'batch_size': 128},
 'accuracy': {},
 'time_elapsed': {},
 'e_graph_sage': {},
 'e_graph_sage_res': {}}

In [19]:
dtime = time.strftime("%Y%m%d-%H%M%S")
dtime

'20241110-231618'

In [20]:
results_folder_path = "results"
results_folder_path1 = os.path.join(results_folder_path, name)
results_folder_path2 = os.path.join(results_folder_path1, "session_graphs")
folder_path = os.path.join(results_folder_path2, dtime)
confusion_matrices_path = os.path.join(folder_path, "confusion_matrices")
os.makedirs(confusion_matrices_path, exist_ok=True)
# os.makedirs(confusion_matrices_path, exist_ok=True)

In [21]:
dataset_folder_path = None

dataset_folder_path = os.path.join(
    "datasets", name, "session_graphs", "graphs")

In [22]:
# read training and testing graphs
graphs = []
for file in os.listdir(dataset_folder_path):
    # print(f"==>> file: {os.path.join(dataset_folder_path_train, file)}")
    with open(os.path.join(dataset_folder_path, file), "rb") as f:
        G = pickle.load(f)
        # print(list(G.nodes(data=True))[0])
        G = from_networkx(
            G, edge_attrs=['h', dataset.label_col, "session_id", "index"])
        # labels = G.edata[dataset.label_col]
        # if len(node_label.unique()) > 1:
        #     print(f"==>> node_label.unique(): {len(node_label.unique())}")
        num_features = G.edata['h'].shape[1]
        G.ndata['h'] = th.ones(G.num_nodes(), num_features)

        G.ndata['h'] = th.reshape(
            G.ndata['h'], (G.ndata['h'].shape[0], 1, num_features))
        G.edata['h'] = th.reshape(
            G.edata['h'], (G.edata['h'].shape[0], 1, num_features))

        G.edata['train_mask'] = th.ones(len(G.edata['h']), dtype=th.bool)
        graphs.append(G)

        # break

training_graphs, testing_graphs = train_test_split(
    graphs, test_size=0.2, random_state=42)
len(training_graphs)

94816

In [23]:
def compute_accuracy(pred, labels):
    return (pred.round() == labels).float().mean().item()

In [24]:
features_number = training_graphs[0].edata['h'].shape[1]

In [25]:
def train_gnn(model_class, graphs, residual):

    # criterion = nn.BCELoss()
    # criterion = nn.CrossEntropyLoss()

    for epoch in range(1, num_epochs):
        predictions = []
        labels = []
        for G in graphs:

            class_weights = class_weight.compute_class_weight('balanced',
                                                              classes=np.unique(
                                                                  G.edata[dataset.label_col].cpu().numpy()),
                                                              y=G.edata[dataset.label_col].cpu().numpy())

            class_weights = th.FloatTensor(class_weights)

            criterion = nn.CrossEntropyLoss(weight=class_weights)
            node_features = G.ndata['h']
            edge_features = G.edata['h']

            edge_label = G.edata[dataset.label_col]
            train_mask = G.edata['train_mask']
            model = model_class(num_features, num_features, 128, F.relu,
                                dropout=0.2, residual=residual)

            opt = th.optim.Adam(model.parameters())

            pred = model(G, node_features, edge_features).squeeze(1)
            loss = criterion(pred[train_mask], edge_label[train_mask])

            predictions.append(pred)
            print(f"==>> pred: {pred}")
            labels.append(edge_label[train_mask])
            print(f"==>> edge_label[train_mask]: {edge_label[train_mask]}")

            break
            opt.zero_grad()
            loss.backward()
            opt.step()

        predictions_tensor = th.cat(predictions, dim=0)
        labels_tensor = th.cat(labels, dim=0)

        # print('Training acc:', compute_accuracy(pred, node_label))
        print('Epoch:', epoch, ' Training acc:',
              compute_accuracy(predictions_tensor, labels_tensor))
    return model

### Training EGCN

In [26]:
if e_gcn:
    model_gcn = train_gnn(EGCN, training_graphs, False)

### Training EGCN residual

In [27]:
if e_gcn_res:
    model_gcn_res = train_gnn(EGCN, training_graphs, True)

### Training EGraphSage 

In [28]:
if e_graph_sage:
    model_sage = train_gnn(EGRAPHSAGE, training_graphs, False)

TypeError: EGRAPHSAGE.__init__() missing 1 required positional argument: 'num_neighbors'

### Training EGraphSage residual

In [None]:
if e_graph_sage_res:
    model_sage_res = train_gnn(EGRAPHSAGE, training_graphs, True)

### Training EGAT

In [None]:
if e_gat:
    model_gat = train_gnn(EGAT, training_graphs, False)

### Training EGAT residual

In [None]:
if e_gat_res:
    model_gat_res = train_gnn(EGAT, training_graphs, True)

# Testing

In [None]:
def test_gnn(model, graphs):
    predictions = []
    labels = []
    start_time = timeit.default_timer()
    for G in graphs:
        actual_label = G.edata.pop(dataset.label_col)
        G.ndata['feature'] = th.ones(G.num_nodes(), num_features)
        G.ndata['feature'] = th.reshape(
            G.ndata['feature'], (G.ndata['feature'].shape[0], 1, G.ndata['feature'].shape[1]))
        G.edata['h'] = th.reshape(
            G.edata['h'], (G.edata['h'].shape[0], 1, G.edata['h'].shape[1]))

        node_features_test = G.ndata['feature']
        edge_features_test = G.edata['h']
        test_pred = model(G, node_features_test, edge_features_test)

        predictions.append(test_pred)
        print(f"==>> test_pred: {test_pred}")
        print(f"==>> predictions: {predictions}")
        labels.append(actual_label)
        print(f"==>> actual_label: {actual_label}")
        print(f"==>> labels: {labels}")

        break
    elapsed = timeit.default_timer() - start_time
    print(str(elapsed) + ' seconds')

    predictions_tensor = th.cat(predictions, dim=0).round().detach().numpy()
    print(f"==>> predictions_tensor: {predictions_tensor}")
    labels_tensor = th.cat(labels, dim=0)
    print(f"==>> labels_tensor: {labels_tensor}")

    return (labels_tensor, predictions_tensor, elapsed)

In [None]:

def calculate_FPR_FNR(cm):

    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)

    return FPR, FNR

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          file_path=None):

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(12, 12))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(
        accuracy, misclass))
    if file_path:
        plt.savefig(file_path)
    plt.show()

### Testing EGCN

In [None]:
if e_gcn:
    actual, test_pred, elapsed = test_gnn(model_gcn, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if e_gcn:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels=labels)

In [None]:
if e_gcn:
    plot_confusion_matrix(cm=cm,
                          normalize=False,
                          target_names=labels,
                          title="Confusion Matrix",
                          file_path=confusion_matrices_path + '/e_gcn.png')

In [None]:
if e_gcn:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if e_gcn:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["e_gcn"]["elapsed"] = elapsed
    results_final["e_gcn"]["classification_report"] = cr
    results_final["e_gcn"]["FPR"] = FPR
    results_final["e_gcn"]["FNR"] = FNR
    results_final["accuracy"]["e_gcn"] = cr["accuracy"]
    results_final["time_elapsed"]["e_gcn"] = elapsed

    print(classification_report(actual, test_pred, digits=4))

### Testing EGCN residual

In [None]:
if e_gcn_res:
    actual, test_pred, elapsed = test_gnn(model_gcn_res, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if e_gcn_res:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels=labels)

In [None]:
if e_gcn_res:
    plot_confusion_matrix(cm=cm,
                          normalize=False,
                          target_names=labels,
                          title="Confusion Matrix",
                          file_path=confusion_matrices_path + '/e_gcn_res.png')

In [None]:
if e_gcn_res:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if e_gcn_res:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["e_gcn_res"]["elapsed"] = elapsed
    results_final["e_gcn_res"]["classification_report"] = cr
    results_final["e_gcn_res"]["FPR"] = FPR
    results_final["e_gcn_res"]["FNR"] = FNR
    results_final["accuracy"]["e_gcn_res"] = cr["accuracy"]
    results_final["time_elapsed"]["e_gcn_res"] = elapsed

    print(classification_report(actual, test_pred, digits=4))

### Testing EGraphSage

In [None]:
if e_graph_sage:
    actual, test_pred, elapsed = test_gnn(model_sage, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if e_graph_sage:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels=labels)

In [None]:
if e_graph_sage:
    plot_confusion_matrix(cm=cm,
                          normalize=False,
                          target_names=labels,
                          title="Confusion Matrix",
                          file_path=confusion_matrices_path + '/e_graph_sage.png')

In [None]:
if e_graph_sage:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if e_graph_sage:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["e_graph_sage"]["elapsed"] = elapsed
    results_final["e_graph_sage"]["classification_report"] = cr
    results_final["e_graph_sage"]["FPR"] = FPR
    results_final["e_graph_sage"]["FNR"] = FNR
    results_final["accuracy"]["e_graph_sage"] = cr["accuracy"]
    results_final["time_elapsed"]["e_graph_sage"] = elapsed

    print(classification_report(actual, test_pred, digits=4))

### Testing EGraphSage residual

In [None]:
if e_graph_sage_res:
    actual, test_pred, elapsed = test_gnn(model_sage_res, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if e_graph_sage_res:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels=labels)

In [None]:
if e_graph_sage_res:
    plot_confusion_matrix(cm=cm,
                          normalize=False,
                          target_names=labels,
                          title="Confusion Matrix",
                          file_path=confusion_matrices_path + '/e_graph_sage_res.png')

In [None]:
if e_graph_sage_res:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if e_graph_sage_res:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["e_graph_sage_res"]["elapsed"] = elapsed
    results_final["e_graph_sage_res"]["classification_report"] = cr
    results_final["e_graph_sage_res"]["FPR"] = FPR
    results_final["e_graph_sage_res"]["FNR"] = FNR
    results_final["accuracy"]["e_graph_sage_res"] = cr["accuracy"]
    results_final["time_elapsed"]["e_graph_sage_res"] = elapsed

    print(classification_report(actual, test_pred, digits=4))

### Testing EGAT

In [None]:
if e_gat:
    actual, test_pred, elapsed = test_gnn(model_gat, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if e_gat:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels=labels)

In [None]:
if e_gat:
    plot_confusion_matrix(cm=cm,
                          normalize=False,
                          target_names=labels,
                          title="Confusion Matrix",
                          file_path=confusion_matrices_path + '/e_gat.png')

In [None]:
if e_gat:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if e_gat:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["e_gat"]["elapsed"] = elapsed
    results_final["e_gat"]["classification_report"] = cr
    results_final["e_gat"]["FPR"] = FPR
    results_final["e_gat"]["FNR"] = FNR
    results_final["accuracy"]["e_gat"] = cr["accuracy"]
    results_final["time_elapsed"]["e_gat"] = elapsed

    print(classification_report(actual, test_pred, digits=4))

### Testing EGAT residual

In [None]:
if e_gat_res:
    actual, test_pred, elapsed = test_gnn(model_gat_res, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if e_gat_res:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels=labels)

In [None]:
if e_gat_res:
    plot_confusion_matrix(cm=cm,
                          normalize=False,
                          target_names=labels,
                          title="Confusion Matrix",
                          file_path=confusion_matrices_path + '/e_gat_res.png')

In [None]:
if e_gat_res:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if e_gat_res:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["e_gat_res"]["elapsed"] = elapsed
    results_final["e_gat_res"]["classification_report"] = cr
    results_final["e_gat_res"]["FPR"] = FPR
    results_final["e_gat_res"]["FNR"] = FNR
    results_final["accuracy"]["e_gat_res"] = cr["accuracy"]
    results_final["time_elapsed"]["e_gat_res"] = elapsed

    print(classification_report(actual, test_pred, digits=4))

### Saving results

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)


filename = (folder_path + '/results.json'.format(dtime))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(results_final, cls=NumpyEncoder))
outfile.close()