In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import timeit
import json

os.environ["DGLBACKEND"] = "pytorch"

from dgl import from_networkx
import networkx as nx


import torch as th
import torch.nn as nn
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.utils import class_weight

import matplotlib.pyplot as plt
import itertools

from src.graph_neural_networks import GRAPHSAGE, GAT, GCN
from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures

# datasets = {dataset.name: dataset for dataset in datasets_list}

num_epochs = 100
batch_size = 128
learning_rate = 0.001
LAMBD_1 = 0.0001
LAMBD_2 = 0.001


gcn = False
graph_sage = True
gat = False


In [None]:
name = "cic_ids_2017_5_percent"
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017"
# name = "nf_bot_iot"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"

dataset = datasets[name]

In [None]:
results_final = {}

results_final["name"] = name
results_final["configuration"] = {
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    # "early_stopping": early_stopping,
    # "pca": pca,
    # "digraph_centralities": digraph_centralities,
    # "multi_graph_centralities": multi_graph_centralities,
    # "learning_rate": learning_rate,
    # "LAMBD_1": LAMBD_1,
    # "LAMBD_2": LAMBD_2,
    # "cfg": OmegaConf.to_container(cfg)
}

results_final["accuracy"] = {}

if gcn:
    results_final["gcn"] = {}

if graph_sage:
    results_final["graph_sage"] = {}

if gat:
    results_final["gat"] = {}

results_final

In [None]:
dtime = time.strftime("%Y%m%d-%H%M%S")
dtime

In [None]:
results_folder_path = "results"
results_folder_path1 = os.path.join(results_folder_path, name)
results_folder_path2 = os.path.join(results_folder_path1, "line_graph_unsorted")
folder_path = os.path.join(results_folder_path2, dtime)
confusion_matrices_path = os.path.join(folder_path, "confusion_matrices")
os.makedirs(confusion_matrices_path, exist_ok=True)
# os.makedirs(confusion_matrices_path, exist_ok=True)

In [None]:
dataset_folder_path = None
dataset_folder_path_train = None
dataset_folder_path_test = None

dataset_folder_path = os.path.join("datasets", name, "line_graph_sorted", "graphs")
# dataset_folder_path = os.path.join("datasets", name, "line_graph_unsorted", "graphs")

# dataset_folder_path_train = os.path.join("datasets", name, "line_graph_sorted_train_test", "training")
# dataset_folder_path_test = os.path.join("datasets", name, "line_graph_sorted_train_test", "testing")

# dataset_folder_path_train = os.path.join("datasets", name, "line_graph_unsorted_train_test", "training")
# dataset_folder_path_test = os.path.join("datasets", name, "line_graph_unsorted_train_test", "testing")

In [None]:
# read training and testing graphs
if dataset_folder_path:
    graphs = []
    for file in os.listdir(dataset_folder_path):
        # print(f"==>> file: {os.path.join(dataset_folder_path_train, file)}")
        with open(os.path.join(dataset_folder_path, file), "rb") as f:
            G = pickle.load(f)
            # print(list(G.nodes(data=True))[0])
            G = from_networkx(G,node_attrs=['h',dataset.label_col, "index"])
            node_label = G.ndata[dataset.label_col]
            # if len(node_label.unique()) > 1:
            #     print(f"==>> node_label.unique(): {len(node_label.unique())}")
                
            graphs.append(G)

            # break
            
    training_graphs, testing_graphs = train_test_split(graphs, test_size=0.2, random_state=42)
    len(training_graphs)

In [None]:
if dataset_folder_path_train:
    training_graphs = []
    for file in os.listdir(dataset_folder_path_train):
        # print(f"==>> file: {os.path.join(dataset_folder_path_train, file)}")
        with open(os.path.join(dataset_folder_path_train, file), "rb") as f:
            G = pickle.load(f)
            # print(list(G.nodes(data=True))[0])
            G = from_networkx(G,node_attrs=['h',dataset.label_col, "index"])
            node_label = G.ndata[dataset.label_col]
            # if len(node_label.unique()) > 1:
            #     print(f"==>> node_label.unique(): {len(node_label.unique())}")
                
            training_graphs.append(G)

            # break
            
    len(training_graphs)

In [None]:
def compute_accuracy(pred, labels):
    return (pred.round() == labels).float().mean().item()

In [None]:
G0 = training_graphs[0]
features_number = G.ndata['h'].shape[1]

In [None]:
def train_gnn(model, graphs):
        
    opt = th.optim.Adam(model.parameters())
    criterion = nn.BCELoss()
    # criterion = nn.CrossEntropyLoss()

    for epoch in range(1,num_epochs):
        predictions = []
        labels = []
        for G in graphs:
            # class_weights = class_weight.compute_class_weight('balanced',
            #                                          classes = np.unique(node_label.cpu().numpy()),
            #                                          y = node_label.cpu().numpy())
            # class_weights = th.FloatTensor(class_weights)
            # criterion = nn.CrossEntropyLoss(weight = class_weights)
            # criterion = nn.BCELoss(weight = class_weights)
            
            node_features = G.ndata['h']
            node_label = G.ndata[dataset.label_col].float()
            # print(f"==>> node_label.unique(): {node_label.unique()}")
            # node_index = G.ndata["index"]
            
            #     pred = model(G, node_features,edge_features).cuda()
            pred = model(G, node_features).squeeze(1)
            predictions.append(pred)
            labels.append(node_label)
            loss = criterion(pred, node_label)
            opt.zero_grad()
            loss.backward()
            opt.step()
            
        predictions_tensor = th.cat(predictions, dim=0)
        labels_tensor = th.cat(labels, dim=0)

        # print('Training acc:', compute_accuracy(pred, node_label))
        print('Epoch:', epoch ,' Training acc:', compute_accuracy(predictions_tensor, labels_tensor))
    return model


### Training EGCN

In [None]:
if gcn:
    model = GCN(features_number)
    model_gcn = train_gnn(model, training_graphs)

### Training EGraphSage 

In [None]:
if graph_sage:
    # (``mean``, ``gcn``, ``pool``, ``lstm``)
    model = GRAPHSAGE(features_number, aggregator_type = "gcn")
    # model = GRAPHSAGE(features_number, aggregator_type = "lstm")
    model_sage = train_gnn(model, training_graphs)

### Training EGAT

In [None]:
if gat:
    model = GAT(features_number)
    model_gat = train_gnn(model, training_graphs)

# Testing

In [None]:
if dataset_folder_path_test:
    testing_graphs = []
    for file in os.listdir(dataset_folder_path_test):
        # print(f"==>> file: {os.path.join(dataset_folder_path_test, file)}")
        with open(os.path.join(dataset_folder_path_test, file), "rb") as f:
            G = pickle.load(f)
            # print(list(G.nodes(data=True))[0])
            G = from_networkx(G,node_attrs=['h',dataset.label_col, "index"])
            node_label = G.ndata[dataset.label_col]
            # if len(node_label.unique()) > 1:
            #     print(f"==>> node_label.unique(): {len(node_label.unique())}")
                
            testing_graphs.append(G)

            # break
            
    len(testing_graphs)

In [None]:
def test_gnn(model, graphs):
    predictions = []
    labels = []
    start_time = timeit.default_timer()
    for G in graphs:
        
        node_features_test  = G.ndata['h']
        node_label_test  = G.ndata[dataset.label_col].float()
        test_pred = model(G, node_features_test).squeeze(1)
        predictions.append(test_pred)
        # print(f"==>> predictions: {predictions}")
        labels.append(node_label_test)
        # print(f"==>> labels: {labels}")
    
    elapsed = timeit.default_timer() - start_time
    print(str(elapsed) + ' seconds')

    predictions_tensor = th.cat(predictions, dim=0).round().detach().numpy()
    print(f"==>> predictions_tensor: {predictions_tensor}")
    labels_tensor = th.cat(labels, dim=0)
    print(f"==>> labels_tensor: {labels_tensor}")

    return (labels_tensor, predictions_tensor)


In [None]:

def calculate_FPR_FNR(cm):
 
    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
 
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
 
    return FPR, FNR

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True,
                          file_path = None):
    

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(12, 12))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    if file_path:
        plt.savefig(file_path)
    plt.show()

### Testing EGCN

In [None]:
if gcn:
    actual, test_pred = test_gnn(model_gcn, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if gcn:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels= labels)

In [None]:
if gcn:
    plot_confusion_matrix(cm = cm,
                      normalize    = False,
                      target_names = labels,
                      title        = "Confusion Matrix",
                      file_path = confusion_matrices_path + '/gcn.png')

In [None]:
if gcn:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if gcn:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["gcn"]["classification_report"] = cr
    results_final["gcn"]["FPR"] = FPR
    results_final["gcn"]["FNR"] = FNR
    results_final["accuracy"]["gcn"] = cr["accuracy"]

    print(classification_report(actual, test_pred, digits=4))

### Testing EGraphSage

In [None]:
if graph_sage:
    actual, test_pred = test_gnn(model_sage, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if graph_sage:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels= labels)

In [None]:
if graph_sage:
    plot_confusion_matrix(cm = cm,
                      normalize    = False,
                      target_names = labels,
                      title        = "Confusion Matrix",
                      file_path = confusion_matrices_path + '/e_graph_sage.png')

In [None]:
if graph_sage:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if graph_sage:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["graph_sage"]["classification_report"] = cr
    results_final["graph_sage"]["FPR"] = FPR
    results_final["graph_sage"]["FNR"] = FNR
    results_final["accuracy"]["graph_sage"] = cr["accuracy"]

    print(classification_report(actual, test_pred, digits=4))

### Testing EGAT

In [None]:
if gat:
    actual, test_pred = test_gnn(model_gat, testing_graphs)

    actual = ["Normal" if i == 0 else "Attack" for i in actual]
    test_pred = ["Normal" if i == 0 else "Attack" for i in test_pred]

In [None]:
if gat:
    labels = ["Normal", "Attack"]
    cm = confusion_matrix(actual, test_pred, labels= labels)

In [None]:
if gat:
    plot_confusion_matrix(cm = cm,
                      normalize    = False,
                      target_names = labels,
                      title        = "Confusion Matrix",
                      file_path = confusion_matrices_path + '/e_gat.png')

In [None]:
if gat:
    FPR, FNR = calculate_FPR_FNR(cm)

In [None]:
if gat:
    cr = classification_report(actual, test_pred, digits=4, output_dict=True)
    results_final["gat"]["classification_report"] = cr
    results_final["gat"]["FPR"] = FPR
    results_final["gat"]["FNR"] = FNR
    results_final["accuracy"]["gat"] = cr["accuracy"]

    print(classification_report(actual, test_pred, digits=4))

### Saving results

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

filename = (folder_path + '/results.json'.format(dtime))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(results_final, cls=NumpyEncoder))
outfile.close()