In [2]:
#Imports
import io
import os
import gc
import copy
import math
import ctypes
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score, balanced_accuracy_score, matthews_corrcoef
from sklearn import metrics

import tensorflow as tf
from tensorflow.keras import Input, Model, layers, losses, optimizers, callbacks

#Notebook settings
default_max_columns = None
default_max_rows = 10
pd.set_option('display.max_columns', default_max_columns)
pd.set_option('display.max_rows', default_max_rows)

model_name = "NEURAL_NETWORK"
label_column = "Label"
batch_size = 512
num_datasets = 6
cwd = os.getcwd()

In [3]:
#Utility functions
def split_xy(df, colname):
    y_cols = [c for c in df.columns if c.startswith(colname)]
    x_cols = np.setdiff1d(df.columns, y_cols)
    X = df[x_cols]
    Y = df[y_cols]
    return X, Y

In [4]:
%%time
#Training and evaluating loop designed to work with memory constraints 

df_names = ['NF-UNSW-NB15', 'NF-CSE-CIC-IDS2018', 'NF-BoT-IoT', 'NF-ToN-IoT', 'X-IIoTiD', 'WUSTL-IIOT']
result_df = pd.DataFrame(columns=("TRAIN_DATASET", "EVAL_DATASET", "METRIC", "VAlUE", "LABEL", "MODEL"))

for i in range(num_datasets):

    #Model definiton
    #Deep Neural Network Binary Classifier with sigmoid activiation and 4 layers
    def make_nn_model(input_shape=[32], n_out=2):
        inx = Input(input_shape)
        x = layers.Dense(64, activation=tf.keras.activations.sigmoid)(inx)
        x = layers.Dense(32, activation=tf.keras.activations.sigmoid)(x)
        x = layers.Dense(n_out,activation=tf.keras.activations.sigmoid)(x)
        return Model(inx, x)
    
    loss = losses.BinaryCrossentropy()
    opt = optimizers.Adam(learning_rate=5e-3)
    nn_modell = make_nn_model()
    nn_modell.compile(loss=loss, optimizer=opt, metrics=['accuracy'])

    #Load training and validation data
    print("Loading "+df_names[i]+" trainingset")
    df_train = pd.read_csv(cwd+"/normalized_datasets/"+df_names[i]+"_n_Trainset.csv", dtype=float)
    print(df_train.shape)
    xtrain, ytrain = split_xy(df_train, label_column) 
    ytrain = pd.get_dummies(ytrain, columns=[label_column], dtype = "float") #Split Labels per Class
    del df_train
    trainset = tf.data.Dataset.from_tensor_slices((xtrain, ytrain)).batch(batch_size)
    del xtrain, ytrain
    
    df_val = pd.read_csv(cwd+"/normalized_datasets/"+df_names[i]+"_n_Valset.csv", dtype=float)
    xval, yval = split_xy(df_val, label_column) 
    yval = pd.get_dummies(yval, columns=[label_column], dtype = "float") #Split Labels per Class
    del df_val
    valset = tf.data.Dataset.from_tensor_slices((xval, yval)).batch(batch_size)
    del xval, yval
    
    gc.collect()
    print("Loading complete")
    
    #Train
    print("Training on "+df_names[i]+" trainingset")
    hist = nn_modell.fit(trainset,validation_data = valset, epochs = 50, verbose = 1)
    del trainset, valset
    gc.collect
    print("Training complete")
    
    #Evaluation
    for j in range(num_datasets):

        print("Loading "+df_names[j]+" testset "+str(j))
        #Load test data
        df_test = pd.read_csv(cwd+"/normalized_datasets/"+df_names[j]+"_n_Testset.csv", dtype=float)
        print(df_test.shape)
        xtest, ytest = split_xy(df_test, label_column)
        del df_test
        print("Loading complete")

        #Predict
        print("Evaluating model trained on "+df_names[i]+" trainset on "+df_names[j]+" testset")
        ypred = pd.DataFrame(nn_modell.predict(xtest), columns=["Label_0","Label_1"])
        del xtest
        ypred["Label"] = ypred[["Label_0", "Label_1"]].idxmax(axis=1)
        ypred["Label"] = ypred["Label"].apply(lambda x: 0 if x == "Label_0" else 1).astype("float32")
        ypred = ypred["Label"].to_numpy() 
      
        print("Evaluation complete")
        #Results
        print("Results for:...")
        print("Model trained on "+df_names[i]+" and evaluated on "+df_names[j])
        print(confusion_matrix(ytest, ypred))
        print(classification_report(ytest, ypred))
        precision, recall, f1_score, support = precision_recall_fscore_support(ytest,ypred, zero_division = 0.0)
        accuracy = accuracy_score(ytest, ypred)
        balanced_accuracy = balanced_accuracy_score(ytest, ypred)
        mcc = matthews_corrcoef(ytest, ypred)
        print("---")

        del ypred,ytest
        gc.collect

        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"ACCURACY", "VAlUE":accuracy, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"BALANCED_ACCURACY", "VAlUE":balanced_accuracy, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"MCC", "VAlUE":mcc, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"PRECISION", "VAlUE":precision[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"PRECISION", "VAlUE":precision[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)   
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"RECALL", "VAlUE":recall[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"RECALL", "VAlUE":recall[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"F1_SCORE", "VAlUE":f1_score[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"F1_SCORE", "VAlUE":f1_score[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"SUPPORT", "VAlUE":support[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"SUPPORT", "VAlUE":support[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        
    del nn_modell
    gc.collect
    libc = ctypes.CDLL("libc.so.6") # clearing cache 
    libc.malloc_trim(0)

    print("Cleanup complete")
    print("---+++---")
        
        
        

Loading NF-UNSW-NB15 trainingset
(1168644, 33)
Loading complete
Training on NF-UNSW-NB15 trainingset
Epoch 1/50
[1m2283/2283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9626 - loss: 0.0917 - val_accuracy: 0.9673 - val_loss: 0.0446
Epoch 2/50
[1m2283/2283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9755 - loss: 0.0412 - val_accuracy: 0.9648 - val_loss: 0.0454
Epoch 3/50
[1m2283/2283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9759 - loss: 0.0407 - val_accuracy: 0.9663 - val_loss: 0.0450
Epoch 4/50
[1m2283/2283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9762 - loss: 0.0404 - val_accuracy: 0.9674 - val_loss: 0.0445
Epoch 5/50
[1m2283/2283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9765 - loss: 0.0401 - val_accuracy: 0.9679 - val_loss: 0.0442
Epoch 6/50
[1m2283/2283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 



(1678480, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-CSE-CIC-IDS2018 testset
[1m52453/52453[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 609us/step
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-CSE-CIC-IDS2018
[[1449607   24932]
 [ 203941       0]]
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93   1474539
         1.0       0.00      0.00      0.00    203941

    accuracy                           0.86   1678480
   macro avg       0.44      0.49      0.46   1678480
weighted avg       0.77      0.86      0.81   1678480

---
Loading NF-BoT-IoT testset 2
(120020, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-BoT-IoT testset
[1m3751/3751[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 704us/step
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-BoT-IoT
[[  2716     62]
 [117197     4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96    220915
         1.0       0.00      0.00      0.00     17250

    accuracy                           0.93    238165
   macro avg       0.46      0.50      0.48    238165
weighted avg       0.86      0.93      0.89    238165

---
Cleanup complete
---+++---
Loading NF-CSE-CIC-IDS2018 trainingset
(6042527, 33)


2025-01-11 17:00:55.803830: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1546886912 exceeds 10% of free system memory.


Loading complete
Training on NF-CSE-CIC-IDS2018 trainingset
Epoch 1/50


2025-01-11 17:00:58.509408: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1546886912 exceeds 10% of free system memory.


[1m11802/11802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step - accuracy: 0.9857 - loss: 0.0590 - val_accuracy: 0.9920 - val_loss: 0.0345
Epoch 2/50
[1m11802/11802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - accuracy: 0.9920 - loss: 0.0333 - val_accuracy: 0.9924 - val_loss: 0.0315
Epoch 3/50
[1m11802/11802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - accuracy: 0.9924 - loss: 0.0305 - val_accuracy: 0.9929 - val_loss: 0.0293
Epoch 4/50
[1m11802/11802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - accuracy: 0.9926 - loss: 0.0277 - val_accuracy: 0.9927 - val_loss: 0.0246
Epoch 5/50
[1m11802/11802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - accuracy: 0.9928 - loss: 0.0237 - val_accuracy: 0.9927 - val_loss: 0.0227
Epoch 6/50
[1m11802/11802[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - accuracy: 0.9928 - loss: 0.0213 - val_accuracy: 0.9928 - val_loss: 0.0209
Epoch 7/5

In [5]:
result_df.to_csv(cwd+"/results/"+model_name+"_results.csv",index=False)