In [1]:
#Imports
import io
import os
import gc
import copy
import math
import ctypes
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score, balanced_accuracy_score, matthews_corrcoef
from sklearn import metrics
from sklearn.svm import LinearSVC

#Notebook settings
default_max_columns = None
default_max_rows = 10
pd.set_option('display.max_columns', default_max_columns)
pd.set_option('display.max_rows', default_max_rows)

model_name = "SUPPORT_VECTOR_MACHINE"
label_column = "Label"
batch_size = 512
num_datasets = 6
cwd = os.getcwd()

In [2]:
#Utility functions
def split_xy(df, colname):
    y_cols = [c for c in df.columns if c.startswith(colname)]
    x_cols = np.setdiff1d(df.columns, y_cols)
    X = df[x_cols]
    Y = df[y_cols]
    return X, Y

In [5]:
%%time
#Training and evaluating loop designed to work with memory constraints 

df_names = ['NF-UNSW-NB15', 'NF-CSE-CIC-IDS2018', 'NF-BoT-IoT', 'NF-ToN-IoT', 'X-IIoTiD', 'WUSTL-IIOT']
result_df = pd.DataFrame(columns=("TRAIN_DATASET", "EVAL_DATASET", "METRIC", "VAlUE", "LABEL", "MODEL"))

for i in range(num_datasets):

    #Model definiton
    #Support Vector Machine with linear Kernel
    svm_model = LinearSVC(
        C = 1.0,
        #cache_size = 8192,
        verbose = 0,
    )

    #Load training and validation data
    print("Loading "+df_names[i]+" trainingset")
    df_train = pd.read_csv(cwd+"/normalized_datasets/"+df_names[i]+"_n_Trainset.csv", dtype=float)
    print(df_train.shape)
    #df_train, df_temp = train_test_split(df_train, train_size = int(1e5))
    #del df_temp
    print(df_train.shape)
    xtrain, ytrain = split_xy(df_train, label_column)
    del df_train
    gc.collect()
    print("Loading complete")
    
    #Train
    print("Training on "+df_names[i]+" trainingset")
    svm_model.fit(xtrain, ytrain.values.ravel())
    del xtrain, ytrain
    gc.collect
    print("Training complete")
    
    #Evaluation
    for j in range(num_datasets):

        print("Loading "+df_names[j]+" testset")
        #Load test data
        df_test = pd.read_csv(cwd+"/normalized_datasets/"+df_names[j]+"_n_Testset.csv", dtype=float)
        print(df_test.shape)
        xtest, ytest = split_xy(df_test, label_column)
        del df_test
        print("Loading complete")

        #Predict
        print("Evaluating model trained on "+df_names[i]+" trainset on "+df_names[j]+" testset")
        ypred = svm_model.predict(xtest)
        del xtest
        print("Evaluation complete")

        #Results
        print("Results for:...")
        print("Model trained on "+df_names[i]+" and evaluated on "+df_names[j])
        print(confusion_matrix(ytest, ypred))
        print(classification_report(ytest, ypred))
        precision, recall, f1_score, support = precision_recall_fscore_support(ytest,ypred, zero_division = 0.0)
        accuracy = accuracy_score(ytest, ypred)
        balanced_accuracy = balanced_accuracy_score(ytest, ypred)
        mcc = matthews_corrcoef(ytest, ypred)
        print("bACC:",balanced_accuracy,"MCC:",mcc)
        print("---")

        del ypred, ytest
        gc.collect

        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"ACCURACY", "VAlUE":accuracy, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"BALANCED_ACCURACY", "VAlUE":balanced_accuracy, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"MCC", "VAlUE":mcc, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"PRECISION", "VAlUE":precision[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"PRECISION", "VAlUE":precision[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)   
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"RECALL", "VAlUE":recall[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"RECALL", "VAlUE":recall[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"F1_SCORE", "VAlUE":f1_score[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"F1_SCORE", "VAlUE":f1_score[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"SUPPORT", "VAlUE":support[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"SUPPORT", "VAlUE":support[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        
        
    del svm_model
    gc.collect
    libc = ctypes.CDLL("libc.so.6") # clearing cache 
    libc.malloc_trim(0)
    
    print("Cleanup complete")
    print("---+++---") 

Loading NF-UNSW-NB15 trainingset
(1168644, 33)
(1168644, 33)
Loading complete
Training on NF-UNSW-NB15 trainingset
Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-UNSW-NB15
[[302557   7655]
 [   110  14302]]
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99    310212
         1.0       0.65      0.99      0.79     14412

    accuracy                           0.98    324624
   macro avg       0.83      0.98      0.89    324624
weighted avg       0.98      0.98      0.98    324624

bACC: 0.9838453994122233 MCC: 0.7937047844225776
---
Loading NF-CSE-CIC-IDS2018 testset




(1678480, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-CSE-CIC-IDS2018
[[1412258   62281]
 [ 198697    5244]]
              precision    recall  f1-score   support

         0.0       0.88      0.96      0.92   1474539
         1.0       0.08      0.03      0.04    203941

    accuracy                           0.84   1678480
   macro avg       0.48      0.49      0.48   1678480
weighted avg       0.78      0.84      0.81   1678480

bACC: 0.4917378556131608 MCC: -0.027474460871427613
---
Loading NF-BoT-IoT testset
(120020, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-BoT-IoT testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-BoT-IoT
[[  2678    100]
 [117242      0]]
              precision    recall  f1-score   support

         0.0       0.02      0.96      0.04      277

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.19      1.00      0.33     53654
         1.0       0.00      0.00      0.00    222201

    accuracy                           0.19    275855
   macro avg       0.10      0.50      0.16    275855
weighted avg       0.04      0.19      0.06    275855

bACC: 0.5 MCC: 0.0
---
Loading X-IIoTiD testset
(164018, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on X-IIoTiD testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on X-IIoTiD
[[84355     8]
 [79645    10]]
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.68     84363
         1.0       0.56      0.00      0.00     79655

    accuracy                           0.51    164018
   macro avg       0.53      0.50      0.34    164018
weighted avg       0.53      0.51      0.35    164018

bACC: 0.5000153565478846 MCC: 0.0014653721924935412
---
Loading WUSTL-IIOT 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.02      1.00      0.05      2778
         1.0       0.00      0.00      0.00    117242

    accuracy                           0.02    120020
   macro avg       0.01      0.50      0.02    120020
weighted avg       0.00      0.02      0.00    120020

bACC: 0.5 MCC: 0.0
---
Loading NF-ToN-IoT testset
(275855, 33)
Loading complete
Evaluating model trained on NF-CSE-CIC-IDS2018 trainset on NF-ToN-IoT testset
Evaluation complete
Results for:...
Model trained on NF-CSE-CIC-IDS2018 and evaluated on NF-ToN-IoT
[[ 53564     90]
 [219188   3013]]
              precision    recall  f1-score   support

         0.0       0.20      1.00      0.33     53654
         1.0       0.97      0.01      0.03    222201

    accuracy                           0.21    275855
   macro avg       0.58      0.51      0.18    275855
weighted avg       0.82      0.21      0.09    275855

bACC: 0.5059411902076855 MCC: 0.04459669239456159
---

In [6]:
result_df.to_csv(cwd+"/results/"+model_name+"_results.csv",index=False)