In [1]:
import pandas as pd
import numpy as np
import numpy as np
import pandas as pd 
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
pd.options.mode.chained_assignment = None
from IPython.display import display, HTML
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

#### Functions to be used for model and feature selection

In [2]:
### Function to save final model for later use
def save_model(RF, filename):
    pickle.dump(RF, open(filename, 'wb'))

# Function to Fit model based on optimal values of number of estimators and use it to compute feature importance for all the features.
def get_feature_importance(n_tree, max_leaf, X_train, y_train, model_type):
    
    if model_type == 'RF':
        rf_opt = RandomForestClassifier(n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False)

    elif model_type == 'DT':
        rf_opt = DecisionTreeClassifier(max_leaf_nodes=max_leaf, random_state=42)    

    rf_opt.fit(X_train, y_train)
    feature_importance = pd.DataFrame(rf_opt.feature_importances_)
    feature_importance.index = X_train.columns
    feature_importance = feature_importance.sort_values(by=list(feature_importance.columns), axis=0, ascending=False)
    
    return feature_importance


#Function to Fit model based on optimal values of depth and number of estimators and feature importance
# to find the fewest possible features to exceed the previously attained score with all selected features
def get_fewest_features(importance):    
    sorted_feature_names = importance.index
    features = []
    for f in range(1,len(sorted_feature_names)+1):
        features.append(sorted_feature_names[0:f])
    return features

def get_result_scores(classes, cl_report):
    precision=[]
    recall=[]
    f1_score=[]
    supports=[]
    for a_class in classes:
        precision.append(cl_report[a_class]['precision'])
        recall.append(cl_report[a_class]['recall'])
        f1_score.append(cl_report[a_class]['f1-score'])
        supports.append(cl_report[a_class]['support'])
    return precision, recall, f1_score, supports


def get_scores(classes, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test,model_type):      
                        
    if model_type == 'RF':
        model = RandomForestClassifier(n_estimators = n_tree, max_leaf_nodes=max_leaf, random_state=42, bootstrap=False)

    elif model_type == 'DT':
        model = DecisionTreeClassifier(max_leaf_nodes=max_leaf, random_state=42) 

    model.fit(X_train[feats], y_train)
    y_pred = model.predict(X_test[feats])

    class_report = classification_report(y_test, y_pred, target_names=classes, output_dict = True)
    
    macro_score = class_report['macro avg']['f1-score']
    weighted_score = class_report['weighted avg']['f1-score']

    return model, class_report, macro_score, weighted_score, y_pred


def get_x_y(Dataset, classes, feats):
    Dataset = Dataset[Dataset["label"].isin(classes)]    
    X = Dataset[feats]
    y = Dataset['label'].replace(classes, range(len(classes)))
    return X, y

def analyze_models(classes, model_type, n_trees, X_train, y_train, X_test, y_test, max_leaf, filename_out):
    with open(filename_out, "w") as res_file:
        print('tree;n_feat;macro;weighted;feats', file=res_file)
        if model_type == 'RF':
            # FOR EACH (n_tree, feat)
            for n_tree in n_trees:
                # get feature orders to use
                importance = get_feature_importance(n_tree, max_leaf, X_train, y_train,model_type)
                m_feats = get_fewest_features(importance) 
                for feats in m_feats:
                    # Get the scores with the given (n_tree, feat)
                    model, c_report, macro_f1, weight_f1, y_pred = get_scores(classes, n_tree, feats, max_leaf, X_train, y_train, X_test, y_test,model_type)
                    print(str(n_tree)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(list(feats)), file=res_file)

        elif model_type == 'DT':
            importance = get_feature_importance(0, max_leaf, X_train, y_train,model_type)
            m_feats = get_fewest_features(importance) 
            for feats in m_feats:
                # Get the scores with the given (n_tree, feat)
                model, c_report, macro_f1, weight_f1, y_pred = get_scores(classes, 0, feats, max_leaf, X_train, y_train, X_test, y_test,model_type)
                print(str(0)+';'+str(len(feats))+';'+str(macro_f1)+';'+str(weight_f1)+';'+str(list(feats)), file=res_file)
    print("Analysis Complete. Check output file.")
    return []

#### Load cleaned data

In [3]:
# Load the labeled file and print number of rows per label
df_train = pd.read_csv("../pre_processing/NIDD_5G_Train_Full.csv")
df_test = pd.read_csv("../pre_processing/NIDD_5G_Test_Full.csv")

In [4]:
print(df_train['label'].value_counts())

label
Benign            2958689
HTTPFlood          675106
UDPFlood           550471
SlowrateDoS        463246
SYNFlood            19820
TCPConnectScan      15297
SYNScan             15184
UDPScan             12037
ICMPFlood            1703
Name: count, dtype: int64


In [5]:
print(df_test['label'].value_counts())

label
Benign            986820
HTTPFlood         225279
UDPFlood          183289
SlowrateDoS       153989
SYNFlood            6638
TCPConnectScan      5076
SYNScan             4968
UDPScan             3912
ICMPFlood            606
Name: count, dtype: int64


In [6]:
print(df_train['label'].unique())   

['Benign' 'SlowrateDoS' 'SYNFlood' 'UDPScan' 'ICMPFlood' 'TCPConnectScan'
 'HTTPFlood' 'SYNScan' 'UDPFlood']


In [7]:
# Remove rows with srcport or dstport = 0 and label = Benign
df_train = df_train[~(((df_train["srcport"]==0) | (df_train["dstport"]==0)) & (df_train['label']=='Benign'))]
df_test = df_test[~(((df_test["srcport"]==0) | (df_test["dstport"]==0)) & (df_test['label']=='Benign'))]

In [8]:
print(df_train['label'].value_counts())

label
Benign            2497427
HTTPFlood          675106
UDPFlood           550471
SlowrateDoS        463246
SYNFlood            19820
TCPConnectScan      15297
SYNScan             15184
UDPScan             12037
ICMPFlood            1703
Name: count, dtype: int64


In [9]:
print(df_test['label'].value_counts())

label
Benign            833295
HTTPFlood         225279
UDPFlood          183289
SlowrateDoS       153989
SYNFlood            6638
TCPConnectScan      5076
SYNScan             4968
UDPScan             3912
ICMPFlood            606
Name: count, dtype: int64


In [10]:
# We use all 9 classes for our analysis
classes = ['Benign','SlowrateDoS','SYNFlood','UDPScan','ICMPFlood',
           'TCPConnectScan','HTTPFlood','SYNScan','UDPFlood']

# select data with known labels
df_train = df_train[df_train['label'].isin(classes)]
df_test  = df_test[df_test['label'].isin(classes)]

# Normalize the tcp.hdr_len as it is a 4 bit number in the switch
df_train['tcp.hdr_len'] = df_train['tcp.hdr_len']/4
df_test['tcp.hdr_len'] = df_test['tcp.hdr_len']/4

# features for feature and model selection process
features = ['ip.len', 'tcp.flags.syn', 'tcp.flags.ack', 'tcp.flags.push',
       'tcp.flags.fin', 'tcp.flags.reset', 'ip.proto', 'ip.ttl', 'srcport',
       'dstport', 'tcp.window_size_value', 'tcp.hdr_len', 'udp.length']

In [11]:
#we separate the data into features and labels and then numerize the labels
X_train, X_test, y_train, y_test = df_train[features], df_test[features], df_train['label'], df_test['label']

y_train = y_train.replace(classes, range(len(classes)))
y_test = y_test.replace(classes, range(len(classes)))

In [12]:
y_train.value_counts()

label
0    2497427
6     675106
8     550471
1     463246
2      19820
5      15297
7      15184
3      12037
4       1703
Name: count, dtype: int64

In [13]:
y_test.value_counts()

label
0    833295
6    225279
8    183289
1    153989
2      6638
5      5076
7      4968
3      3912
4       606
Name: count, dtype: int64

#### Analyze the features and model combinations for both DT and RF

In [68]:
analyze_models(classes, "DT", range(1,5,1), X_train.fillna(0), y_train, X_test.fillna(0), y_test, 500, "PKT_models_5G_NIDD_DT_Full.csv")

Analysis Complete. Check output file.


[]

In [69]:
analyze_models(classes, "RF", range(1,8,2), X_train.fillna(0), y_train, X_test.fillna(0), y_test, 500, "PKT_models_5G_NIDD_RF_Full.csv")

Analysis Complete. Check output file.


[]

In [70]:
results_dt, results_rf = pd.read_csv("PKT_models_5G_NIDD_DT_Full.csv", delimiter=";"), pd.read_csv("PKT_models_5G_NIDD_RF_Full.csv", delimiter=";")
results_dt = results_dt.sort_values(by=['macro','weighted'],ascending=False)
results_rf = results_rf.sort_values(by=['macro','weighted'],ascending=False)

In [71]:
results_dt.head(10)

Unnamed: 0,tree,n_feat,macro,weighted,feats
9,0,10,0.980802,0.979794,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
10,0,11,0.980802,0.979794,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
11,0,12,0.980802,0.979794,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
12,0,13,0.980802,0.979794,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
8,0,9,0.97916,0.97782,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
7,0,8,0.978888,0.977818,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
6,0,7,0.977391,0.9763,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
5,0,6,0.933999,0.975161,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
4,0,5,0.933955,0.975159,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."
3,0,4,0.933839,0.975024,"['ip.ttl', 'dstport', 'tcp.window_size_value',..."


In [72]:
results_rf.head(10)

Unnamed: 0,tree,n_feat,macro,weighted,feats
12,1,13,0.980581,0.979649,"['tcp.hdr_len', 'srcport', 'dstport', 'ip.len'..."
10,1,11,0.980467,0.979639,"['tcp.hdr_len', 'srcport', 'dstport', 'ip.len'..."
9,1,10,0.979795,0.979098,"['tcp.hdr_len', 'srcport', 'dstport', 'ip.len'..."
50,7,12,0.979792,0.979079,"['ip.ttl', 'ip.proto', 'ip.len', 'tcp.window_s..."
25,3,13,0.9797,0.979471,"['ip.ttl', 'ip.proto', 'udp.length', 'srcport'..."
51,7,13,0.979527,0.978787,"['ip.ttl', 'ip.proto', 'ip.len', 'tcp.window_s..."
37,5,12,0.979198,0.978614,"['ip.ttl', 'ip.proto', 'udp.length', 'dstport'..."
38,5,13,0.979137,0.978508,"['ip.ttl', 'ip.proto', 'udp.length', 'dstport'..."
49,7,11,0.978738,0.977566,"['ip.ttl', 'ip.proto', 'ip.len', 'tcp.window_s..."
24,3,12,0.978494,0.978048,"['ip.ttl', 'ip.proto', 'udp.length', 'srcport'..."


#### Train and save final selected RF model

In [14]:
# From analysis, selected model is best RF with 3 trees, 9 features
rf = RandomForestClassifier(random_state=42, n_estimators=3, max_leaf_nodes=500)

selected_features = ['tcp.hdr_len', 'ip.ttl', 'tcp.window_size_value', 'ip.len', 'dstport', 
                     'srcport', 'tcp.flags.push', 'tcp.flags.fin', 'tcp.flags.reset']
# Train the model
rf.fit(X_train[selected_features].fillna(0), y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test[selected_features].fillna(0))

# Evaluate the model
report = classification_report(y_test, y_pred,target_names=classes, output_dict=True)

print(report)

{'Benign': {'precision': 1.0, 'recall': 0.9998151914988089, 'f1-score': 0.9999075872100699, 'support': 833295.0}, 'SlowrateDoS': {'precision': 0.9371333012102832, 'recall': 0.8588405665339732, 'f1-score': 0.896280407843775, 'support': 153989.0}, 'SYNFlood': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6638.0}, 'UDPScan': {'precision': 0.9987199180747568, 'recall': 0.9971881390593047, 'f1-score': 0.9979534407776925, 'support': 3912.0}, 'ICMPFlood': {'precision': 0.9821717990275527, 'recall': 1.0, 'f1-score': 0.9910057236304171, 'support': 606.0}, 'TCPConnectScan': {'precision': 0.9992047713717693, 'recall': 0.9901497241922774, 'f1-score': 0.9946566396200277, 'support': 5076.0}, 'HTTPFlood': {'precision': 0.9082776677520057, 'recall': 0.9608884982621549, 'f1-score': 0.9338426727810409, 'support': 225279.0}, 'SYNScan': {'precision': 0.9957848253713368, 'recall': 0.9985909822866345, 'f1-score': 0.9971859296482412, 'support': 4968.0}, 'UDPFlood': {'precision': 1.0, 'recall'

In [16]:
# we check the number of bits that the code words will have in P4
for tree in rf.estimators_:
    nbitsp4 = tree.tree_.node_count - tree.tree_.n_leaves
    print("Number of bits in P4: ", nbitsp4)

Number of bits in P4:  499
Number of bits in P4:  499
Number of bits in P4:  499


In [17]:
# save the model for use in p4
save_model(rf, "rf_model_NIDD_Full.pkl")

In [18]:
for tree in rf.estimators_:
    depth = tree.tree_.max_depth
    print("Depth of the tree:", depth)

Depth of the tree: 35
Depth of the tree: 32
Depth of the tree: 36


In [19]:
def calculate_scores_fpr_tpr(y_true, y_pred, classes,labels):

    cm = confusion_matrix(y_true, y_pred)
    metrics_table = []

    for i in range(len(classes)):
        TP = cm[i, i]
        FN = sum(cm[i, :]) - TP
        FP = sum(cm[:, i]) - TP
        TN = sum(sum(cm)) - TP - FP - FN

        # precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, labels=[i])
        TPR = (TP / (TP + FN)) if TP + FN != 0 else 0
        FPR = (FP / (FP + TN)) if FP + TN != 0 else 0
        TNR = (TN / (TN + FP)) if TN + FP != 0 else 0
        FNR = (FN / (FN + TP)) if FN + TP != 0 else 0
        f1_score = (2*TP)/((2*TP)+FP+FN)

        metrics_table.append([f1_score, TPR, FPR, TNR, FNR])
    
    metrics_table = np.array(metrics_table)
    macro_avg = np.mean(metrics_table, axis=0)

    weighted_avg = np.average(metrics_table, axis=0, weights=np.sum(cm, axis=1)/np.sum(cm))

    final_met_table = pd.DataFrame(np.vstack((metrics_table, macro_avg, weighted_avg)), columns=['F1 Score', 'TPR', 'FPR', 'TNR', 'FNR'], index=labels + ['Macro Avg', 'Weighted Avg'])

    return cm, final_met_table

In [20]:
classes_long = ['Benign','SlowrateDoS','SYNFlood','UDPScan','ICMPFlood',
                'TCPConnectScan','HTTPFlood','SYNScan','UDPFlood']
classes_short = ['Benign','SlowrateDoS','SYNFlood','UDPScan','ICMPFlood',
                 'TCPConnectScan','HTTPFlood','SYNScan','UDPFlood']

In [21]:
cm, final_met_table = calculate_scores_fpr_tpr(y_test, y_pred, classes,classes_long)

In [22]:
final_met_table*100

Unnamed: 0,F1 Score,TPR,FPR,TNR,FNR
Benign,99.990759,99.981519,0.0,100.0,0.018481
SlowrateDoS,89.628041,85.884057,0.702419,99.297581,14.115943
SYNFlood,100.0,100.0,0.0,100.0,0.0
UDPScan,99.795344,99.718814,0.000354,99.999646,0.281186
ICMPFlood,99.100572,100.0,0.000777,99.999223,0.0
TCPConnectScan,99.465664,99.014972,0.000283,99.999717,0.985028
HTTPFlood,93.384267,96.08885,1.834242,98.165758,3.91115
SYNScan,99.718593,99.859098,0.001487,99.998513,0.140902
UDPFlood,99.999182,99.998363,0.0,100.0,0.001637
Macro Avg,97.898047,97.838408,0.282174,99.717826,2.161592


In [None]:
final_met_table.to_csv("final_met_table_rf.csv")

### Train and save selected DT

In [23]:
# From analysis, selected model is best DT with 10 features
dt = DecisionTreeClassifier(random_state=42, max_leaf_nodes=500)

selected_features = ['ip.ttl', 'dstport', 'tcp.window_size_value', 
                     'ip.len', 'tcp.flags.push', 'tcp.hdr_len', 
                     'srcport', 'tcp.flags.reset', 'udp.length', 'tcp.flags.fin']
# Train the model
dt.fit(X_train[selected_features].fillna(0), y_train)

# Make predictions on the test set
y_pred = dt.predict(X_test[selected_features].fillna(0))

# Evaluate the model
report = classification_report(y_test, y_pred,target_names=classes, output_dict=True)

print(report)

{'Benign': {'precision': 0.9999963992942617, 'recall': 0.999846392934075, 'f1-score': 0.999921390488249, 'support': 833295.0}, 'SlowrateDoS': {'precision': 0.9412194128960706, 'recall': 0.8707634960938768, 'f1-score': 0.9046216744082497, 'support': 153989.0}, 'SYNFlood': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6638.0}, 'UDPScan': {'precision': 0.9987199180747568, 'recall': 0.9971881390593047, 'f1-score': 0.9979534407776925, 'support': 3912.0}, 'ICMPFlood': {'precision': 0.9885807504078303, 'recall': 1.0, 'f1-score': 0.9942575881870386, 'support': 606.0}, 'TCPConnectScan': {'precision': 0.9994033412887828, 'recall': 0.9899527186761229, 'f1-score': 0.9946555819477434, 'support': 5076.0}, 'HTTPFlood': {'precision': 0.9154836259284267, 'recall': 0.9629526054359261, 'f1-score': 0.938618333805672, 'support': 225279.0}, 'SYNScan': {'precision': 0.9953870838347373, 'recall': 0.9989935587761675, 'f1-score': 0.9971870604781997, 'support': 4968.0}, 'UDPFlood': {'precision': 

In [24]:
# we check the number of bits that the code words will have in P4
nbitsp4 = dt.tree_.node_count - dt.tree_.n_leaves
print("Number of bits in P4: ", nbitsp4)

Number of bits in P4:  499


In [25]:
# save the model for use in p4
save_model(dt, "DT_model_NIDD_9_classes.sav")

In [26]:
from joblib import dump

dump(dt, 'DT_model_NIDD_9_classes.joblib')

['DT_model_NIDD_9_classes.joblib']

In [27]:
with open('DT_model_NIDD_9_classes.sav', 'rb') as f:
    clf = pickle.load(f)

In [29]:
depth = dt.tree_.max_depth
print("Depth of the tree:", depth)

Depth of the tree: 37


In [30]:
cm_rf, final_met_table_dt = calculate_scores_fpr_tpr(y_test, y_pred, classes,classes_long)

In [31]:
final_met_table_dt*100

Unnamed: 0,F1 Score,TPR,FPR,TNR,FNR
Benign,99.992139,99.984639,0.000514,99.999486,0.015361
SlowrateDoS,90.462167,87.07635,0.662991,99.337009,12.92365
SYNFlood,100.0,100.0,0.0,100.0,0.0
UDPScan,99.795344,99.718814,0.000354,99.999646,0.281186
ICMPFlood,99.425759,100.0,0.000494,99.999506,0.0
TCPConnectScan,99.465558,98.995272,0.000212,99.999788,1.004728
HTTPFlood,93.861833,96.295261,1.680437,98.319563,3.704739
SYNScan,99.718706,99.899356,0.001629,99.998371,0.100644
UDPFlood,100.0,100.0,0.0,100.0,0.0
Macro Avg,98.080167,97.996632,0.260737,99.739263,2.003368


In [32]:
final_met_table_dt.to_csv("final_met_table_dt_full.csv")