In [1]:
import requests

import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [None]:
from sklearn.model_selection import learning_curve

In [2]:
# load dataset
data = pd.read_csv(r'/home/patrick/Github/Mosaic_DDOS_Datasets/train_mosaic.csv')
x_cols = [c for c in data.columns if c != 'Label']
# set input matrix and target column
X = data[x_cols]
y = data['Label']
# show first rows of data
data.head()

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0,739228.5,743103.4661,1264682,213775,49700000.0,41400000.0,79000000,20500000,DoS
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,443,11932077,12,16,5030,15703,1525,0,419.166667,644.896586,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Data Preprocessing/Cleaning

In [3]:
data.shape

(809361, 78)

In [4]:
original_features=list(data.columns)
len(original_features)

78

In [5]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

0

In [6]:
len(data)

809361

# Encoding the dataset.

In [7]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['Label']

In [8]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['Bwd_Packets_Sec',
 'Avg_Fwd_Segment_Size',
 'Bwd_Packet_Length_Mean',
 'SYN_Flag_Count',
 'Fwd_Header_Length',
 'Active_Min',
 'Subflow_Fwd_Bytes',
 'Fwd_IAT_Std',
 'Fwd_PSH_Flags',
 'Fwd_Avg_Packets_Bulk',
 'Bwd_Header_Length',
 'ACK_Flag_Count',
 'Idle_Mean',
 'Flow_Bytes_Sec',
 'Fwd_IAT_Total',
 'Fwd_IAT_Mean',
 'FIN_Flag_Count',
 'Bwd_IAT_Mean',
 'Fwd_Avg_Bulk_Rate',
 'Total_Length_of_Bwd_Packets',
 'Active_Max',
 'Active_Std',
 'Flow_IAT_Std',
 'Max_Packet_Length',
 'Fwd_Packet_Length_Std',
 'Subflow_Bwd_Bytes',
 'Flow_IAT_Min',
 'Fwd_Packet_Length_Mean',
 'Fwd_IAT_Max',
 'Avg_Bwd_Segment_Size',
 'Packet_Length_Std',
 'ECE_Flag_Count',
 'Fwd_URG_Flags',
 'Idle_Std',
 'Bwd_Avg_Bulk_Rate',
 'Idle_Max',
 'Bwd_IAT_Std',
 'Fwd_Packet_Length_Max',
 'Min_Packet_Length',
 'Destination_Port',
 'Idle_Min',
 'Total_Fwd_Packets',
 'Bwd_Avg_Packets_Bulk',
 'Average_Packet_Size',
 'Bwd_IAT_Min',
 'Fwd_IAT_Min',
 'Bwd_PSH_Flags',
 'CWE_Flag_Count',
 'Init_Win_bytes_forward',
 'Bwd_Packet_Lengt

In [9]:
encoders = {}
for column in ['Label']:
    categorical_convert = LabelEncoder()

In [10]:
nominal=['Label']
ordinal=list(set(categorical_features)-set(nominal))

In [11]:
df_nominal=pd.get_dummies(data[nominal])

In [12]:
target=['Label']

In [13]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [14]:
data[numerical]

Unnamed: 0,Bwd_Packets_Sec,Avg_Fwd_Segment_Size,Bwd_Packet_Length_Mean,SYN_Flag_Count,Fwd_Header_Length,Active_Min,Subflow_Fwd_Bytes,Fwd_IAT_Std,Fwd_PSH_Flags,Fwd_Avg_Packets_Bulk,...,Bwd_URG_Flags,Total_Backward_Packets,min_seg_size_forward,Init_Win_bytes_backward,URG_Flag_Count,Flow_Duration,act_data_pkt_fwd,Bwd_IAT_Max,Total_Length_of_Fwd_Packets,RST_Flag_Count
0,0.009884,48.450000,0.0000,1,728,213775,969,1.840000e+07,0,0,...,0,1,0,29200,0,101168794,3,0,969,0
1,17241.379310,0.000000,0.0000,0,32,0,0,0.000000e+00,0,0,...,0,1,32,33304,1,58,0,0,0,0
2,64.213703,37.000000,122.0000,0,80,0,148,1.743482e+04,0,0,...,0,2,20,-1,0,31146,3,1,148,0
3,15.704504,143.000000,97.2500,0,72,0,429,8.303272e+04,0,0,...,0,4,20,237,0,254704,2,130438,429,0
4,1.340923,419.166667,981.4375,0,252,0,5030,6.855032e+04,0,0,...,0,16,20,100,0,11932077,11,11049530,5030,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809356,0.014672,23.300000,0.0000,1,344,3026,233,1.137151e+07,0,0,...,0,1,0,29200,0,68156881,6,0,233,0
809357,5.760103,49.142857,0.0000,1,284,0,344,5.380612e+04,0,0,...,0,1,0,29200,0,173608,1,0,344,0
809358,0.016149,30.000000,0.0000,1,224,1423,210,1.575375e+07,0,0,...,0,1,0,29200,0,61923754,4,0,210,0
809359,2688.172043,44.000000,100.0000,0,40,0,88,0.000000e+00,0,0,...,0,2,20,-1,0,744,1,4,88,0


In [15]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)
new_data.shape



(809361, 79)

# Standardize the dataset.

In [16]:
# set input matrix and target column
x_cols = [c for c in data.columns if c != 'Label']
x_cols2= [c for c in new_data.columns]
X = data[x_cols]
print(X)
y = data['Label']

        Destination_Port  Flow_Duration  Total_Fwd_Packets  \
0                     80      101168794                 20   
1                  60711             58                  1   
2                     53          31146                  4   
3                     80         254704                  3   
4                    443       11932077                 12   
...                  ...            ...                ...   
809356                80       68156881                 10   
809357                80         173608                  7   
809358                80       61923754                  7   
809359                53            744                  2   
809360                80         196947                 10   

        Total_Backward_Packets  Total_Length_of_Fwd_Packets  \
0                            1                          969   
1                            1                            0   
2                            2                          148   
3  

In [17]:
X=new_data.to_numpy()

In [18]:
X[0]

array([ 0.00000000e+00,  1.00000000e+00,  9.88447100e-03,  4.84500000e+01,
        0.00000000e+00,  1.00000000e+00,  7.28000000e+02,  2.13775000e+05,
        9.69000000e+02,  1.84000000e+07,  0.00000000e+00,  0.00000000e+00,
        4.00000000e+01,  0.00000000e+00,  4.97000000e+07,  9.57805230e+00,
        1.01000000e+08,  5.32369611e+06,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.26468200e+06,  7.43103466e+05,
        1.80000000e+07,  3.53000000e+02,  1.19083551e+02,  0.00000000e+00,
        2.00000000e+00,  4.84500000e+01,  7.90000000e+07,  0.00000000e+00,
        1.14164659e+02,  0.00000000e+00,  0.00000000e+00,  4.14000000e+07,
        0.00000000e+00,  7.90000000e+07,  0.00000000e+00,  3.53000000e+02,
        0.00000000e+00,  8.00000000e+01,  2.05000000e+07,  2.00000000e+01,
        0.00000000e+00,  4.61428571e+01,  0.00000000e+00,  2.00000000e+00,
        0.00000000e+00,  0.00000000e+00, -1.00000000e+00,  0.00000000e+00,
        7.90000000e+07,  

In [19]:
X=StandardScaler().fit_transform(X)

In [20]:
X[0]

array([-9.19101676e-01,  9.19101676e-01, -1.12373622e-01, -6.09452103e-03,
       -3.70547215e-01,  1.08238486e+00,  3.11894084e-03,  2.68840421e-02,
        1.32981740e-01,  1.59825624e+00, -1.73935450e-01,  0.00000000e+00,
        3.73063922e-03, -4.25006181e-01,  1.47778855e+00, -6.48067346e-01,
        2.20626751e+00,  2.69704704e-01, -1.20779767e-01, -1.25612557e-01,
        0.00000000e+00, -4.33692669e-03,  6.12413133e-01,  9.19315893e-01,
        1.42710518e+00,  1.01609589e-02,  3.44684549e-01, -4.33692669e-03,
       -5.84993856e-02, -6.09452103e-03,  2.48921670e+00, -3.70547215e-01,
        1.49630887e-02, -8.09246851e-03,  0.00000000e+00,  5.54507241e+00,
        0.00000000e+00,  2.40334688e+00, -1.50363405e-01,  3.92497130e-01,
       -4.59960676e-01, -3.13659533e-01,  3.68190006e-01,  2.01311408e-02,
        0.00000000e+00, -2.49174756e-01, -8.90209854e-02, -1.03343892e-01,
       -4.64564156e-02,  0.00000000e+00, -3.17178725e-01, -3.27908244e-01,
        2.39292378e+00, -

In [21]:
y=data[target]

In [22]:
y=y.to_numpy()

In [23]:
y

array([['DoS'],
       ['BENIGN'],
       ['BENIGN'],
       ...,
       ['DoS'],
       ['BENIGN'],
       ['DoS']], dtype=object)

In [24]:
y=categorical_convert.fit_transform(y)
encoders[column] = categorical_convert

  return f(*args, **kwargs)


In [25]:
y

array([1, 0, 0, ..., 1, 0, 1])

# Feature engineering

In [26]:
pca=PCA(n_components=2)

In [27]:
p_components=pca.fit_transform(X)

In [28]:
p_components

array([[-6.29910143,  1.87856478],
       [ 2.15924041, -3.30069401],
       [ 2.48911828, -1.63410267],
       ...,
       [-4.46395951,  0.38074179],
       [ 3.20123472, -1.67298795],
       [-0.0879794 , -1.49656169]])

In [29]:
pca.explained_variance_ratio_

array([0.17348344, 0.15744644])

In [30]:
# Split the dataset into a training set and a validation set.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [32]:
for i in range(100):
    input_data = dict(pd.DataFrame(X_test).iloc[i])
    target = pd.DataFrame(y_test).iloc[i]
    r = requests.post("http://127.0.0.1:8000/api/v1/ddos_classifier/predict?status=ab_testing", input_data)
    response = r.json()
    print("Response")
    print(response)
    # provide feedback
    requests.put("http://127.0.0.1:8000/api/v1/mlrequests/{}".format(response["request_id"]), {"feedback": target})

Response
{'probability': 0.91, 'label': 'DDOS', 'status': 'OK', 'request_id': 104}
Response
{'probability': 0.07, 'label': 'BENIGN', 'status': 'OK', 'request_id': 105}
Response
{'probability': 0.66, 'label': 'DDOS', 'status': 'OK', 'request_id': 106}
Response
{'probability': 0.72, 'label': 'DDOS', 'status': 'OK', 'request_id': 107}
Response
{'probability': 0.06, 'label': 'BENIGN', 'status': 'OK', 'request_id': 108}
Response
{'probability': 0.9, 'label': 'DDOS', 'status': 'OK', 'request_id': 109}
Response
{'probability': 0.51, 'label': 'DDOS', 'status': 'OK', 'request_id': 110}
Response
{'probability': 0.52, 'label': 'DDOS', 'status': 'OK', 'request_id': 111}
Response
{'probability': 0.08, 'label': 'BENIGN', 'status': 'OK', 'request_id': 112}
Response
{'probability': 0.16, 'label': 'BENIGN', 'status': 'OK', 'request_id': 113}
Response
{'probability': 0.07, 'label': 'BENIGN', 'status': 'OK', 'request_id': 114}
Response
{'probability': 0.66, 'label': 'DDOS', 'status': 'OK', 'request_id': 