In [2]:
import requests

import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import learning_curve

In [3]:
# load dataset
data = pd.read_csv(r'/home/patrick/Github/Mosaic_DDOS_Datasets/train_mosaic.csv')
x_cols = [c for c in data.columns if c != 'Label']
# set input matrix and target column
X = data[x_cols]
y = data['Label']
# show first rows of data
data.head()

Unnamed: 0,Destination_Port,Flow_Duration,Total_Fwd_Packets,Total_Backward_Packets,Total_Length_of_Fwd_Packets,Total_Length_of_Bwd_Packets,Fwd_Packet_Length_Max,Fwd_Packet_Length_Min,Fwd_Packet_Length_Mean,Fwd_Packet_Length_Std,...,min_seg_size_forward,Active_Mean,Active_Std,Active_Max,Active_Min,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Label
0,80,101168794,20,1,969,0,353,0,48.45,119.083551,...,0,739228.5,743103.4661,1264682,213775,49700000.0,41400000.0,79000000,20500000,DoS
1,60711,58,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,53,31146,4,2,148,244,37,37,37.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,80,254704,3,4,429,389,423,0,143.0,242.50567,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,443,11932077,12,16,5030,15703,1525,0,419.166667,644.896586,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Data Preprocessing/Cleaning

In [4]:
data.shape

(809361, 78)

In [5]:
original_features=list(data.columns)
len(original_features)

78

In [6]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

0

In [7]:
len(data)

809361

# Encoding the dataset.

In [8]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['Label']

In [9]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['Fwd_Packet_Length_Max',
 'Subflow_Fwd_Bytes',
 'Fwd_Avg_Bytes_Bulk',
 'Flow_IAT_Min',
 'URG_Flag_Count',
 'min_seg_size_forward',
 'Flow_IAT_Mean',
 'Fwd_Avg_Packets_Bulk',
 'Idle_Min',
 'SYN_Flag_Count',
 'Subflow_Fwd_Packets',
 'Max_Packet_Length',
 'Total_Length_of_Bwd_Packets',
 'Fwd_IAT_Total',
 'Active_Max',
 'Fwd_Packets_Sec',
 'ECE_Flag_Count',
 'PSH_Flag_Count',
 'Bwd_Packet_Length_Std',
 'Total_Length_of_Fwd_Packets',
 'Init_Win_bytes_backward',
 'Fwd_IAT_Min',
 'Destination_Port',
 'Min_Packet_Length',
 'Bwd_Avg_Bytes_Bulk',
 'Bwd_Header_Length',
 'Active_Mean',
 'Idle_Max',
 'Bwd_Packet_Length_Mean',
 'Flow_IAT_Max',
 'Down_Up_Ratio',
 'Active_Min',
 'Bwd_Packets_Sec',
 'Avg_Bwd_Segment_Size',
 'act_data_pkt_fwd',
 'Bwd_Packet_Length_Max',
 'Subflow_Bwd_Bytes',
 'Idle_Std',
 'Bwd_IAT_Min',
 'Bwd_Avg_Packets_Bulk',
 'Flow_Bytes_Sec',
 'Fwd_PSH_Flags',
 'Packet_Length_Std',
 'Fwd_Header_Length',
 'Bwd_IAT_Mean',
 'Fwd_URG_Flags',
 'Fwd_IAT_Std',
 'Average_Packet_Size',
 'Fl

In [10]:
encoders = {}
for column in ['Label']:
    categorical_convert = LabelEncoder()

In [11]:
nominal=['Label']
ordinal=list(set(categorical_features)-set(nominal))

In [12]:
df_nominal=pd.get_dummies(data[nominal])

In [13]:
target=['Label']

In [14]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [15]:
data[numerical]

Unnamed: 0,Fwd_Packet_Length_Max,Subflow_Fwd_Bytes,Fwd_Avg_Bytes_Bulk,Flow_IAT_Min,URG_Flag_Count,min_seg_size_forward,Flow_IAT_Mean,Fwd_Avg_Packets_Bulk,Idle_Min,SYN_Flag_Count,...,CWE_Flag_Count,Avg_Fwd_Segment_Size,Fwd_Packet_Length_Min,Bwd_URG_Flags,Bwd_Avg_Bulk_Rate,Total_Fwd_Packets,ACK_Flag_Count,Bwd_IAT_Max,Packet_Length_Mean,Fwd_IAT_Mean
0,353,969,0,2,0,0,5.058440e+06,0,20500000,1,...,0,48.450000,0,0,0,20,0,0,44.045455,5.323696e+06
1,0,0,0,58,1,32,5.800000e+01,0,0,0,...,0,0.000000,0,0,0,1,1,0,0.000000,0.000000e+00
2,37,148,0,1,0,20,6.229200e+03,0,0,0,...,0,37.000000,37,0,0,4,0,1,61.285714,1.006800e+04
3,423,429,0,4,0,20,4.245067e+04,0,0,0,...,0,143.000000,0,0,0,3,0,130438,102.250000,6.524300e+04
4,1525,5030,0,2,0,20,4.419288e+05,0,0,0,...,0,419.166667,0,0,0,12,0,11049530,714.931035,8.025336e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809356,168,233,0,4,0,0,6.815688e+06,0,19048836,1,...,0,23.300000,0,0,0,10,0,0,19.416667,7.572829e+06
809357,344,344,0,1,0,0,2.480114e+04,0,0,1,...,0,49.142857,0,0,0,7,0,0,38.222222,2.830983e+04
809358,168,210,0,3,0,0,8.846251e+06,0,22243737,1,...,0,30.000000,0,0,0,7,0,0,23.333333,1.032050e+07
809359,44,88,0,4,0,20,2.480000e+02,0,0,0,...,0,44.000000,44,0,0,2,0,4,66.400000,4.800000e+01


In [32]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)
new_data.shape



(809361, 79)

# Standardize the dataset.

In [33]:
# set input matrix and target column
x_cols = [c for c in data.columns if c != 'Label']
x_cols2= [c for c in new_data.columns]
X = data[x_cols]
print(X)
y = data['Label']

        Destination_Port  Flow_Duration  Total_Fwd_Packets  \
0                     80      101168794                 20   
1                  60711             58                  1   
2                     53          31146                  4   
3                     80         254704                  3   
4                    443       11932077                 12   
...                  ...            ...                ...   
809356                80       68156881                 10   
809357                80         173608                  7   
809358                80       61923754                  7   
809359                53            744                  2   
809360                80         196947                 10   

        Total_Backward_Packets  Total_Length_of_Fwd_Packets  \
0                            1                          969   
1                            1                            0   
2                            2                          148   
3  

In [18]:
X=new_data.to_numpy()

In [19]:
X[0]

array([ 0.00000000e+00,  1.00000000e+00,  3.53000000e+02,  9.69000000e+02,
        0.00000000e+00,  2.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        5.05843970e+06,  0.00000000e+00,  2.05000000e+07,  1.00000000e+00,
        2.00000000e+01,  3.53000000e+02,  0.00000000e+00,  1.01000000e+08,
        1.26468200e+06,  1.97689418e-01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  9.69000000e+02,  2.92000000e+04,  2.00000000e+00,
        8.00000000e+01,  0.00000000e+00,  0.00000000e+00,  4.00000000e+01,
        7.39228500e+05,  7.90000000e+07,  0.00000000e+00,  7.90000000e+07,
        0.00000000e+00,  2.13775000e+05,  9.88447100e-03,  0.00000000e+00,
        3.00000000e+00,  0.00000000e+00,  0.00000000e+00,  4.14000000e+07,
        0.00000000e+00,  0.00000000e+00,  9.57805230e+00,  0.00000000e+00,
        1.14164659e+02,  7.28000000e+02,  0.00000000e+00,  0.00000000e+00,
        1.84000000e+07,  4.61428571e+01,  1.01168794e+08,  4.84500000e+01,
        1.80000000e+07,  

In [20]:
X=StandardScaler().fit_transform(X)

In [21]:
X[0]

array([-9.19101676e-01,  9.19101676e-01,  3.92497130e-01,  1.32981740e-01,
        0.00000000e+00, -5.84993856e-02, -2.40910824e-01,  4.55176286e-03,
        4.89100470e-01,  0.00000000e+00,  3.68190006e-01,  1.08238486e+00,
        2.01311408e-02,  1.01609589e-02, -4.33692669e-03,  2.20626751e+00,
        6.12413133e-01, -1.79938277e-01, -8.09246851e-03, -3.61911677e-01,
       -3.03892094e-01,  1.32981740e-01,  1.01857280e+00, -1.03343892e-01,
       -3.13659533e-01, -4.59960676e-01,  0.00000000e+00,  3.73063922e-03,
        4.66782295e-01,  2.40334688e+00, -3.70547215e-01,  2.39292378e+00,
       -6.39163643e-01,  2.68840421e-02, -1.12373622e-01, -3.70547215e-01,
       -2.54114316e-03, -3.27908244e-01, -4.33692669e-03,  5.54507241e+00,
       -8.90209854e-02,  0.00000000e+00, -6.48067346e-01, -1.73935450e-01,
        1.49630887e-02,  3.11894084e-03, -1.25612557e-01,  0.00000000e+00,
        1.59825624e+00, -2.49174756e-01,  2.14160569e+00, -6.09452103e-03,
        1.42710518e+00,  

In [22]:
y=data[target]

In [23]:
y=y.to_numpy()

In [24]:
y

array([['DoS'],
       ['BENIGN'],
       ['BENIGN'],
       ...,
       ['DoS'],
       ['BENIGN'],
       ['DoS']], dtype=object)

In [25]:
y=categorical_convert.fit_transform(y)
encoders[column] = categorical_convert

  return f(*args, **kwargs)


In [26]:
y

array([1, 0, 0, ..., 1, 0, 1])

# Feature engineering

In [27]:
pca=PCA(n_components=2)

In [28]:
p_components=pca.fit_transform(X)

In [29]:
p_components

array([[-6.2991014 ,  1.87856418],
       [ 2.15924083, -3.300693  ],
       [ 2.48911825, -1.63410258],
       ...,
       [-4.46395958,  0.38074132],
       [ 3.20123471, -1.67298788],
       [-0.08797932, -1.49656159]])

In [30]:
pca.explained_variance_ratio_

array([0.17348344, 0.15744644])

In [31]:
# Split the dataset into a training set and a validation set.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

In [39]:
for i in range(100):
    input_data = dict(pd.DataFrame(X_test).iloc[i])
    target = pd.DataFrame(y_test).iloc[i]
    r = requests.post("http://127.0.0.1:8000/api/v1/ddos_classifier/predict?status=ab_testing", input_data)
    response = r.json()
    print("Response")
    print(response)
    # provide feedback
    requests.put("http://127.0.0.1:8000/api/v1/mlrequests/{}".format(response["request_id"]), {"feedback": target})

Response
{'probability': 0.71, 'label': 'DDOS', 'status': 'OK', 'request_id': 1}
Response
{'probability': 0.01, 'label': 'BENIGN', 'status': 'OK', 'request_id': 2}
Response
{'probability': 0.87, 'label': 'DDOS', 'status': 'OK', 'request_id': 3}
Response
{'probability': 0.68, 'label': 'DDOS', 'status': 'OK', 'request_id': 4}
Response
{'probability': 0.06, 'label': 'BENIGN', 'status': 'OK', 'request_id': 5}
Response
{'probability': 0.9, 'label': 'DDOS', 'status': 'OK', 'request_id': 6}
Response
{'probability': 0.88, 'label': 'DDOS', 'status': 'OK', 'request_id': 7}
Response
{'probability': 0.74, 'label': 'DDOS', 'status': 'OK', 'request_id': 8}
Response
{'probability': 0.0, 'label': 'BENIGN', 'status': 'OK', 'request_id': 9}
Response
{'probability': 0.0, 'label': 'BENIGN', 'status': 'OK', 'request_id': 10}
Response
{'probability': 0.0, 'label': 'BENIGN', 'status': 'OK', 'request_id': 11}
Response
{'probability': 0.86, 'label': 'DDOS', 'status': 'OK', 'request_id': 12}
Response
{'probabil