In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
from sklearn import svm
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
dtypes = {
    'Src IP': 'category',
    'Src Port': 'uint16',
    'Dst IP': 'category',
    'Dst Port': 'uint16',
    'Protocol': 'category',
    'Flow Duration': 'uint32',
    'Tot Fwd Pkts': 'uint32',
    'Tot Bwd Pkts': 'uint32',
    'TotLen Fwd Pkts': 'float32',
    'TotLen Bwd Pkts': 'float32',
    'Fwd Pkt Len Max': 'float32',
    'Fwd Pkt Len Min': 'float32',
    'Fwd Pkt Len Mean': 'float32',
    'Fwd Pkt Len Std': 'float32',
    'Bwd Pkt Len Max': 'float32',
    'Bwd Pkt Len Min': 'float32',
    'Bwd Pkt Len Mean': 'float32',
    'Bwd Pkt Len Std': 'float32',
    'Flow Byts/s': 'float32',
    'Flow Pkts/s': 'float32',
    'Flow IAT Mean': 'float32',
    'Flow IAT Std': 'float32',
    'Flow IAT Max': 'float32',
    'Flow IAT Min': 'float32',
    'Fwd IAT Tot': 'float32',
    'Fwd IAT Mean': 'float32',
    'Fwd IAT Std': 'float32',
    'Fwd IAT Max': 'float32',
    'Fwd IAT Min': 'float32',
    'Bwd IAT Tot': 'float32',
    'Bwd IAT Mean': 'float32',
    'Bwd IAT Std': 'float32',
    'Bwd IAT Max': 'float32',
    'Bwd IAT Min': 'float32',
    'Fwd PSH Flags': 'category',
    'Bwd PSH Flags': 'category',
    'Fwd URG Flags': 'category',
    'Bwd URG Flags': 'category',
    'Fwd Header Len': 'uint32',
    'Bwd Header Len': 'uint32',
    'Fwd Pkts/s': 'float32',
    'Bwd Pkts/s': 'float32',
    'Pkt Len Min': 'float32',
    'Pkt Len Max': 'float32',
    'Pkt Len Mean': 'float32',
    'Pkt Len Std': 'float32',
    'Pkt Len Var': 'float32',
    'FIN Flag Cnt': 'category',
    'SYN Flag Cnt': 'category',
    'RST Flag Cnt': 'category',
    'PSH Flag Cnt': 'category',
    'ACK Flag Cnt': 'category',
    'URG Flag Cnt': 'category',
    'CWE Flag Count': 'category',
    'ECE Flag Cnt': 'category',
    'Down/Up Ratio': 'float32',
    'Pkt Size Avg': 'float32',
    'Fwd Seg Size Avg': 'float32',
    'Bwd Seg Size Avg': 'float32',
    'Fwd Byts/b Avg': 'uint32',
    'Fwd Pkts/b Avg': 'uint32',
    'Fwd Blk Rate Avg': 'uint32',
    'Bwd Byts/b Avg': 'uint32',
    'Bwd Pkts/b Avg': 'uint32',
    'Bwd Blk Rate Avg': 'uint32',
    'Subflow Fwd Pkts': 'uint32',
    'Subflow Fwd Byts': 'uint32',
    'Subflow Bwd Pkts': 'uint32',
    'Subflow Bwd Byts': 'uint32',
    'Init Fwd Win Byts': 'uint32',
    'Init Bwd Win Byts': 'uint32',
    'Fwd Act Data Pkts': 'uint32',
    'Fwd Seg Size Min': 'uint32',
    'Active Mean': 'float32',
    'Active Std': 'float32',
    'Active Max': 'float32',
    'Active Min': 'float32',
    'Idle Mean': 'float32',
    'Idle Std': 'float32',
    'Idle Max': 'float32',
    'Idle Min': 'float32',
    'Label': 'category'
}

In [4]:
df = pd.read_csv(
    '../dataset/final_dataset.csv',
    dtype=dtypes,
    parse_dates=['Timestamp'],
    usecols=[*dtypes.keys(), 'Timestamp'],
    engine='c',
    low_memory=True
)
del dtypes

In [5]:
df.head()

Unnamed: 0,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.4.118,4504,203.73.24.75,80,6,2010-12-06 08:34:32,3974862,29,44,86.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
1,192.168.4.118,4504,203.73.24.75,80,6,2010-12-06 08:34:36,63,1,1,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
2,192.168.4.118,4505,203.73.24.75,80,6,2010-12-06 08:34:36,476078,2,6,86.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
3,192.168.4.118,4505,203.73.24.75,80,6,2010-12-06 08:34:37,151,2,1,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos
4,192.168.4.118,4506,203.73.24.75,80,6,2010-12-06 08:34:37,472507,2,5,73.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ddos


In [6]:
df.shape

(12794627, 83)

In [7]:
MB = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(MB))

Memory usage of dataframe is 3541.11 MB


In [8]:
# Dropping columns that have just one value
colsToDrop = np.array(['Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg'])

In [9]:
# Drop categorical columns that have one category predominance
rowbuilder = lambda col: {'col': col, 'unique_values': df[col].nunique(), 'most_frequent_value': df[col].value_counts().index[0],'frequency': df[col].value_counts(normalize=True).values[0]}
frequency = [rowbuilder(col) for col in df.select_dtypes(include=['category']).columns]
skewed = pd.DataFrame(frequency)
skewed = skewed[skewed['frequency'] >= 0.95]
colsToDrop = np.union1d(colsToDrop, skewed['col'].values)
colsToDrop
del skewed
del rowbuilder
del frequency

In [10]:
# Drop columns where missing values are more than 50% Drop rows where a column missing values are no more than 5%
missing = df.isna().sum()
missing = pd.DataFrame({'count': missing, '% of total': missing/len(df)*100}, index=df.columns)
colsToDrop = np.union1d(colsToDrop, missing[missing['% of total'] >= 50].index.values)
dropnaCols = missing[(missing['% of total'] > 0) & (missing['% of total'] <= 5)].index.values

In [11]:
# Handling faulty data
df['Flow Byts/s'].replace(np.inf, np.nan, inplace=True)
df['Flow Pkts/s'].replace(np.inf, np.nan, inplace=True)
dropnaCols = np.union1d(dropnaCols, ['Flow Byts/s', 'Flow Pkts/s'])

In [12]:
print(colsToDrop)
print(dropnaCols)

['Bwd Blk Rate Avg' 'Bwd Byts/b Avg' 'Bwd PSH Flags' 'Bwd Pkts/b Avg'
 'Bwd URG Flags' 'FIN Flag Cnt' 'Fwd Blk Rate Avg' 'Fwd Byts/b Avg'
 'Fwd PSH Flags' 'Fwd Pkts/b Avg' 'Fwd URG Flags' 'URG Flag Cnt']
['Flow Byts/s' 'Flow Pkts/s']


In [13]:
df.drop(columns=colsToDrop, inplace=True)
df.dropna(subset=dropnaCols, inplace=True)

In [14]:
df.describe()

Unnamed: 0,Src Port,Dst Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,...,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746850.0,12746847.0,12746847.0
mean,37204.6,14506.39,8254783.0,27.29087,4.992832,1122.071,3246.234,291.1146,7.117381,78.52734,...,24.23191,8.073743,137426.5,69585.37,205710.5,90352.34,3136952.0,109640.9,3228279.0,3037019.5
std,25168.41,22994.85,25189320.0,1723.798,251.3901,54884.19,539742.7,386.2819,19.5127,95.78453,...,1714.791,9.126309,2302855.0,1401670.0,3053002.0,1936304.0,11993940.0,1413072.0,12211288.0,11882638.0
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,443.0,80.0,1280.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50622.0,80.0,32774.0,2.0,1.0,42.0,115.0,40.0,0.0,36.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,56246.0,37994.0,4167330.0,4.0,4.0,935.0,358.0,677.0,0.0,142.6667,...,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,65535.0,65535.0,4294967000.0,309628.0,291923.0,9908096.0,655452700.0,23360.0,1472.0,4660.441,...,309628.0,48.0,113269100.0,75232420.0,113269100.0,113269100.0,119999700.0,76393950.0,119999736.0,119999736.0


In [15]:
import socket
import struct

def ip2int(addr):
    return struct.unpack("!I", socket.inet_aton(addr))[0]

In [16]:
df.loc[:,'Src IP'] = df['Src IP'].apply(lambda x:ip2int(x)) 
df.loc[:,'Dst IP'] = df['Dst IP'].apply(lambda x:ip2int(x))
df.loc[:,'Timestamp'] = df['Timestamp'].values.astype(float)
df.loc[:,'SYN Flag Cnt'] = df['SYN Flag Cnt'].apply(lambda x:int(x))
df.loc[:,'RST Flag Cnt'] = df['RST Flag Cnt'].apply(lambda x:int(x))
df.loc[:,'PSH Flag Cnt'] = df['PSH Flag Cnt'].apply(lambda x:int(x))
df.loc[:,'ACK Flag Cnt'] = df['ACK Flag Cnt'].apply(lambda x:int(x))
df.loc[:,'CWE Flag Count'] = df['CWE Flag Count'].apply(lambda x:int(x))
df.loc[:,'ECE Flag Cnt'] = df['ECE Flag Cnt'].apply(lambda x:int(x))
df.loc[:,'Protocol'] = df['Protocol'].apply(lambda x:int(x))

In [17]:
y_data = df.pop('Label')
x_data = df

In [18]:
print(x_data.shape)
print(y_data.shape)

(12746847, 70)
(12746847,)


In [19]:
train_x, test_x, train_y, test_y = train_test_split(x_data,y_data,test_size=0.20,random_state=0)

In [20]:
train_x.reset_index(drop=True,inplace=True)
test_x.reset_index(drop=True,inplace=True)
train_y.reset_index(drop=True,inplace=True)
test_y.reset_index(drop=True,inplace=True)

In [21]:
print(train_x.shape,train_y.shape)
print(test_x.shape,test_y.shape)

(10197477, 70) (10197477,)
(2549370, 70) (2549370,)


In [22]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [23]:
def printReport(test_y,predict_y):
  accuracy = accuracy_score(test_y, predict_y)
  print('Accuracy: %.7f' % accuracy)

  # T up-down  F left-right
  print('Confusion Matrix')                      #  TP FP
  print(confusion_matrix(test_y,predict_y))      #  FN TN 

  print(classification_report(test_y,predict_y,digits=7))

In [24]:
n=5
classifier = KNeighborsClassifier(n_neighbors=n,metric='minkowski',algorithm='kd_tree',n_jobs=-1)
classifier.fit(train_x,train_y)

predict_y = classifier.predict(test_x)

print('K-Nearest Neighbour Classifier')
printReport(test_y,predict_y)

K-Nearest Neighbour Classifier
Accuracy: 0.9999871
Confusion Matrix
[[1254753      23]
 [     10 1294584]]
              precision    recall  f1-score   support

      Benign  0.9999920 0.9999817 0.9999869   1254776
        ddos  0.9999822 0.9999923 0.9999873   1294594

    accuracy                      0.9999871   2549370
   macro avg  0.9999871 0.9999870 0.9999871   2549370
weighted avg  0.9999871 0.9999871 0.9999871   2549370



In [24]:
classifier = LogisticRegression(random_state=0, max_iter=1000, solver='sag', n_jobs=-1)
classifier.fit(train_x,train_y)

predict_y = classifier.predict(test_x)

print('Logistic Regression')
printReport(test_y,predict_y)



Logistic Regression
Accuracy: 0.9947646
Confusion Matrix
[[1246551    8225]
 [   5122 1289472]]
              precision    recall  f1-score   support

      Benign  0.9959079 0.9934450 0.9946749   1254776
        ddos  0.9936618 0.9960435 0.9948513   1294594

    accuracy                      0.9947646   2549370
   macro avg  0.9947849 0.9947443 0.9947631   2549370
weighted avg  0.9947673 0.9947646 0.9947645   2549370



In [25]:
model = RandomForestClassifier(random_state=0,n_jobs=-1)
model.fit(train_x, train_y)

predict_y = model.predict(test_x)

print('Random Forest Classifier')
printReport(test_y,predict_y)

Random Forest Classifier
Accuracy: 1.0000000
Confusion Matrix
[[1254776       0]
 [      0 1294594]]
              precision    recall  f1-score   support

      Benign  1.0000000 1.0000000 1.0000000   1254776
        ddos  1.0000000 1.0000000 1.0000000   1294594

    accuracy                      1.0000000   2549370
   macro avg  1.0000000 1.0000000 1.0000000   2549370
weighted avg  1.0000000 1.0000000 1.0000000   2549370



In [None]:
classifier = svm.SVC(random_state=0, kernel='rbf')
classifier.fit(train_x,train_y)

predict_y = classifier.predict(test_x)

print('Support Vector Machine')
printReport(test_y,predict_y)