#### IMPORTING REQUIRED MODULES

In [1]:
# importing modules

import numpy as np 
import pandas as pd 

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

In [2]:
# reading the dataset
df = pd.read_csv('./dataset.csv')

#### PERFORMING EDA

In [3]:
df.columns

Index(['Unnamed: 0', 'Flow ID', 'Source IP', 'Source Port', 'Destination IP',
       'Destination Port', 'Protocol', 'Timestamp', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Le

In [4]:
# dropping out unecessary columns
df = df.drop(["Unnamed: 0","Flow ID","Source IP","Source Port","Destination IP","Destination Port","Protocol","Timestamp"],axis=1)
df.columns

Index(['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PS

In [5]:
# checking for null values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        print(str(col) + " has " + str(df[col].isnull().sum()) + " null values ")

# checking total rows in df
print(len(df))

# checking percent of null values
print((21087/985290) * 100)

# removing rows as 2% of the rows doesn't effect the result much
df.dropna(inplace=True)

Flow Bytes/s has 21087 null values 
985290
2.140182078372865


In [6]:
# checking values to see if there are any unnecessary columns
df.describe()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
count,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,...,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0,964203.0
mean,3077556.0,10.891449,0.204105,4455.553073,71.88417,590.898235,573.25948,584.28197,6.827721,6.422334,...,-40465130.0,2809.354,2224.001,6064.781,1601.227,841718.3,210984.7,1109876.0,646774.7,0.982651
std,15910760.0,278.222502,3.94541,12987.980119,11325.32,538.979083,545.697192,540.024761,25.6144,147.738412,...,203385700.0,99477.08,80333.31,171036.8,81721.82,4167550.0,1328895.0,5494114.0,3432059.0,0.130568
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1062719000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.0,0.0,458.0,0.0,229.0,229.0,229.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,2.0,0.0,1180.0,0.0,401.0,383.0,401.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,420.0,4.0,0.0,2944.0,0.0,1226.0,1225.0,1225.0,0.0,0.0,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,120000000.0,86176.0,1856.0,176000.0,6068687.0,3621.0,2019.0,2666.086957,1315.418679,35040.0,...,1480.0,38446140.0,48680470.0,72868430.0,13101140.0,114745500.0,54244360.0,114745500.0,114745500.0,1.0


In [7]:
# checking for string input features
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 964203 entries, 0 to 985289
Data columns (total 80 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Flow Duration                964203 non-null  int64  
 1   Total Fwd Packets            964203 non-null  int64  
 2   Total Backward Packets       964203 non-null  int64  
 3   Total Length of Fwd Packets  964203 non-null  float64
 4   Total Length of Bwd Packets  964203 non-null  float64
 5   Fwd Packet Length Max        964203 non-null  float64
 6   Fwd Packet Length Min        964203 non-null  float64
 7   Fwd Packet Length Mean       964203 non-null  float64
 8   Fwd Packet Length Std        964203 non-null  float64
 9   Bwd Packet Length Max        964203 non-null  float64
 10  Bwd Packet Length Min        964203 non-null  float64
 11  Bwd Packet Length Mean       964203 non-null  float64
 12  Bwd Packet Length Std        964203 non-null  float64
 13 

In [8]:
# analysing the feature 'SimillarHTTP'
print(np.unique(df['SimillarHTTP']))

# Dropping it as it found out to be not useful
df.drop(['SimillarHTTP'],axis=1,inplace=True)

['0' '0.gravatar.com/avatar/?s=40&d=mm&r=g'
 '0.gravatar.com/avatar/?s=48&d=mm&r=g'
 '1.gravatar.com/avatar/?s=40&d=mm&r=g'
 '1.gravatar.com/avatar/?s=48&d=mm&r=g'
 '11.tlu.dl.delivery.mp.microsoft.com/filestreamingservice/files/20482332-875d-405b-8a17-f256e205f983?P1=1543633297&P2=402&P3=2&P4=bEJ2NeZzPx%2bDwlzU3UuU0f8gECv78aHQI5ytEYcJzuG6hT8g%2buj60fnS1U7zJUf2u2ZBxrHZnII2l5gHg0hjQQ%3d%3d'
 '1b.tlu.dl.delivery.mp.microsoft.com/filestreamingservice/files/0474fd6b-8c76-407a-8b61-079dbf032ec9?P1=1543633281&P2=402&P3=2&P4=oRPLLcWdg3fRgY7e%2bgipNa0u%2bROwr%2fbuVK1vgM%2fGOGu%2blTQyVUXJXNQ%2bEqNkEKpqc4JxU%2f6eajEGvxFhYGIvjQ%3d%3d'
 '1b.tlu.dl.delivery.mp.microsoft.com/filestreamingservice/files/407eb178-7954-4c17-99c4-574244931c5d?P1=1543671169&P2=402&P3=2&P4=d8j%2fPViy6%2fEO5G1%2fb60jkSIPmyIRDwXRRcjHt0UEAJJWznZmjtn%2b3Godl0kXWGHvJ3JkCl1523cTy5LpI3zelA%3d%3d'
 '1b.tlu.dl.delivery.mp.microsoft.com/filestreamingservice/files/89c41fd5-5d40-4d5c-bd81-c99f68e7aa5b?P1=1543672193&P2=402&P3=2&P4=I%2f

In [9]:
# performing LabelEncoding on Label (output)

label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

In [11]:
# checking for inf values


for col in df.columns:
    is_inf = np.isinf(df[col]).any()
    if(is_inf):
        print(col, " has ", is_inf.sum() ," infinite values ")

# there is one row with infinite value dropping it

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

is_inf = np.isinf(df).any()
is_inf.sum().sum()


Flow Bytes/s  has  1  infinite values 
Flow Packets/s  has  1  infinite values 


0

In [10]:
# Final dataset after EDA
df.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound,Label
0,1,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,8
1,46,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,8
2,7147,134,0,58960.0,0.0,440.0,440.0,440.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,3
3,1,2,0,2560.0,0.0,1280.0,1280.0,1280.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2
4,38747799,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,0.0,50.0,50.0,38747748.0,0.0,38747748.0,38747748.0,1,8


#### BUILDING MODEL

In [12]:
# taking input features and output label
X = df.drop('Label', axis=1)
y = df['Label']

# splitting the data into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Scale the features using Min-Max scaling

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##### XGBOOST

In [19]:
import xgboost as xgb

# Create an XGBoost classifier
xgb_classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Fit the classifier to your training data
xgb_classifier.fit(X_train_scaled, y_train)

In [22]:
# calculating the metrics
accuracy = accuracy_score(y_test, y_pred_xgb)
precision = precision_score(y_test, y_pred_xgb, average='weighted')
recall = recall_score(y_test, y_pred_xgb, average='weighted')
f1 = f1_score(y_test, y_pred_xgb, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_xgb)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred_xgb)

# Display the metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall (Sensitivity): {recall}')
print(f'F1-Score: {f1}')
print(f'Balanced Accuracy: {balanced_accuracy}')

Accuracy: 0.8126145387904704
Precision: 0.8301846168671998
Recall (Sensitivity): 0.8126145387904704
F1-Score: 0.7984704549976329
Balanced Accuracy: 0.7263384912545817
