In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


DATASETS_PATH = '/content/drive/My Drive/Datasets/ML Project/data/recommended'
TRAINING_PATH = f'{DATASETS_PATH}/training/training.csv'
TESTING_PATH = f'{DATASETS_PATH}/test/test.csv'
INDEX_COL = 'pkSeqID'
BEST_FEATURE_ORDER = [
  'seq', 'min', 'max', 'mean', 'stddev',
  'N_IN_Conn_P_SrcIP', 'N_IN_Conn_P_DstIP',
  'drate', 'srate',
  'saddr', 'sport', 'daddr', 'dport', 'proto',
  'state_number',
  'attack', 'category', 'subcategory'
]

In [69]:
training = pd.read_csv(TRAINING_PATH, index_col=INDEX_COL)[BEST_FEATURE_ORDER]
training.head()

Unnamed: 0_level_0,seq,min,max,mean,stddev,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,drate,srate,saddr,sport,daddr,dport,proto,state_number,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3142762,251984,0.0,4.031619,2.687519,1.900363,100,100,0.0,0.494549,192.168.100.150,6551,192.168.100.3,80,udp,4,1,DDoS,UDP
2432264,256724,3.85693,4.012924,3.934927,0.078003,38,100,0.0,0.256493,192.168.100.150,5532,192.168.100.3,80,tcp,3,1,DDoS,TCP
1976315,62921,2.9741,3.609205,3.341429,0.268666,100,100,0.0,0.29488,192.168.100.147,27165,192.168.100.3,80,tcp,3,1,DDoS,TCP
1240757,99168,0.0,4.942302,3.222832,1.823185,63,63,0.0,0.461435,192.168.100.150,48719,192.168.100.3,80,udp,4,1,DoS,UDP
3257991,105063,2.979995,4.994452,3.983222,0.822418,100,100,0.0,1.002999,192.168.100.147,22461,192.168.100.3,80,udp,4,1,DDoS,UDP


In [70]:
testing = pd.read_csv(TESTING_PATH, index_col=INDEX_COL)[BEST_FEATURE_ORDER]

In [28]:
print(f'Training Shape: {training.shape}')
print(f'Testing Shape: {testing.shape}')

Training Shape: (2934817, 18)
Testing Shape: (733705, 18)


In [29]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2934817 entries, 3142762 to 96906
Data columns (total 18 columns):
 #   Column             Dtype  
---  ------             -----  
 0   seq                int64  
 1   min                float64
 2   max                float64
 3   mean               float64
 4   stddev             float64
 5   N_IN_Conn_P_SrcIP  int64  
 6   N_IN_Conn_P_DstIP  int64  
 7   drate              float64
 8   srate              float64
 9   saddr              object 
 10  sport              object 
 11  daddr              object 
 12  dport              object 
 13  proto              object 
 14  state_number       int64  
 15  attack             int64  
 16  category           object 
 17  subcategory        object 
dtypes: float64(6), int64(5), object(7)
memory usage: 425.4+ MB


In [30]:
import numpy as np

numerical = [training.columns[i] for i in range(len(training.columns)) if training.dtypes.iloc[i] != 'O']
non_numerical = [training.columns[i] for i in range(len(training.columns)) if training.dtypes.iloc[i] == 'O']

print(f'There are {len(numerical)} numerical features, and {len(non_numerical)} non numerical features')

There are 11 numerical features, and 7 non numerical features


In [39]:
training.isna().sum(axis=0)

Unnamed: 0,0
seq,0
min,0
max,0
mean,0
stddev,0
N_IN_Conn_P_SrcIP,0
N_IN_Conn_P_DstIP,0
drate,0
srate,0
saddr,0


In [37]:
sum(training.duplicated())

0

Dataset has no duplicated rows, or missing values.

In [42]:
training.head()

Unnamed: 0_level_0,seq,min,max,mean,stddev,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,drate,srate,saddr,sport,daddr,dport,proto,state_number,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3142762,251984,0.0,4.031619,2.687519,1.900363,100,100,0.0,0.494549,192.168.100.150,6551,192.168.100.3,80,udp,4,1,DDoS,UDP
2432264,256724,3.85693,4.012924,3.934927,0.078003,38,100,0.0,0.256493,192.168.100.150,5532,192.168.100.3,80,tcp,3,1,DDoS,TCP
1976315,62921,2.9741,3.609205,3.341429,0.268666,100,100,0.0,0.29488,192.168.100.147,27165,192.168.100.3,80,tcp,3,1,DDoS,TCP
1240757,99168,0.0,4.942302,3.222832,1.823185,63,63,0.0,0.461435,192.168.100.150,48719,192.168.100.3,80,udp,4,1,DoS,UDP
3257991,105063,2.979995,4.994452,3.983222,0.822418,100,100,0.0,1.002999,192.168.100.147,22461,192.168.100.3,80,udp,4,1,DDoS,UDP


In [47]:
training['saddr'].value_counts()

Unnamed: 0_level_0,count
saddr,Unnamed: 1_level_1
192.168.100.147,761360
192.168.100.148,738642
192.168.100.150,712260
192.168.100.149,711466
192.168.100.3,6609
192.168.100.5,4107
192.168.100.6,272
192.168.100.7,34
192.168.100.4,17
192.168.100.1,14


Most of the source addresses are for a local network, so they are not that much useful in a real-world scenario

**Conclusion:**
`saddr` probably should be dropped

In [51]:
TO_DROP = ['saddr']

In [52]:
training['daddr'].value_counts()

Unnamed: 0_level_0,count
daddr,Unnamed: 1_level_1
192.168.100.3,1900562
192.168.100.5,361192
192.168.100.7,332161
192.168.100.6,329679
192.168.100.150,3040
...,...
205.251.194.167,1
224.0.0.252,1
216.239.38.10,1
205.251.194.84,1


In [53]:
non_local_addr = training.loc[training['daddr'].map(lambda addr: not str(addr).startswith('192.168')), 'daddr']
l_non_local = len(non_local_addr)
l_local = len(training['daddr']) - l_non_local
print(f"Non-Local to Local ratio: {l_non_local} / {l_local} = {l_non_local / l_local}")
non_local_addr.value_counts()

Non-Local to Local ratio: 242 / 2934575 = 8.246509290101632e-05


Unnamed: 0_level_0,count
daddr,Unnamed: 1_level_1
8.8.8.8,48
224.0.0.251,42
27.124.125.250,12
ff02::fb,7
ff02::2,6
...,...
205.251.194.167,1
224.0.0.252,1
216.239.38.10,1
205.251.194.84,1


A high number of destination addresses are from a local network, likely not to occur in real world scenario.

**Conclusion:** `daddr` is to dropped

In [54]:
TO_DROP = TO_DROP + ['daddr']
print(TO_DROP)

['saddr', 'daddr']


In [55]:
training.head()

Unnamed: 0_level_0,seq,min,max,mean,stddev,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,drate,srate,saddr,sport,daddr,dport,proto,state_number,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3142762,251984,0.0,4.031619,2.687519,1.900363,100,100,0.0,0.494549,192.168.100.150,6551,192.168.100.3,80,udp,4,1,DDoS,UDP
2432264,256724,3.85693,4.012924,3.934927,0.078003,38,100,0.0,0.256493,192.168.100.150,5532,192.168.100.3,80,tcp,3,1,DDoS,TCP
1976315,62921,2.9741,3.609205,3.341429,0.268666,100,100,0.0,0.29488,192.168.100.147,27165,192.168.100.3,80,tcp,3,1,DDoS,TCP
1240757,99168,0.0,4.942302,3.222832,1.823185,63,63,0.0,0.461435,192.168.100.150,48719,192.168.100.3,80,udp,4,1,DoS,UDP
3257991,105063,2.979995,4.994452,3.983222,0.822418,100,100,0.0,1.002999,192.168.100.147,22461,192.168.100.3,80,udp,4,1,DDoS,UDP


In [56]:
training.describe()

Unnamed: 0,seq,min,max,mean,stddev,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,drate,srate,state_number,attack
count,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0,2934817.0
mean,121297.3,1.017208,3.019269,2.230471,0.8869639,82.54997,92.45766,0.4303064,3.12829,3.134219,0.9998739
std,75787.0,1.483551,1.860915,1.517766,0.8036391,24.39019,18.16651,56.23304,784.5494,1.187107,0.0112275
min,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
25%,54847.0,0.0,0.280417,0.181934,0.029997,69.0,100.0,0.0,0.155845,3.0,1.0
50%,117737.0,0.0,4.008429,2.689973,0.792575,100.0,100.0,0.0,0.28378,4.0,1.0
75%,184870.0,2.147949,4.292426,3.565061,1.74522,100.0,100.0,0.0,0.488,4.0,1.0
max,262211.0,4.980471,4.999999,4.981882,2.496763,100.0,100.0,58823.53,1000000.0,11.0,1.0


The `seq` column is defined as the Argus (an open-source tool used as a data source for an **ML for anamoly detection** use case).

Since this model is meant to be general and the `seq` doesn't carry any useful information it should be dropped.

In [57]:
TO_DROP += ['seq']

In [58]:
training.head()

Unnamed: 0_level_0,seq,min,max,mean,stddev,N_IN_Conn_P_SrcIP,N_IN_Conn_P_DstIP,drate,srate,saddr,sport,daddr,dport,proto,state_number,attack,category,subcategory
pkSeqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3142762,251984,0.0,4.031619,2.687519,1.900363,100,100,0.0,0.494549,192.168.100.150,6551,192.168.100.3,80,udp,4,1,DDoS,UDP
2432264,256724,3.85693,4.012924,3.934927,0.078003,38,100,0.0,0.256493,192.168.100.150,5532,192.168.100.3,80,tcp,3,1,DDoS,TCP
1976315,62921,2.9741,3.609205,3.341429,0.268666,100,100,0.0,0.29488,192.168.100.147,27165,192.168.100.3,80,tcp,3,1,DDoS,TCP
1240757,99168,0.0,4.942302,3.222832,1.823185,63,63,0.0,0.461435,192.168.100.150,48719,192.168.100.3,80,udp,4,1,DoS,UDP
3257991,105063,2.979995,4.994452,3.983222,0.822418,100,100,0.0,1.002999,192.168.100.147,22461,192.168.100.3,80,udp,4,1,DDoS,UDP


In [71]:
training.drop(TO_DROP, axis=1, inplace=True)
testing.drop(TO_DROP, axis=1, inplace=True)

In [80]:
(training['category'] + ' ' + training['subcategory']).value_counts()

Unnamed: 0,count
DoS UDP,826349
DDoS TCP,782228
DDoS UDP,758301
DoS TCP,492615
Reconnaissance Service_Scan,58626
Reconnaissance OS_Fingerprint,14293
DoS HTTP,1184
DDoS HTTP,786
Normal Normal,370
Theft Keylogging,59
