This section contains the preprocessing of the data.

In [4]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from sklearn.preprocessing import MinMaxScaler

In [5]:
dataset=pd.read_csv('Wednesday-workingHours.pcap_ISCX.csv')

First we should drop the redundant records.

In [6]:
newdata=dataset.drop_duplicates()
newdata.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.0,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


Now let's move on to handling null values.

In [7]:
final_data=newdata.dropna()
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 610733 entries, 0 to 692702
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             610733 non-null  int64  
 1    Flow Duration                610733 non-null  int64  
 2    Total Fwd Packets            610733 non-null  int64  
 3    Total Backward Packets       610733 non-null  int64  
 4   Total Length of Fwd Packets   610733 non-null  int64  
 5    Total Length of Bwd Packets  610733 non-null  int64  
 6    Fwd Packet Length Max        610733 non-null  int64  
 7    Fwd Packet Length Min        610733 non-null  int64  
 8    Fwd Packet Length Mean       610733 non-null  float64
 9    Fwd Packet Length Std        610733 non-null  float64
 10  Bwd Packet Length Max         610733 non-null  int64  
 11   Bwd Packet Length Min        610733 non-null  int64  
 12   Bwd Packet Length Mean       610733 non-null  fl

Our model will only need to detect DoS attacks so we are eliminating the 'heartbleed' labeled rows.

In [8]:
mask=final_data[' Label'] == 'Heartbleed'
htbleed=final_data[mask]
print(htbleed)

         Destination Port   Flow Duration   Total Fwd Packets  \
597130                444       119302728                2685   
597175                444       119262215                2792   
597209                444       119261118                2794   
597265                444       119260295                2791   
597277                444       119297996                2782   
597510                444       119259886                2782   
597536                444       119259012                2801   
597722                444       119257653                2802   
597807                444       119299621                2805   
597813                444       119296592                2797   
597825                444        24719667                 590   

         Total Backward Packets  Total Length of Fwd Packets  \
597130                     1729                         8299   
597175                     2110                        13712   
597209                     

In [9]:
dt=final_data[~mask]
dt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 610722 entries, 0 to 692702
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             610722 non-null  int64  
 1    Flow Duration                610722 non-null  int64  
 2    Total Fwd Packets            610722 non-null  int64  
 3    Total Backward Packets       610722 non-null  int64  
 4   Total Length of Fwd Packets   610722 non-null  int64  
 5    Total Length of Bwd Packets  610722 non-null  int64  
 6    Fwd Packet Length Max        610722 non-null  int64  
 7    Fwd Packet Length Min        610722 non-null  int64  
 8    Fwd Packet Length Mean       610722 non-null  float64
 9    Fwd Packet Length Std        610722 non-null  float64
 10  Bwd Packet Length Max         610722 non-null  int64  
 11   Bwd Packet Length Min        610722 non-null  int64  
 12   Bwd Packet Length Mean       610722 non-null  fl

We should transform the categorical data in our set, the 'Label' column contains the labels of the packets traced, we give 0 value to BENIGN labels and 1 to all other DoS labels.

In [10]:
dt[' Label']=dt[" Label"].map({'BENIGN':0,'DoS slowloris':1, 'DoS Slowhttptest':1, 'DoS Hulk':1, 'DoS GoldenEye':1 })
dt[' Label'].info()
                                                          

<class 'pandas.core.series.Series'>
Index: 610722 entries, 0 to 692702
Series name:  Label
Non-Null Count   Dtype
--------------   -----
610722 non-null  int64
dtypes: int64(1)
memory usage: 9.3 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dt[' Label']=dt[" Label"].map({'BENIGN':0,'DoS slowloris':1, 'DoS Slowhttptest':1, 'DoS Hulk':1, 'DoS GoldenEye':1 })


Let's now remove the infinite values in our dataset.

In [11]:
inf_mask=np.isinf(dt).any(axis=1)
dtt=dt[~inf_mask]
rows_with_inf_values = dtt[inf_mask]
rows_with_inf_values.info()


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0    Destination Port             0 non-null      int64  
 1    Flow Duration                0 non-null      int64  
 2    Total Fwd Packets            0 non-null      int64  
 3    Total Backward Packets       0 non-null      int64  
 4   Total Length of Fwd Packets   0 non-null      int64  
 5    Total Length of Bwd Packets  0 non-null      int64  
 6    Fwd Packet Length Max        0 non-null      int64  
 7    Fwd Packet Length Min        0 non-null      int64  
 8    Fwd Packet Length Mean       0 non-null      float64
 9    Fwd Packet Length Std        0 non-null      float64
 10  Bwd Packet Length Max         0 non-null      int64  
 11   Bwd Packet Length Min        0 non-null      int64  
 12   Bwd Packet Length Mean       0 non-null      float64
 13   Bwd Packet Length Std

  rows_with_inf_values = dtt[inf_mask]


Scaling the data to make the learning easier for our model.

In [13]:
scaler = MinMaxScaler()
dtt_normalized=scaler.fit_transform(dtt)
dtt.head()
dtt.tail()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
692698,53,32215,4,2,112,152,28,28,28.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
692699,53,324,2,2,84,362,42,42,42.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
692700,58030,82,2,1,31,6,31,0,15.5,21.92031,...,32,0.0,0.0,0,0,0.0,0.0,0,0,0
692701,53,1048635,6,2,192,256,32,32,32.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
692702,53,94939,4,2,188,226,47,47,47.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


Finally we save our modified dataset to a csv file.

In [14]:
dtt.to_csv('dt.csv', index=False)