In [1]:
import sys
sys.path.append("../") # adding the path to read the utils file
import pandas as pd
import glob
from utils import *

pd.set_option('display.max_columns', None)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# Preprocessing the data

In [2]:
# Define the pattern to match the file names
file_pattern = '../datasets/CICIDS-2018/*.csv'

# Retrieve the list of file paths matching the pattern
file_paths = glob.glob(file_pattern)

def load_ds(path):
    print(path)
    return pd.read_csv(path, encoding="us-ascii", encoding_errors="ignore", usecols=cicids_18_col, dtype_backend="pyarrow")

ds = pd.concat([load_ds(file_path) for file_path in file_paths], ignore_index=True)
ds.columns = ['Protocol', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count',
       'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio',
       'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size',
       'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate',
       'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate',
       'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets',
       'Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
       'Idle Std', 'Idle Max', 'Idle Min', 'Label']
ds.head()

../datasets/CICIDS-2018/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Friday-16-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv
../datasets/CICIDS-2018/Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv


Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6,37953,5,3,135.0,127.0,135.0,0.0,27.0,60.373835,127.0,0.0,42.333333,73.323484,6903.275103,210.787026,5421.857,5403.58,12099.0,23.0,37953.0,9488.25,3245.485,12382.0,6013.0,19960.0,9980.0,13546.75171,19559.0,401.0,0,0,0,0,168,104,131.741891,79.045135,0.0,135.0,29.111111,57.800183,3340.861111,0,0,0,1,0,0,0,0,0.0,32.75,27.0,42.333333,0,0,0,0,0,0,5,135,3,127,29200,219,1,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,17,117573474,3,0,1500.0,0.0,500.0,500.0,500.0,0.0,0.0,0.0,0.0,0.0,12.75798,0.025516,58800000.0,23800000.0,75600000.0,42000000.0,118000000.0,58800000.0,23800000.0,75600000.0,42000000.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,24,0,0.025516,0.0,500.0,500.0,500.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,666.666667,500.0,0.0,0,0,0,0,0,0,3,1500,0,0,-1,-1,2,8,0.0,0.0,0.0,0.0,58800000.0,23800000.0,75600000.0,42000000.0,Benign
2,17,117573474,3,0,1500.0,0.0,500.0,500.0,500.0,0.0,0.0,0.0,0.0,0.0,12.75798,0.025516,58800000.0,23800000.0,75600000.0,42000000.0,118000000.0,58800000.0,23800000.0,75600000.0,42000000.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,24,0,0.025516,0.0,500.0,500.0,500.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,666.666667,500.0,0.0,0,0,0,0,0,0,3,1500,0,0,-1,-1,2,8,0.0,0.0,0.0,0.0,58800000.0,23800000.0,75600000.0,42000000.0,Benign
3,17,99743998,5,0,2500.0,0.0,500.0,500.0,500.0,0.0,0.0,0.0,0.0,0.0,25.064165,0.050128,24900000.0,34000000.0,75600000.0,4000290.0,99700000.0,24900000.0,34000000.0,75600000.0,4000290.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40,0,0.050128,0.0,500.0,500.0,500.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,600.0,500.0,0.0,0,0,0,0,0,0,5,2500,0,0,-1,-1,4,8,4000290.0,0.0,4000290.0,4000290.0,31900000.0,37900000.0,75600000.0,7200397.0,Benign
4,17,99743999,5,0,2500.0,0.0,500.0,500.0,500.0,0.0,0.0,0.0,0.0,0.0,25.064165,0.050128,24900000.0,34000000.0,75600000.0,4000286.0,99700000.0,24900000.0,34000000.0,75600000.0,4000286.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40,0,0.050128,0.0,500.0,500.0,500.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,600.0,500.0,0.0,0,0,0,0,0,0,5,2500,0,0,-1,-1,4,8,4000286.0,0.0,4000286.0,4000286.0,31900000.0,37900000.0,75600000.0,7200399.0,Benign


In [3]:
compute_ratio(ds,'Label')

                          Ratio
Label                          
Benign                    83.07
DDOS attack-HOIC           4.23
DDoS attacks-LOIC-HTTP     3.55
DoS attacks-Hulk           2.85
Bot                        1.76
FTP-BruteForce             1.19
SSH-Bruteforce             1.16
Infilteration              1.00
DoS attacks-SlowHTTPTest   0.86
DoS attacks-GoldenEye      0.26
DoS attacks-Slowloris      0.07
DDOS attack-LOIC-UDP       0.01
Brute Force -Web           0.00
Brute Force -XSS           0.00
SQL Injection              0.00


In [4]:
original_lenght = ds.shape[0]

In [5]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16232988 entries, 0 to 16232987
Data columns (total 78 columns):
 #   Column                       Dtype          
---  ------                       -----          
 0   Protocol                     int64[pyarrow] 
 1   Flow Duration                int64[pyarrow] 
 2   Total Fwd Packets            int64[pyarrow] 
 3   Total Backward Packets       int64[pyarrow] 
 4   Total Length of Fwd Packets  double[pyarrow]
 5   Total Length of Bwd Packets  double[pyarrow]
 6   Fwd Packet Length Max        double[pyarrow]
 7   Fwd Packet Length Min        double[pyarrow]
 8   Fwd Packet Length Mean       double[pyarrow]
 9   Fwd Packet Length Std        double[pyarrow]
 10  Bwd Packet Length Max        double[pyarrow]
 11  Bwd Packet Length Min        double[pyarrow]
 12  Bwd Packet Length Mean       double[pyarrow]
 13  Bwd Packet Length Std        double[pyarrow]
 14  Flow Bytes/s                 double[pyarrow]
 15  Flow Packets/s               d

In [6]:
def downcast_df(df):
    # Identify integer columns for downcast
    int_cols = df.select_dtypes(include='int64').columns

    # Downcast integer columns to unsigned integers
    for col in int_cols:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    # Identify float columns for downcast
    float_cols = df.select_dtypes(include='float64').columns

    # Downcast float columns to smaller floating-point types
    for col in float_cols:
        df[col] = pd.to_numeric(df[col], downcast='float')

    return df

In [7]:
ds = downcast_df(ds)

In [8]:
ds.dropna(inplace=True)

In [9]:
droped_na_lenght = ds.shape[0]

In [10]:
print("removed", original_lenght - droped_na_lenght, "null values")

removed 59766 null values


In [11]:
ds.drop_duplicates(inplace=True)

In [12]:
print("removed", droped_na_lenght - ds.shape[0], "duplicates")

removed 5351241 duplicates


removing the unecessary columns, like port, ip, timestap and flow id

In [13]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10821981 entries, 0 to 16232987
Data columns (total 78 columns):
 #   Column                       Dtype          
---  ------                       -----          
 0   Protocol                     uint8[pyarrow] 
 1   Flow Duration                int64[pyarrow] 
 2   Total Fwd Packets            uint32[pyarrow]
 3   Total Backward Packets       uint32[pyarrow]
 4   Total Length of Fwd Packets  double[pyarrow]
 5   Total Length of Bwd Packets  double[pyarrow]
 6   Fwd Packet Length Max        float[pyarrow] 
 7   Fwd Packet Length Min        float[pyarrow] 
 8   Fwd Packet Length Mean       double[pyarrow]
 9   Fwd Packet Length Std        double[pyarrow]
 10  Bwd Packet Length Max        float[pyarrow] 
 11  Bwd Packet Length Min        float[pyarrow] 
 12  Bwd Packet Length Mean       double[pyarrow]
 13  Bwd Packet Length Std        double[pyarrow]
 14  Flow Bytes/s                 double[pyarrow]
 15  Flow Packets/s               double

In [14]:
ds[np.isinf(ds.select_dtypes(include=np.number)).any(axis=1)].Label.value_counts()

Label
Benign           1099
Infilteration      46
Name: count, dtype: int64[pyarrow]

In [15]:
ds[np.isinf(ds.select_dtypes(include=np.number)).any(axis=1)]

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
1431849,6,0,2,0,31.0,0.0,31.0,0.0,15.5,21.920310,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,31.0,20.666667,17.897858,320.333333,0,1,0,0,1,0,0,0,0.0,31.0,15.5,0.0,0,0,0,0,0,0,2,31,0,0,980,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1433506,6,0,2,0,31.0,0.0,31.0,0.0,15.5,21.920310,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,31.0,20.666667,17.897858,320.333333,0,1,0,0,1,0,0,0,0.0,31.0,15.5,0.0,0,0,0,0,0,0,2,31,0,0,946,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1434527,6,0,2,0,31.0,0.0,31.0,0.0,15.5,21.920310,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,31.0,20.666667,17.897858,320.333333,0,1,0,0,1,0,0,0,0.0,31.0,15.5,0.0,0,0,0,0,0,0,2,31,0,0,294,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1434587,6,0,2,0,38.0,0.0,38.0,0.0,19.0,26.870058,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,38.0,25.333333,21.939310,481.333333,0,1,0,0,1,0,0,0,0.0,38.0,19.0,0.0,0,0,0,0,0,0,2,38,0,0,176,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1434831,6,0,2,0,53.0,0.0,53.0,0.0,26.5,37.476659,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,53.0,35.333333,30.599564,936.333333,0,1,0,0,1,0,0,0,0.0,53.0,26.5,0.0,0,0,0,0,0,0,2,53,0,0,245,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16161432,6,0,1,1,58.0,31.0,58.0,58.0,58.0,0.000000,31.0,31.0,31.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,20,20,0.0,0.0,31.0,58.0,49.000000,15.588457,243.000000,0,1,0,0,1,0,0,0,1.0,73.5,58.0,31.0,0,0,0,0,0,0,1,58,1,31,68,68,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infilteration
16177874,6,0,2,0,53.0,0.0,53.0,0.0,26.5,37.476659,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,53.0,35.333333,30.599564,936.333333,0,1,0,0,1,0,0,0,0.0,53.0,26.5,0.0,0,0,0,0,0,0,2,53,0,0,131,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infilteration
16181398,6,0,2,0,31.0,0.0,31.0,0.0,15.5,21.920310,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,40,0,0.0,0.0,0.0,31.0,20.666667,17.897858,320.333333,0,1,0,0,1,0,0,0,0.0,31.0,15.5,0.0,0,0,0,0,0,0,2,31,0,0,122,-1,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Infilteration
16201291,6,0,1,1,63.0,0.0,63.0,63.0,63.0,0.000000,0.0,0.0,0.0,0.0,inf,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,20,20,0.0,0.0,0.0,63.0,42.000000,36.373067,1323.000000,0,1,0,0,1,0,0,0,1.0,63.0,63.0,0.0,0,0,0,0,0,0,1,63,1,0,215,215,0,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [16]:
ds = ds[~np.isinf(ds.select_dtypes(include=np.number)).any(axis=1)]

In [17]:
ds["Attack"] = ds["Label"].apply(lambda x: 1 if x != "Benign" else 0).astype("bool")

In [18]:
ds.describe()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
count,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820836.0,10820840.0,10820836.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820836.0,10820836.0,10820836.0,10820836.0,10820836.0,10820836.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0,10820840.0
mean,8.087906,17550720.0,34.55673,9.19237,1450.181,7080.574,292.1326,8.675383,123.0545,105.6497,508.8795,23.33959,809.7099,198.57,596470.4,3980.312,26183760.0,81467060.0,9760533.0,3340714.0,17114900.0,30943940.0,99537230.0,9471133.0,3470634.0,11395210.0,23846180.0,22972870.0,3917800.0,434870.9,0.04291387,0.0,0.0001802079,0.0,372.1678,196.2204,2997.746,975.087,8.532274,566.493,348.7025,176.7857,193032.4,0.002312113,0.04291387,0.281115,0.5473757,0.2295595,0.01532303,0.0001802079,0.2811151,0.4709304,2783.823,123.0545,809.7099,0.0,0.0,0.0,0.0,0.0,0.0,34.55673,1450.181,9.19237,7080.54,10186.36,12873.43,29.82517,18.0968,476838.4,1333987.0,393608.5,173197.6,16668090.0,3793741.0,8023772.0,6909935.0
std,4.556638,604976400.0,1863.004,200.8376,76140.71,287146.6,335.9054,23.8323,2479.685,129.3054,542.5371,51.15383,17377.33,898.8252,28881980.0,410565.5,402551100.0,597621400.0,807748900.0,915685400.0,604974100.0,480440700.0,644477800.0,807747600.0,915688800.0,30983300.0,380355900.0,308698400.0,12333570.0,4656384.0,0.202663,0.0,0.01342294,0.0,15019.15,3998.1,408503.8,13244.71,21.50273,548.817,7424.123,257.2121,6094536.0,0.04802882,0.202663,0.4495435,0.4977505,0.4205496,0.1228342,0.01342294,0.4495435,1.167465,21098.45,2479.685,17377.33,0.0,0.0,0.0,0.0,0.0,0.0,1863.004,76140.71,200.8376,287134.0,17427.97,23997.03,1861.528,6.728805,29248130.0,61560770.0,4059054.0,2587479.0,439125000.0,243876000.0,765939600.0,77805080.0
min,0.0,-919011000000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.008895325,-828220000000.0,0.0,-828220000000.0,-947405000000.0,-919011000000.0,-828220000000.0,0.0,-828220000000.0,-947405000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,16983.0,2.0,1.0,32.0,51.0,31.0,0.0,22.0,0.0,47.0,0.0,32.25,0.0,53.15157,1.582906,7463.593,9.899495,14816.0,3.0,960.0,321.75,0.0,804.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,8.0,0.877527,0.2302696,0.0,54.0,36.44444,18.47521,341.3333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,22.0,32.25,0.0,0.0,0.0,0.0,0.0,0.0,2.0,32.0,1.0,51.0,219.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,1171164.0,3.0,3.0,161.0,274.0,103.0,0.0,47.0,49.21721,231.0,0.0,120.0,102.6649,926.6347,9.291658,118716.3,54120.72,805123.5,29.0,383921.0,129585.1,8558.514,210840.0,80.0,22167.0,9151.0,196.9678,18414.0,2.0,0.0,0.0,0.0,0.0,72.0,92.0,5.164577,3.336763,0.0,259.0,77.33333,109.2928,11944.92,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,96.0,47.0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,161.0,3.0,274.0,8192.0,149.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.0,5733249.0,8.0,7.0,935.0,1581.0,640.0,0.0,108.6667,195.8092,1173.0,0.0,225.8571,429.5,12525.26,178.3031,764250.0,599676.7,5234965.0,809.0,5128750.0,1083967.0,411425.3,4285852.0,7182.0,2101293.0,376376.6,345250.7,967002.0,16726.0,0.0,0.0,0.0,0.0,172.0,152.0,96.32983,62.21425,0.0,1173.0,151.6111,319.3302,101992.6,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,170.3125,108.6667,225.8571,0.0,0.0,0.0,0.0,0.0,0.0,8.0,935.0,7.0,1581.0,8192.0,7970.0,5.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,17.0,120000000.0,309629.0,123118.0,144391800.0,156360400.0,64440.0,1460.0,1299125.0,18401.58,65160.0,1460.0,1416624.0,632631.0,9857143000.0,1333333000.0,14999040000.0,474354500000.0,979781000000.0,120000000.0,120000000.0,14999990000.0,474354500000.0,979781000000.0,120000000.0,120000000.0,14985020000.0,24642060000.0,120000000.0,120000000.0,1.0,0.0,1.0,0.0,2477032.0,2462372.0,1333333000.0,2000000.0,1460.0,65160.0,1054925.0,476212.0,2774026000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,311.0,1187125.0,1299125.0,1416624.0,0.0,0.0,0.0,0.0,0.0,0.0,309629.0,144391800.0,123118.0,156360400.0,65535.0,65535.0,309628.0,56.0,9833661000.0,9969606000.0,114000000.0,114000000.0,395571400000.0,262247900000.0,979781000000.0,239934000000.0


In [19]:
ds.drop(columns=["Bwd PSH Flags","Bwd URG Flags","Fwd Avg Bytes/Bulk","Fwd Avg Packets/Bulk","Fwd Avg Bulk Rate","Bwd Avg Bytes/Bulk","Bwd Avg Packets/Bulk","Bwd Avg Bulk Rate"], inplace=True)

In [20]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10820836 entries, 0 to 16232987
Data columns (total 71 columns):
 #   Column                       Dtype          
---  ------                       -----          
 0   Protocol                     uint8[pyarrow] 
 1   Flow Duration                int64[pyarrow] 
 2   Total Fwd Packets            uint32[pyarrow]
 3   Total Backward Packets       uint32[pyarrow]
 4   Total Length of Fwd Packets  double[pyarrow]
 5   Total Length of Bwd Packets  double[pyarrow]
 6   Fwd Packet Length Max        float[pyarrow] 
 7   Fwd Packet Length Min        float[pyarrow] 
 8   Fwd Packet Length Mean       double[pyarrow]
 9   Fwd Packet Length Std        double[pyarrow]
 10  Bwd Packet Length Max        float[pyarrow] 
 11  Bwd Packet Length Min        float[pyarrow] 
 12  Bwd Packet Length Mean       double[pyarrow]
 13  Bwd Packet Length Std        double[pyarrow]
 14  Flow Bytes/s                 double[pyarrow]
 15  Flow Packets/s               double

In [21]:
compute_ratio(ds,'Label')

                          Ratio
Label                          
Benign                    87.75
DDoS attacks-LOIC-HTTP     5.32
DDOS attack-HOIC           1.84
DoS attacks-Hulk           1.34
Bot                        1.34
Infilteration              1.05
SSH-Bruteforce             0.87
DoS attacks-GoldenEye      0.38
DoS attacks-Slowloris      0.09
DDOS attack-LOIC-UDP       0.02
Brute Force -Web           0.01
Brute Force -XSS           0.00
SQL Injection              0.00
DoS attacks-SlowHTTPTest   0.00
FTP-BruteForce             0.00


In [None]:
ds.Label = ds.Label.replace('Infilteration', 'Infiltration')
ds.Label = ds.Label.replace('DoS attacks-Hulk', 'DoS Hulk')
ds.Label = ds.Label.replace('SSH-Bruteforce ', 'SSH-Patator')
ds.Label = ds.Label.replace('FTP-BruteForce', 'FTP-Patator')
ds.Label = ds.Label.replace('Brute Force -Web', 'Web Attack  Brute Force')
ds.Label = ds.Label.replace('Brute Force -XSS', 'Web Attack  XSS')
ds.Label = ds.Label.replace('DDOS attack-HOIC', 'Web Attack  XSS')

In [20]:
ds.to_parquet("../datasets/CICIDS-2018.parquet", index=False)