In [1]:
import os
import pandas as pd
import pickle
import json


from src.dataset.dataset_info import datasets, DatasetInfo
from src.dataset.clean_dataset import clean_dataset
from src.dataset.create_class_num_col import one_dataset_class_num_col
from src.dataset.features_analysis import feature_analysis_pipeline
from src.dataset.undersample_classes import undersample_classes
from src.graph.graph_measures import calculate_graph_measures
from local_variables import datasets_main_path


In [2]:
with_sort_timestamp = False
with_undersample_classes = False

name = "cic_ids_2017_5_percent"
original_path = "./testing_dfs/cic_ids_2017_5_percent.parquet"
file_type = "parquet"

# name = "cic_ton_iot_5_percent"
# original_path = "./testing_dfs/cic_ton_iot_5_percent.parquet"
# file_type = "parquet"

folder_path = os.path.join("datasets", name)
output_path = os.path.join(folder_path, name + ".parquet")

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

print(f"==>> original_path: {original_path}")
print(f"==>> folder_path: {folder_path}")
print(f"==>> output_path: {output_path}")


==>> original_path: ./testing_dfs/cic_ids_2017_5_percent.parquet
==>> folder_path: datasets\cic_ids_2017_5_percent
==>> output_path: datasets\cic_ids_2017_5_percent\cic_ids_2017_5_percent.parquet


# Preparing Datasets

### Reading and Cleaning

In [3]:
if file_type == "parquet":
    df = pd.read_parquet(original_path)
if file_type == "csv":
    df = pd.read_csv(original_path)
if file_type == "pkl":
    df = pd.read_pickle(original_path, compression="zip")

In [4]:
pd.options.display.max_columns = df.shape[1]

In [5]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack
66292,192.168.10.1-192.168.10.3-53-60671-17,192.168.10.3,60671.0,192.168.10.1,53.0,17.0,03/07/2017 10:23:37,30919.0,1.0,1.0,41.0,69.0,41.0,41.0,41.0,0.0,69.0,69.0,69.0,0.0,3557.683,64.68515,30919.0,0.0,30919.0,30919.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,20.0,32.342573,32.342573,41.0,69.0,50.333333,16.165808,261.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75.5,41.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,41.0,1.0,69.0,-1.0,-1.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
244828,172.16.0.1-192.168.10.50-39546-80-6,172.16.0.1,39546.0,192.168.10.50,80.0,6.0,5/7/2017 10:55,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,274.0,-1.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DoS Hulk
420414,192.168.10.8-52.84.64.212-51938-443-6,52.84.64.212,443.0,192.168.10.8,51938.0,6.0,03/07/2017 11:20:18,3.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,4000000.0,666666.7,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,333333.333333,333333.333333,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,9.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,6.0,119.0,16360.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
393869,172.217.10.2-192.168.10.8-80-55154-6,192.168.10.8,55154.0,172.217.10.2,80.0,6.0,5/7/2017 1:53,115525809.0,16.0,14.0,442.0,122.0,358.0,0.0,27.625,88.112712,116.0,0.0,8.714286,30.920316,4.882026,0.2596822,3983648.586,4849805.159,9994305.0,83.0,116000000.0,7701720.6,10000000.0,36714.0,111000000.0,8508555.231,3716723.776,10000000.0,452.0,0.0,0.0,0.0,0.0,332.0,424.0,0.138497,0.121185,0.0,358.0,18.193548,66.307073,4396.627957,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.8,27.625,8.714286,0.0,0.0,0.0,0.0,0.0,0.0,16.0,442.0,14.0,122.0,8192.0,343.0,15.0,20.0,78974.81818,140027.3781,501173.0,36578.0,9976579.636,58066.24315,9994305.0,9801504.0,0,BENIGN
399570,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,


In [6]:
timestamp_format = "mixed"
# timestamp_format="%d/%m/%Y %I:%M:%S %p"

In [7]:
df.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count',
       'ECE Flag Cnt', 'Dow

In [9]:
df.describe(include="all")

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack
count,141475,141475,141475.0,141475,141475.0,141475.0,141475,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141419.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,155968.0,141475
unique,116612,5755,,8567,,,13414,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14
top,8.0.6.4-8.6.0.1-0-0-0,172.16.0.1,,192.168.10.3,,,7/7/2017 2:55,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,BENIGN
freq,61,27906,,34087,,,2290,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,113655
mean,,,41124.306287,,8071.785602,9.872175,,14784190.0,8.318735,9.014794,589.2447,13037.23,211.295508,18.697056,59.040798,70.134894,876.867772,41.02457,307.899132,338.094831,inf,inf,1320839.0,2951200.0,9257020.0,168804.6,14476020.0,2663891.0,9115654.0,1075501.0,9866952.0,1839549.0,1484041.0,4734505.0,1010619.0,0.046849,0.0,0.00012,0.0,-7975.593,-7371.709,63625.4,7093.419,16.445683,959.729139,173.324832,297.950659,493811.4,0.03513,0.046849,0.000269,0.298328,0.316317,0.094815,0.00012,0.000269,0.684665,193.47888,59.040798,307.899132,0.0,0.0,0.0,0.0,0.0,0.0,8.318735,589.2447,9.014794,13040.03,7015.472847,1959.062909,4.35197,-4361.648,81599.86,41127.9,153801.5,58197.05,8388645.0,492192.8,8760911.0,8001546.0,0.271293,
std,,,22324.611736,,18285.407133,5.260464,,33577790.0,591.783955,796.922806,10511.41,1798510.0,740.70266,60.440359,189.692462,287.534594,1955.350714,69.483855,607.482483,844.526666,,,4574566.0,8116673.0,24540310.0,2992404.0,33493830.0,9706261.0,24611200.0,8804841.0,28636610.0,9025832.0,6261658.0,17259420.0,8490852.0,0.211317,0.0,0.010961,0.0,2863420.0,2854741.0,245459.4,39393.38,25.607089,2043.741131,307.003468,636.449437,1667792.0,0.184109,0.211317,0.016387,0.457526,0.46504,0.292961,0.010961,0.016387,0.668355,333.428395,189.692462,607.482483,0.0,0.0,0.0,0.0,0.0,0.0,591.783955,10511.41,796.922806,1799119.0,14403.889129,8408.899082,468.157127,1444665.0,635155.9,400257.8,1048516.0,561322.1,23716260.0,4547756.0,24433440.0,23458840.0,0.444628,
min,,,0.0,,0.0,0.0,,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12000000.0,-2000000.0,-1.0,0.0,-1.0,-11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1073741000.0,-1073741000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,,,32586.5,,53.0,6.0,,155.0,2.0,1.0,12.0,2.0,6.0,0.0,6.0,0.0,2.0,0.0,1.2,0.0,119.5242,3.390808,64.0,0.0,125.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,20.0,1.73864,0.1213363,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.75,6.0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,1.0,2.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
50%,,,50955.0,,80.0,6.0,,31351.0,2.0,2.0,62.0,123.0,37.0,2.0,34.0,0.0,79.0,0.0,72.0,0.0,4569.191,108.4666,11619.33,139.431,30869.0,4.0,46.0,38.0,46.0,3.0,3.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,64.0,40.0,61.09295,19.65718,2.0,88.0,57.333333,26.290683,691.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,72.5,34.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,62.0,2.0,123.0,251.0,-1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
75%,,,58473.0,,443.0,17.0,,3266403.0,5.0,4.0,192.0,487.0,84.0,36.0,50.0,26.162951,285.0,77.0,183.0,82.024387,166666.7,23121.39,342069.2,698969.7,2539600.0,64.0,1261595.0,210993.0,993521.0,48.0,103136.5,18848.83,15995.42,62839.0,45.0,0.0,0.0,0.0,0.0,120.0,104.0,11976.05,7194.245,36.0,565.0,121.0,179.555934,32240.33,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,149.75,50.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,192.0,4.0,487.0,8192.0,235.0,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,


In [10]:
datesetInfo =   DatasetInfo(name="cic_bot_iot",
                path="datasets/cic_bot_iot/cic_bot_iot.parquet",
                file_type="parquet",
                src_ip_col="Src IP",
                src_port_col="Src Port",
                dst_ip_col="Dst IP",
                dst_port_col="Dst Port",
                flow_id_col="Flow ID",
                timestamp_col="Timestamp",
                label_col="Label",
                class_col="Attack",
                class_num_col="Class",
                timestamp_format="mixed",
                drop_columns=["Flow ID", "Src IP", "Dst IP",
                              "Timestamp", "Src Port", "Dst Port", "Attack"],
                )

In [11]:
df.shape

(155968, 84)

In [12]:
# import numpy as np
# df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [13]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141419.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,141475.0,155968.0
mean,41124.306287,8071.785602,9.872175,14784190.0,8.318735,9.014794,589.2447,13037.23,211.295508,18.697056,59.040798,70.134894,876.867772,41.02457,307.899132,338.094831,inf,inf,1320839.0,2951200.0,9257020.0,168804.6,14476020.0,2663891.0,9115654.0,1075501.0,9866952.0,1839549.0,1484041.0,4734505.0,1010619.0,0.046849,0.0,0.00012,0.0,-7975.593,-7371.709,63625.4,7093.419,16.445683,959.729139,173.324832,297.950659,493811.4,0.03513,0.046849,0.000269,0.298328,0.316317,0.094815,0.00012,0.000269,0.684665,193.47888,59.040798,307.899132,0.0,0.0,0.0,0.0,0.0,0.0,8.318735,589.2447,9.014794,13040.03,7015.472847,1959.062909,4.35197,-4361.648,81599.86,41127.9,153801.5,58197.05,8388645.0,492192.8,8760911.0,8001546.0,0.271293
std,22324.611736,18285.407133,5.260464,33577790.0,591.783955,796.922806,10511.41,1798510.0,740.70266,60.440359,189.692462,287.534594,1955.350714,69.483855,607.482483,844.526666,,,4574566.0,8116673.0,24540310.0,2992404.0,33493830.0,9706261.0,24611200.0,8804841.0,28636610.0,9025832.0,6261658.0,17259420.0,8490852.0,0.211317,0.0,0.010961,0.0,2863420.0,2854741.0,245459.4,39393.38,25.607089,2043.741131,307.003468,636.449437,1667792.0,0.184109,0.211317,0.016387,0.457526,0.46504,0.292961,0.010961,0.016387,0.668355,333.428395,189.692462,607.482483,0.0,0.0,0.0,0.0,0.0,0.0,591.783955,10511.41,796.922806,1799119.0,14403.889129,8408.899082,468.157127,1444665.0,635155.9,400257.8,1048516.0,561322.1,23716260.0,4547756.0,24433440.0,23458840.0,0.444628
min,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12000000.0,-2000000.0,-1.0,0.0,-1.0,-11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1073741000.0,-1073741000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32586.5,53.0,6.0,155.0,2.0,1.0,12.0,2.0,6.0,0.0,6.0,0.0,2.0,0.0,1.2,0.0,119.5242,3.390808,64.0,0.0,125.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,20.0,1.73864,0.1213363,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.75,6.0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,1.0,2.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50955.0,80.0,6.0,31351.0,2.0,2.0,62.0,123.0,37.0,2.0,34.0,0.0,79.0,0.0,72.0,0.0,4569.191,108.4666,11619.33,139.431,30869.0,4.0,46.0,38.0,46.0,3.0,3.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,64.0,40.0,61.09295,19.65718,2.0,88.0,57.333333,26.290683,691.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,72.5,34.0,72.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,62.0,2.0,123.0,251.0,-1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,58473.0,443.0,17.0,3266403.0,5.0,4.0,192.0,487.0,84.0,36.0,50.0,26.162951,285.0,77.0,183.0,82.024387,166666.7,23121.39,342069.2,698969.7,2539600.0,64.0,1261595.0,210993.0,993521.0,48.0,103136.5,18848.83,15995.42,62839.0,45.0,0.0,0.0,0.0,0.0,120.0,104.0,11976.05,7194.245,36.0,565.0,121.0,179.555934,32240.33,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,149.75,50.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,192.0,4.0,487.0,8192.0,235.0,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,65535.0,65527.0,17.0,119999900.0,184029.0,244716.0,2321478.0,539997800.0,23360.0,2065.0,3867.0,6692.644993,13032.0,1460.0,3877.333333,6715.738331,inf,inf,120000000.0,84751950.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,82300000.0,120000000.0,120000000.0,1.0,0.0,1.0,0.0,3875868.0,4894320.0,3000000.0,2000000.0,1359.0,23360.0,2595.555556,4731.522394,22400000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.0,2920.0,3867.0,3877.333333,0.0,0.0,0.0,0.0,0.0,0.0,184029.0,2321478.0,244716.0,539997800.0,65535.0,65535.0,175660.0,60.0,85900000.0,50400000.0,87400000.0,85900000.0,120000000.0,73400000.0,120000000.0,120000000.0,1.0


In [14]:
df.isna().sum()

Flow ID     14493
Src IP      14493
Src Port    14493
Dst IP      14493
Dst Port    14493
            ...  
Idle Std    14493
Idle Max    14493
Idle Min    14493
Label           0
Attack      14493
Length: 84, dtype: int64

In [15]:
def to_categorical_codes(column):
        return pd.Categorical(column).codes
    
if name=='x_iiot':
    columns_to_convert = [
          'Protocol', 'Service', 'is_syn_only', 'Is_SYN_ACK', 
           'is_pure_ack', 'is_with_payload', 'FIN or RST', 
        'Bad_checksum', 'is_SYN_with_RST', 'anomaly_alert']


    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
            
    df['class3'] = (df['class3'] == 'Attack').astype(int)
    df = df.applymap(lambda x: 0 if x == '-' else x)
    df = df.applymap(lambda x: 0 if x == '?' else x)
    df = df.applymap(lambda x: 0 if x == '#DIV/0!' else x)    
    df = df.applymap(lambda x: 0 if x == 'excel' else x)
    df = df.applymap(lambda x: 0 if x == 'aza' else x)
    df = df.applymap(lambda x: 0 if x == ' ' else x)
    
if name=='edge_iiot':
    columns_to_convert = ['mqtt.protoname','mqtt.topic','mqtt.conack.flags','mqtt.msg','http.request.method','dns.qry.name.len','arp.src.proto_ipv4',
                          'http.request.full_uri','http.file_data','http.request.version','arp.dst.proto_ipv4','http.request.uri.query','tcp.srcport','http.referer']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    
if name=='ccd_inid_modified':    
    columns_to_convert = ['splt_direction','splt_ps','splt_piat_ms','application_name','application_category_name','requested_server_name','client_fingerprint']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    df['traffic_type'] = (df['traffic_type'] == 'attack').astype(int)
    df.drop(['requested_server_name', 'client_fingerprint', 'server_fingerprint', 'user_agent', 'content_type'], axis=1, inplace=True)

if name=='cic_ton_iot_modified':
    df.drop(['datetime'], axis=1, inplace=True)


    

In [16]:
df = clean_dataset(df, flow_id_col=datesetInfo.flow_id_col, timestamp_col=datesetInfo.timestamp_col)

==>> original df.shape[0]: 155968
==>> after drop na df.shape[0]: 141344
==>> after drop_duplicates df.shape[0]: 141255


In [17]:
df[datesetInfo.src_ip_col] = df[datesetInfo.src_ip_col].apply(str)
df[datesetInfo.src_port_col] = df[datesetInfo.src_port_col].apply(str)
df[datesetInfo.dst_ip_col] = df[datesetInfo.dst_ip_col].apply(str)
df[datesetInfo.dst_port_col] = df[datesetInfo.dst_port_col].apply(str)

In [18]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack
66292,192.168.10.1-192.168.10.3-53-60671-17,192.168.10.3,60671.0,192.168.10.1,53.0,17.0,03/07/2017 10:23:37,30919.0,1.0,1.0,41.0,69.0,41.0,41.0,41.0,0.0,69.0,69.0,69.0,0.0,3557.683,64.685145,30919.0,0.0,30919.0,30919.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,20.0,32.342573,32.342573,41.0,69.0,50.333333,16.165808,261.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75.5,41.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,41.0,1.0,69.0,-1.0,-1.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
420414,192.168.10.8-52.84.64.212-51938-443-6,52.84.64.212,443.0,192.168.10.8,51938.0,6.0,03/07/2017 11:20:18,3.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,4000000.0,666666.666667,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,333333.333333,333333.333333,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,9.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,6.0,119.0,16360.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
393869,172.217.10.2-192.168.10.8-80-55154-6,192.168.10.8,55154.0,172.217.10.2,80.0,6.0,5/7/2017 1:53,115525809.0,16.0,14.0,442.0,122.0,358.0,0.0,27.625,88.112712,116.0,0.0,8.714286,30.920316,4.882026,0.259682,3983649.0,4849805.0,9994305.0,83.0,116000000.0,7701720.6,10000000.0,36714.0,111000000.0,8508555.231,3716723.776,10000000.0,452.0,0.0,0.0,0.0,0.0,332.0,424.0,0.138497,0.121185,0.0,358.0,18.193548,66.307073,4396.627957,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.8,27.625,8.714286,0.0,0.0,0.0,0.0,0.0,0.0,16.0,442.0,14.0,122.0,8192.0,343.0,15.0,20.0,78974.81818,140027.3781,501173.0,36578.0,9976579.636,58066.24315,9994305.0,9801504.0,0,BENIGN
319307,192.168.10.1-192.168.10.3-53-61248-17,192.168.10.3,61248.0,192.168.10.1,53.0,17.0,03/07/2017 04:15:49,60594.0,1.0,1.0,45.0,235.0,45.0,45.0,45.0,0.0,235.0,235.0,235.0,0.0,4620.92,33.006568,60594.0,0.0,60594.0,60594.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,16.503284,16.503284,45.0,235.0,108.333333,109.696551,12033.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,162.5,45.0,235.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,45.0,1.0,235.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN
419002,192.168.10.3-192.168.10.12-53-32968-17,192.168.10.12,32968.0,192.168.10.3,53.0,17.0,03/07/2017 09:40:12,310.0,2.0,2.0,68.0,412.0,34.0,34.0,34.0,0.0,206.0,206.0,206.0,0.0,1548387.0,12903.225806,103.3333,177.2465,308.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,40.0,64.0,6451.612903,6451.612903,34.0,206.0,102.8,94.20828,8875.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,128.5,34.0,206.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,68.0,2.0,412.0,-1.0,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN


In [19]:
df.describe()

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0
mean,9.873031,14807220.0,8.327422,9.028417,589.9394,13057.53,211.57302,18.676167,59.081966,70.243025,878.231114,41.086114,308.376325,338.621402,1513099.0,70321.38,1322896.0,2955796.0,9271438.0,169067.4,14498570.0,2668040.0,9129851.0,1077176.0,9882319.0,1842414.0,1486353.0,4741879.0,1012193.0,0.04683,0.0,0.000113,0.0,-7988.132,-7383.2,63290.14,7102.04,16.43619,961.172029,173.548602,298.405037,494570.4,0.035085,0.04683,0.000269,0.298687,0.315883,0.094878,0.000113,0.000269,0.685321,193.716278,59.081966,308.376325,0.0,0.0,0.0,0.0,0.0,0.0,8.327422,589.9394,9.028417,13060.33,7020.166677,1958.582769,4.356575,-4368.483,81726.95,41191.96,154041.1,58287.69,8401710.0,492959.3,8774555.0,8014008.0,0.196482
std,5.260671,33598860.0,592.244566,797.543085,10519.56,1799910.0,741.224496,60.22243,189.753834,287.745321,1956.567387,69.520074,607.834884,845.078583,26414220.0,252094.5,4577830.0,8122155.0,24556690.0,2994726.0,33515020.0,9713247.0,24627730.0,8811592.0,28656250.0,9032565.0,6266258.0,17271850.0,8497368.0,0.211276,0.0,0.010642,0.0,2865649.0,2856963.0,244936.8,39413.5,25.580409,2044.997003,307.165319,636.832775,1668975.0,0.183997,0.211276,0.0164,0.457684,0.464868,0.293047,0.010642,0.0164,0.668443,333.58474,189.753834,607.834884,0.0,0.0,0.0,0.0,0.0,0.0,592.244566,10519.56,797.543085,1800519.0,14405.828142,8408.985025,468.521524,1445789.0,635642.1,400566.1,1049314.0,561754.4,23732410.0,4551254.0,24450020.0,23474980.0,0.397338
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12000000.0,-2000000.0,-1.0,0.0,-1.0,-11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1073741000.0,-1073741000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,156.0,2.0,1.0,12.0,4.0,6.0,0.0,6.0,0.0,2.0,0.0,2.0,0.0,119.507,3.351313,64.0,0.0,126.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,20.0,1.751075,0.1221032,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,1.0,4.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,31389.0,2.0,2.0,62.0,124.0,37.0,2.0,34.0,0.0,80.0,0.0,73.0,0.0,4552.126,106.6524,11775.33,141.4508,30888.0,4.0,47.0,43.0,46.0,3.0,3.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,64.0,40.0,61.1733,19.7486,2.0,88.0,57.4,26.290683,691.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,72.5,34.0,73.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,62.0,2.0,124.0,251.0,-1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17.0,3319227.0,5.0,4.0,196.0,493.0,84.0,36.0,50.0,26.162951,288.0,77.0,183.0,84.0,164383.6,22988.51,345447.3,707447.4,2624044.0,64.0,1282444.0,218584.5,995900.5,48.0,104216.0,19017.33,16108.2,63208.5,45.0,0.0,0.0,0.0,0.0,120.0,108.0,11976.05,7246.377,36.0,575.0,121.212406,181.002257,32761.82,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,149.885621,50.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,196.0,4.0,493.0,8192.0,235.0,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,17.0,119999900.0,184029.0,244716.0,2321478.0,539997800.0,23360.0,1983.0,3867.0,6692.644993,13032.0,1460.0,3877.333333,6715.738331,2070000000.0,3000000.0,120000000.0,84751950.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,82300000.0,120000000.0,120000000.0,1.0,0.0,1.0,0.0,3875868.0,4894320.0,3000000.0,2000000.0,1359.0,23360.0,2595.555556,4731.522394,22400000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.0,2920.0,3867.0,3877.333333,0.0,0.0,0.0,0.0,0.0,0.0,184029.0,2321478.0,244716.0,539997800.0,65535.0,65535.0,175660.0,60.0,85900000.0,50400000.0,87400000.0,85900000.0,120000000.0,73400000.0,120000000.0,120000000.0,1.0


In [20]:
_,var_dropped, corr_dropped = feature_analysis_pipeline(df=df, drop_columns=datesetInfo.drop_columns,label_col=datesetInfo.label_col)
var_dropped, corr_dropped

(['Bwd PSH Flags',
  'Bwd URG Flags',
  'Fwd Byts/b Avg',
  'Fwd Pkts/b Avg',
  'Fwd Blk Rate Avg',
  'Bwd Byts/b Avg',
  'Bwd Pkts/b Avg',
  'Bwd Blk Rate Avg'],
 {'Active Mean',
  'Active Std',
  'Bwd Header Len',
  'Bwd IAT Mean',
  'Bwd IAT Std',
  'Bwd Pkt Len Max',
  'Bwd Pkt Len Mean',
  'Bwd Pkt Len Std',
  'Flow Duration',
  'Flow IAT Max',
  'Flow IAT Mean',
  'Flow IAT Std',
  'Flow Pkts/s',
  'Fwd Header Len',
  'Fwd IAT Max',
  'Fwd IAT Mean',
  'Fwd IAT Min',
  'Fwd IAT Tot',
  'Fwd PSH Flags',
  'Fwd Pkt Len Max',
  'Fwd Pkt Len Mean',
  'Fwd Pkt Len Std',
  'Fwd URG Flags',
  'Idle Max',
  'Idle Mean',
  'Pkt Len Max',
  'Pkt Len Mean',
  'Pkt Len Std',
  'Pkt Len Var',
  'Pkt Size Avg',
  'Protocol',
  'RST Flag Cnt',
  'Subflow Bwd Byts',
  'Subflow Bwd Pkts',
  'Subflow Fwd Pkts',
  'Tot Bwd Pkts',
  'Tot Fwd Pkts',
  'TotLen Bwd Pkts',
  'TotLen Fwd Pkts'})

In [21]:
print(f"==>> type(var_dropped): {type(var_dropped)}")
print(f"==>> type(corr_dropped): {type(corr_dropped)}")

==>> type(var_dropped): <class 'list'>
==>> type(corr_dropped): <class 'set'>


In [22]:
var_dropped = set(var_dropped)
weak_columns = var_dropped.union(set(corr_dropped))
weak_columns

{'Active Mean',
 'Active Std',
 'Bwd Blk Rate Avg',
 'Bwd Byts/b Avg',
 'Bwd Header Len',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd PSH Flags',
 'Bwd Pkt Len Max',
 'Bwd Pkt Len Mean',
 'Bwd Pkt Len Std',
 'Bwd Pkts/b Avg',
 'Bwd URG Flags',
 'Flow Duration',
 'Flow IAT Max',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow Pkts/s',
 'Fwd Blk Rate Avg',
 'Fwd Byts/b Avg',
 'Fwd Header Len',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Fwd IAT Min',
 'Fwd IAT Tot',
 'Fwd PSH Flags',
 'Fwd Pkt Len Max',
 'Fwd Pkt Len Mean',
 'Fwd Pkt Len Std',
 'Fwd Pkts/b Avg',
 'Fwd URG Flags',
 'Idle Max',
 'Idle Mean',
 'Pkt Len Max',
 'Pkt Len Mean',
 'Pkt Len Std',
 'Pkt Len Var',
 'Pkt Size Avg',
 'Protocol',
 'RST Flag Cnt',
 'Subflow Bwd Byts',
 'Subflow Bwd Pkts',
 'Subflow Fwd Pkts',
 'Tot Bwd Pkts',
 'Tot Fwd Pkts',
 'TotLen Bwd Pkts',
 'TotLen Fwd Pkts'}

### Attacks Types

In [23]:
df[datesetInfo.class_col].unique()

array(['BENIGN', 'PortScan', 'DoS Hulk', 'DoS slowloris', 'DDoS',
       'DoS Slowhttptest', 'FTP-Patator', 'SSH-Patator', 'DoS GoldenEye',
       'Web Attack � Brute Force', 'Infiltration', 'Bot',
       'Web Attack � XSS', 'Web Attack � Sql Injection'], dtype=object)

In [24]:
# df[class_col] = df[class_col].replace({"BENIGN": "Benign",
#                                        "DDoS": "ddos",
#                                        "Web Attack � Brute Force": "bruteforce",
#                                        "Web Attack � XSS": "xss"})

In [25]:
classes = df[datesetInfo.class_col].unique()

### Sorting (optional)

In [26]:
if with_sort_timestamp and datesetInfo.timestamp_col:
    df[datesetInfo.timestamp_col] = pd.to_datetime(df[datesetInfo.timestamp_col].str.strip(), format=timestamp_format)
    df.sort_values(datesetInfo.timestamp_col, inplace= True)

### Encoding Attacks into integers

In [27]:
df, labels_names = one_dataset_class_num_col(df, datesetInfo.class_num_col, datesetInfo.class_col)

==>> labels_names: {0: 'BENIGN', 1: 'Bot', 2: 'DDoS', 3: 'DoS GoldenEye', 4: 'DoS Hulk', 5: 'DoS Slowhttptest', 6: 'DoS slowloris', 7: 'FTP-Patator', 8: 'Infiltration', 9: 'PortScan', 10: 'SSH-Patator', 11: 'Web Attack � Brute Force', 12: 'Web Attack � Sql Injection', 13: 'Web Attack � XSS'}


In [28]:
df.groupby(datesetInfo.class_col).size()

Attack
BENIGN                        113501
Bot                               87
DDoS                            6391
DoS GoldenEye                    519
DoS Hulk                       11512
DoS Slowhttptest                 259
DoS slowloris                    283
FTP-Patator                      377
Infiltration                       6
PortScan                        7916
SSH-Patator                      289
Web Attack � Brute Force          76
Web Attack � Sql Injection         3
Web Attack � XSS                  36
dtype: int64

### Undersampling classes (optional)

In [29]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [30]:
if with_undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df[df[datesetInfo.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df[df[datesetInfo.class_col] == class_label])

    df = []
    # Optional: shuffle the undersampled DataFrame
    df = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [31]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

### saving labels encodings and datasets properties

In [32]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [33]:
df[datesetInfo.label_col].unique()

array([0, 1], dtype=int64)

In [34]:
total_count = len(df)

properties = {
    "name": name,
    "length": total_count,
}

num_benign = len(df[df[datesetInfo.label_col] == 0])
num_attack = len(df[df[datesetInfo.label_col] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df[datesetInfo.class_col].unique())

properties["weak_columns"] = list(weak_columns) 

with open(folder_path + '/df_properties.json', 'w') as f:
    json.dump(properties, f)
    
properties


{'name': 'cic_ids_2017_5_percent',
 'length': 141255,
 'num_benign': 113501,
 'percentage_of_benign_records': 80.35184595235567,
 'num_attack': 27754,
 'percentage_of_attack_records': 19.648154047644333,
 'attacks': ['BENIGN',
  'PortScan',
  'DoS Hulk',
  'DoS slowloris',
  'DDoS',
  'DoS Slowhttptest',
  'FTP-Patator',
  'SSH-Patator',
  'DoS GoldenEye',
  'Web Attack � Brute Force',
  'Infiltration',
  'Bot',
  'Web Attack � XSS',
  'Web Attack � Sql Injection'],
 'weak_columns': ['Fwd IAT Tot',
  'Fwd Pkt Len Std',
  'Subflow Bwd Pkts',
  'Fwd PSH Flags',
  'Bwd Header Len',
  'Bwd Byts/b Avg',
  'Bwd Pkts/b Avg',
  'Bwd Pkt Len Std',
  'Fwd Header Len',
  'Fwd Blk Rate Avg',
  'Fwd Byts/b Avg',
  'Bwd Pkt Len Mean',
  'Fwd Pkt Len Max',
  'Fwd IAT Max',
  'Subflow Bwd Byts',
  'Bwd URG Flags',
  'Tot Fwd Pkts',
  'Fwd Pkt Len Mean',
  'Flow Pkts/s',
  'TotLen Bwd Pkts',
  'Active Mean',
  'Idle Mean',
  'Idle Max',
  'RST Flag Cnt',
  'Bwd Blk Rate Avg',
  'Bwd Pkt Len Max',
  'Fw

In [35]:
df.to_parquet(output_path)

In [36]:
df.shape

(141255, 85)

In [37]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,...,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
66292,192.168.10.1-192.168.10.3-53-60671-17,192.168.10.3,60671.0,192.168.10.1,53.0,17.0,03/07/2017 10:23:37,30919.0,1.0,1.0,41.0,69.0,41.0,41.0,41.0,0.0,69.0,69.0,69.0,0.0,3557.683,64.685145,30919.0,0.0,30919.0,30919.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,20.0,32.342573,...,41.0,69.0,50.333333,16.165808,261.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,75.5,41.0,69.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,41.0,1.0,69.0,-1.0,-1.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
420414,192.168.10.8-52.84.64.212-51938-443-6,52.84.64.212,443.0,192.168.10.8,51938.0,6.0,03/07/2017 11:20:18,3.0,1.0,1.0,6.0,6.0,6.0,6.0,6.0,0.0,6.0,6.0,6.0,0.0,4000000.0,666666.666667,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,333333.333333,...,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,9.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,1.0,6.0,119.0,16360.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
393869,172.217.10.2-192.168.10.8-80-55154-6,192.168.10.8,55154.0,172.217.10.2,80.0,6.0,5/7/2017 1:53,115525809.0,16.0,14.0,442.0,122.0,358.0,0.0,27.625,88.112712,116.0,0.0,8.714286,30.920316,4.882026,0.259682,3983649.0,4849805.0,9994305.0,83.0,116000000.0,7701720.6,10000000.0,36714.0,111000000.0,8508555.231,3716723.776,10000000.0,452.0,0.0,0.0,0.0,0.0,332.0,424.0,0.138497,...,0.0,358.0,18.193548,66.307073,4396.627957,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.8,27.625,8.714286,0.0,0.0,0.0,0.0,0.0,0.0,16.0,442.0,14.0,122.0,8192.0,343.0,15.0,20.0,78974.81818,140027.3781,501173.0,36578.0,9976579.636,58066.24315,9994305.0,9801504.0,0,BENIGN,0
319307,192.168.10.1-192.168.10.3-53-61248-17,192.168.10.3,61248.0,192.168.10.1,53.0,17.0,03/07/2017 04:15:49,60594.0,1.0,1.0,45.0,235.0,45.0,45.0,45.0,0.0,235.0,235.0,235.0,0.0,4620.92,33.006568,60594.0,0.0,60594.0,60594.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,16.503284,...,45.0,235.0,108.333333,109.696551,12033.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,162.5,45.0,235.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,45.0,1.0,235.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
419002,192.168.10.3-192.168.10.12-53-32968-17,192.168.10.12,32968.0,192.168.10.3,53.0,17.0,03/07/2017 09:40:12,310.0,2.0,2.0,68.0,412.0,34.0,34.0,34.0,0.0,206.0,206.0,206.0,0.0,1548387.0,12903.225806,103.3333,177.2465,308.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,40.0,64.0,6451.612903,...,34.0,206.0,102.8,94.20828,8875.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,128.5,34.0,206.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,68.0,2.0,412.0,-1.0,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0


In [38]:
df.describe()

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Class
count,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0,141255.0
mean,9.873031,14807220.0,8.327422,9.028417,589.9394,13057.53,211.57302,18.676167,59.081966,70.243025,878.231114,41.086114,308.376325,338.621402,1513099.0,70321.38,1322896.0,2955796.0,9271438.0,169067.4,14498570.0,2668040.0,9129851.0,1077176.0,9882319.0,1842414.0,1486353.0,4741879.0,1012193.0,0.04683,0.0,0.000113,0.0,-7988.132,-7383.2,63290.14,7102.04,16.43619,961.172029,173.548602,298.405037,494570.4,0.035085,0.04683,0.000269,0.298687,0.315883,0.094878,0.000113,0.000269,0.685321,193.716278,59.081966,308.376325,0.0,0.0,0.0,0.0,0.0,0.0,8.327422,589.9394,9.028417,13060.33,7020.166677,1958.582769,4.356575,-4368.483,81726.95,41191.96,154041.1,58287.69,8401710.0,492959.3,8774555.0,8014008.0,0.196482,1.002641
std,5.260671,33598860.0,592.244566,797.543085,10519.56,1799910.0,741.224496,60.22243,189.753834,287.745321,1956.567387,69.520074,607.834884,845.078583,26414220.0,252094.5,4577830.0,8122155.0,24556690.0,2994726.0,33515020.0,9713247.0,24627730.0,8811592.0,28656250.0,9032565.0,6266258.0,17271850.0,8497368.0,0.211276,0.0,0.010642,0.0,2865649.0,2856963.0,244936.8,39413.5,25.580409,2044.997003,307.165319,636.832775,1668975.0,0.183997,0.211276,0.0164,0.457684,0.464868,0.293047,0.010642,0.0164,0.668443,333.58474,189.753834,607.834884,0.0,0.0,0.0,0.0,0.0,0.0,592.244566,10519.56,797.543085,1800519.0,14405.828142,8408.985025,468.521524,1445789.0,635642.1,400566.1,1049314.0,561754.4,23732410.0,4551254.0,24450020.0,23474980.0,0.397338,2.370643
min,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-12000000.0,-2000000.0,-1.0,0.0,-1.0,-11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1073741000.0,-1073741000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,156.0,2.0,1.0,12.0,4.0,6.0,0.0,6.0,0.0,2.0,0.0,2.0,0.0,119.507,3.351313,64.0,0.0,126.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,20.0,1.751075,0.1221032,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,1.0,4.0,-1.0,-1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,31389.0,2.0,2.0,62.0,124.0,37.0,2.0,34.0,0.0,80.0,0.0,73.0,0.0,4552.126,106.6524,11775.33,141.4508,30888.0,4.0,47.0,43.0,46.0,3.0,3.0,3.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,64.0,40.0,61.1733,19.7486,2.0,88.0,57.4,26.290683,691.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,72.5,34.0,73.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,62.0,2.0,124.0,251.0,-1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17.0,3319227.0,5.0,4.0,196.0,493.0,84.0,36.0,50.0,26.162951,288.0,77.0,183.0,84.0,164383.6,22988.51,345447.3,707447.4,2624044.0,64.0,1282444.0,218584.5,995900.5,48.0,104216.0,19017.33,16108.2,63208.5,45.0,0.0,0.0,0.0,0.0,120.0,108.0,11976.05,7246.377,36.0,575.0,121.212406,181.002257,32761.82,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,149.885621,50.0,183.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,196.0,4.0,493.0,8192.0,235.0,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,17.0,119999900.0,184029.0,244716.0,2321478.0,539997800.0,23360.0,1983.0,3867.0,6692.644993,13032.0,1460.0,3877.333333,6715.738331,2070000000.0,3000000.0,120000000.0,84751950.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,120000000.0,82300000.0,120000000.0,120000000.0,1.0,0.0,1.0,0.0,3875868.0,4894320.0,3000000.0,2000000.0,1359.0,23360.0,2595.555556,4731.522394,22400000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.0,2920.0,3867.0,3877.333333,0.0,0.0,0.0,0.0,0.0,0.0,184029.0,2321478.0,244716.0,539997800.0,65535.0,65535.0,175660.0,60.0,85900000.0,50400000.0,87400000.0,85900000.0,120000000.0,73400000.0,120000000.0,120000000.0,1.0,13.0
