In [37]:
import os
import pandas as pd
import pickle
import json


from src.dataset.dataset_info import datasets, DatasetInfo
from src.dataset.clean_dataset import clean_dataset
from src.dataset.create_class_num_col import one_dataset_class_num_col
from src.dataset.features_analysis import feature_analysis_pipeline
from src.dataset.undersample_classes import undersample_classes
from src.graph.graph_measures import calculate_graph_measures
from local_variables import datasets_main_path


In [38]:
with_sort_timestamp = False
with_undersample_classes = False

# name = "cic_ids_2017_5_percent"
# original_path = "./testing_dfs/cic_ids_2017_5_percent.parquet"
# file_type = "parquet"

# name = "cic_ton_iot_5_percent"
# original_path = "./testing_dfs/cic_ton_iot_5_percent.parquet"
# file_type = "parquet"

# name = "cic_ton_iot"
# original_path = datasets_main_path + "cic_ton_iot.parquet"
# file_type = "parquet"

# name = "cic_ids_2017"
# original_path = datasets_main_path + "cic_ids_2017.parquet"
# file_type = "parquet"

name = "cic_bot_iot"
original_path = datasets_main_path + "cic_bot_iot.parquet"
file_type = "parquet"

# name = "cic_ton_iot_modified"
# original_path = datasets_main_path + "CIC-ToN-IoT-Modified.pkl"
# file_type = "pkl"

# name = "nf_ton_iotv2_modified"
# original_path = datasets_main_path + "NF-ToN-IoT-v2-Modified.pkl"
# file_type = "pkl"

# name = "ccd_inid_modified"
# original_path = datasets_main_path + "CCD-INID-Modified.pkl"
# file_type = "pkl"

# name = "nf_uq_nids_modified"
# original_path = datasets_main_path + "NF-UQ-NIDS-Modified.pkl"
# file_type = "pkl"


# name = "nf_bot_iot"

# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

folder_path = os.path.join("datasets", name)
output_path = os.path.join(folder_path, name + ".parquet")

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

print(f"==>> original_path: {original_path}")
print(f"==>> folder_path: {folder_path}")
print(f"==>> output_path: {output_path}")


==>> original_path: c:\Users\Administrateur\Desktop\datasets\cic_bot_iot.parquet
==>> folder_path: datasets\cic_bot_iot
==>> output_path: datasets\cic_bot_iot\cic_bot_iot.parquet


# Preparing Datasets

### Reading and Cleaning

In [39]:
if file_type == "parquet":
    df = pd.read_parquet(original_path)
if file_type == "csv":
    df = pd.read_csv(original_path)
if file_type == "pkl":
    df = pd.read_pickle(original_path, compression="zip")

In [40]:
pd.options.display.max_columns = df.shape[1]

In [41]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack
0,192.168.100.150-192.168.100.3-54114-44581-6,192.168.100.150,54114.0,192.168.100.3,44581.0,6.0,21/05/2018 08:34:13 PM,64465.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.024587,64465.0,0.0,64465.0,64465.0,0.0,0.0,0.0,0.0,64465.0,64465.0,0.0,64465.0,64465.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,31.024587,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance
1,192.168.100.147-192.168.100.7-1649-80-6,192.168.100.147,1649.0,192.168.100.7,80.0,6.0,04/06/2018 05:38:04 PM,22646986.0,3.0,1.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,100.0,100.0,0.0,17.662394,0.176624,7548995.0,5561972.0,13839423.0,3281987.0,19364999.0,9682500.0,13839423.0,5525576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.132468,0.044156,100.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,300.0,1.0,100.0,-1.0,512.0,3.0,0.0,3281987.0,0.0,3281987.0,3281987.0,9682499.5,5878778.0,13839423.0,5525576.0,1,DDoS
2,192.168.100.148-192.168.100.3-59032-80-6,192.168.100.148,59032.0,192.168.100.3,80.0,6.0,04/06/2018 01:41:34 PM,153463.0,1.0,2.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,0.0,50.0,70.710678,651.622867,19.548686,76731.5,76717.55,130979.0,22484.0,0.0,0.0,0.0,0.0,22484.0,22484.0,0.0,22484.0,22484.0,0.0,0.0,0.0,0.0,20.0,44.0,6.516229,13.032457,0.0,100.0,50.0,57.735027,3333.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,66.666667,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,-1.0,29200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance
3,192.168.100.149-192.168.100.7-2820-80-17,192.168.100.149,2820.0,192.168.100.7,80.0,17.0,04/06/2018 02:29:28 PM,24915470.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200679,6228868.0,2337803.0,9613711.0,4568562.0,20138439.0,6712813.0,9613711.0,4568562.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,8.0,0.160543,0.040136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,4777031.0,0.0,4777031.0,4777031.0,7784938.5,2586275.0,9613711.0,5956166.0,1,DDoS
4,192.168.100.150-192.168.100.5-5146-80-17,192.168.100.150,5146.0,192.168.100.5,80.0,17.0,04/06/2018 06:17:12 PM,46437692.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193808,5804712.0,3854238.0,14796053.0,3089320.0,42448141.0,6064020.0,14796053.0,3089320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,8.0,0.172274,0.021534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,9597806.5,7235516.0,14714089.0,4481524.0,9080693.0,5048133.0,14796053.0,5230723.0,1,DDoS


In [42]:
timestamp_format = "mixed"
# timestamp_format="%d/%m/%Y %I:%M:%S %p"

In [43]:
df.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count',
       'ECE Flag Cnt', 'Dow

In [44]:
list(df.dtypes)

[dtype('O'),
 dtype('O'),
 dtype('float64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('O'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64')

In [45]:
df.describe(include="all")

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack
count,13428595,13428595,13428600.0,13428595,13428600.0,13428600.0,13428595,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595.0,13428600.0,13428595.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595.0,13428595.0,13428595.0,13428595.0,13428595.0,13428595.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595.0,13428600.0,13428600.0,13428595.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595
unique,3801165,65,,289,,,52577,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5
top,192.168.100.3-192.168.100.6-80-80-6,192.168.100.149,,192.168.100.3,,,22/05/2018 10:34:03 AM,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,DDoS
freq,970,3444781,,6519604,,,38919,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4913917
mean,,,36281.75,,3083.07,10.52028,,23927390.0,6.244988,1.941958,1139.214,586.817,48.91711,30.14244,39.67568,10.21963,49.50581,24.72766,34.22074,14.93871,inf,inf,5043371.0,2544748.0,8632454.0,2982927.0,18688740.0,5385607.0,7609911.0,3626096.0,4194108.0,2680705.0,1432846.0,3704321.0,1581196.0,0.0,0.0003249037,0.0,5.138289e-05,77.9876,33.97108,697.2033,2864.551,24.59191,51.11406,38.56766,12.32099,2093.779,0.001846358,0.5767217,0.00198822,0.0003249037,0.01009651,5.138289e-05,1.891486e-05,1.90638e-05,0.1508969,47.18481,39.67568,34.22074,0.0,0.0,0.0,0.0,0.0,0.0,6.244988,1139.214,1.941958,586.817,-1.0,4367.415,2.56074,0.0,2057990.0,102466.2,2131767.0,1985147.0,7059793.0,796533.9,7841705.0,6423615.0,0.993354,
std,,,18377.59,,10778.31,5.412621,,16946070.0,70.32073,46.69613,106743.9,134863.1,93.52407,47.03005,59.02777,38.52885,101.7514,45.26063,60.04179,41.13049,,,4717544.0,3535748.0,7754256.0,4338853.0,14736550.0,5208623.0,7414981.0,4799812.0,10862540.0,7039556.0,5021360.0,9384352.0,6143070.0,0.0,0.01802216,0.0,0.007168002,1170.441,824.8526,23544.03,35226.37,43.56298,118.5539,54.46863,44.06781,823152.4,0.04292959,0.4940787,0.04454511,0.01802216,0.09997287,0.007168002,0.004349081,0.004366169,0.4145783,63.53947,59.02777,60.04179,0.0,0.0,0.0,0.0,0.0,0.0,70.32073,106743.9,46.69613,134863.1,0.0,10229.55,70.17818,0.0,4749506.0,745037.2,4860748.0,4697343.0,7926355.0,1555991.0,8369454.0,7749780.0,0.08125145,
min,,,0.0,,0.0,0.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01669377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,,,21104.0,,80.0,6.0,,14569900.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2029664,2095341.0,97765.18,3620151.0,39774.0,10086130.0,1690878.0,2731049.0,179061.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,8.0,0.09671012,0.03424554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,
50%,,,39446.0,,80.0,6.0,,22528720.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2757326,4385851.0,1266008.0,6640853.0,2128264.0,17261580.0,4251418.0,5485502.0,2545918.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.18272,0.05464118,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6169274.0,0.0,6637858.0,5302683.0,1.0,
75%,,,51716.0,,80.0,17.0,,33115880.0,7.0,2.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,0.0,97.6,0.0,16.64728,0.5380673,6066621.0,3009289.0,11477550.0,3959594.0,26790980.0,6514790.0,9975538.0,4410432.0,20591.0,20021.0,0.0,20576.0,15955.0,0.0,0.0,0.0,0.0,80.0,44.0,0.2787254,0.1082259,0.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.5,100.0,97.6,0.0,0.0,0.0,0.0,0.0,0.0,7.0,300.0,2.0,100.0,-1.0,512.0,3.0,0.0,247765.5,0.0,252894.0,228261.0,9067824.0,1004928.0,11474050.0,7353598.0,1.0,


In [46]:
datesetInfo =   DatasetInfo(name="cic_bot_iot",
                path="datasets/cic_bot_iot/cic_bot_iot.parquet",
                file_type="parquet",
                src_ip_col="Src IP",
                src_port_col="Src Port",
                dst_ip_col="Dst IP",
                dst_port_col="Dst Port",
                flow_id_col="Flow ID",
                timestamp_col="Timestamp",
                label_col="Label",
                class_col="Attack",
                class_num_col="Class",
                timestamp_format="mixed",
                drop_columns=["Flow ID", "Src IP", "Dst IP",
                              "Timestamp", "Src Port", "Dst Port", "Attack"],
                )

In [47]:
df.shape

(13428595, 84)

In [48]:
# import numpy as np
# df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [49]:
df.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)


Unnamed: 0,Src Port,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595.0,13428600.0,13428595.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595.0,13428595.0,13428595.0,13428595.0,13428595.0,13428595.0,13428600.0,13428600.0,13428600.0,13428600.0,13428595.0,13428600.0,13428600.0,13428595.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0,13428600.0
mean,36281.75,3083.07,10.52028,23927390.0,6.244988,1.941958,1139.214,586.817,48.91711,30.14244,39.67568,10.21963,49.50581,24.72766,34.22074,14.93871,inf,inf,5043371.0,2544748.0,8632454.0,2982927.0,18688740.0,5385607.0,7609911.0,3626096.0,4194108.0,2680705.0,1432846.0,3704321.0,1581196.0,0.0,0.0003249037,0.0,5.138289e-05,77.9876,33.97108,697.2033,2864.551,24.59191,51.11406,38.56766,12.32099,2093.779,0.001846358,0.5767217,0.00198822,0.0003249037,0.01009651,5.138289e-05,1.891486e-05,1.90638e-05,0.1508969,47.18481,39.67568,34.22074,0.0,0.0,0.0,0.0,0.0,0.0,6.244988,1139.214,1.941958,586.817,-1.0,4367.415,2.56074,0.0,2057990.0,102466.2,2131767.0,1985147.0,7059793.0,796533.9,7841705.0,6423615.0,0.993354
std,18377.59,10778.31,5.412621,16946070.0,70.32073,46.69613,106743.9,134863.1,93.52407,47.03005,59.02777,38.52885,101.7514,45.26063,60.04179,41.13049,,,4717544.0,3535748.0,7754256.0,4338853.0,14736550.0,5208623.0,7414981.0,4799812.0,10862540.0,7039556.0,5021360.0,9384352.0,6143070.0,0.0,0.01802216,0.0,0.007168002,1170.441,824.8526,23544.03,35226.37,43.56298,118.5539,54.46863,44.06781,823152.4,0.04292959,0.4940787,0.04454511,0.01802216,0.09997287,0.007168002,0.004349081,0.004366169,0.4145783,63.53947,59.02777,60.04179,0.0,0.0,0.0,0.0,0.0,0.0,70.32073,106743.9,46.69613,134863.1,0.0,10229.55,70.17818,0.0,4749506.0,745037.2,4860748.0,4697343.0,7926355.0,1555991.0,8369454.0,7749780.0,0.08125145
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01669377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21104.0,80.0,6.0,14569900.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2029664,2095341.0,97765.18,3620151.0,39774.0,10086130.0,1690878.0,2731049.0,179061.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,8.0,0.09671012,0.03424554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,39446.0,80.0,6.0,22528720.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2757326,4385851.0,1266008.0,6640853.0,2128264.0,17261580.0,4251418.0,5485502.0,2545918.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.18272,0.05464118,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6169274.0,0.0,6637858.0,5302683.0,1.0
75%,51716.0,80.0,17.0,33115880.0,7.0,2.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,0.0,97.6,0.0,16.64728,0.5380673,6066621.0,3009289.0,11477550.0,3959594.0,26790980.0,6514790.0,9975538.0,4410432.0,20591.0,20021.0,0.0,20576.0,15955.0,0.0,0.0,0.0,0.0,80.0,44.0,0.2787254,0.1082259,0.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.5,100.0,97.6,0.0,0.0,0.0,0.0,0.0,0.0,7.0,300.0,2.0,100.0,-1.0,512.0,3.0,0.0,247765.5,0.0,252894.0,228261.0,9067824.0,1004928.0,11474050.0,7353598.0,1.0
max,65535.0,65535.0,17.0,120000000.0,49152.0,32519.0,186458300.0,196303200.0,64240.0,1448.0,51262.19,27020.02,65160.0,1458.0,53459.47,29231.92,inf,inf,119805200.0,84843630.0,119987200.0,119805200.0,119999700.0,117018800.0,117221700.0,117018800.0,120000000.0,119987300.0,77390570.0,119987300.0,119987300.0,0.0,1.0,0.0,1.0,1572864.0,1040608.0,2000000.0,2000000.0,1212.0,65160.0,29861.07,30290.77,917530900.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0,29869.58,51262.19,53459.47,0.0,0.0,0.0,0.0,0.0,0.0,49152.0,186458300.0,32519.0,196303200.0,-1.0,42408.0,49064.0,0.0,111442900.0,58869230.0,111442900.0,111442900.0,119987200.0,66874970.0,119987200.0,119987200.0,1.0


In [50]:
df.isna().sum()

Flow ID     0
Src IP      0
Src Port    0
Dst IP      0
Dst Port    0
           ..
Idle Std    0
Idle Max    0
Idle Min    0
Label       0
Attack      0
Length: 84, dtype: int64

In [51]:
def to_categorical_codes(column):
        return pd.Categorical(column).codes
    
if name=='x_iiot':
    columns_to_convert = [
          'Protocol', 'Service', 'is_syn_only', 'Is_SYN_ACK', 
           'is_pure_ack', 'is_with_payload', 'FIN or RST', 
        'Bad_checksum', 'is_SYN_with_RST', 'anomaly_alert']


    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
            
    df['class3'] = (df['class3'] == 'Attack').astype(int)
    df = df.applymap(lambda x: 0 if x == '-' else x)
    df = df.applymap(lambda x: 0 if x == '?' else x)
    df = df.applymap(lambda x: 0 if x == '#DIV/0!' else x)    
    df = df.applymap(lambda x: 0 if x == 'excel' else x)
    df = df.applymap(lambda x: 0 if x == 'aza' else x)
    df = df.applymap(lambda x: 0 if x == ' ' else x)
    
if name=='edge_iiot':
    columns_to_convert = ['mqtt.protoname','mqtt.topic','mqtt.conack.flags','mqtt.msg','http.request.method','dns.qry.name.len','arp.src.proto_ipv4',
                          'http.request.full_uri','http.file_data','http.request.version','arp.dst.proto_ipv4','http.request.uri.query','tcp.srcport','http.referer']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    
if name=='ccd_inid_modified':    
    columns_to_convert = ['splt_direction','splt_ps','splt_piat_ms','application_name','application_category_name','requested_server_name','client_fingerprint']

    for col in columns_to_convert:
        if col in df.columns:
            df[col] = to_categorical_codes(df[col])
        else:
            print(f"Warning: Column '{col}' not found in the DataFrame")
    df['traffic_type'] = (df['traffic_type'] == 'attack').astype(int)
    df.drop(['requested_server_name', 'client_fingerprint', 'server_fingerprint', 'user_agent', 'content_type'], axis=1, inplace=True)

if name=='cic_ton_iot_modified':
    df.drop(['datetime'], axis=1, inplace=True)


    

In [52]:
df = clean_dataset(df, flow_id_col=datesetInfo.flow_id_col, timestamp_col=datesetInfo.timestamp_col)

==>> original df.shape[0]: 13428595
==>> after drop na df.shape[0]: 13427562
==>> after drop_duplicates df.shape[0]: 13427518


In [53]:
df[datesetInfo.src_ip_col] = df[datesetInfo.src_ip_col].apply(str)
df[datesetInfo.src_port_col] = df[datesetInfo.src_port_col].apply(str)
df[datesetInfo.dst_ip_col] = df[datesetInfo.dst_ip_col].apply(str)
df[datesetInfo.dst_port_col] = df[datesetInfo.dst_port_col].apply(str)

In [54]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack
0,192.168.100.150-192.168.100.3-54114-44581-6,192.168.100.150,54114.0,192.168.100.3,44581.0,6.0,21/05/2018 08:34:13 PM,64465.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.024587,64465.0,0.0,64465.0,64465.0,0.0,0.0,0.0,0.0,64465.0,64465.0,0.0,64465.0,64465.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,31.024587,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance
1,192.168.100.147-192.168.100.7-1649-80-6,192.168.100.147,1649.0,192.168.100.7,80.0,6.0,04/06/2018 05:38:04 PM,22646986.0,3.0,1.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,100.0,100.0,0.0,17.662394,0.176624,7548995.0,5561972.0,13839423.0,3281987.0,19364999.0,9682500.0,13839423.0,5525576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.132468,0.044156,100.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,300.0,1.0,100.0,-1.0,512.0,3.0,0.0,3281987.0,0.0,3281987.0,3281987.0,9682499.5,5878778.0,13839423.0,5525576.0,1,DDoS
2,192.168.100.148-192.168.100.3-59032-80-6,192.168.100.148,59032.0,192.168.100.3,80.0,6.0,04/06/2018 01:41:34 PM,153463.0,1.0,2.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,0.0,50.0,70.710678,651.622867,19.548686,76731.5,76717.55,130979.0,22484.0,0.0,0.0,0.0,0.0,22484.0,22484.0,0.0,22484.0,22484.0,0.0,0.0,0.0,0.0,20.0,44.0,6.516229,13.032457,0.0,100.0,50.0,57.735027,3333.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,66.666667,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,-1.0,29200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance
3,192.168.100.149-192.168.100.7-2820-80-17,192.168.100.149,2820.0,192.168.100.7,80.0,17.0,04/06/2018 02:29:28 PM,24915470.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200679,6228868.0,2337803.0,9613711.0,4568562.0,20138439.0,6712813.0,9613711.0,4568562.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,8.0,0.160543,0.040136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,4777031.0,0.0,4777031.0,4777031.0,7784938.5,2586275.0,9613711.0,5956166.0,1,DDoS
4,192.168.100.150-192.168.100.5-5146-80-17,192.168.100.150,5146.0,192.168.100.5,80.0,17.0,04/06/2018 06:17:12 PM,46437692.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193808,5804712.0,3854238.0,14796053.0,3089320.0,42448141.0,6064020.0,14796053.0,3089320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,8.0,0.172274,0.021534,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,9597806.5,7235516.0,14714089.0,4481524.0,9080693.0,5048133.0,14796053.0,5230723.0,1,DDoS


In [55]:
df.describe()

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
count,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427518.0,13427520.0,13427518.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427518.0,13427518.0,13427518.0,13427518.0,13427518.0,13427518.0,13427520.0,13427520.0,13427520.0,13427520.0,13427518.0,13427520.0,13427520.0,13427518.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0
mean,10.52062,23929310.0,6.245421,1.942021,1139.305,586.8635,48.92058,30.14441,39.67842,10.22045,49.50924,24.72911,34.22295,14.93991,17497.02,3561.901,5043776.0,2544952.0,8633147.0,2983166.0,18690240.0,5386039.0,7610522.0,3626387.0,4194444.0,2680920.0,1432961.0,3704618.0,1581323.0,0.0,0.0003232168,0.0,5.138701e-05,77.99179,33.97108,697.2383,2864.662,24.59343,51.11762,38.57025,12.32192,2093.946,0.001845836,0.5767571,0.001984134,0.0003232168,0.01003276,5.138701e-05,1.891638e-05,1.906533e-05,0.1508414,47.18783,39.67842,34.22295,0.0,0.0,0.0,0.0,0.0,0.0,6.245421,1139.305,1.942021,586.8635,-1.0,4367.75,2.560943,0.0,2058155.0,102474.4,2131938.0,1985306.0,7060359.0,796597.8,7842334.0,6424130.0,0.9933577
std,5.412682,16945390.0,70.32354,46.698,106748.2,134868.5,93.52652,47.03042,59.02855,38.53029,101.7542,45.26081,60.04267,41.13192,1029434.0,53600.78,4717517.0,3535817.0,7754182.0,4338945.0,14736190.0,5208608.0,7414965.0,4799894.0,10862910.0,7039797.0,5021545.0,9384669.0,6143300.0,0.0,0.01797533,0.0,0.007168289,1170.487,824.8857,23544.97,35227.66,43.56331,118.5575,54.46903,44.06944,823185.4,0.04292353,0.4940732,0.04449941,0.01797533,0.09965992,0.007168289,0.004349255,0.004366344,0.4145358,63.53914,59.02855,60.04267,0.0,0.0,0.0,0.0,0.0,0.0,70.32354,106748.2,46.698,134868.5,0.0,10229.89,70.18099,0.0,4749661.0,745066.5,4860906.0,4697498.0,7926421.0,1556037.0,8369495.0,7749877.0,0.08122936
min,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01669377,0.3333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008333335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,14571970.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2029618,2095667.0,97867.28,3620585.0,39784.0,10088950.0,1691135.0,2731493.0,179115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,8.0,0.09677616,0.03425194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,6.0,22529570.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2757213,4386073.0,1266178.0,6641591.0,2128488.0,17262610.0,4251708.0,5486075.0,2546196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.1827317,0.05464567,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6169748.0,0.0,6638622.0,5302814.0,1.0
75%,17.0,33117460.0,7.0,2.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,0.0,97.6,0.0,16.64995,0.5379276,6066778.0,3009487.0,11478470.0,3959719.0,26791690.0,6514945.0,9976006.0,4410581.0,20609.75,20034.0,0.0,20590.0,15970.0,0.0,0.0,0.0,0.0,80.0,44.0,0.2787262,0.1082315,0.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.5,100.0,97.6,0.0,0.0,0.0,0.0,0.0,0.0,7.0,300.0,2.0,100.0,-1.0,512.0,3.0,0.0,247885.0,0.0,252947.0,228334.0,9068176.0,1005116.0,11474520.0,7354286.0,1.0
max,17.0,120000000.0,49152.0,32519.0,186458300.0,196303200.0,64240.0,1448.0,51262.19,27020.02,65160.0,1458.0,53459.47,29231.92,494000000.0,4000000.0,119805200.0,84843630.0,119987200.0,119805200.0,119999700.0,117018800.0,117221700.0,117018800.0,120000000.0,119987300.0,77390570.0,119987300.0,119987300.0,0.0,1.0,0.0,1.0,1572864.0,1040608.0,2000000.0,2000000.0,1212.0,65160.0,29861.07,30290.77,917530900.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0,29869.58,51262.19,53459.47,0.0,0.0,0.0,0.0,0.0,0.0,49152.0,186458300.0,32519.0,196303200.0,-1.0,42408.0,49064.0,0.0,111442900.0,58869230.0,111442900.0,111442900.0,119987200.0,66874970.0,119987200.0,119987200.0,1.0


In [56]:
_,var_dropped, corr_dropped = feature_analysis_pipeline(df=df, drop_columns=datesetInfo.drop_columns,label_col=datesetInfo.label_col)
var_dropped, corr_dropped

(['Fwd PSH Flags',
  'Fwd URG Flags',
  'Fwd Byts/b Avg',
  'Fwd Pkts/b Avg',
  'Fwd Blk Rate Avg',
  'Bwd Byts/b Avg',
  'Bwd Pkts/b Avg',
  'Bwd Blk Rate Avg',
  'Init Fwd Win Byts',
  'Fwd Seg Size Min'],
 {'Active Max',
  'Active Mean',
  'Bwd Header Len',
  'Bwd IAT Mean',
  'Bwd IAT Std',
  'Bwd IAT Tot',
  'Bwd PSH Flags',
  'Bwd Pkt Len Max',
  'Bwd Pkt Len Mean',
  'Bwd Pkt Len Min',
  'Bwd Pkt Len Std',
  'Bwd URG Flags',
  'CWE Flag Count',
  'Flow Duration',
  'Flow IAT Max',
  'Flow IAT Mean',
  'Flow IAT Std',
  'Flow Pkts/s',
  'Fwd Header Len',
  'Fwd IAT Max',
  'Fwd IAT Mean',
  'Fwd Pkt Len Max',
  'Fwd Pkt Len Mean',
  'Fwd Pkt Len Min',
  'Idle Max',
  'Idle Mean',
  'Pkt Len Max',
  'Pkt Len Mean',
  'Pkt Size Avg',
  'Subflow Fwd Pkts',
  'Tot Bwd Pkts',
  'Tot Fwd Pkts',
  'TotLen Bwd Pkts',
  'TotLen Fwd Pkts'})

In [74]:
print(f"==>> type(var_dropped): {type(var_dropped)}")
print(f"==>> type(corr_dropped): {type(corr_dropped)}")

==>> type(var_dropped): <class 'list'>
==>> type(corr_dropped): <class 'set'>


In [78]:
var_dropped = set(var_dropped)
weak_columns = var_dropped.union(set(corr_dropped))
weak_columns

{'Active Max',
 'Active Mean',
 'Bwd Blk Rate Avg',
 'Bwd Byts/b Avg',
 'Bwd Header Len',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Tot',
 'Bwd PSH Flags',
 'Bwd Pkt Len Max',
 'Bwd Pkt Len Mean',
 'Bwd Pkt Len Min',
 'Bwd Pkt Len Std',
 'Bwd Pkts/b Avg',
 'Bwd URG Flags',
 'CWE Flag Count',
 'Flow Duration',
 'Flow IAT Max',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow Pkts/s',
 'Fwd Blk Rate Avg',
 'Fwd Byts/b Avg',
 'Fwd Header Len',
 'Fwd IAT Max',
 'Fwd IAT Mean',
 'Fwd PSH Flags',
 'Fwd Pkt Len Max',
 'Fwd Pkt Len Mean',
 'Fwd Pkt Len Min',
 'Fwd Pkts/b Avg',
 'Fwd Seg Size Min',
 'Fwd URG Flags',
 'Idle Max',
 'Idle Mean',
 'Init Fwd Win Byts',
 'Pkt Len Max',
 'Pkt Len Mean',
 'Pkt Size Avg',
 'Subflow Fwd Pkts',
 'Tot Bwd Pkts',
 'Tot Fwd Pkts',
 'TotLen Bwd Pkts',
 'TotLen Fwd Pkts'}

### Attacks Types

In [57]:
df[datesetInfo.class_col].unique()

array(['Reconnaissance', 'DDoS', 'DoS', 'Benign', 'Theft'], dtype=object)

In [58]:
# df[class_col] = df[class_col].replace({"BENIGN": "Benign",
#                                        "DDoS": "ddos",
#                                        "Web Attack � Brute Force": "bruteforce",
#                                        "Web Attack � XSS": "xss"})

In [59]:
classes = df[datesetInfo.class_col].unique()

### Sorting (optional)

In [60]:
if with_sort_timestamp and datesetInfo.timestamp_col:
    df[datesetInfo.timestamp_col] = pd.to_datetime(df[datesetInfo.timestamp_col].str.strip(), format=timestamp_format)
    df.sort_values(datesetInfo.timestamp_col, inplace= True)

### Encoding Attacks into integers

In [61]:
df, labels_names = one_dataset_class_num_col(df, datesetInfo.class_num_col, datesetInfo.class_col)

==>> labels_names: {np.int64(0): np.str_('Benign'), np.int64(1): np.str_('DDoS'), np.int64(2): np.str_('DoS'), np.int64(3): np.str_('Reconnaissance'), np.int64(4): np.str_('Theft')}


In [62]:
df.groupby(datesetInfo.class_col).size()

Attack
Benign              89190
DDoS              4913860
DoS               4909359
Reconnaissance    3513408
Theft                1701
dtype: int64

### Undersampling classes (optional)

In [63]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [64]:
if with_undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df[df[datesetInfo.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df[df[datesetInfo.class_col] == class_label])

    df = []
    # Optional: shuffle the undersampled DataFrame
    df = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [65]:
if with_undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df.groupby(datesetInfo.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

### saving labels encodings and datasets properties

In [66]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [67]:
df[datesetInfo.label_col].unique()

array([1, 0])

In [68]:
total_count = len(df)

properties = {
    "name": name,
    "length": total_count,
}

num_benign = len(df[df[datesetInfo.label_col] == 0])
num_attack = len(df[df[datesetInfo.label_col] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df[datesetInfo.class_col].unique())

properties["weak_columns"] = list(weak_columns) 

with open(folder_path + '/df_properties.json', 'w') as f:
    json.dump(properties, f)
    
properties


{'name': 'cic_bot_iot',
 'length': 13427518,
 'num_benign': 89190,
 'percentage_of_benign_records': 0.664232958019494,
 'num_attack': 13338328,
 'percentage_of_attack_records': 99.3357670419805,
 'attacks': ['Reconnaissance', 'DDoS', 'DoS', 'Benign', 'Theft']}

In [69]:
# graphs_properties_path = folder_path + '/graphs_properties.json'
# G = nx.from_pandas_edgelist(
#     df,
#     source=src_ip_col,
#     target=dst_ip_col,
#     create_using=nx.DiGraph()
# )
# calculate_graph_measures(G, graphs_properties_path)

In [70]:
df.to_parquet(output_path)

In [73]:
df.shape

(13427518, 85)

In [71]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,...,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
0,192.168.100.150-192.168.100.3-54114-44581-6,192.168.100.150,54114.0,192.168.100.3,44581.0,6.0,21/05/2018 08:34:13 PM,64465.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,31.024587,64465.0,0.0,64465.0,64465.0,0.0,0.0,0.0,0.0,64465.0,64465.0,0.0,64465.0,64465.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance,3
1,192.168.100.147-192.168.100.7-1649-80-6,192.168.100.147,1649.0,192.168.100.7,80.0,6.0,04/06/2018 05:38:04 PM,22646986.0,3.0,1.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,100.0,100.0,0.0,17.662394,0.176624,7548995.0,5561972.0,13839423.0,3281987.0,19364999.0,9682500.0,13839423.0,5525576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.132468,...,100.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,125.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,300.0,1.0,100.0,-1.0,512.0,3.0,0.0,3281987.0,0.0,3281987.0,3281987.0,9682499.5,5878778.0,13839423.0,5525576.0,1,DDoS,1
2,192.168.100.148-192.168.100.3-59032-80-6,192.168.100.148,59032.0,192.168.100.3,80.0,6.0,04/06/2018 01:41:34 PM,153463.0,1.0,2.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0,0.0,50.0,70.710678,651.622867,19.548686,76731.5,76717.55,130979.0,22484.0,0.0,0.0,0.0,0.0,22484.0,22484.0,0.0,22484.0,22484.0,0.0,0.0,0.0,0.0,20.0,44.0,6.516229,...,0.0,100.0,50.0,57.735027,3333.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,66.666667,0.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,100.0,-1.0,29200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance,3
3,192.168.100.149-192.168.100.7-2820-80-17,192.168.100.149,2820.0,192.168.100.7,80.0,17.0,04/06/2018 02:29:28 PM,24915470.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.200679,6228868.0,2337803.0,9613711.0,4568562.0,20138439.0,6712813.0,9613711.0,4568562.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,8.0,0.160543,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,4777031.0,0.0,4777031.0,4777031.0,7784938.5,2586275.0,9613711.0,5956166.0,1,DDoS,1
4,192.168.100.150-192.168.100.5-5146-80-17,192.168.100.150,5146.0,192.168.100.5,80.0,17.0,04/06/2018 06:17:12 PM,46437692.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193808,5804712.0,3854238.0,14796053.0,3089320.0,42448141.0,6064020.0,14796053.0,3089320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,8.0,0.172274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,9597806.5,7235516.0,14714089.0,4481524.0,9080693.0,5048133.0,14796053.0,5230723.0,1,DDoS,1


In [72]:
df.describe()

Unnamed: 0,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Class
count,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427518.0,13427520.0,13427518.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427518.0,13427518.0,13427518.0,13427518.0,13427518.0,13427518.0,13427520.0,13427520.0,13427520.0,13427520.0,13427518.0,13427520.0,13427520.0,13427518.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0,13427520.0
mean,10.52062,23929310.0,6.245421,1.942021,1139.305,586.8635,48.92058,30.14441,39.67842,10.22045,49.50924,24.72911,34.22295,14.93991,17497.02,3561.901,5043776.0,2544952.0,8633147.0,2983166.0,18690240.0,5386039.0,7610522.0,3626387.0,4194444.0,2680920.0,1432961.0,3704618.0,1581323.0,0.0,0.0003232168,0.0,5.138701e-05,77.99179,33.97108,697.2383,2864.662,24.59343,51.11762,38.57025,12.32192,2093.946,0.001845836,0.5767571,0.001984134,0.0003232168,0.01003276,5.138701e-05,1.891638e-05,1.906533e-05,0.1508414,47.18783,39.67842,34.22295,0.0,0.0,0.0,0.0,0.0,0.0,6.245421,1139.305,1.942021,586.8635,-1.0,4367.75,2.560943,0.0,2058155.0,102474.4,2131938.0,1985306.0,7060359.0,796597.8,7842334.0,6424130.0,0.9933577,1.882672
std,5.412682,16945390.0,70.32354,46.698,106748.2,134868.5,93.52652,47.03042,59.02855,38.53029,101.7542,45.26081,60.04267,41.13192,1029434.0,53600.78,4717517.0,3535817.0,7754182.0,4338945.0,14736190.0,5208608.0,7414965.0,4799894.0,10862910.0,7039797.0,5021545.0,9384669.0,6143300.0,0.0,0.01797533,0.0,0.007168289,1170.487,824.8857,23544.97,35227.66,43.56331,118.5575,54.46903,44.06944,823185.4,0.04292353,0.4940732,0.04449941,0.01797533,0.09965992,0.007168289,0.004349255,0.004366344,0.4145358,63.53914,59.02855,60.04267,0.0,0.0,0.0,0.0,0.0,0.0,70.32354,106748.2,46.698,134868.5,0.0,10229.89,70.18099,0.0,4749661.0,745066.5,4860906.0,4697498.0,7926421.0,1556037.0,8369495.0,7749877.0,0.08122936,0.800576
min,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01669377,0.3333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008333335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,14571970.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2029618,2095667.0,97867.28,3620585.0,39784.0,10088950.0,1691135.0,2731493.0,179115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,8.0,0.09677616,0.03425194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,6.0,22529570.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2757213,4386073.0,1266178.0,6641591.0,2128488.0,17262610.0,4251708.0,5486075.0,2546196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,20.0,0.1827317,0.05464567,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6169748.0,0.0,6638622.0,5302814.0,1.0,2.0
75%,17.0,33117460.0,7.0,2.0,300.0,100.0,100.0,100.0,100.0,0.0,100.0,0.0,97.6,0.0,16.64995,0.5379276,6066778.0,3009487.0,11478470.0,3959719.0,26791690.0,6514945.0,9976006.0,4410581.0,20609.75,20034.0,0.0,20590.0,15970.0,0.0,0.0,0.0,0.0,80.0,44.0,0.2787262,0.1082315,0.0,100.0,100.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.5,100.0,97.6,0.0,0.0,0.0,0.0,0.0,0.0,7.0,300.0,2.0,100.0,-1.0,512.0,3.0,0.0,247885.0,0.0,252947.0,228334.0,9068176.0,1005116.0,11474520.0,7354286.0,1.0,3.0
max,17.0,120000000.0,49152.0,32519.0,186458300.0,196303200.0,64240.0,1448.0,51262.19,27020.02,65160.0,1458.0,53459.47,29231.92,494000000.0,4000000.0,119805200.0,84843630.0,119987200.0,119805200.0,119999700.0,117018800.0,117221700.0,117018800.0,120000000.0,119987300.0,77390570.0,119987300.0,119987300.0,0.0,1.0,0.0,1.0,1572864.0,1040608.0,2000000.0,2000000.0,1212.0,65160.0,29861.07,30290.77,917530900.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0,29869.58,51262.19,53459.47,0.0,0.0,0.0,0.0,0.0,0.0,49152.0,186458300.0,32519.0,196303200.0,-1.0,42408.0,49064.0,0.0,111442900.0,58869230.0,111442900.0,111442900.0,119987200.0,66874970.0,119987200.0,119987200.0,1.0,4.0
