# Pre-processing NF-BoT-IoT-v2 e NF-CSE-CIC-IDS2018-v2 datasets

Stratified sampling for the NF-BoT-IoT-v2 and NF-CSE-CIC-IDS2018-v2. These datsets need special treatment otherwise my RAM will explode LOL.

## IMPORTANT

You NEED at least 30GB of RAM to pre-process those!!

In [1]:
import dask.dataframe as dd
import pandas as pd

In [2]:
from typing import List, Tuple

# Reading dataset

Reading the dataset using dask.

In [3]:
local = {
    'nf-bot-iot-v2': 'nf-bot-iot-vv2/befb58edf3428167_MOHANAD_A4706/data/NF-BoT-IoT-v2.csv',
    'NF-CSE-CIC-IDS2018-v2': 'NF-CSE-CIC-IDS2018-v2/data/NF-CSE-CIC-IDS2018-v2.csv',
}

current_dataset_name = 'NF-CSE-CIC-IDS2018-v2'

df = dd.read_csv(local[current_dataset_name])

df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,13.58.98.64,40894,172.31.69.25,22,6,92.0,3164,23,3765,21,...,26883,26847,0,0,0,0,0,0,1,SSH-Bruteforce
1,213.202.230.143,29622,172.31.66.103,3389,6,0.0,1919,14,2031,11,...,8192,64000,0,0,0,0,0,0,0,Benign
2,172.31.66.5,65456,172.31.0.2,53,17,0.0,116,2,148,2,...,0,0,0,0,2511,1,5,0,0,Benign
3,172.31.64.92,57918,172.31.0.2,53,17,0.0,70,1,130,1,...,0,0,0,0,3371,1,60,0,0,Benign
4,18.219.32.43,63269,172.31.69.25,80,6,7.0,232,5,1136,4,...,8192,26883,0,0,0,0,0,0,1,DDoS attacks-LOIC-HTTP


## Knowing the dataset

In [8]:
total_count = df["Attack"].value_counts().compute()
total_ratio = df["Attack"].value_counts(normalize=True).compute()

table_1 = dd.concat([total_count, total_ratio], axis=1).sort_values(by="count").compute()
table_1

Unnamed: 0_level_0,count,proportion
Attack,Unnamed: 1_level_1,Unnamed: 2_level_1
SQL Injection,432,2.3e-05
Brute Force -XSS,927,4.9e-05
DDOS attack-LOIC-UDP,2112,0.000112
Brute Force -Web,2143,0.000113
DoS attacks-Slowloris,9512,0.000503
DoS attacks-SlowHTTPTest,14116,0.000747
FTP-BruteForce,25933,0.001373
DoS attacks-GoldenEye,27723,0.001467
SSH-Bruteforce,94979,0.005027
Infilteration,116361,0.006159


In [9]:
total_records = len(df)

In [10]:
desired_frac = 0.01

In [12]:
sample_total = round(total_records * desired_frac)
total_records, sample_total

(18893708, 188937)

In [13]:
distribution = list(zip(table_1.index.tolist(), table_1["proportion"].values))
distribution

[('SQL Injection', 2.286475476386107e-05),
 ('Brute Force -XSS', 4.906395293078521e-05),
 ('DDOS attack-LOIC-UDP', 0.00011178324551220967),
 ('Brute Force -Web', 0.00011342400337720896),
 ('DoS attacks-Slowloris', 0.0005034480261894594),
 ('DoS attacks-SlowHTTPTest', 0.0007471270329783863),
 ('FTP-BruteForce', 0.0013725733455815026),
 ('DoS attacks-GoldenEye', 0.0014673138803669454),
 ('SSH-Bruteforce', 0.0050270174599925015),
 ('Infilteration', 0.006158716965457495),
 ('Bot', 0.007573791232509786),
 ('DDoS attacks-LOIC-HTTP', 0.016264673932718767),
 ('DoS attacks-Hulk', 0.022899051896006863),
 ('DDOS attack-HOIC', 0.05720729885314201),
 ('Benign', 0.8804818514184722)]

In [14]:
for attack_name, attack_proportion in distribution:
    frac = round(sample_total * attack_proportion)
    print(f"{attack_name} | {frac}")

SQL Injection | 4
Brute Force -XSS | 9
DDOS attack-LOIC-UDP | 21
Brute Force -Web | 21
DoS attacks-Slowloris | 95
DoS attacks-SlowHTTPTest | 141
FTP-BruteForce | 259
DoS attacks-GoldenEye | 277
SSH-Bruteforce | 950
Infilteration | 1164
Bot | 1431
DDoS attacks-LOIC-HTTP | 3073
DoS attacks-Hulk | 4326
DDOS attack-HOIC | 10809
Benign | 166356


In [None]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])

In [None]:
def get_stratified_sample(df: dd.DataFrame, distribution: List[Tuple[str, float]], sample_total: int):
    samples = []
    for attack_name, attack_proportion in distribution:
        pandas_df: pd.DataFrame = df[df["Attack"] == attack_name].compute()
        n = round(sample_total * attack_proportion)
        samples.append(pandas_df.drop_duplicates().sample(n=n))
    return samples

stratified_sample = get_stratified_sample(df, distribution, sample_total)

In [None]:
stratified_sample["Attack"].value_counts().sort_values()

In [50]:
# target = "Attack"
# total_desire = 339_871
# classes = {
#     "DDoS": {
#         "distribution": 0.485438,
#         "total": 18_331_847,
#     },
#     "DoS": {
#         "distribution": 0.441516,
#         "total": 16_673_183,
#     },
#     "Reconnaissance": {
#         "distribution": 0.069406,
#         "total": 2_620_999,
#     },
#     "Benign": {
#         "distribution": 0.003576,
#         "total": 135_037,
#     },
# }
# samples = []

# for class_name, class_data in classes.items():
#     frac = (total_desire * class_data["distribution"]) / class_data["total"]
#     samples.append(df[df[target] == class_name].sample(frac=frac).compute())
#     print(samples[-1].shape, class_name)

# # 'Theft' needs special treatment because there are small records of it, and the sample() 
# # function doesn't work exactly the way you want LOL
# theft_class = df[df[target] == "Theft"].sample(frac=0.1).compute().sample(frac=0.1)
# print(theft_class.shape)
# samples.append(theft_class)

# combined_samples = dd.concat(samples)
# combined_samples.shape[0].compute()

(164988, 45) DDoS
(150053, 45) DoS
(23587, 45) Reconnaissance
(1216, 45) Benign
(24, 45)


# Saving the dataset

The final pre-processed dataset will be save as a '.parquet' file

In [74]:
file_path = f'./{current_dataset_name}.parquet'
stratified_sample.to_parquet(file_path, compression='gzip')