# Pre-processing NF-BoT-IoT-v2 dataset

Stratified sampling for the NF-BoT-IoT-v2. This dataset needs special treatment otherwise my RAM will explode LOL.

In [37]:
import dask.dataframe as dd
import pandas as pd

# Reading dataset

Reading the dataset using dask.

In [38]:
df = dd.read_csv('./NF-BoT-IoT-v2/befb58edf3428167_MOHANAD_A4706/data/NF-BoT-IoT-v2.csv')  # records: 37_763_497

df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,192.168.100.149,63519,192.168.100.6,80,6,7.0,140,1,0,0,...,512,0,0,0,0,0,0,0,1,DoS
1,192.168.100.147,60774,192.168.100.5,80,6,7.0,280,2,0,0,...,512,0,0,0,0,0,0,0,1,DoS
2,192.168.100.148,32534,192.168.100.5,80,6,7.0,280,2,0,0,...,512,0,0,0,0,0,0,0,1,DoS
3,192.168.100.147,52113,192.168.100.7,80,17,188.0,56,2,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
4,192.168.100.150,36780,192.168.100.6,80,17,188.0,56,2,0,0,...,0,0,0,0,0,0,0,0,1,DDoS


In [39]:
current_dataset_name = "NF-BoT-IoT-v2"

## Knowing the dataset

Checking the number of lines and if there are missing values.

In [44]:
df.shape[0].compute()

37763497

In [47]:
df.isnull().sum().compute()

IPV4_SRC_ADDR                  0
L4_SRC_PORT                    0
IPV4_DST_ADDR                  0
L4_DST_PORT                    0
PROTOCOL                       0
L7_PROTO                       0
IN_BYTES                       0
IN_PKTS                        0
OUT_BYTES                      0
OUT_PKTS                       0
TCP_FLAGS                      0
CLIENT_TCP_FLAGS               0
SERVER_TCP_FLAGS               0
FLOW_DURATION_MILLISECONDS     0
DURATION_IN                    0
DURATION_OUT                   0
MIN_TTL                        0
MAX_TTL                        0
LONGEST_FLOW_PKT               0
SHORTEST_FLOW_PKT              0
MIN_IP_PKT_LEN                 0
MAX_IP_PKT_LEN                 0
SRC_TO_DST_SECOND_BYTES        0
DST_TO_SRC_SECOND_BYTES        0
RETRANSMITTED_IN_BYTES         0
RETRANSMITTED_IN_PKTS          0
RETRANSMITTED_OUT_BYTES        0
RETRANSMITTED_OUT_PKTS         0
SRC_TO_DST_AVG_THROUGHPUT      0
DST_TO_SRC_AVG_THROUGHPUT      0
NUM_PKTS_U

# NF-BoT-IoT-v2


|   **Attack**   | **Amount**  | **distribution**  
|:--------------:|:----------: |:-----------------:
|      DDoS      |  18_331_847 |      0.485438     
|       DoS      |  16_673_183 |      0.441516     
| Reconnaissance |   2_620_999 |      0.069406     
|     Benign     |   135_037   |      0.003576     
|      Theft     |    2_431    |      0.000064     

In [48]:
df["Attack"].value_counts().compute()

Attack
DDoS              18331847
DoS               16673183
Reconnaissance     2620999
Benign              135037
Theft                 2431
Name: count, dtype: int64

In [49]:
df["Attack"].value_counts(normalize=True).compute()

Attack
DDoS              0.485438
DoS               0.441516
Reconnaissance    0.069406
Benign            0.003576
Theft             0.000064
Name: proportion, dtype: float64

In [50]:
target = "Attack"
total_desire = 339_871
classes = {
    "DDoS": {
        "distribution": 0.485438,
        "total": 18_331_847,
    },
    "DoS": {
        "distribution": 0.441516,
        "total": 16_673_183,
    },
    "Reconnaissance": {
        "distribution": 0.069406,
        "total": 2_620_999,
    },
    "Benign": {
        "distribution": 0.003576,
        "total": 135_037,
    },
}
samples = []

for class_name, class_data in classes.items():
    frac = (total_desire * class_data["distribution"]) / class_data["total"]
    samples.append(df[df[target] == class_name].sample(frac=frac).compute())
    print(samples[-1].shape, class_name)

# 'Theft' needs special treatment because there are small records of it, and the sample() 
# function doesn't work exactly the way you want LOL
theft_class = df[df[target] == "Theft"].sample(frac=0.1).compute().sample(frac=0.1)
print(theft_class.shape)
samples.append(theft_class)

(164988, 45) DDoS
(150053, 45) DoS
(23587, 45) Reconnaissance
(1216, 45) Benign
(24, 45)


In [51]:
combined_samples = dd.concat(samples)
combined_samples.shape[0].compute()

339868

In [None]:
import gc
gc.collect()

del samples
del df

In [54]:
combined_samples = combined_samples.compute()
combined_samples

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,192.168.100.149,57338,192.168.100.7,80,17,188.0,84,3,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
2,192.168.100.150,30962,192.168.100.3,80,17,188.0,56,2,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
3,192.168.100.150,61157,192.168.100.3,80,17,188.0,56,2,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
6,192.168.100.148,51294,192.168.100.3,80,17,188.0,56,2,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
9,192.168.100.147,49667,192.168.100.7,80,17,188.0,84,3,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364524,192.168.100.150,36802,192.168.100.3,22,6,92.0,1658,15,3380,16,...,29200,28960,0,0,0,0,0,0,1,Theft
376805,192.168.100.6,0,192.168.100.149,0,1,0.0,84000,1500,0,0,...,0,0,771,3,0,0,0,0,1,Theft
395093,192.168.100.3,43171,192.168.100.150,4433,6,0.0,60,1,40,1,...,29200,0,0,0,0,0,0,0,1,Theft
398276,192.168.100.3,42966,192.168.100.150,4433,6,0.0,60,1,40,1,...,29200,0,0,0,0,0,0,0,1,Theft


In [61]:
combined_samples = combined_samples.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])
combined_samples.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
194244,26893,80,17,188.0,56,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
208499,55939,80,6,7.0,280,2,0,0,2,2,...,512,0,35840,140,0,0,0,0,1,DoS
361571,39975,80,17,188.0,56,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
131692,13530,80,17,188.0,56,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,DDoS
241474,61124,80,17,188.0,56,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,DDoS


In [62]:
combined_samples = combined_samples.sample(frac=1)

## Deleting duplicated records 

In [70]:
print(combined_samples.duplicated().sum())
combined_samples[combined_samples.duplicated()]["Attack"].value_counts()

955


Attack
DDoS              509
DoS               420
Reconnaissance     26
Name: count, dtype: int64

In [71]:
combined_samples = combined_samples.drop_duplicates()

In [73]:
combined_samples["Attack"].value_counts()

Attack
DDoS              164479
DoS               149633
Reconnaissance     23561
Benign              1216
Theft                 24
Name: count, dtype: int64

# Saving the dataset

The final pre-processed dataset will be save as a '.parquet' file

In [74]:
file_path = f'./{current_dataset_name}.parquet'
combined_samples.to_parquet(file_path, compression='gzip')