# Pre-processing datasets

Doing stratified sampling for the two datasets (NF-ToN-IoT-v2 and NF-UNSW-NB15-v2).

## IMPORTANT

**Only run "nf-ton-iot-v2" dataset if you have more than 16GB of RAM**

In [1]:
import dask.dataframe as dd
import pandas as pd
from typing import List, Tuple

## Reading datasets


In [2]:
kaggle = {
    'nf-unsw-nb15-v2': '/kaggle/input/nf-unsw-nb15-v2/fe6cb615d161452c_MOHANAD_A4706/data/NF-UNSW-NB15-v2.csv', 
    'nf-ton-iot-v2': '/kaggle/input/nf-ton-iot-v2/9bafce9d380588c2_MOHANAD_A4706/data/NF-ToN-IoT-v2.csv'
}

local = {
    'nf-unsw-nb15-v2': 'nf-unsw-nb15-v2/fe6cb615d161452c_MOHANAD_A4706/data/NF-UNSW-NB15-v2.csv', 
    'nf-ton-iot-v2': 'nf-ton-iot-v2/9bafce9d380588c2_MOHANAD_A4706/data/NF-ToN-IoT-v2.csv'
}

current_dataset_name = "nf-unsw-nb15-v2"
df = pd.read_csv(local[current_dataset_name])

df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,59.166.0.5,1305,149.171.126.8,21,6,1.0,9,1,193,3,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,59.166.0.5,1305,149.171.126.8,21,6,1.0,261,5,469,7,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,59.166.0.5,1305,149.171.126.8,21,6,1.0,481,9,750,11,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,59.166.0.5,1305,149.171.126.8,21,6,1.0,701,13,1054,15,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,59.166.0.5,1305,149.171.126.8,21,6,1.0,1031,19,1474,21,...,14480,13032,64256,251,0,0,0,230.0,0,Benign


# Knowing the dataset

In [3]:
total_count = df['Attack'].value_counts()
total_ratio = df["Attack"].value_counts(normalize=True).round(4) * 100
expected_sample_count = df.groupby("Attack").apply(
    lambda group: group.sample(frac=0.15), include_groups=True,
).droplevel(0)["Attack"].value_counts()

table_1 = pd.concat([total_count, total_ratio, expected_sample_count], axis=1)

print(f"{table_1}")

                  count  proportion   count
Attack                                     
Benign          2295222       96.02  344283
Exploits          31551        1.32    4733
Fuzzers           22310        0.93    3346
Generic           16560        0.69    2484
Reconnaissance    12779        0.53    1917
DoS                5794        0.24     869
Analysis           2299        0.10     345
Backdoor           2169        0.09     325
Shellcode          1427        0.06     214
Worms               164        0.01      25


  expected_sample_count = df.groupby("Attack").apply(


In [4]:
attacks_names = table_1.index.tolist()
distribution = list(zip(attacks_names,expected_sample_count.values))
distribution

[('Benign', 344283),
 ('Exploits', 4733),
 ('Fuzzers', 3346),
 ('Generic', 2484),
 ('Reconnaissance', 1917),
 ('DoS', 869),
 ('Analysis', 345),
 ('Backdoor', 325),
 ('Shellcode', 214),
 ('Worms', 25)]

# Removing Columns

Removing desired columns

In [5]:
df = df.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])
df.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,1305,21,6,1.0,9,1,193,3,24,24,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,1305,21,6,1.0,261,5,469,7,24,24,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,1305,21,6,1.0,481,9,750,11,24,24,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,1305,21,6,1.0,701,13,1054,15,24,24,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,1305,21,6,1.0,1031,19,1474,21,24,24,...,14480,13032,64256,251,0,0,0,230.0,0,Benign


In [6]:
print(df.duplicated().sum())

403530


# Stratified_sample

Getting the desired number of groups following the strata sample.

In [22]:
def get_stratified_sample(df: pd.DataFrame, distribution: List[Tuple[str, int]]):
    attack_group = df.groupby("Attack")
    samples = []
    for attack_name, desired_total in distribution:
        samples.append(attack_group.get_group(attack_name).drop_duplicates().sample(n=desired_total))
    stratified_sample = pd.concat(samples)
    return stratified_sample
        
stratified_sample = get_stratified_sample(df, distribution)

In [25]:
stratified_sample["Attack"].value_counts()

Attack
Benign            344283
Exploits            4733
Fuzzers             3346
Generic             2484
Reconnaissance      1917
DoS                  869
Analysis             345
Backdoor             325
Shellcode            214
Worms                 25
Name: count, dtype: int64

# Saving the dataset

Saving the new dataset into parquet format to be more efficient

In [28]:
# file_path = f'./{current_dataset_name}.parquet'
file_path = f'./pre-processed/{current_dataset_name.upper()}.parquet'
stratified_sample.sample(frac=1.0).to_parquet(file_path, compression='gzip')