# Pre-processing datasets

Doing stratified sampling for the two datasets (NF-ToN-IoT-v2 and NF-UNSW-NB15-v2).

In [1]:
import dask.dataframe as dd
import pandas as pd

## Reading datasets


In [2]:
df = pd.read_csv('/kaggle/input/nf-ton-iot-v2/9bafce9d380588c2_MOHANAD_A4706/data/NF-ToN-IoT-v2.csv')  # records: 16_940_496
# df = pd.read_csv('/kaggle/input/nf-unsw-nb15-v2/fe6cb615d161452c_MOHANAD_A4706/data/NF-UNSW-NB15-v2.csv')  # records: 2_390_275

df.head()

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,192.168.1.193,49235,192.168.1.33,4444,6,0.0,155392,202,34552,149,...,45555,4805,0,0,0,0,0,0,1,ransomware
1,192.168.1.193,49228,192.168.1.152,1880,6,0.0,1600,40,35741,65,...,16425,237,0,0,0,0,0,0,0,Benign
2,192.168.1.152,0,192.168.1.193,0,1,0.0,212,2,0,0,...,0,0,771,3,0,0,0,0,0,Benign
3,192.168.1.169,65317,239.255.255.250,1900,17,0.0,165,1,0,0,...,0,0,0,0,0,0,0,0,0,Benign
4,192.168.1.79,60766,192.168.1.255,15600,17,0.0,63,1,0,0,...,0,0,0,0,0,0,0,0,0,Benign


In [3]:
# "NF-UNSW-NB15-v2" | "NF-ToN-IoT-v2"
current_dataset_name = "NF-ToN-IoT-v2"

# Knowing the dataset

Check if there are missing values.

In [4]:
df.shape

(16940496, 45)

In [5]:
df.isnull().sum()

IPV4_SRC_ADDR                  0
L4_SRC_PORT                    0
IPV4_DST_ADDR                  0
L4_DST_PORT                    0
PROTOCOL                       0
L7_PROTO                       0
IN_BYTES                       0
IN_PKTS                        0
OUT_BYTES                      0
OUT_PKTS                       0
TCP_FLAGS                      0
CLIENT_TCP_FLAGS               0
SERVER_TCP_FLAGS               0
FLOW_DURATION_MILLISECONDS     0
DURATION_IN                    0
DURATION_OUT                   0
MIN_TTL                        0
MAX_TTL                        0
LONGEST_FLOW_PKT               0
SHORTEST_FLOW_PKT              0
MIN_IP_PKT_LEN                 0
MAX_IP_PKT_LEN                 0
SRC_TO_DST_SECOND_BYTES        0
DST_TO_SRC_SECOND_BYTES        0
RETRANSMITTED_IN_BYTES         0
RETRANSMITTED_IN_PKTS          0
RETRANSMITTED_OUT_BYTES        0
RETRANSMITTED_OUT_PKTS         0
SRC_TO_DST_AVG_THROUGHPUT      0
DST_TO_SRC_AVG_THROUGHPUT      0
NUM_PKTS_U

In [6]:
df["Attack"].value_counts()

Attack
Benign        6099469
scanning      3781419
xss           2455020
ddos          2026234
password      1153323
dos            712609
injection      684465
backdoor        16809
mitm             7723
ransomware       3425
Name: count, dtype: int64

# Stratified sampling

Getting the ratio of classes in the original dataset.

The function get_stratified_sample() also returns the sample given the desire fraction

In [7]:
ratio = df["Attack"].value_counts(normalize=True)

ratio

Attack
Benign        0.360053
scanning      0.223218
xss           0.144920
ddos          0.119609
password      0.068081
dos           0.042065
injection     0.040404
backdoor      0.000992
mitm          0.000456
ransomware    0.000202
Name: proportion, dtype: float64

In [8]:
def get_stratified_sample(df, feature, sample_frac):    
    
    stratified_sample = df.groupby(feature).apply(
        lambda x: x.sample(frac=sample_frac)
    ).droplevel(0)
    
    expected_ratio = df[feature].value_counts(normalize=True).round(4) * 100
    stratified_ratio = stratified_sample[feature].value_counts(normalize=True).round(4) * 100
        
    ratios_df = pd.DataFrame({'Expected': expected_ratio, "Stratified": stratified_ratio})
    ratios_df = pd.concat([expected_ratio, stratified_ratio], axis=1)
    
    print(f'{ratios_df}\n\nAmount of classes:\n{stratified_sample["Attack"].value_counts()}')
    
    return stratified_sample
    
stratified_sample = get_stratified_sample(df, "Attack", 0.015)

            proportion  proportion
Attack                            
Benign           36.01       36.01
scanning         22.32       22.32
xss              14.49       14.49
ddos             11.96       11.96
password          6.81        6.81
dos               4.21        4.21
injection         4.04        4.04
backdoor          0.10        0.10
mitm              0.05        0.05
ransomware        0.02        0.02

Amount of classes:
Attack
Benign        91492
scanning      56721
xss           36825
ddos          30394
password      17300
dos           10689
injection     10267
backdoor        252
mitm            116
ransomware       51
Name: count, dtype: int64


In [20]:
stratified_sample.shape

(246770, 43)

# Removing Columns

Removing desiring columns and choosing a random sample of the original dataset

In [11]:
stratified_sample = stratified_sample.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])

In [12]:
stratified_sample = stratified_sample.sample(frac=1)
stratified_sample

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
5787015,40637,8086,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
6052503,54473,4111,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
2828423,45025,24619,6,0.0,48,1,0,0,2,2,...,4096,0,0,0,0,0,0,0,1,scanning
4177606,45577,8099,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
4835504,36945,12000,6,0.0,44,1,40,1,22,2,...,1024,0,0,0,0,0,0,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14059700,56554,443,6,91.0,276,5,120,2,18,18,...,29200,28960,0,0,0,0,0,0,1,ddos
9319187,46295,53,17,0.0,68,1,100,1,0,0,...,0,0,0,0,14668,1,11,0,1,xss
10782789,58528,80,6,7.0,1843,7,590,5,27,27,...,29200,5792,0,0,0,0,0,0,1,xss
10611187,58442,53,17,0.0,68,1,68,1,0,0,...,0,0,0,0,7691,28,0,0,1,xss


In [13]:
print(stratified_sample.duplicated().sum())
stratified_sample[stratified_sample.duplicated()]["Attack"].value_counts()

7337


Attack
scanning     5985
Benign       1160
ddos           94
password       55
dos            32
injection      10
xss             1
Name: count, dtype: int64

In [15]:
stratified_sample[stratified_sample.duplicated()]

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
9188626,41725,443,6,91.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,1,scanning
5039986,60727,80,6,7.0,40,1,0,0,16,16,...,1024,0,0,0,0,0,0,0,1,scanning
5815048,49945,5801,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
7438176,33322,443,6,91.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,1,scanning
7537959,38754,443,6,91.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,1,scanning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4793050,57507,2068,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
3896751,52089,777,6,0.0,44,1,40,1,22,2,...,1024,0,0,0,0,0,0,0,0,Benign
3767512,40617,80,6,7.0,40,1,0,0,16,16,...,1024,0,0,0,0,0,0,0,1,scanning
2233584,63998,80,6,7.0,40,1,0,0,16,16,...,1024,0,0,0,0,0,0,0,1,scanning


In [16]:
stratified_sample = stratified_sample.drop_duplicates()

In [17]:
stratified_sample["Attack"].value_counts()

Attack
Benign        90332
scanning      50736
xss           36824
ddos          30300
password      17245
dos           10657
injection     10257
backdoor        252
mitm            116
ransomware       51
Name: count, dtype: int64

In [19]:
stratified_sample["Attack"].value_counts(normalize=True).round(4) * 100

Attack
Benign        36.61
scanning      20.56
xss           14.92
ddos          12.28
password       6.99
dos            4.32
injection      4.16
backdoor       0.10
mitm           0.05
ransomware     0.02
Name: proportion, dtype: float64

In [33]:
stratified_sample

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
5787015,40637,8086,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
6052503,54473,4111,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
2828423,45025,24619,6,0.0,48,1,0,0,2,2,...,4096,0,0,0,0,0,0,0,1,scanning
4177606,45577,8099,6,0.0,44,1,0,0,2,2,...,1024,0,0,0,0,0,0,0,0,Benign
4835504,36945,12000,6,0.0,44,1,40,1,22,2,...,1024,0,0,0,0,0,0,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14059700,56554,443,6,91.0,276,5,120,2,18,18,...,29200,28960,0,0,0,0,0,0,1,ddos
9319187,46295,53,17,0.0,68,1,100,1,0,0,...,0,0,0,0,14668,1,11,0,1,xss
10782789,58528,80,6,7.0,1843,7,590,5,27,27,...,29200,5792,0,0,0,0,0,0,1,xss
10611187,58442,53,17,0.0,68,1,68,1,0,0,...,0,0,0,0,7691,28,0,0,1,xss


# Saving the dataset

Saving the new dataset into parquet format to be more efficient

In [37]:
file_path = f'./{current_dataset_name}.parquet'
stratified_sample.to_parquet(file_path, compression='gzip')