In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold

In [2]:
root = '../../../'

In [3]:
df_pre = pd.read_csv(root + 'datasets/multiclass/processed/CICDDoS_pre.csv', index_col=[0])

In [4]:
df_pre.shape

(5280, 78)

In [5]:
df = df_pre.drop(columns=[' Label'])

___
# Remove constant features

In [6]:
constant_features = [feature for feature in df.columns if df[feature].std() == 0]

In [7]:
len(constant_features)

12

In [8]:
constant_features

[' Bwd PSH Flags',
 ' Fwd URG Flags',
 ' Bwd URG Flags',
 'FIN Flag Count',
 ' PSH Flag Count',
 ' ECE Flag Count',
 'Fwd Avg Bytes/Bulk',
 ' Fwd Avg Packets/Bulk',
 ' Fwd Avg Bulk Rate',
 ' Bwd Avg Bytes/Bulk',
 ' Bwd Avg Packets/Bulk',
 'Bwd Avg Bulk Rate']

In [9]:
df_no_constant = df.drop(columns=constant_features)

In [10]:
df_no_constant.head()

Unnamed: 0_level_0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10431,1,2,0,2736.0,0.0,1368.0,1368.0,1368.0,0.0,0.0,...,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13090,1,2,0,2650.0,0.0,1325.0,1325.0,1325.0,0.0,0.0,...,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12990,2,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19624,46,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20691,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_no_constant.shape

(5280, 65)

___
# Remove quasi-constant features

In [12]:
# find features with low variance
sel = VarianceThreshold(threshold=0.01)
sel.fit(df_no_constant)

# how many not quasi-constant?
sum(sel.get_support()) 

62

In [13]:
features_to_keep = df_no_constant.columns[sel.get_support()]
features_to_remove = df_no_constant.columns[~sel.get_support()]

features_to_remove

Index(['Fwd PSH Flags', ' SYN Flag Count', ' RST Flag Count'], dtype='object')

In [14]:
df_no_quasi = sel.transform(df_no_constant)

df_no_quasi = pd.DataFrame(df_no_quasi)
df_no_quasi.columns = features_to_keep

In [15]:
df_no_quasi.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,1.0,2.0,0.0,2736.0,0.0,1368.0,1368.0,1368.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,0.0,2650.0,0.0,1325.0,1325.0,1325.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2.0,0.0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,46.0,2.0,0.0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,2.0,0.0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df_no_quasi.shape

(5280, 62)

___
# Remove duplicated features

In [17]:
# check for duplicated features in the training set:

# create an empty dictionary, where we will store 
# the groups of duplicates
duplicated_feat_pairs = {}

# create an empty list to collect features
# that were found to be duplicated
_duplicated_feat = []


# iterate over every feature in our dataset:
for i in range(0, len(df_no_quasi.columns)):
    
    # this bit helps me understand where the loop is at:
    if i % 10 == 0:  
        print(i)
    
    # choose 1 feature:
    feat_1 = df_no_quasi.columns[i]
    
    # check if this feature has already been identified
    # as a duplicate of another one. If it was, it should be stored in
    # our _duplicated_feat list.
    
    # If this feature was already identified as a duplicate, we skip it, if
    # it has not yet been identified as a duplicate, then we proceed:
    if feat_1 not in _duplicated_feat:
    
        # create an empty list as an entry for this feature in the dictionary:
        duplicated_feat_pairs[feat_1] = []

        # now, iterate over the remaining features of the dataset:
        for feat_2 in df_no_quasi.columns[i + 1:]:

            # check if this second feature is identical to the first one
            if df_no_quasi[feat_1].equals(df_no_quasi[feat_2]):

                # if it is identical, append it to the list in the dictionary
                duplicated_feat_pairs[feat_1].append(feat_2)
                
                # and append it to our monitor list for duplicated variables
                _duplicated_feat.append(feat_2)
                
                # done!

0
10
20
30
40
50
60


In [18]:
# let's explore our list of duplicated features
len(_duplicated_feat)

7

In [19]:
# these are the ones:

_duplicated_feat

['Subflow Fwd Packets',
 ' Subflow Bwd Packets',
 ' Subflow Fwd Bytes',
 ' Subflow Bwd Bytes',
 ' Avg Fwd Segment Size',
 ' Avg Bwd Segment Size',
 ' Fwd Header Length.1']

In [20]:
duplicated_feat_pairs

{' Flow Duration': [],
 ' Total Fwd Packets': ['Subflow Fwd Packets'],
 ' Total Backward Packets': [' Subflow Bwd Packets'],
 'Total Length of Fwd Packets': [' Subflow Fwd Bytes'],
 ' Total Length of Bwd Packets': [' Subflow Bwd Bytes'],
 ' Fwd Packet Length Max': [],
 ' Fwd Packet Length Min': [],
 ' Fwd Packet Length Mean': [' Avg Fwd Segment Size'],
 ' Fwd Packet Length Std': [],
 'Bwd Packet Length Max': [],
 ' Bwd Packet Length Min': [],
 ' Bwd Packet Length Mean': [' Avg Bwd Segment Size'],
 ' Bwd Packet Length Std': [],
 'Flow Bytes/s': [],
 ' Flow Packets/s': [],
 ' Flow IAT Mean': [],
 ' Flow IAT Std': [],
 ' Flow IAT Max': [],
 ' Flow IAT Min': [],
 'Fwd IAT Total': [],
 ' Fwd IAT Mean': [],
 ' Fwd IAT Std': [],
 ' Fwd IAT Max': [],
 ' Fwd IAT Min': [],
 'Bwd IAT Total': [],
 ' Bwd IAT Mean': [],
 ' Bwd IAT Std': [],
 ' Bwd IAT Max': [],
 ' Bwd IAT Min': [],
 ' Fwd Header Length': [' Fwd Header Length.1'],
 ' Bwd Header Length': [],
 'Fwd Packets/s': [],
 ' Bwd Packets/s': []

In [21]:
df_no_duplicates = df_no_quasi.drop(columns=' Fwd Header Length.1')

In [22]:
df_no_duplicates.shape

(5280, 61)

___
# Save the dataset

In [23]:
df_pre.reset_index(inplace=True)
df_no_duplicates[' Label'] = df_pre[' Label']

In [24]:
df_no_duplicates.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,1.0,2.0,0.0,2736.0,0.0,1368.0,1368.0,1368.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
1,1.0,2.0,0.0,2650.0,0.0,1325.0,1325.0,1325.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
2,2.0,2.0,0.0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
3,46.0,2.0,0.0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS
4,1.0,2.0,0.0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DrDoS_DNS


In [25]:
df_no_duplicates.shape

(5280, 62)

In [26]:
df_no_duplicates.to_csv(root + "datasets/multiclass/processed/CICDDoS_basic.csv")