In [42]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None) #no max columns
pd.set_option('display.max_rows', None) #no max columns

# For the HLD-DDoSDN multiclass dataset, there are major inconsistencies between classes.
# All traffic for labels 1 (ICMP DDoS), 2 (TCP DDoS), and 3 (UDP DDoS) uses Protocol 6 (TCP),
# which contradicts the expected protocols (1 for ICMP, 6 for TCP, 17 for UDP) mentioned on the article.
#
# Similarly, in the binary classification datasets:
# - ICMP DDoS dataset (H-ICMP_...) uses TCP for label 0 (DDoS) and ICMP for label 1 (Normal), which is illogical.
# - UDP DDoS dataset (H-UDP_...) uses TCP for DDoS and UDP for normal traffic, again mismatching the expected attack type.
# - Only the TCP DDoS dataset is internally consistent but lacks protocol diversity.
#
# These inconsistencies cast doubt on the validity of the dataset and may mislead any machine learning models trained on it.

binary_path = "../1 dataset/HLD-DDoSDN_DatasetCSV/binary" # Path to binary datasets

In [60]:
##################################################################
#                                                 READ/LOAD BINARY
##################################################################

# Read multiple CSV files from folder
df = pd.concat(map(pd.read_csv, glob.glob(binary_path + "/*.csv"))) #output: (2413314, 72)
print("Dataset loaded...")

Dataset loaded...


In [62]:
# Check label by group

df.groupby("Label")["Protocol"].value_counts().unstack(fill_value=0)

# As mentioned previously, label 0 (representing DDoS traffic) contains 2,475,040 samples,
# all of which are TCP-based (Protocol 6), regardless of the DDoS type.
# This is inconsistent with the intended dataset design, which should include ICMP and UDP DDoS traffic.
#
# Meanwhile, label 1 (benign traffic) contains a realistic mix of protocols:
# - ICMP: 1,059,226 samples
# - TCP: 808,918 samples
# - UDP: 606,896 samples
#
# This discrepancy indicates a serious labeling or extraction issue in the dataset, where
# attack traffic does not represent protocol diversity, thereby undermining its suitability
# for training robust, real-world DDoS detection models.

Protocol,0,6,17
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,2475040,0
1,1059226,808918,606896


In [66]:
# dDisplay a summary of the DataFrame
df.info()

# There are 72 features, where:
# 71 columns are actual features (input variables).
# 1 column is the label (output/target).

<class 'pandas.core.frame.DataFrame'>
Index: 4950080 entries, 0 to 606895
Data columns (total 72 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Flow ID            int64  
 1   Src IP             int64  
 2   Src Port           int64  
 3   Dst IP             int64  
 4   Dst Port           int64  
 5   Protocol           int64  
 6   Timestamp          int64  
 7   Flow Duration      float64
 8   Tot Fwd Pkts       int64  
 9   Tot Bwd Pkts       int64  
 10  TotLen Fwd Pkts    int64  
 11  TotLen Bwd Pkts    int64  
 12  Fwd Pkt Len Max    int64  
 13  Fwd Pkt Len Min    int64  
 14  Fwd Pkt Len Mean   float64
 15  Fwd Pkt Len Std    float64
 16  Bwd Pkt Len Max    int64  
 17  Bwd Pkt Len Min    int64  
 18  Bwd Pkt Len Mean   int64  
 19  Bwd Pkt Len Std    int64  
 20  Flow Byts/s        float64
 21  Flow Pkts/s        float64
 22  Flow IAT Mean      float64
 23  Flow IAT Std       float64
 24  Flow IAT Max       float64
 25  Flow IAT Min       float

In [67]:
#### Number shows total rows and total columns available
#output shows total rows and total columns available

df.shape

#The output shows a total of 4950080 instances, with 72 features including Label.

(4950080, 72)

In [69]:
# Show all the unique values in the 'Protocol' column of your DataFrame.

print(df['Protocol'].unique())


[ 6  0 17]


In [70]:
df["Label"].value_counts()

# Display the label

Label
0    2475040
1    2475040
Name: count, dtype: int64

In [71]:
df.head(10)

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,437,198,33166,1,6633,6,0,68349.0,1,1,60,60,60,60,60.0,0.0,60,60,60,0,1755.695036,29.261584,68349.0,0.0,68349.0,68349.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,14.630792,14.630792,60,60,60.0,0.0,0.0,0,0,0,1,1,1,90.0,60.0,60,1,60,1,60,-1,7164,1,0,0,0,0,0.0,0.0,0.0,0.0,0
1,436,198,6633,1,33166,6,0,69939.0,1,1,154,154,154,154,154.0,0.0,154,154,154,0,4403.83763,28.596348,69939.0,0.0,69939.0,69939.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,14.298174,14.298174,154,154,154.0,0.0,0.0,0,0,0,1,1,1,231.0,154.0,154,1,154,1,154,-1,128,1,0,0,0,0,0.0,0.0,0.0,0.0,0
2,242,115,33166,1,6633,6,0,64508.0,1,1,60,60,60,60,60.0,0.0,60,60,60,0,1860.23439,31.003906,64508.0,0.0,64508.0,64508.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,15.501953,15.501953,60,60,60.0,0.0,0.0,0,0,0,1,1,1,90.0,60.0,60,1,60,1,60,-1,7164,1,0,0,0,0,0.0,0.0,0.0,0.0,0
3,241,115,6633,1,33166,6,0,64518.0,1,1,154,154,154,154,154.0,0.0,154,154,154,0,4773.861558,30.999101,64518.0,0.0,64518.0,64518.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,15.499551,15.499551,154,154,154.0,0.0,0.0,0,0,0,1,1,1,231.0,154.0,154,1,154,1,154,-1,128,1,0,0,0,0,0.0,0.0,0.0,0.0,0
4,223,106,33166,1,6633,6,0,67664.0,1,1,60,60,60,60,60.0,0.0,60,60,60,0,1773.468905,29.557815,67664.0,0.0,67664.0,67664.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,14.778908,14.778908,60,60,60.0,0.0,0.0,0,0,0,1,1,1,90.0,60.0,60,1,60,1,60,-1,7164,1,0,0,0,0,0.0,0.0,0.0,0.0,0
5,222,106,6633,1,33166,6,0,67659.0,1,1,154,154,154,154,154.0,0.0,154,154,154,0,4552.239909,29.559999,67659.0,0.0,67659.0,67659.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,14.78,14.78,154,154,154.0,0.0,0.0,0,0,0,1,1,1,231.0,154.0,154,1,154,1,154,-1,128,1,0,0,0,0,0.0,0.0,0.0,0.0,0
6,546,245,33166,1,6633,6,1,64100.0,1,1,60,60,60,60,60.0,0.0,60,60,60,0,1872.074883,31.201248,64100.0,0.0,64100.0,64100.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,15.600624,15.600624,60,60,60.0,0.0,0.0,0,0,0,1,1,1,90.0,60.0,60,1,60,1,60,-1,7164,1,0,0,0,0,0.0,0.0,0.0,0.0,0
7,545,245,6633,1,33166,6,1,64123.0,1,1,154,154,154,154,154.0,0.0,154,154,154,0,4803.268718,31.190057,64123.0,0.0,64123.0,64123.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0,0,1,32,32,15.595028,15.595028,154,154,154.0,0.0,0.0,0,0,0,1,1,1,231.0,154.0,154,1,154,1,154,-1,128,1,0,0,0,0,0.0,0.0,0.0,0.0,0
8,498,221,33166,1,6633,6,1,59716.0,3,1,180,60,60,60,60.0,0.0,60,60,60,0,4019.023377,66.983723,19905.33333,27584.63082,51440.0,254.0,59462.0,29731.0,30701.16223,51440.0,8022.0,0,0.0,0.0,0,0,0,1,96,32,50.237792,16.745931,60,60,60.0,0.0,0.0,0,0,0,1,1,0,75.0,60.0,60,3,180,1,60,-1,7164,3,0,0,0,0,0.0,0.0,0.0,0.0,0
9,497,221,6633,1,33166,6,1,59592.0,3,1,462,154,154,154,154.0,0.0,154,154,154,0,10336.95798,67.123104,19864.0,27437.39142,51240.0,372.0,59220.0,29610.0,30589.43935,51240.0,7980.0,0,0.0,0.0,0,0,0,1,96,32,50.342328,16.780776,154,154,154.0,0.0,0.0,0,0,0,1,1,0,192.5,154.0,154,3,462,1,154,-1,128,3,0,0,0,0,0.0,0.0,0.0,0.0,0


In [72]:
# Based on paper, the 0 is malicious and 1 is normal.
# In a binary experiment, the normal class is assigned a value of 1, and the malicious traffic is assigned a value of 0.
# In the multiclass experiment, every class is given a unique value. For example, 0, 1, 2, and 3 represent SDN normal traffic, ICMP, TCP, and UDP DDoS flooding attacks, respectively.

# The dataset contains 71 statistical features, with the last column indicating the class label.
# It can be used for both binary and multi-class classifications.
# In the binary experiment, normal traffic is assigned a value of 1, and malicious traffic a value of 0.
# In the multi-class experiment, each class is given a unique value:
# 0 for SDN normal traffic,
# 1 for ICMP DDoS,
# 2 for TCP DDoS, and
# 3 for UDP DDoS flooding attacks. -Quote from article.

In [74]:
# Export to csv file

# df.to_csv('../ds/hldddosdn_hlddos_combine_binary.csv', index=False)

In [76]:
# Display the protocol value counts.

df["Protocol"].value_counts()

# Legends:
# 6 - TCP
# 0 - ICMP
# 17 - UDP

Protocol
6     3283958
0     1059226
17     606896
Name: count, dtype: int64

In [78]:
# To check the protocol for each label.

# filtered_df = df[df["Label"] == 3]
# Get the value counts for the Protocol column
# protocol_counts = filtered_df["Protocol"].value_counts()
# print(protocol_counts)

# Analysis (HLD-DDoSDN multiclass dataset):
# for label 0 (Normal traffic), total protocol 6 (166667), 17 (166667), 0 (166667).
# for label 1 (ICMP traffic), total protocol 6 (500000)
# for label 2 (TCP traffic), total protocol 6 (500000)
# for label 3 (UDP traffic), total protocol 6 (500000)

In [79]:
#show the dataframe instances
#df.head() #show first 5 rows
#df.tail() #show last 5 rows

#df #show first 5 and last 5 rows

In [80]:
#df.describe()

In [81]:
##################################################################
#                          DATA CLEANING - CHECK FOR MISSING VALUE
##################################################################

#check all rows for missing value

#df.isna().sum() #check for missing value, or
df.isnull().sum() #check for missing value

Flow ID              0
Src IP               0
Src Port             0
Dst IP               0
Dst Port             0
Protocol             0
Timestamp            0
Flow Duration        0
Tot Fwd Pkts         0
Tot Bwd Pkts         0
TotLen Fwd Pkts      0
TotLen Bwd Pkts      0
Fwd Pkt Len Max      0
Fwd Pkt Len Min      0
Fwd Pkt Len Mean     0
Fwd Pkt Len Std      0
Bwd Pkt Len Max      0
Bwd Pkt Len Min      0
Bwd Pkt Len Mean     0
Bwd Pkt Len Std      0
Flow Byts/s          0
Flow Pkts/s          0
Flow IAT Mean        0
Flow IAT Std         0
Flow IAT Max         0
Flow IAT Min         0
Fwd IAT Tot          0
Fwd IAT Mean         0
Fwd IAT Std          0
Fwd IAT Max          0
Fwd IAT Min          0
Bwd IAT Tot          0
Bwd IAT Mean         0
Bwd IAT Std          0
Bwd IAT Max          0
Bwd IAT Min          0
Fwd PSH Flags        0
Bwd PSH Flags        0
Fwd Header Len       0
Bwd Header Len       0
Fwd Pkts/s           0
Bwd Pkts/s           0
Pkt Len Min          0
Pkt Len Max

In [82]:
#zero means no missing value - all rows is clean

In [83]:
#show columns name

df.columns

Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd Header Len',
       'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min',
       'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var',
       'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'PSH Flag Cnt',
       'ACK Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 'Fwd Seg Size Avg',
       'Bwd Seg Size Avg', 

In [90]:
##################################################################
#                             DATA PREPARATION - FEATURE SELECTION
##################################################################

# List of features to drop
# Since the 'Protocol' feature is inconsistent and unreliable, it will be dropped from the dataset.

features_to_drop = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp', 'Protocol']

# Drop the specified features from the DataFrame
# Dropping the 'Protocol' feature due to inconsistencies and potential mislabeling.
new_df = df.drop(features_to_drop, axis=1)

In [92]:
# Number shows total rows and total columns available
# Check for dataframe details, supposed to show 66 features after features selection

new_df.shape

# Output: 4950080 instances, and 65 features

(4950080, 65)

In [94]:
#show latest columns name

new_df.columns

Index(['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts',
       'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min',
       'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s',
       'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'Down/Up Ratio',
       'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg',
       'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Subflow Bwd Pkts',
       'Subflow Bwd Byts', 

In [97]:
# Display protocol based on label

new_df.groupby("Label")["Protocol"].value_counts().unstack(fill_value=0)

Protocol,0,6,17
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,2475040,0
1,1059226,808918,606896


In [96]:
##################################################################
#                                      EXPORT THE DATASET (BINARY)
##################################################################

new_df.to_csv('../ds/hldddosdn_hlddos_combined_binary_cleaned_0d1n.csv', index=False)


In [None]:
################################################################## COMPLETED!

In [105]:
##################################################################
#                                 NOW WE GO FOR MULTICLASS DATASET
##################################################################

In [2]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split

path = "../1 dataset/HLD-DDoSDN_datasetCSV" # Path to multiclass datasets

# Read multiple CSV files from folder
multi_df = pd.concat(map(pd.read_csv, glob.glob(path + "/*.csv"))) #output: (2413314, 72)
print("Dataset loaded...")

Dataset loaded...


In [5]:
# Check label by group

multi_df.groupby("Label")["Protocol"].value_counts().unstack(fill_value=0)

Protocol,0,6,17
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,166666,166667,166667
1,0,500000,0
2,0,500000,0
3,0,500000,0


In [103]:
# check for value count for each class
# 0-Normal, 1-ICMP, 2-TCP, 3-UDP

multi_df["Label"].value_counts()

Label
0    500000
1    500000
2    500000
3    500000
Name: count, dtype: int64

In [6]:
#### Number shows total rows and total columns available
#check for dataframe details, supposed to show 66 features after features selection
#1mil instances, and 66 features
multi_df.shape

(2000000, 72)

In [7]:
##################################################################
#                             DATA PREPARATION - FEATURE SELECTION
##################################################################

# List of features to drop
# Since the 'Protocol' feature is inconsistent and unreliable, it will be dropped from the dataset.

features_to_drop = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp', 'Protocol']

# Drop the specified features from the DataFrame
# Dropping the 'Protocol' feature due to inconsistencies and potential mislabeling.
new_multi_df = multi_df.drop(features_to_drop, axis=1)

In [15]:
new_multi_df.shape

(2000000, 65)

In [8]:
##################################################################
#                                       EXPORT THE DATASET (MULTI)
##################################################################

new_multi_df.to_csv('../ds/hldddosdn_hlddos_combined_multi_cleaned_0d1n.csv', index=False)

In [None]:
################################################################## COMPLETED!