In [1]:
### SINGLE FILE STATS ANALYSIS

In [9]:
import pandas as pd

multi_path = "../1 dataset/HLD-DDoSDN_datasetCSV" # Path to multiclass datasets
binary_path = "../1 dataset/HLD-DDoSDN_DatasetCSV/binary" # Path to binary datasets

# Read single multiclass CSV file

multi_single_df1 = pd.read_csv(multi_path + "/H-All_DDoS_Flooding_Attack_balanced_data.csv")
multi_single_df2 = pd.read_csv(multi_path + "/L-All_DDoS_Flooding_Attack_balanced_data.csv")

############################################

# Read single binary classification CSV file

binary_single_df1 = pd.read_csv(binary_path + "/H-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df2 = pd.read_csv(binary_path + "/H-TCP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df3 = pd.read_csv(binary_path + "/H-UDP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df4 = pd.read_csv(binary_path + "/L-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df5 = pd.read_csv(binary_path + "/L-TCP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df6 = pd.read_csv(binary_path + "/L-UDP_DDoS_Flooding_Attack_balanced_data.csv")

In [10]:
def analyze_class_dataset(df, dataset_name):
    print(f"Dataset: {dataset_name}")
    
    # print(f"\nDataset shape: {df.shape}")
    
    print("\n--- Label Distribution ---")
    print(df["Label"].value_counts())
    
    print("\n--- Protocol Distribution by Label ---")
    protocol_label_dist = df.groupby("Label")["Protocol"].value_counts().unstack(fill_value=0)
    print(protocol_label_dist)
    
    print("\n--- Summary ---")
    print(f"Total samples: {df.shape[0]:,}")
    print(f"Total features: {df.shape[1]}")
    print(f"Unique labels: {df['Label'].nunique()}")
    print(f"Unique protocols: {df['Protocol'].nunique()}")
    print("Stats analysis completed!")
    print("-"*60,"\n")

In [14]:
########################
#    MULTI-CLASS DATASET
########################

In [15]:
print("Multiclass Dataset Stats")
analyze_class_dataset(multi_single_df1, "H-All_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(multi_single_df2, "L-All_DDoS_Flooding_Attack_balanced_data.csv")

Multiclass Dataset Stats
Dataset: H-All_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    250000
1    250000
2    250000
3    250000
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol     0       6      17
Label                         
0         83333   83333  83334
1             0  250000      0
2             0  250000      0
3             0  250000      0

--- Summary ---
Total samples: 1,000,000
Total features: 72
Unique labels: 4
Unique protocols: 3
Stats analysis completed!
------------------------------------------------------------ 

Dataset: L-All_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    250000
1    250000
2    250000
3    250000
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol     0       6      17
Label                         
0         83333   83334  83333
1             0  250000      0
2             0  250000      0
3             0  250000      0

--- Summary

In [16]:
# Multiclass Dataset Stats Analysis:

# In both High-Rate and Low-Rate “All Attacks” datasets, the authors define four classes: Normal, ICMP DDoS, TCP DDoS, and
# UDP DDoS, each with 250 000 samples. Total combination of both datasets will be 2,000,000 samples.

# Based on the article, the 0 is malicious and 1 is normal.
# Author quote:
# In a binary experiment, the normal class is assigned a value of 1, and the malicious traffic is assigned a value of 0.
# In the multiclass experiment, every class is given a unique value. For example, 0, 1, 2, and 3 represent SDN normal traffic, 
# ICMP, TCP, and UDP DDoS flooding attacks, respectively.

# Based on the protocol distribution above, labels 1, 2, and 3 appear to be captured as TCP (protocol 6) 
# due to OpenFlow encapsulation, which delivers packet-in messages over a TCP channel (port 6653). 
# As a result, the 'Protocol' feature does not reflect the actual nature of the DDoS attack (e.g., ICMP or UDP flood). 
# Therefore, it is advisable to exclude this feature from the model, as it may introduce bias or mislead the learning process.

In [18]:
########################
#   BINARY-CLASS DATASET
########################

In [13]:
print("Binary Class Dataset Stats - High DDOS")
analyze_class_dataset(binary_single_df1, "H-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df2, "H-TCP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df3, "H-UDP_DDoS_Flooding_Attack_balanced_data.csv")

Binary Class Dataset Stats - High DDOS
Dataset: H-ICMP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    529613
1    529613
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       0       6
Label                   
0              0  529613
1         529613       0

--- Summary ---
Total samples: 1,059,226
Total features: 72
Unique labels: 2
Unique protocols: 2
Stats analysis completed!
------------------------------------------------------------ 

Dataset: H-TCP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    373596
1    373596
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       6
Label           
0         373596
1         373596

--- Summary ---
Total samples: 747,192
Total features: 72
Unique labels: 2
Unique protocols: 1
Stats analysis completed!
------------------------------------------------------------ 

Dataset: H-UDP_DDoS_Flooding_Attack_balanced_data.csv

--- Label

In [6]:
print("Binary Class Dataset Stats - Low DDOS")
analyze_class_dataset(binary_single_df4, "L-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df5, "L-TCP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df6, "L-UDP_DDoS_Flooding_Attack_balanced_data.csv")

Binary Class Dataset Stats - Low DDOS
Dataset: L-ICMP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    529613
1    529613
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       0       6
Label                   
0              0  529613
1         529613       0

--- Summary ---
Total samples: 1,059,226
Total features: 72
Unique labels: 2
Unique protocols: 2
Stats analysis completed!
------------------------------------------------------------ 

Dataset: L-TCP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    435322
1    435322
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       6
Label           
0         435322
1         435322

--- Summary ---
Total samples: 870,644
Total features: 72
Unique labels: 2
Unique protocols: 1
Stats analysis completed!
------------------------------------------------------------ 

Dataset: L-UDP_DDoS_Flooding_Attack_balanced_data.csv

--- Label 

In [None]:
# Binary Dataset Stats Analysis:

# The HLD-DDoSDN dataset includes six binary-classification scenarios:
# (i) High-rate ICMP, (ii) Low-rate ICMP, (iii) High-rate UDP, (iv) Low-rate UDP, (v) High-rate TCP, and (vi) Low-rate TCP DDoS flooding attacks.

# ICMP DDoS Datasets (High & Low):
# Label 1 (ICMP attack) is recorded with Protocol = 0
# This likely indicates packet parsing failure due to OpenFlow encapsulation
# Label 0 (normal) is consistently recorded as Protocol = 6 (TCP)
# Conclusion: The Protocol feature is unreliable for ICMP attack detection in this dataset

In [None]:
# TCP DDoS Datasets (High & Low)
# Both attack and normal traffic are recorded with Protocol = 6 (TCP)
# This makes the Protocol feature useless for distinguishing between classes
# Conclusion: Protocol is not informative for TCP DDoS classification

In [None]:
# UDP DDoS Datasets (High & Low)
# Label 1 (UDP attack) correctly appears as Protocol = 17 (UDP)
# Label 0 (normal) appears as Protocol = 6 (likely OpenFlow controller traffic)
# Conclusion: Protocol can distinguish UDP attack traffic, but its reliability depends on whether the packets are encapsulated or directly observed

In [None]:
# Overall Summary
# The Protocol feature:
# -Fails for ICMP, due to parsing issues (recorded as 0)
# -Is redundant for TCP, as all traffic uses protocol 6
# -Is only partially useful for UDP, depending on data extraction
# Recommendation: Exclude the Protocol feature from model training across all datasets, unless validated specifically for UDP traffic