In [1]:
### SINGLE FILE STATS ANALYSIS

In [2]:
import pandas as pd

multi_path = "../1 dataset/HLD-DDoSDN_datasetCSV" # Path to multiclass datasets
binary_path = "../1 dataset/HLD-DDoSDN_DatasetCSV/binary" # Path to binary datasets

# Read single multiclass CSV file

multi_single_df1 = pd.read_csv(multi_path + "/H-All_DDoS_Flooding_Attack_balanced_data.csv")
multi_single_df2 = pd.read_csv(multi_path + "/L-All_DDoS_Flooding_Attack_balanced_data.csv")

############################################

# Read single binary classification CSV file

binary_single_df1 = pd.read_csv(binary_path + "/H-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df2 = pd.read_csv(binary_path + "/H-TCP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df3 = pd.read_csv(binary_path + "/H-UDP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df4 = pd.read_csv(binary_path + "/L-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df5 = pd.read_csv(binary_path + "/L-TCP_DDoS_Flooding_Attack_balanced_data.csv")
binary_single_df6 = pd.read_csv(binary_path + "/L-UDP_DDoS_Flooding_Attack_balanced_data.csv")

In [3]:
def analyze_class_dataset(df, dataset_name):
    print(f"Dataset: {dataset_name}")
    
    # print(f"\nDataset shape: {df.shape}")
    
    print("\n--- Label Distribution ---")
    print(df["Label"].value_counts())
    
    print("\n--- Protocol Distribution by Label ---")
    protocol_label_dist = df.groupby("Label")["Protocol"].value_counts().unstack(fill_value=0)
    print(protocol_label_dist)
    
    print("\n--- Summary ---")
    print(f"Total samples: {df.shape[0]:,}")
    print(f"Total features: {df.shape[1]}")
    print(f"Unique labels: {df['Label'].nunique()}")
    print(f"Unique protocols: {df['Protocol'].nunique()}")
    print("Stats analysis completed!")
    print("-"*60,"\n")

In [4]:
print("Multiclass Dataset Stats")
analyze_class_dataset(multi_single_df1, "H-All_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(multi_single_df2, "L-All_DDoS_Flooding_Attack_balanced_data.csv")

Multiclass Dataset Stats
Dataset: H-All_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    250000
1    250000
2    250000
3    250000
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol     0       6      17
Label                         
0         83333   83333  83334
1             0  250000      0
2             0  250000      0
3             0  250000      0

--- Summary ---
Total samples: 1,000,000
Total features: 72
Unique labels: 4
Unique protocols: 3
Stats analysis completed!
------------------------------------------------------------ 

Dataset: L-All_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    250000
1    250000
2    250000
3    250000
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol     0       6      17
Label                         
0         83333   83334  83333
1             0  250000      0
2             0  250000      0
3             0  250000      0

--- Summary

In [7]:
# Multiclass Dataset Stats Analysis:

# In both High-Rate and Low-Rate “All Attacks” datasets, the authors define four classes—Normal, ICMP DDoS, TCP DDoS, and
# UDP DDoS—each with 250 000 samples. Critically, each attack class should carry its own protocol (ICMP→1, TCP→6, UDP→17) 
# in the Protocol field.

# But, none of the ICMP or UDP attack samples in either H-All or L-All carry the correct protocol even though the paper’s Table 7
# says they should.

In [5]:
print("Binary Class Dataset Stats - High DDOS")
analyze_class_dataset(binary_single_df1, "H-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df2, "H-TCP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df3, "H-UDP_DDoS_Flooding_Attack_balanced_data.csv")

Binary Class Dataset Stats - High DDOS
Dataset: H-ICMP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    529613
1    529613
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       0       6
Label                   
0              0  529613
1         529613       0

--- Summary ---
Total samples: 1,059,226
Total features: 72
Unique labels: 2
Unique protocols: 2
Stats analysis completed!
------------------------------------------------------------ 

Dataset: H-TCP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    373596
1    373596
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       6
Label           
0         373596
1         373596

--- Summary ---
Total samples: 747,192
Total features: 72
Unique labels: 2
Unique protocols: 1
Stats analysis completed!
------------------------------------------------------------ 

Dataset: H-UDP_DDoS_Flooding_Attack_balanced_data.csv

--- Label

In [6]:
print("Binary Class Dataset Stats - Low DDOS")
analyze_class_dataset(binary_single_df4, "L-ICMP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df5, "L-TCP_DDoS_Flooding_Attack_balanced_data.csv")
analyze_class_dataset(binary_single_df6, "L-UDP_DDoS_Flooding_Attack_balanced_data.csv")

Binary Class Dataset Stats - Low DDOS
Dataset: L-ICMP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    529613
1    529613
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       0       6
Label                   
0              0  529613
1         529613       0

--- Summary ---
Total samples: 1,059,226
Total features: 72
Unique labels: 2
Unique protocols: 2
Stats analysis completed!
------------------------------------------------------------ 

Dataset: L-TCP_DDoS_Flooding_Attack_balanced_data.csv

--- Label Distribution ---
Label
0    435322
1    435322
Name: count, dtype: int64

--- Protocol Distribution by Label ---
Protocol       6
Label           
0         435322
1         435322

--- Summary ---
Total samples: 870,644
Total features: 72
Unique labels: 2
Unique protocols: 1
Stats analysis completed!
------------------------------------------------------------ 

Dataset: L-UDP_DDoS_Flooding_Attack_balanced_data.csv

--- Label 

In [8]:
# Binary Dataset Stats Analysis:

# The HLD-DDoSDN dataset includes six binary-classification scenarios:
# (i) High-rate ICMP, (ii) Low-rate ICMP, (iii) High-rate UDP, (iv) Low-rate UDP, (v) High-rate TCP, and (vi) Low-rate TCP DDoS flooding attacks.
# In each file, the “Attack” samples must all use the attack’s protocol (ICMP→Protocol 1, UDP→17, TCP→6),
# and the “Normal” samples are generated with Scapy using a mixture of benign TCP/UDP/ICMP traffic.

# But, in every ICMP and UDP binary file, all “Attack” flows are mislabeled as TCP (Protocol 6), 
# and the true attack protocol never appears under the attack label. Even the TCP-only files have no normal traffic diversity.

# Implications
# ICMP-based attacks (should be Protocol 1) are entirely missing from the attack side of your ICMP files.
# UDP-based attacks (should be Protocol 17) likewise never appear.
# Models trained on these CSVs will simply learn “Protocol 6 ⇒ attack, otherwise ⇒ normal,” rather than any real DDoS signature.