## CIC-DDoS2019 Dataset EDA

The path to the dataset - https://data.mendeley.com/datasets/ssnc74xm6r/1

In [None]:
# === Cell 1: Load the dataset and preview the first records ===
import pandas as pd

# Path to your downloaded CSV file
csv_path = "cicddos2019_dataset.csv"

# Load the dataset (low_memory=False prevents dtype guessing warnings)
print("Loading the dataset... this may take a minute.")
df = pd.read_csv(csv_path, low_memory=False)

print(f"Dataset loaded successfully! Shape: {df.shape[0]:,} rows Ã— {df.shape[1]} columns")

# Display the first 20 rows to inspect the structure
df.head(20)

In [None]:
# Let's see which columns are available and what their data types are.

print("Dataset structure overview:\n")
df.info()

print("\nList of columns:")
print(", ".join(df.columns))

In [None]:
# === Distribution of attack classes and labels ===
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", palette="tab10")

# 1. Distribution of attack classes
plt.figure(figsize=(10,5))
sns.countplot(data=df, x="Class", hue="Class", order=df["Class"].value_counts().index, palette="tab10")
plt.title("Distribution of attack classes")
plt.xticks(rotation=45, ha='right')
plt.ylabel("Number of flows")
plt.xlabel("Class")
plt.show()

# 2. Distribution of Label (top 15)
top_labels = df["Label"].value_counts().head(15).reset_index()
top_labels.columns = ["Label", "Count"]

plt.figure(figsize=(10,5))
sns.barplot(data=top_labels, x="Label", y="Count", hue="Label", legend=False, palette="tab20")
plt.title("Top 15 attack labels")
plt.xticks(rotation=75, ha='right')
plt.ylabel("Number of flows")
plt.xlabel("Label")
plt.show()

In [None]:
# === Compare numeric features between normal and attack flows ===

# Choose a few meaningful features
features = ["Flow Duration", "Total Fwd Packets", "Total Backward Packets", "Flow Bytes/s"]

# Melt to long format for easy plotting
subset = df.melt(id_vars="Class", value_vars=features, var_name="Feature", value_name="Value")

plt.figure(figsize=(12,6))
sns.boxplot(data=subset, x="Feature", y="Value", hue="Class", showfliers=False)
plt.yscale('log')  # log scale helps show differences clearly
plt.title("Distribution of flow features (log scale)")
plt.xlabel("")
plt.ylabel("Value (log scale)")
plt.legend(title="Class")
plt.show()

In [None]:
# === Pairwise feature visualization ===

# Select a small sample (e.g., 2000 rows) to avoid slowdown
sample = df.sample(2000, random_state=42)

sns.pairplot(
    data=sample,
    vars=["Flow Duration", "Total Fwd Packets", "Total Backward Packets", "Flow Bytes/s"],
    hue="Class",
    corner=True,
    diag_kind="hist",
    palette="tab10"
)
plt.suptitle("Pairwise feature relationships (sampled)", y=1.02)
plt.show()

In [None]:
# === Feature correlation heatmap ===

# Compute correlation matrix for numeric columns
corr = df.select_dtypes(include=["int64", "float64"]).corr()

# To keep it readable, take only top 20 most variable columns
top_var_cols = df.select_dtypes(include=["int64","float64"]).var().nlargest(20).index
plt.figure(figsize=(12,10))
sns.heatmap(corr.loc[top_var_cols, top_var_cols], cmap="coolwarm", annot=False, linewidths=0.3)
plt.title("Correlation heatmap (top 20 numeric features)")
plt.show()

In [None]:
# === Quick summary ===
print("Top 10 classes:")
print(df['Class'].value_counts().head(10).to_string(), "\n")

print("Top 10 labels:")
print(df['Label'].value_counts().head(10).to_string(), "\n")

print("Basic numeric statistics:")
print(df[["Flow Duration", "Total Fwd Packets", "Total Backward Packets", "Flow Bytes/s"]].describe().T)