In [1]:
# --- Notebook: 02_label_and_feature_analysis.ipynb ---
# Objective:
# - Analyse label distributions (device_mac, attack_label)
# - Quantify class imbalance and coverage
# - Inspect numeric features (variance, correlations)
# - Produce tables/figures to justify multitask CNN–Transformer design

In [None]:
# ============================================================
# 1. Environment setup & Configuration
# ============================================================

import sys
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (10, 5)

# Assumes this notebook is in <project_root>/notebooks/
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

from src.data.load_data import IoTDatasetLoader


BASE_PATH = Path(
    "/Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024"
)

# Sample configuration (≈ 560k rows)
MAX_PACKET_FILES_FOR_SAMPLE = 8        # first 8 packet files
ROWS_PER_PACKET_FILE_SAMPLE = 70_000   # 70k rows per file

ATTACK_KEYWORDS = {
    "ddos": "ddos",
    "mirai": "mirai",
    "dos": "dos",
    "benign": "benign",
    "recon": "recon",
    "spoof": "spoofing",
    "web": "web-based",
    "bruteforce": "brute force",
    "brute_force": "brute force",
    "brute-force": "brute force",
}

REPORTS_DIR = PROJECT_ROOT / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)


In [3]:
# ============================================================
# 2. Helper functions
# ============================================================

def infer_attack_label_from_filename(file_path: str) -> str:
    """
    Simple keyword-based label from file path. Used as fallback.
    """
    p = file_path.lower()
    for k, v in ATTACK_KEYWORDS.items():
        if k in p:
            return v
    return "unknown"


def detect_attack_label_column(df: pd.DataFrame, verbose: bool = True):
    """
    Try to locate the attack/benign label column using:
    1) column name (label/attack/family/category)
    2) overlap of string values with ATTACK_KEYWORDS
    """
    name_based = [
        c for c in df.columns
        if any(k in c.lower() for k in ["label", "attack", "family", "category"])
    ]

    content_based = []
    obj_cols = df.select_dtypes(include=["object"]).columns.tolist()

    for col in obj_cols:
        vals = (
            df[col]
            .dropna()
            .astype(str)
            .str.lower()
            .unique()
        )
        if len(set(vals) & set(ATTACK_KEYWORDS.values())) > 0:
            content_based.append(col)

    candidates = list(dict.fromkeys(name_based + content_based))

    if verbose:
        print("Attack label column detection:")
        print("  Name-based candidates   :", name_based)
        print("  Content-based candidates:", content_based)
        print("  Final candidate list    :", candidates)

    if not candidates:
        return None

    for c in candidates:
        if "label" in c.lower():
            return c
    return candidates[0]


def memory_usage_mb(df: pd.DataFrame) -> float:
    return df.memory_usage(deep=True).sum() / (1024 ** 2)

In [4]:
# ============================================================
# 3. Prepare packet-based file list
# ============================================================

loader = IoTDatasetLoader(BASE_PATH)
all_files = loader.list_files()
print(f"Total CSV files found: {len(all_files)}")

packet_files = [
    f for f in all_files
    if ("packet" in f.lower())
    or ("device identification_anomaly detection - packet based features".lower() in f.lower())
]
flow_files = [
    f for f in all_files
    if ("flow" in f.lower()) or ("cicflow" in f.lower())
]

if len(packet_files) == 0:
    packet_files = all_files

print(f"Packet-based CSV files (heuristic): {len(packet_files)}")
print(f"Flow-based   CSV files (heuristic): {len(flow_files)}")

print("\nExample packet-based files:")
for f in packet_files[:5]:
    print("  ", f)

Total CSV files found: 312
Packet-based CSV files (heuristic): 180
Flow-based   CSV files (heuristic): 132

Example packet-based files:
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic1.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic2.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic3.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BruteForce/DictionaryBruteForce/D

In [5]:

# ============================================================
# 4. Build combined packet sample for analysis
# ============================================================

print("\nBuilding combined packet-based sample for deeper analysis...")

sample_dfs = []
selected_packet_files = packet_files[:MAX_PACKET_FILES_FOR_SAMPLE]

for i, fp in enumerate(selected_packet_files, start=1):
    print(f"  [{i}/{len(selected_packet_files)}] Loading sample from {fp}")
    df_tmp = loader.load_single(fp, nrows=ROWS_PER_PACKET_FILE_SAMPLE)
    df_tmp["attack_label_from_file"] = infer_attack_label_from_filename(fp)
    df_tmp["source_file"] = Path(fp).name
    sample_dfs.append(df_tmp)

packet_sample = pd.concat(sample_dfs, ignore_index=True)
print("\nCombined packet_sample shape:", packet_sample.shape)
print("Approx memory usage (MB):", round(memory_usage_mb(packet_sample), 2))
packet_sample.info()


Building combined packet-based sample for deeper analysis...
  [1/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic.csv
  [2/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic1.csv
  [3/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic2.csv
  [4/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic3.csv
  [5/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Pack

  return pd.read_csv(filepath, nrows=nrows)


  [7/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/DDoS/DDoS-ACK_Fragmentation/DDoS-ACK_Fragmentation1.csv


  return pd.read_csv(filepath, nrows=nrows)


  [8/8] Loading sample from /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/DDoS/DDoS-ACK_Fragmentation/DDoS-ACK_Fragmentation10.csv


  return pd.read_csv(filepath, nrows=nrows)



Combined packet_sample shape: (320000, 137)
Approx memory usage (MB): 617.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320000 entries, 0 to 319999
Columns: 137 entries, stream to source_file
dtypes: float64(97), int64(22), object(18)
memory usage: 334.5+ MB
