In [None]:
# --- Notebook: 03_preprocessing_packet.ipynb ---
# Goal:
# - Construct a training-ready multitask dataset from CIC IoT-IDAD 2024 packet-based CSV files
# - Approximate size: ~3.6M rows (balanced across 8 attack families)
# - Provide two supervised targets:
#       * device_id   (encoded from device_mac)  → Device Identification
#       * attack_id   (encoded from attack_label) → Intrusion Detection
# - Export:
#       * packets_train.csv
#       * packets_val.csv
#       * packets_test.csv
#       * device_label_mapping.json
#       * attack_label_mapping.json
# - This dataset is used by the final multitask 1D-CNN + MLP model (shared backbone + two heads).


In [2]:
# ============================================================
# 1. Environment setup & Configuration
# ============================================================

import sys
import os
from pathlib import Path
import json
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Optional progress bar for streaming read
try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **kwargs: x  # fallback no-op

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Assume notebook lives in <project_root>/notebooks/
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

from src.data.load_data import IoTDatasetLoader



# Raw CIC IoT-IDAD 2024 base directory
BASE_PATH = Path(
    "/Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024"
)

DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Target dataset size (chosen to fit in memory on 16 GB M1 while still being large)
TARGET_TOTAL_ROWS = 3_600_000          # total rows across all 8 attack families
USEABLE_FRACTION = 1.0                 # keep simple; can reduce if memory is tight
FINAL_TARGET_ROWS = int(TARGET_TOTAL_ROWS * USEABLE_FRACTION)

# Attack families of interest (benign + 7 attack categories)
ATTACK_FAMILIES = [
    "benign",
    "ddos",
    "dos",
    "mirai",
    "recon",
    "spoofing",
    "web-based",
    "brute force",
]

# Aim for an approximately balanced dataset across these families
TARGET_PER_ATTACK = FINAL_TARGET_ROWS // len(ATTACK_FAMILIES)

# Chunk size for streaming CSVs
CHUNK_SIZE = 200_000  # rows per chunk when reading large files

# Output format (CSV used to avoid pyarrow dependency issues)
USE_PARQUET = False  # kept for completeness; not used in current export



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ============================================================
# 2. Helper functions
# ============================================================

def infer_attack_label_from_filename(file_path: str) -> str:
    """
    Infer a coarse attack label from the CSV file path.
    This is used to attach a stable 'attack_label' string before encoding.
    Priority order is important to avoid misclassification (e.g. Mirai vs generic DoS).
    """
    p = file_path.lower()

    if "benign" in p:
        return "benign"
    if "mirai" in p:
        return "mirai"
    if "ddos" in p:
        return "ddos"
    if "bruteforce" in p or "brute_force" in p or "brute-force" in p:
        return "brute force"
    if "spoof" in p:
        return "spoofing"
    if "recon" in p:
        return "recon"
    if "web" in p:
        return "web-based"
    if "dos" in p:
        return "dos"

    return "unknown"


def memory_usage_mb(df: pd.DataFrame) -> float:
    """Compute approximate memory usage of a DataFrame in MB."""
    return df.memory_usage(deep=True).sum() / (1024 ** 2)


def label_distribution(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    """
    Return a small summary DataFrame with class counts and percentages
    for a given label column (e.g. attack_label, attack_id).
    """
    counts = df[label_col].value_counts().sort_index()
    perc = counts / counts.sum() * 100.0
    out = pd.DataFrame({"count": counts, "percentage": perc})
    return out


In [4]:

# ============================================================
# 3. List packet-based files
# ============================================================

loader = IoTDatasetLoader(BASE_PATH)
all_files = loader.list_files()
print(f"Total CSV files found under BASE_PATH: {len(all_files)}")

# Packet-based files are identified by naming convention used in CIC IoT-IDAD 2024
packet_files = [
    f for f in all_files
    if "device identification_Anomaly detection - packet based features".lower() in f.lower()
    or "device identification_anomaly detection - packet based features".lower() in f.lower()
    or "packet based features".lower() in f.lower()
]

# Fallback heuristic if naming varies
if len(packet_files) == 0:
    packet_files = [f for f in all_files if "packet" in f.lower()]

print(f"Packet-based CSV files detected: {len(packet_files)}")
print("Example packet-based files:")
for f in packet_files[:10]:
    print("  ", f)

Total CSV files found under BASE_PATH: 312
Packet-based CSV files detected: 180
Example packet-based files:
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic1.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic2.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BenignTraffic/BenignTraffic3.csv
   /Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024/Device Identification_Anomaly Detection - Packet Based Features/BruteForce/DictionaryBruteForce/DictionaryBruteForce.csv
   /

In [5]:
# ============================================================
# 4. Balanced sampling across attack families (streamed)
# ============================================================

# Track how many rows have been collected per attack family
attack_counts = defaultdict(int)
selected_chunks = []

print("\nTarget total rows:", FINAL_TARGET_ROWS)
print("Target per attack family:", TARGET_PER_ATTACK)
print("Attack families considered:", ATTACK_FAMILIES)

for fp in tqdm(packet_files, desc="Processing packet CSV files"):
    # Infer the coarse attack family for this file from its path
    label = infer_attack_label_from_filename(fp)

    # Skip files that are not part of the 8 target families
    if label not in ATTACK_FAMILIES:
        continue

    # If this attack already reached its target quota, skip the file
    if attack_counts[label] >= TARGET_PER_ATTACK:
        continue

    # Stream the CSV in chunks to avoid loading entire file into memory
    for chunk in pd.read_csv(fp, chunksize=CHUNK_SIZE, low_memory=False):
        remaining_for_label = TARGET_PER_ATTACK - attack_counts[label]
        if remaining_for_label <= 0:
            break

        # If the chunk is smaller than the remaining quota, take all rows
        if len(chunk) <= remaining_for_label:
            sampled_chunk = chunk
        else:
            # Otherwise randomly down-sample to exactly what is needed
            sampled_chunk = chunk.sample(
                n=remaining_for_label,
                random_state=RANDOM_SEED,
            )

        # Attach multitask labels that we control:
        sampled_chunk["attack_label"] = label
        sampled_chunk["source_file"] = Path(fp).name

        attack_counts[label] += len(sampled_chunk)
        selected_chunks.append(sampled_chunk)

        # Stop globally once total selected rows reach the target
        if sum(attack_counts.values()) >= FINAL_TARGET_ROWS:
            break

    # Global early-stop check
    if sum(attack_counts.values()) >= FINAL_TARGET_ROWS:
        break

print("\nAttack counts collected (approx.):")
for k in ATTACK_FAMILIES:
    print(f"  {k:12s}: {attack_counts[k]:,}")

total_selected = sum(attack_counts.values())
print(f"\nTotal selected rows (approx.): {total_selected:,}")



Target total rows: 3600000
Target per attack family: 450000
Attack families considered: ['benign', 'ddos', 'dos', 'mirai', 'recon', 'spoofing', 'web-based', 'brute force']


Processing packet CSV files: 100%|██████████| 180/180 [01:10<00:00,  2.55it/s]


Attack counts collected (approx.):
  benign      : 450,000
  ddos        : 450,000
  dos         : 450,000
  mirai       : 450,000
  recon       : 450,000
  spoofing    : 450,000
  web-based   : 206,067
  brute force : 131,477

Total selected rows (approx.): 3,037,544





In [6]:
# ============================================================
# 5. Concatenate selected data and basic checks
# ============================================================

df_all = pd.concat(selected_chunks, ignore_index=True)
print("\nConcatenated dataframe shape:", df_all.shape)
print("Approx memory usage (MB):", round(memory_usage_mb(df_all), 2))

# Ensure device identifier column is present
if "device_mac" not in df_all.columns:
    raise ValueError("device_mac column not found in concatenated dataframe.")

# Check that attack_label is attached correctly
print("\nUnique attack_label values in df_all:", df_all["attack_label"].unique())


Concatenated dataframe shape: (3037544, 137)
Approx memory usage (MB): 5874.76

Unique attack_label values in df_all: ['benign' 'brute force' 'ddos' 'dos' 'mirai' 'recon' 'spoofing'
 'web-based']


In [7]:
# ============================================================
# 6. Label encoding for device_mac and attack_label
# ============================================================

# Device label encoding: device_mac → device_id
device_labels = df_all["device_mac"].astype(str).unique()
device_labels_sorted = sorted(device_labels)

device_to_id = {dev: i for i, dev in enumerate(device_labels_sorted)}
id_to_device = {i: dev for dev, i in device_to_id.items()}

df_all["device_id"] = df_all["device_mac"].astype(str).map(device_to_id)

# Attack label encoding: attack_label → attack_id
attack_labels_present = sorted(df_all["attack_label"].astype(str).unique())
attack_to_id = {lab: i for i, lab in enumerate(attack_labels_present)}
id_to_attack = {i: lab for lab, i in attack_to_id.items()}

df_all["attack_id"] = df_all["attack_label"].astype(str).map(attack_to_id)

print("\nDevice label space size:", len(device_to_id))
print("Attack label space size:", len(attack_to_id))

print("\nSample device mapping (first 10):")
for i, (dev, idx) in enumerate(device_to_id.items()):
    if i >= 10:
        break
    print(f"  {idx:3d} -> {dev}")

print("\nAttack mapping:")
for lab, idx in attack_to_id.items():
    print(f"  {idx:2d} -> {lab}")

# Persist label mappings for downstream training / deployment
with open(PROCESSED_DIR / "device_label_mapping.json", "w") as f:
    json.dump({"device_to_id": device_to_id, "id_to_device": id_to_device}, f, indent=2)

with open(PROCESSED_DIR / "attack_label_mapping.json", "w") as f:
    json.dump({"attack_to_id": attack_to_id, "id_to_attack": id_to_attack}, f, indent=2)


Device label space size: 94
Attack label space size: 8

Sample device mapping (first 10):
    0 -> 00:0c:29:03:b2:98
    1 -> 00:0c:29:07:63:da
    2 -> 00:0c:29:1c:55:4a
    3 -> 00:0c:29:20:ab:ec
    4 -> 00:0c:29:3e:f0:e0
    5 -> 00:0c:29:c3:9b:8a
    6 -> 00:0c:29:dd:6e:c7
    7 -> 00:0c:29:f2:f3:74
    8 -> 00:a3:d1:07:6f:03
    9 -> 00:a3:d1:07:6f:05

Attack mapping:
   0 -> benign
   1 -> brute force
   2 -> ddos
   3 -> dos
   4 -> mirai
   5 -> recon
   6 -> spoofing
   7 -> web-based


In [8]:
# ============================================================
# 7. Train / validation / test split (stratified by attack_id)
# ============================================================

# First split: Train (70%) vs Temp (30%), stratified by attack_id
train_df, temp_df = train_test_split(
    df_all,
    test_size=0.30,
    random_state=RANDOM_SEED,
    stratify=df_all["attack_id"],
)

# Second split: Temp → Validation (15%) + Test (15%)
# Since temp is 30%, splitting 50/50 gives 15/15 overall.
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=RANDOM_SEED,
    stratify=temp_df["attack_id"],
)

print("\nSplit sizes (stratified by attack_id):")
print(f"  Train: {len(train_df):,} rows")
print(f"  Val  : {len(val_df):,} rows")
print(f"  Test : {len(test_df):,} rows")

print("\nAttack distribution in Train (attack_label):")
print(label_distribution(train_df, "attack_label"))

print("\nAttack distribution in Val (attack_label):")
print(label_distribution(val_df, "attack_label"))

print("\nAttack distribution in Test (attack_label):")
print(label_distribution(test_df, "attack_label"))


Split sizes (stratified by attack_id):
  Train: 2,126,280 rows
  Val  : 455,632 rows
  Test : 455,632 rows

Attack distribution in Train (attack_label):
               count  percentage
attack_label                    
benign        315000   14.814606
brute force    92034    4.328405
ddos          315000   14.814606
dos           315000   14.814606
mirai         315000   14.814606
recon         315000   14.814606
spoofing      315000   14.814606
web-based     144246    6.783961

Attack distribution in Val (attack_label):
              count  percentage
attack_label                   
benign        67500   14.814587
brute force   19721    4.328274
ddos          67500   14.814587
dos           67500   14.814587
mirai         67500   14.814587
recon         67500   14.814587
spoofing      67500   14.814587
web-based     30911    6.784203

Attack distribution in Test (attack_label):
              count  percentage
attack_label                   
benign        67500   14.814587
brute force

In [9]:
# ============================================================
# 8. Save processed datasets (CSV format)
# ============================================================

train_path = PROCESSED_DIR / "packets_train.csv"
val_path   = PROCESSED_DIR / "packets_val.csv"
test_path  = PROCESSED_DIR / "packets_test.csv"

# CSV is explicit to avoid environment-specific parquet issues
train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"\nSaved processed CSV files to {PROCESSED_DIR}:")
print("  ", train_path.name)
print("  ", val_path.name)
print("  ", test_path.name)


Saved processed CSV files to /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS/data/processed:
   packets_train.csv
   packets_val.csv
   packets_test.csv


In [11]:
# ============================================================
# 9. Summary (for report / supervisor / methodology section)
# ============================================================

print("\n================ PREPROCESSING SUMMARY ================")
print(f"- Total selected rows for multitask dataset : {len(df_all):,}")
print("- Attack families included and per-class counts (df_all):")
for lab in attack_labels_present:
    cnt = int((df_all["attack_label"] == lab).sum())
    print(f"    {lab:12s}: {cnt:,}")
print(f"- Number of device classes (device_id)      : {len(device_to_id)}")
print("- Train / Val / Test shapes (rows, columns):")
print(f"    Train: {train_df.shape}")
print(f"    Val  : {val_df.shape}")
print(f"    Test : {test_df.shape}")
print("- Label mappings saved to:")
print(f"    {PROCESSED_DIR / 'device_label_mapping.json'}")
print(f"    {PROCESSED_DIR / 'attack_label_mapping.json'}")
print("- Datasets saved to:")
print(f"    {train_path}")
print(f"    {val_path}")
print(f"    {test_path}")
print("- This preprocessed dataset is used by the final multitask")
print("  1D-CNN + MLP model with a shared backbone and two heads")
print("  (attack_id and device_id).")
print("=======================================================")


- Total selected rows for multitask dataset : 3,037,544
- Attack families included and per-class counts (df_all):
    benign      : 450,000
    brute force : 131,477
    ddos        : 450,000
    dos         : 450,000
    mirai       : 450,000
    recon       : 450,000
    spoofing    : 450,000
    web-based   : 206,067
- Number of device classes (device_id)      : 94
- Train / Val / Test shapes (rows, columns):
    Train: (2126280, 139)
    Val  : (455632, 139)
    Test : (455632, 139)
- Label mappings saved to:
    /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS/data/processed/device_label_mapping.json
    /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS/data/processed/attack_label_mapping.json
- Datasets saved to:
    /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS/data/processed/packets_train.csv
    /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS/data/processed/packets_val.csv
    /Users/naeemulhassan/naeem-p/Cloud-Deployed-M