In [None]:
# --- Notebook: 03_preprocessing_packet.ipynb ---
# Goal:
# - Construct a training-ready multitask dataset from CIC IoT-IDAD 2024 packet-based CSV files
# - Approximate size: ~3.6M rows (balanced across 8 attack families)
# - Provide two supervised targets:
#       * device_id   (encoded from device_mac)  → Device Identification
#       * attack_id   (encoded from attack_label) → Intrusion Detection
# - Export:
#       * packets_train.csv
#       * packets_val.csv
#       * packets_test.csv
#       * device_label_mapping.json
#       * attack_label_mapping.json
# - This dataset is used by the final multitask 1D-CNN + MLP model (shared backbone + two heads).


In [None]:
# ============================================================
# 1. Environment setup & Configuration
# ============================================================

import sys
import os
from pathlib import Path
import json
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Optional progress bar for streaming read
try:
    from tqdm.auto import tqdm
except ImportError:
    tqdm = lambda x, **kwargs: x  # fallback no-op

# Reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Assume notebook lives in <project_root>/notebooks/
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

from src.data.load_data import IoTDatasetLoader



# Raw CIC IoT-IDAD 2024 base directory
BASE_PATH = Path(
    "/Users/naeemulhassan/naeem-p/CIC_IoT_IDAD_2024/CIC_IoT_IDAD_Dataset_2024"
)

DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Target dataset size (chosen to fit in memory on 16 GB M1 while still being large)
TARGET_TOTAL_ROWS = 3_600_000          # total rows across all 8 attack families
USEABLE_FRACTION = 1.0                 # keep simple; can reduce if memory is tight
FINAL_TARGET_ROWS = int(TARGET_TOTAL_ROWS * USEABLE_FRACTION)

# Attack families of interest (benign + 7 attack categories)
ATTACK_FAMILIES = [
    "benign",
    "ddos",
    "dos",
    "mirai",
    "recon",
    "spoofing",
    "web-based",
    "brute force",
]

# Aim for an approximately balanced dataset across these families
TARGET_PER_ATTACK = FINAL_TARGET_ROWS // len(ATTACK_FAMILIES)

# Chunk size for streaming CSVs
CHUNK_SIZE = 200_000  # rows per chunk when reading large files

# Output format (CSV used to avoid pyarrow dependency issues)
USE_PARQUET = False  # kept for completeness; not used in current export

