In [1]:
#!/usr/bin/env python
# --------------------------------------------------------------
# 06_baseline_xgb_attack.py
# Baseline: Single-task XGBoost for attack_id prediction
# Dataset: CIC IoT-IDAD 2024 (packet-based processed subset)
# --------------------------------------------------------------

# ==============================================================
# 1. Imports,  Paths, Configurations, Constants
# ==============================================================

import sys
import os
from pathlib import Path
import json

import numpy as np
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

from xgboost import XGBClassifier

PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
sys.path.append(str(PROJECT_ROOT))

DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"

TRAIN_PATH = PROCESSED_DIR / "packets_train.csv"
VAL_PATH   = PROCESSED_DIR / "packets_val.csv"
TEST_PATH  = PROCESSED_DIR / "packets_test.csv"
ATTACK_LABEL_MAP_PATH = PROCESSED_DIR / "attack_label_mapping.json"

TARGET_COL = "attack_id"
SUBSAMPLE_N = 300_000  # XGB is faster than RF for large N

XGB_CONFIG = {
    "n_estimators": 400,
    "max_depth": 8,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "objective": "multi:softmax",
    "tree_method": "hist",       # use 'gpu_hist' if GPU is available
    "eval_metric": "mlogloss",
    "n_jobs": -1,
    "random_state": 42,
}

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Using processed data from:", PROCESSED_DIR)

PROJECT_ROOT: /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS
Using processed data from: /Users/naeemulhassan/naeem-p/Cloud-Deployed-Multitask-IoT-IDS/data/processed


In [2]:
# ==============================================================
# 2. Load Processed Data
# ==============================================================

train_df = pd.read_csv(TRAIN_PATH)
val_df   = pd.read_csv(VAL_PATH)
test_df  = pd.read_csv(TEST_PATH)

print("Train shape:", train_df.shape)
print("Val   shape:", val_df.shape)
print("Test  shape:", test_df.shape)

with open(ATTACK_LABEL_MAP_PATH, "r") as f:
    attack_label_mapping = json.load(f)["id_to_attack"]

num_attacks = len(attack_label_mapping)
print("Number of attack classes:", num_attacks)

# update num_class from mapping
XGB_CONFIG["num_class"] = num_attacks

Train shape: (2126280, 139)
Val   shape: (455632, 139)
Test  shape: (455632, 139)
Number of attack classes: 8


In [4]:
# ==============================================================
# 3. Feature Selection (Numeric Only)
# ==============================================================

numeric_cols = train_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in [TARGET_COL, "device_id"]]

print("\nNumber of feature columns:", len(feature_cols))
print("Example features:", feature_cols[:15])


Number of feature columns: 119
Example features: ['stream', 'src_port', 'dst_port', 'inter_arrival_time', 'time_since_previously_displayed_frame', 'port_class_dst', 'l4_tcp', 'l4_udp', 'ttl', 'eth_size', 'tcp_window_size', 'payload_entropy', 'handshake_cipher_suites_length', 'handshake_ciphersuites', 'handshake_extensions_length']


In [5]:
# ==============================================================
# 4. Cleaning: NaN / Inf Handling
# ==============================================================

def clean_df(df: pd.DataFrame, feature_cols, name: str) -> pd.DataFrame:
    df = df.copy()
    df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
    nan_before = df[feature_cols].isna().sum().sum()
    if nan_before > 0:
        print(f"  [{name}] NaN before fill: {nan_before}, filling with 0.")
        df[feature_cols] = df[feature_cols].fillna(0)
    return df

print("\nCleaning NaN/Inf...")
train_df = clean_df(train_df, feature_cols, "train")
val_df   = clean_df(val_df, feature_cols, "val")
test_df  = clean_df(test_df, feature_cols, "test")


Cleaning NaN/Inf...
  [train] NaN before fill: 20652772, filling with 0.
  [val] NaN before fill: 4432636, filling with 0.
  [test] NaN before fill: 4426542, filling with 0.


In [6]:
# ==============================================================
# 5. Standardisation (Z-score)
# ==============================================================

print("\nStandardising features (z-score on train stats)...")
means = train_df[feature_cols].mean()
stds  = train_df[feature_cols].std().replace(0, 1.0)

def standardise(df: pd.DataFrame, name: str) -> pd.DataFrame:
    df = df.copy()
    df[feature_cols] = (df[feature_cols] - means) / stds
    df[feature_cols] = df[feature_cols].clip(-10, 10)
    n_nan = df[feature_cols].isna().sum().sum()
    n_inf = np.isinf(df[feature_cols].values).sum()
    print(f"  [{name}] NaN after std: {n_nan}, Inf: {n_inf}")
    if n_nan > 0 or n_inf > 0:
        raise ValueError(f"Found NaN/Inf in {name} after standardisation.")
    return df

train_df = standardise(train_df, "train")
val_df   = standardise(val_df, "val")
test_df  = standardise(test_df, "test")



Standardising features (z-score on train stats)...
  [train] NaN after std: 0, Inf: 0
  [val] NaN after std: 0, Inf: 0
  [test] NaN after std: 0, Inf: 0


In [7]:
# ==============================================================
# 6. Numpy Arrays + Optional Subsampling
# ==============================================================

X_train = train_df[feature_cols].values
y_train = train_df[TARGET_COL].values

X_val   = val_df[feature_cols].values
y_val   = val_df[TARGET_COL].values

X_test  = test_df[feature_cols].values
y_test  = test_df[TARGET_COL].values

print("\nFull training size:", X_train.shape[0])

if SUBSAMPLE_N is not None and X_train.shape[0] > SUBSAMPLE_N:
    idx = np.random.choice(X_train.shape[0], size=SUBSAMPLE_N, replace=False)
    X_train_sub = X_train[idx]
    y_train_sub = y_train[idx]
    print(f"Subsampled training size for XGBoost: {X_train_sub.shape[0]}")
else:
    X_train_sub = X_train
    y_train_sub = y_train
    print("Using full training set for XGBoost.")


Full training size: 2126280
Subsampled training size for XGBoost: 300000


In [8]:

# ==============================================================
# 7. Evaluation Helper
# ==============================================================

def evaluate_classifier(name, clf, X_val, y_val, X_test, y_test):
    attack_names = [attack_label_mapping[str(i)] for i in range(len(attack_label_mapping))]

    for split_name, X, y in [("Val", X_val, y_val), ("Test", X_test, y_test)]:
        y_pred = clf.predict(X)
        acc = accuracy_score(y, y_pred)
        macro_f1 = f1_score(y, y_pred, average="macro")

        print(f"\n[{name}] {split_name} Accuracy: {acc:.4f}, Macro-F1: {macro_f1:.4f}")
        print(f"[{name}] {split_name} classification report:")
        print(classification_report(
            y,
            y_pred,
            target_names=attack_names,
            digits=4,
            zero_division=0,
        ))
        print(f"[{name}] {split_name} confusion matrix:")
        print(confusion_matrix(y, y_pred))


In [9]:
# ==============================================================
# 8. Train XGBoost Baseline
# ==============================================================

print("\nTraining XGBoost baseline with config:", XGB_CONFIG)
xgb = XGBClassifier(**XGB_CONFIG)
xgb.fit(X_train_sub, y_train_sub)
print("XGBoost training complete.")


Training XGBoost baseline with config: {'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'multi:softmax', 'tree_method': 'hist', 'eval_metric': 'mlogloss', 'n_jobs': -1, 'random_state': 42, 'num_class': 8}
XGBoost training complete.


In [10]:

# ==============================================================
# 9. Evaluate
# ==============================================================

evaluate_classifier("XGBoost", xgb, X_val, y_val, X_test, y_test)

print("\nDone (XGBoost baseline).")


[XGBoost] Val Accuracy: 0.9955, Macro-F1: 0.9945
[XGBoost] Val classification report:
              precision    recall  f1-score   support

      benign     0.9944    0.9967    0.9956     67500
 brute force     0.9983    0.9923    0.9953     19721
        ddos     0.9989    0.9968    0.9978     67500
         dos     0.9956    0.9968    0.9962     67500
       mirai     0.9995    0.9970    0.9982     67500
       recon     0.9973    0.9933    0.9953     67500
    spoofing     0.9975    0.9951    0.9963     67500
   web-based     0.9713    0.9915    0.9813     30911

    accuracy                         0.9955    455632
   macro avg     0.9941    0.9949    0.9945    455632
weighted avg     0.9955    0.9955    0.9955    455632

[XGBoost] Val confusion matrix:
[[67275     9     6    27     2    11    21   149]
 [   47 19569     4    13     1     5     5    77]
 [   56     8 67281    92     3    13     3    44]
 [   62     6    33 67282     7    24    15    71]
 [   17     1     5    22 