# IoT IDS â€“ BGWO + RFE-XGBoost Complete Pipeline

This notebook implements the full intrusion detection pipeline with:

- BGWO (Binary Grey Wolf Optimizer) + RFE-XGBoost feature selection  
- BO-TPE (Optuna TPE sampler) for hyperparameter optimization  
- Binary and multi-class classification (Normal vs Attack, plus 7-class attacks)  
- SMOTE for class imbalance  
- Attack-specific evaluation metrics  
- Checkpoint saving (models, scalers, selected features, Optuna study)

The notebook is self-contained and runs end-to-end on a **synthetic IoT dataset** so that you can execute it without any external files.

To use your real datasets (N-BaIoT, BoT-IoT, WUSTL-IIoT-2021, WUSTL-EHMS-2020, NSL-KDD), replace the synthetic `df_syn` with your preprocessed merged dataframe and keep the rest of the pipeline unchanged.


In [None]:

# BGWO + RFE-XGBoost IoT IDS Pipeline
# This notebook cell defines a complete, end-to-end implementation
# that you can run in Google Colab. It includes:
# - Synthetic example data (so the pipeline runs without external files)
# - BGWO + RFE feature selection for XGBoost
# - BO-TPE hyperparameter optimization using Optuna
# - Binary and multi-class classification support
# - SMOTE for class imbalance
# - Attack-specific analysis
# - Checkpoint saving for later reuse

# =========================
# 1. Install dependencies (for Colab / any Python env)
# =========================

import importlib
import subprocess
import sys

def ensure_package(package_name: str):
    try:
        importlib.import_module(package_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

for pkg in ["optuna", "imbalanced-learn", "xgboost", "joblib"]:
    ensure_package(pkg)

# =========================
# 2. Imports
# =========================

import os
import math
import json
import random
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.feature_selection import RFE

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
import optuna
from optuna.samplers import TPESampler
import joblib

# For reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# =========================
# 3. Configuration
# =========================

# Directory where models, feature lists, and studies will be saved.
# In Colab + Drive, you can do:
# from google.colab import drive
# drive.mount('/content/drive')
# and then set:
# CHECKPOINT_DIR = "/content/drive/MyDrive/iot_ids_checkpoints"

CHECKPOINT_DIR = "./checkpoints"

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Attack labels as described in your project
ATTACK_LABELS = [
    "Normal",
    "Data Tampering",
    "Impersonation/Spoofing",
    "DDoS/DoS",
    "Silent Exfiltration",
    "Backdoor",
    "Reconnaissance",
]
ATTACK_LABEL_TO_ID = {name: idx for idx, name in enumerate(ATTACK_LABELS)}
ATTACK_ID_TO_LABEL = {idx: name for name, idx in enumerate(ATTACK_LABELS)}

# =========================
# 4. Synthetic dataset builder
# =========================

def build_synthetic_iot_dataset(
    n_samples_per_class: int = 800,
    n_features: int = 40,
    random_state: int = RANDOM_STATE,
) -> Dict[str, pd.DataFrame]:
    """
    Build a synthetic dataset that mimics Normal + 6 attack types.
    This lets the full pipeline run end-to-end even without real CSVs.
    Replace this later with real merged IoT datasets.
    """
    rng = np.random.RandomState(random_state)

    n_classes = len(ATTACK_LABELS)  # 7 (Normal + 6 attacks)
    total_samples = n_samples_per_class * n_classes

    X = []
    y_multi = []
    attack_names = []

    # Create a simple pattern per class to make the task non-trivial
    for cls_id in range(n_classes):
        # base mean vector differs per class
        mean = rng.uniform(-2, 2, size=n_features) + cls_id * 0.5
        cov = np.eye(n_features) * rng.uniform(0.5, 1.5)
        samples = rng.multivariate_normal(
            mean, cov, size=n_samples_per_class
        )
        X.append(samples)
        y_multi.extend([cls_id] * n_samples_per_class)
        attack_names.extend([ATTACK_ID_TO_LABEL[cls_id]] * n_samples_per_class)

    X = np.vstack(X)
    y_multi = np.array(y_multi, dtype=int)

    # Binary labels: 0 = Normal, 1 = Attack
    y_binary = (y_multi != ATTACK_LABEL_TO_ID["Normal"]).astype(int)

    feature_names = [f"f_{i}" for i in range(n_features)]
    df_X = pd.DataFrame(X, columns=feature_names)

    df = df_X.copy()
    df["label_binary"] = y_binary
    df["label_multi"] = y_multi
    df["attack_name"] = attack_names

    return {
        "dataframe": df,
        "feature_names": feature_names,
    }

# =========================
# 5. BGWO Feature Selector
# =========================

class BGWOFeatureSelector:
    """
    Binary Grey Wolf Optimizer for feature selection using XGBoost as evaluator.
    """

    def __init__(
        self,
        n_features: int,
        max_iter: int = 15,
        n_wolves: int = 8,
        random_state: int = RANDOM_STATE,
    ):
        self.n_features = n_features
        self.max_iter = max_iter
        self.n_wolves = n_wolves
        self.random_state = random_state

        self.best_mask_: Optional[np.ndarray] = None
        self.best_score_: float = -np.inf

        self._rng = np.random.RandomState(self.random_state)

    def _initialize_positions(self) -> np.ndarray:
        # Continuous positions in [0,1] for each wolf and feature
        return self._rng.uniform(0, 1, size=(self.n_wolves, self.n_features))

    @staticmethod
    def _continuous_to_binary(positions: np.ndarray) -> np.ndarray:
        # Sigmoid + threshold to get binary mask
        sigmoid = 1 / (1 + np.exp(-10 * (positions - 0.5)))
        return (sigmoid >= 0.5).astype(int)

    def _fitness(
        self, mask: np.ndarray, X: np.ndarray, y: np.ndarray
    ) -> float:
        # If no features selected, return very poor score
        if mask.sum() == 0:
            return 0.0

        X_sel = X[:, mask == 1]

        # Lightweight XGBoost for evaluation
        params = dict(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            subsample=0.9,
            colsample_bytree=0.9,
            objective="binary:logistic"
            if len(np.unique(y)) == 2
            else "multi:softprob",
            tree_method="hist",
            n_jobs=-1,
        )
        if len(np.unique(y)) > 2:
            params["num_class"] = len(np.unique(y))

        cv = StratifiedKFold(
            n_splits=3, shuffle=True, random_state=self.random_state
        )
        scores = []
        for train_idx, valid_idx in cv.split(X_sel, y):
            X_tr, X_va = X_sel[train_idx], X_sel[valid_idx]
            y_tr, y_va = y[train_idx], y[valid_idx]

            model = XGBClassifier(**params)
            model.fit(X_tr, y_tr)

            if len(np.unique(y)) == 2:
                probs = model.predict_proba(X_va)[:, 1]
                score = roc_auc_score(y_va, probs)
            else:
                preds = model.predict(X_va)
                score = (preds == y_va).mean()
            scores.append(score)

        return float(np.mean(scores))

    def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """
        Run BGWO to select a subset of features. Returns indices of selected features.
        """
        positions = self._initialize_positions()
        binary_masks = self._continuous_to_binary(positions)

        # Evaluate initial fitness
        fitness = np.zeros(self.n_wolves)
        for i in range(self.n_wolves):
            fitness[i] = self._fitness(binary_masks[i], X, y)

        # Identify alpha, beta, delta wolves (best three)
        def get_leaders():
            sorted_idx = np.argsort(-fitness)  # descending
            return sorted_idx[0], sorted_idx[1], sorted_idx[2]

        alpha_idx, beta_idx, delta_idx = get_leaders()

        self.best_mask_ = binary_masks[alpha_idx].copy()
        self.best_score_ = fitness[alpha_idx]

        # Main loop
        for iter_idx in range(self.max_iter):
            a = 2 - 2 * (iter_idx / (self.max_iter - 1 + 1e-9))  # linearly from 2 to 0

            for i in range(self.n_wolves):
                for d in range(self.n_features):
                    r1, r2 = self._rng.rand(), self._rng.rand()
                    A1 = 2 * a * r1 - a
                    C1 = 2 * r2

                    r1, r2 = self._rng.rand(), self._rng.rand()
                    A2 = 2 * a * r1 - a
                    C2 = 2 * r2

                    r1, r2 = self._rng.rand(), self._rng.rand()
                    A3 = 2 * a * r1 - a
                    C3 = 2 * r2

                    X_alpha_d = positions[alpha_idx, d]
                    X_beta_d = positions[beta_idx, d]
                    X_delta_d = positions[delta_idx, d]

                    D_alpha = abs(C1 * X_alpha_d - positions[i, d])
                    D_beta = abs(C2 * X_beta_d - positions[i, d])
                    D_delta = abs(C3 * X_delta_d - positions[i, d])

                    X1 = X_alpha_d - A1 * D_alpha
                    X2 = X_beta_d - A2 * D_beta
                    X3 = X_delta_d - A3 * D_delta

                    new_pos = (X1 + X2 + X3) / 3.0
                    # Keep within [0,1]
                    positions[i, d] = max(0.0, min(1.0, new_pos))

            # Update binary masks and fitness
            binary_masks = self._continuous_to_binary(positions)
            for i in range(self.n_wolves):
                fitness[i] = self._fitness(binary_masks[i], X, y)

            alpha_idx, beta_idx, delta_idx = get_leaders()

            if fitness[alpha_idx] > self.best_score_:
                self.best_score_ = fitness[alpha_idx]
                self.best_mask_ = binary_masks[alpha_idx].copy()

            print(
                f"[BGWO] Iter {iter_idx+1}/{self.max_iter} - "
                f"Best fitness: {self.best_score_:.4f}, "
                f"Selected features: {int(self.best_mask_.sum())}"
            )

        # Safety: if all-zero mask, fall back to top-k features by variance
        if self.best_mask_.sum() == 0:
            print("[BGWO] Warning: empty mask, falling back to variance-based selection.")
            variances = np.var(X, axis=0)
            k = max(1, int(0.2 * self.n_features))
            top_idx = np.argsort(-variances)[:k]
            self.best_mask_[top_idx] = 1

        selected_indices = np.where(self.best_mask_ == 1)[0]
        return selected_indices

    def transform(self, X: np.ndarray) -> np.ndarray:
        if self.best_mask_ is None:
            raise RuntimeError("BGWOFeatureSelector must be fitted before transform().")
        return X[:, self.best_mask_ == 1]

    def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        self.fit(X, y)
        return self.transform(X)

# =========================
# 6. RFE with XGBoost (Stage 2)
# =========================

def rfe_xgboost(
    X: np.ndarray,
    y: np.ndarray,
    feature_indices: np.ndarray,
    n_features_to_select: Optional[int] = None,
) -> np.ndarray:
    """
    Apply RFE on top of BGWO-selected features.
    Returns final selected feature indices (referring to original feature space).
    """
    if n_features_to_select is None:
        n_features_to_select = max(1, int(len(feature_indices) * 0.5))

    # Ensure numpy arrays
    feature_indices = np.array(feature_indices, dtype=int)
    X_sel = X[:, feature_indices]

    # Basic XGBoost estimator for RFE
    params = dict(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        objective="binary:logistic"
        if len(np.unique(y)) == 2
        else "multi:softprob",
        tree_method="hist",
        n_jobs=-1,
    )
    if len(np.unique(y)) > 2:
        params["num_class"] = len(np.unique(y))

    estimator = XGBClassifier(**params)
    rfe = RFE(
        estimator=estimator,
        n_features_to_select=n_features_to_select,
        step=1,
    )
    rfe.fit(X_sel, y)

    support_mask = rfe.support_
    selected_within = np.where(support_mask)[0]
    final_indices = feature_indices[selected_within]

    print(
        f"[RFE] Reduced from {len(feature_indices)} to {len(final_indices)} features."
    )
    return final_indices

# =========================
# 7. Hyperparameter optimization with BO-TPE (Optuna)
# =========================

def optuna_objective(
    trial: optuna.trial.Trial,
    X: np.ndarray,
    y: np.ndarray,
    task_type: str = "binary",
) -> float:
    """
    Objective for Optuna: tune XGBoost hyperparameters.
    BO-TPE is used via Optuna's TPESampler.
    """
    n_classes = len(np.unique(y))

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float(
            "learning_rate", 0.01, 0.3, log=True
        ),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "tree_method": "hist",
        "n_jobs": -1,
    }

    if task_type == "binary":
        params["objective"] = "binary:logistic"
        eval_metric = "auc"
    else:
        params["objective"] = "multi:softprob"
        params["num_class"] = n_classes
        eval_metric = "mlogloss"

    cv = StratifiedKFold(
        n_splits=3, shuffle=True, random_state=RANDOM_STATE
    )
    scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_tr, X_va = X[train_idx], X[valid_idx]
        y_tr, y_va = y[train_idx], y[valid_idx]

        model = XGBClassifier(
            **params,
            eval_metric=eval_metric,
        )
        model.fit(X_tr, y_tr)

        if task_type == "binary":
            probs = model.predict_proba(X_va)[:, 1]
            score = roc_auc_score(y_va, probs)
        else:
            preds = model.predict(X_va)
            score = (preds == y_va).mean()

        scores.append(score)

    return float(np.mean(scores))


def tune_hyperparameters(
    X: np.ndarray,
    y: np.ndarray,
    task_type: str = "binary",
    n_trials: int = 25,
    study_name: str = "xgb_optuna_study",
) -> Tuple[Dict, optuna.Study]:
    """
    Run BO-TPE hyperparameter search with Optuna for XGBoost.
    Returns best_params and study.
    """
    sampler = TPESampler(seed=RANDOM_STATE)
    direction = "maximize"
    study = optuna.create_study(
        study_name=study_name,
        direction=direction,
        sampler=sampler,
    )

    study.optimize(
        lambda trial: optuna_objective(
            trial, X, y, task_type=task_type
        ),
        n_trials=n_trials,
        show_progress_bar=False,
    )

    best_params = study.best_params.copy()
    best_params.update(
        {
            "tree_method": "hist",
            "n_jobs": -1,
        }
    )

    if task_type == "binary":
        best_params["objective"] = "binary:logistic"
    else:
        best_params["objective"] = "multi:softprob"
        best_params["num_class"] = len(np.unique(y))

    print(
        f"[Optuna] Best score: {study.best_value:.4f} with params:\n{json.dumps(best_params, indent=2)}"
    )
    return best_params, study

# =========================
# 8. Attack-specific metrics
# =========================

def print_attack_specific_report(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    label_map: Dict[int, str],
) -> pd.DataFrame:
    """
    Print and return a per-attack classification report.
    """
    unique_labels = sorted(np.unique(y_true))
    target_names = [label_map[i] for i in unique_labels]

    report_dict = classification_report(
        y_true,
        y_pred,
        target_names=target_names,
        output_dict=True,
        zero_division=0,
    )
    df_report = pd.DataFrame(report_dict).transpose()
    print("\n=== Attack-specific classification report ===")
    print(df_report[["precision", "recall", "f1-score", "support"]])
    return df_report

# =========================
# 9. Full BGWO-RFE-XGBoost pipeline
# =========================

def run_bgwo_rfe_xgb_pipeline(
    df: pd.DataFrame,
    feature_cols: List[str],
    label_col: str,
    task_type: str,
    checkpoint_prefix: str,
    label_map: Optional[Dict[int, str]] = None,
    bgwo_max_iter: int = 10,
    bgwo_n_wolves: int = 6,
    n_optuna_trials: int = 15,
) -> Dict:
    """
    Run the complete pipeline:
      - Train/test split
      - Scaling
      - SMOTE
      - BGWO (Stage 1)
      - RFE-XGBoost (Stage 2)
      - BO-TPE hyperparameter tuning
      - Final training and evaluation
      - Checkpoint saving
    """

    assert task_type in ("binary", "multiclass")

    X = df[feature_cols].values.astype(np.float32)
    y = df[label_col].values.astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=y,
        random_state=RANDOM_STATE,
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Handle class imbalance with SMOTE (on training data only)
    smote = SMOTE(random_state=RANDOM_STATE)
    X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

    print(f"[Data] Train shape (balanced): {X_train_bal.shape}")
    print(f"[Data] Test shape: {X_test_scaled.shape}")

    # Stage 1: BGWO
    bgwo = BGWOFeatureSelector(
        n_features=X_train_bal.shape[1],
        max_iter=bgwo_max_iter,
        n_wolves=bgwo_n_wolves,
    )
    bgwo_selected_idx = bgwo.fit(X_train_bal, y_train_bal)
    print(f"[BGWO] Selected {len(bgwo_selected_idx)} features via BGWO.")

    # Stage 2: RFE
    final_feature_idx = rfe_xgboost(
        X_train_bal,
        y_train_bal,
        bgwo_selected_idx,
        n_features_to_select=max(1, int(len(bgwo_selected_idx) * 0.5)),
    )
    print(f"[RFE] Final selected features: {len(final_feature_idx)}")

    selected_feature_names = [feature_cols[i] for i in final_feature_idx]

    # Reduce data to final selected features
    X_train_fs = X_train_bal[:, final_feature_idx]
    X_test_fs = X_test_scaled[:, final_feature_idx]

    # Hyperparameter tuning with BO-TPE (Optuna)
    best_params, study = tune_hyperparameters(
        X_train_fs,
        y_train_bal,
        task_type=task_type,
        n_trials=n_optuna_trials,
        study_name=f"{checkpoint_prefix}_optuna",
    )

    # Final model training
    model = XGBClassifier(**best_params)
    model.fit(X_train_fs, y_train_bal)

    # Evaluation
    y_pred = model.predict(X_test_fs)

    print("\n=== Global classification report ===")
    print(
        classification_report(
            y_test,
            y_pred,
            zero_division=0,
        )
    )

    if task_type == "binary":
        y_prob = model.predict_proba(X_test_fs)[:, 1]
        try:
            auc = roc_auc_score(y_test, y_prob)
            print(f"[Binary] Test ROC-AUC: {auc:.4f}")
        except ValueError:
            print("[Binary] ROC-AUC could not be computed (single class in y_test).")

    # Attack-specific analysis only for multi-class
    attack_report_df = None
    if task_type == "multiclass" and label_map is not None:
        attack_report_df = print_attack_specific_report(
            y_true=y_test,
            y_pred=y_pred,
            label_map=label_map,
        )

    # Confusion matrix
    print("\n=== Confusion matrix ===")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # Save checkpoints
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)

    model_path = os.path.join(
        CHECKPOINT_DIR, f"{checkpoint_prefix}_xgb_model.pkl"
    )
    scaler_path = os.path.join(
        CHECKPOINT_DIR, f"{checkpoint_prefix}_scaler.pkl"
    )
    feature_idx_path = os.path.join(
        CHECKPOINT_DIR, f"{checkpoint_prefix}_feature_indices.pkl"
    )
    feature_names_path = os.path.join(
        CHECKPOINT_DIR, f"{checkpoint_prefix}_feature_names.json"
    )
    study_path = os.path.join(
        CHECKPOINT_DIR, f"{checkpoint_prefix}_optuna_study.pkl"
    )

    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    joblib.dump(final_feature_idx, feature_idx_path)
    with open(feature_names_path, "w") as f:
        json.dump(selected_feature_names, f, indent=2)
    joblib.dump(study, study_path)

    print(f"\n[Checkpoint] Saved model to: {model_path}")
    print(f"[Checkpoint] Saved scaler to: {scaler_path}")
    print(f"[Checkpoint] Saved feature indices to: {feature_idx_path}")
    print(f"[Checkpoint] Saved feature names to: {feature_names_path}")
    print(f"[Checkpoint] Saved Optuna study to: {study_path}")

    results = {
        "model": model,
        "scaler": scaler,
        "selected_feature_indices": final_feature_idx,
        "selected_feature_names": selected_feature_names,
        "study": study,
        "confusion_matrix": cm,
        "attack_report_df": attack_report_df,
    }
    return results

# =========================
# 10. Example: run pipeline on synthetic IoT dataset
# =========================

# Build synthetic dataset that mimics:
# - Normal traffic
# - Data Tampering
# - Impersonation/Spoofing
# - DDoS/DoS
# - Silent Exfiltration
# - Backdoor
# - Reconnaissance

synthetic_data = build_synthetic_iot_dataset(
    n_samples_per_class=400,  # reduce/increase for speed vs. accuracy
    n_features=32,
)
df_syn = synthetic_data["dataframe"]
feature_cols_syn = synthetic_data["feature_names"]

print("Synthetic dataset shape:", df_syn.shape)
print("Class distribution (multi):")
print(df_syn["label_multi"].value_counts().sort_index())

# ---- Binary classification: Normal vs Attack ----
binary_results = run_bgwo_rfe_xgb_pipeline(
    df=df_syn,
    feature_cols=feature_cols_syn,
    label_col="label_binary",
    task_type="binary",
    checkpoint_prefix="synthetic_binary",
    label_map=None,
    bgwo_max_iter=5,     # small for demo; increase for better performance
    bgwo_n_wolves=5,
    n_optuna_trials=10,
)

# ---- Multi-class classification: 7-class attack types ----
multi_results = run_bgwo_rfe_xgb_pipeline(
    df=df_syn,
    feature_cols=feature_cols_syn,
    label_col="label_multi",
    task_type="multiclass",
    checkpoint_prefix="synthetic_multiclass",
    label_map=ATTACK_ID_TO_LABEL,
    bgwo_max_iter=5,
    bgwo_n_wolves=5,
    n_optuna_trials=10,
)

print("\nPipeline execution on synthetic data is complete.")
print("You can now plug in your real merged IoT datasets in place of df_syn.")
