## Installing dependencies and setting up

In [None]:
# + pyarrow (fast CSV/Parquet IO) + polars (fast ETL) + fastparquet (optional)
!pip install pandas numpy tqdm scikit-learn xgboost lightgbm matplotlib pyarrow polars fastparquet


In [None]:
#!/usr/bin/env python3
import os
import polars as pl

folder = os.getcwd()
csv_files = [f for f in os.listdir(folder) if f.lower().endswith(".csv")]
if not csv_files:
    raise SystemExit("No CSV files found in this folder.")
if len(csv_files) > 1:
    raise SystemExit(f"Expected a single CSV, found {len(csv_files)}: {csv_files}")

src = os.path.join(folder, csv_files[0])
dst = os.path.join(folder, os.path.splitext(csv_files[0])[0] + "_clean.parquet")

# Stream CSV -> Parquet (no full collect)
(
    pl.scan_csv(
        src,
        try_parse_dates=True,        # parse date-like columns if present
        infer_schema_length=200_000, # safer than tiny sample; 0 = read all rows (slower)
        # quote_char=None,           # uncomment only if you know there are NO quoted fields
        # ignore_errors=True,        # uncomment if you can drop occasional bad lines
    )
    .sink_parquet(
        dst,
        compression="zstd",
        compression_level=1,         # fast writes
        statistics=False,            # faster; enable if you’ll filter heavily later
        maintain_order=False,
        # row_group_size=500_000,    # tune if needed for very wide/huge files
    )
)

print(f"CSV -> Parquet: {src}  ➜  {dst}")


In [None]:
#!/usr/bin/env python3
import os
import pandas as pd

# Use the current directory
folder_path = os.getcwd()

# Find all Parquet files in here
parquet_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.parquet')]

if not parquet_files:
    print("No Parquet files found in this folder.")
else:
    for parquet_file in parquet_files:
        file_path = os.path.join(folder_path, parquet_file)
        
        # Load Parquet
        df = pd.read_parquet(file_path)

        # Drop any rows with NaNs
        df_cleaned = df.dropna()

        # Overwrite the same file (or change the filename if you prefer)
        df_cleaned.to_parquet(file_path, index=False)

        print(f"Cleaned {parquet_file} → {len(df_cleaned)} rows kept")


In [None]:
#!/usr/bin/env python3
import os
import polars as pl

folder = os.getcwd()
pattern = os.path.join(folder, "*_clean.parquet")
out_file = os.path.join(folder, "label_counts.txt")

# Scan, then select only the Label column (projection pushdown will keep it fast).
q = (
    pl.scan_parquet(pattern)
      .select(pl.col("Label").cast(pl.Categorical))
      .filter(pl.col("Label").is_not_null())
)

# Total rows (collect just a tiny scalar)
n_rows = (
    q.select(pl.len().alias("rows"))
     .collect(engine="streaming")["rows"][0]
)

# Label counts (materialize only the small result)
label_counts = (
    q.group_by("Label")
     .len()
     .sort("len", descending=True)
     .collect(engine="streaming")
     .rename({"len": "count"})
)

# Write to txt
with open(out_file, "w") as f:
    f.write("Analyzing original Parquet files (glob): *_clean.parquet\n")
    f.write(f"Total rows: {n_rows}\n\n")
    f.write("Unique labels and their counts:\n")
    for label, count in zip(label_counts["Label"].to_list(),
                            label_counts["count"].to_list()):
        f.write(f"  {label}: {count}\n")
    f.write(f"\nTotal unique labels: {label_counts.shape[0]}\n")

print(f"Results written to {out_file}")


In [None]:
#!/usr/bin/env python3
import os
import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

folder = os.getcwd()
pattern = os.path.join(folder, "*_clean.parquet")

# 1) Lazily scan all parts (single call; supports glob)
lazy_all = pl.scan_parquet(pattern)

# 2) Add a stable row id (after concat) and ensure Label dtype
#    We will ONLY collect (rid, Label), not the whole table.
lazy_all = lazy_all.with_row_index("rid").with_columns(pl.col("Label").cast(pl.Utf8))

# 3) Collect tiny metadata needed for splitting
meta = lazy_all.select("rid", "Label").collect(engine="streaming")

if "Label" not in meta.columns:
    raise RuntimeError("Expected a 'Label' column in the dataset.")

# 4) Decide if we can stratify (≥2 classes and each has ≥2 rows)
use_stratify = False
counts = meta.group_by("Label").len().select("len").to_series().to_numpy()
if meta["Label"].n_unique() >= 2 and (counts >= 2).all():
    use_stratify = True

# 5) Split on *indices* (NumPy) — tiny and fast
rid = meta["rid"].to_numpy()
labels = meta["Label"].to_numpy()

train_idx, test_idx = train_test_split(
    rid,
    test_size=0.20,
    shuffle=True,
    random_state=42,
    stratify=labels if use_stratify else None,
)

# 6) Build small row-id tables and join (semi) to stream rows to Parquet
train_ids = pl.DataFrame({"rid": train_idx}).lazy()
test_ids  = pl.DataFrame({"rid": test_idx}).lazy()

lazy_with_id = lazy_all  # already has "rid" column

lazy_train = lazy_with_id.join(train_ids, on="rid", how="semi")
lazy_test  = lazy_with_id.join(test_ids,  on="rid", how="semi")

train_parquet = os.path.join(folder, "train.parquet")
test_parquet  = os.path.join(folder, "test.parquet")

# 7) Stream directly to Parquet (fast path; no full materialization)
#    Use a fast codec/level for speed; tweak as you prefer.
lazy_train.sink_parquet(train_parquet, compression="zstd", compression_level=1, statistics=False)
lazy_test.sink_parquet(test_parquet,   compression="zstd", compression_level=1, statistics=False)

print(
    "Data split complete:\n"
    f" • Pattern: {pattern}\n"
    f" • Training rows: {len(train_idx)}\n"
    f" • Test rows: {len(test_idx)}\n"
    f" • Stratified: {use_stratify}\n"
    f" • Saved: {train_parquet}, {test_parquet}"
)


We deduce that there are 17(16+1) Classes (Attack Labels)

In [None]:
#!/usr/bin/env python3
import os
import pickle
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.preprocessing import StandardScaler

folder = os.getcwd()
train_parquet = os.path.join(folder, "train.parquet")

# 1) Load train
train_pl = pl.read_parquet(train_parquet)
y_column = "Label"
if y_column not in train_pl.columns:
    raise RuntimeError("Expected 'Label' in train.parquet")

# 2) Numeric cols (exclude label)
numeric_cols = [c for c in train_pl.select(cs.numeric()).columns if c != y_column]
if not numeric_cols:
    raise RuntimeError("No numeric feature columns found.")

# 3) Medians on TRAIN ONLY
meds_s = train_pl.select([pl.col(c).median().alias(c) for c in numeric_cols])
medians = {c: float(meds_s[c][0]) for c in numeric_cols}

# 4) Impute to float32 → view as float64 for stats
def to_imputed_f32(df: pl.DataFrame, cols: list[str], med: dict[str, float]) -> np.ndarray:
    df = df.with_columns([pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c) for c in cols])
    df = df.with_columns([pl.col(c).fill_null(med[c]).fill_nan(med[c]).cast(pl.Float32).alias(c) for c in cols])
    return df.select(cols).to_numpy().astype(np.float32, copy=False)

X_train_for_scaler = to_imputed_f32(train_pl, numeric_cols, medians).astype(np.float64, copy=False)

# 4.1) Drop zero/near-zero variance cols once
stds = X_train_for_scaler.std(axis=0, dtype=np.float64)  # ddof=0 like StandardScaler
EPS_STD = 1e-12
keep_mask = stds > EPS_STD
if not keep_mask.all():
    dropped = [c for c, k in zip(numeric_cols, keep_mask) if not k]
    print(f"Dropping {len(dropped)} ~zero-variance cols:", dropped)
    numeric_cols = [c for c, k in zip(numeric_cols, keep_mask) if k]
    X_train_for_scaler = X_train_for_scaler[:, keep_mask]
    medians = {c: medians[c] for c in numeric_cols}

# 5) Fit scaler
scaler = StandardScaler()
scaler.fit(X_train_for_scaler)

# 5.1) Save LR keep indices for near-constant scales
EPS_SCALE = 1e-6
lr_keep_mask = scaler.scale_ >= EPS_SCALE
lr_keep_idx = np.where(lr_keep_mask)[0].astype(np.int32)
if not lr_keep_mask.all():
    dropped_lr = [c for c, k in zip(numeric_cols, lr_keep_mask) if not k]
    print(f"[LR path] Dropping {len(dropped_lr)} near-constant cols by scale:", dropped_lr)

# 6) Save artifacts
artifacts = {
    "numeric_cols": numeric_cols,
    "medians": medians,
    "scaler_mean": scaler.mean_.astype(np.float64),
    "scaler_scale": scaler.scale_.astype(np.float64),  # keep raw
    "lr_keep_idx": lr_keep_idx,
}
with open(os.path.join(folder, "preproc.pkl"), "wb") as f:
    pickle.dump(artifacts, f)

print(f"Saved preproc.pkl with {len(numeric_cols)} cols (LR cols kept: {lr_keep_idx.size}).")


## 17(16+1) Classes (Attack Labels)

In [None]:
#!/usr/bin/env python3
# --- Threading: let libs use all cores (don't cap OMP to 1)
import os, time, warnings
CORES = os.cpu_count() or 4
os.environ["OMP_NUM_THREADS"] = str(CORES)        # XGBoost / OpenMP users
os.environ["MKL_NUM_THREADS"] = str(CORES)        # BLAS for LR/NumPy
os.environ["OPENBLAS_NUM_THREADS"] = str(CORES)
os.environ["NUMEXPR_NUM_THREADS"] = str(min(CORES, 8))
warnings.filterwarnings("ignore", category=UserWarning)

import polars as pl
import numpy as np
import pickle
from tqdm import tqdm

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
)
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

# --------------------------
# 1) Load train/test Parquet (Polars)
# --------------------------
folder = os.getcwd()
train_parquet = os.path.join(folder, "train.parquet")
test_parquet  = os.path.join(folder, "test.parquet")
if not (os.path.exists(train_parquet) and os.path.exists(test_parquet)):
    raise RuntimeError("Missing train.parquet/test.parquet. Run the splitter step first.")

train_pl = pl.read_parquet(train_parquet)
test_pl  = pl.read_parquet(test_parquet)

# --------------------------
# 2) Columns: numeric features + label
# --------------------------
y_column = "Label"
if y_column not in train_pl.columns or y_column not in test_pl.columns:
    raise RuntimeError("Expected a 'Label' column in both train and test dataframes.")

schema = train_pl.schema
numeric_cols = [c for c, dt in schema.items() if c != y_column and dt in pl.NUMERIC_DTYPES]
if not numeric_cols:
    raise RuntimeError("No numeric feature columns found.")

# --------------------------
# 3) Feature hygiene in Polars (inf -> null; fill null/nan with median; cast to f32)
# --------------------------
def prep_numeric(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    # replace ±inf with null
    df = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    # compute medians once
    meds = df.select([pl.col(c).median().alias(c) for c in cols])
    # fill nulls/NaNs with medians and cast to float32
    df = df.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df

train_pl = prep_numeric(train_pl, numeric_cols)
test_pl  = prep_numeric(test_pl,  numeric_cols)

# --------------------------
# 4) To NumPy (float32, optionally Fortran order)
# --------------------------
def to_f32_f_order(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    arr = df.select(cols).to_numpy()
    return np.asfortranarray(arr, dtype=np.float32)

X_train_full = to_f32_f_order(train_pl, numeric_cols)
X_test       = to_f32_f_order(test_pl,  numeric_cols)

# Labels
y_train_raw = train_pl.select(y_column).to_series().to_numpy()
y_test_raw  = test_pl.select(y_column).to_series().to_numpy()

# --------------------------
# 5) Label encoding on union(train,test) to avoid unseen-class errors
# --------------------------
le = LabelEncoder()
le.fit(np.concatenate([y_train_raw, y_test_raw]).astype(str))
y_train_full = le.transform(y_train_raw.astype(str))
y_test       = le.transform(y_test_raw.astype(str))

# --------------------------
# 6) Scaling for LR in float32 (apply StandardScaler params in f32)
# --------------------------
scaler_path = os.path.join(folder, "preproc.pkl")
if not os.path.exists(scaler_path):
    raise RuntimeError("preproc.pkl not found. Run the preprocessing step first.")
with open(scaler_path, "rb") as f:
    artifacts = pickle.load(f)

mean_f32  = np.asarray(artifacts["scaler_mean"],  dtype=np.float32)
scale_f32 = np.asarray(artifacts["scaler_scale"], dtype=np.float32)

if mean_f32 is None or scale_f32 is None:
    raise RuntimeError("Loaded scaler lacks mean_/scale_. Refit with StandardScaler.")
scale_safe = np.where(scale_f32 == 0.0, 1.0, scale_f32)

def apply_scaler_f32(X_f):
    out = np.empty_like(X_f, dtype=np.float32, order='F')
    np.subtract(X_f, mean_f32, out=out)
    np.divide(out, scale_safe, out=out)
    return out

X_train_lr_full = apply_scaler_f32(X_train_full)
X_test_lr       = apply_scaler_f32(X_test)

# --------------------------
# 7) Validation split for early stopping
# --------------------------
X_tr_num, X_val_num, y_tr, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.15, random_state=42, stratify=y_train_full
)
X_tr_lr,  X_val_lr  = train_test_split(
    X_train_lr_full, test_size=0.15, random_state=42, stratify=y_train_full
)

# --------------------------
# 8) Models (light defaults) + boosters with early stopping
# --------------------------
RANDOM_STATE = 42
ML_models = [
    ("LogisticRegression", LogisticRegression(
        solver="lbfgs", penalty="l2", max_iter=1000, tol=2e-3, random_state=RANDOM_STATE
    )),
    ("DecisionTreeClassifier", DecisionTreeClassifier(
        criterion="entropy", max_depth=5, random_state=RANDOM_STATE
    )),
    ("RandomForestClassifier", RandomForestClassifier(
        n_estimators=100, max_features="sqrt", n_jobs=-1, random_state=RANDOM_STATE
    )),
    ("AdaBoostClassifier", AdaBoostClassifier(
        n_estimators=50, learning_rate=0.5, random_state=RANDOM_STATE
    )),

    ("BaggingClassifier", BaggingClassifier(
        n_estimators=10, n_jobs=-1, random_state=RANDOM_STATE
    )),

    ("LGBMClassifier", LGBMClassifier(
        n_estimators=4000, num_leaves=48, learning_rate=0.06,
        subsample=0.8, colsample_bytree=0.8,
        n_jobs=CORES, random_state=RANDOM_STATE
    )),
]

# --------------------------
# 9) Report
# --------------------------
report_path = os.path.join(folder, "report.txt")
open(report_path, "w").close()

def log_metrics(name, model, X_te, y_te):
    y_pred = model.predict(X_te)
    acc  = accuracy_score(y_te, y_pred)
    rec  = recall_score(y_te, y_pred, average="macro")
    prec = precision_score(y_te, y_pred, average="macro")
    f1   = f1_score(y_te, y_pred, average="macro")
    with open(report_path, "a") as fp:
        fp.write(f"####### {name} #######\n")
        fp.write(f"Accuracy : {acc:.4f}\n")
        fp.write(f"Recall   : {rec:.4f}\n")
        fp.write(f"Precision: {prec:.4f}\n")
        fp.write(f"F1 Score : {f1:.4f}\n\n")

# --------------------------
# 10) Train / time / save / evaluate
# --------------------------
for name, model in tqdm(ML_models, total=len(ML_models), desc="Models"):
    print(f"\n▶ Training {name}")
    t0 = time.perf_counter()

    if name == "LogisticRegression":
        model.fit(X_tr_lr, y_tr)
        Xte = X_test_lr

    elif name == "XGBClassifier":
        Xte = X_test  # test features for XGB always unscaled numeric

    # Try new API (callbacks)
        try:
            from xgboost import callback as xgb_callback  # may not exist on very old versions
            es = [xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=False)]
            model.fit(
                X_tr_num, y_tr,
                eval_set=[(X_val_num, y_val)],
                callbacks=es,
        )
        except (TypeError, ImportError):
            # Fallback: older API with early_stopping_rounds in fit
            try:
                model.fit(
                    X_tr_num, y_tr,
                    eval_set=[(X_val_num, y_val)],
                    early_stopping_rounds=50,
                    verbose=False,
                )
            except TypeError:
                # Last resort: no early stopping
                model.fit(X_tr_num, y_tr)   

    elif name == "LGBMClassifier":
        import lightgbm as lgb
        model.fit(
            X_tr_num, y_tr,
            eval_set=[(X_val_num, y_val)],
            eval_metric="multi_logloss",
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        Xte = X_test

    else:
        model.fit(X_tr_num, y_tr)
        Xte = X_test

    secs = time.perf_counter() - t0
    print(f"⏱ {name} trained in {secs:.1f}s")

    # Save
    with open(os.path.join(folder, f"{name}_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    print(f"▶ Testing {name}")
    log_metrics(name, model, Xte, y_test)

print(f"\nAll models trained, tested, and metrics logged to '{report_path}'.")


In [None]:
# Confusion Matrix (counts + row-normalized) for any saved model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os, pickle

# --- choose the model you want to visualize (must match the filename you saved) ---
model_name = "LGBMClassifier"        # e.g. "LogisticRegression", "RandomForestClassifier", ...
model_path = os.path.join(folder, f"{model_name}_model.pkl")

with open(model_path, "rb") as f:
    model = pickle.load(f)

# Pick the correct test features
Xte = X_test_lr if model_name == "LogisticRegression" else X_test

# Predict
y_pred = model.predict(Xte)

# Labels:
# - y_test is already integer-encoded with LabelEncoder
# - Use index range for confusion_matrix, and le.classes_ as display labels
idx_labels   = np.arange(len(le.classes_))  # 0..K-1
display_lbls = le.classes_

# Raw counts CM
cm = confusion_matrix(y_test, y_pred, labels=idx_labels)

# Save counts CM to CSV
np.savetxt(os.path.join(folder, f"cm_counts_{model_name}.csv"), cm, fmt="%d", delimiter=",")

# Plot counts CM
fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical"
)
ax.set_title(f"Confusion Matrix (counts) — {model_name}")
plt.savefig(os.path.join(folder, f"cm_counts_{model_name}.png"), bbox_inches="tight")
plt.show()
plt.close(fig)

# Row-normalized (per true class)
with np.errstate(invalid="ignore", divide="ignore"):
    row_sums = cm.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # avoid div/0 if a class is absent
    cm_norm = (cm / row_sums)

# Save normalized CM to CSV
np.savetxt(os.path.join(folder, f"cm_normalized_{model_name}.csv"), cm_norm, fmt="%.4f", delimiter=",")

# Plot normalized CM
fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical", values_format=".2f", cmap=plt.cm.Blues
)
ax.set_title(f"Confusion Matrix (row-normalized) — {model_name}")
plt.savefig(os.path.join(folder, f"cm_normalized_{model_name}.png"), bbox_inches="tight")
plt.show()
plt.close(fig)

print("Saved:",
      f"cm_counts_{model_name}.png / .csv",
      f"cm_normalized_{model_name}.png / .csv",
      sep="\n- ")


In [None]:
import os
import polars as pl

# Combine train/test labels
df = pl.concat([train_pl.select("Label"), test_pl.select("Label")])

# Count labels
label_counts = (
    df.group_by("Label")
      .len()
      .sort("len", descending=True)
)

# Save to TXT
out_path = os.path.join(folder, "label_counts.txt")
with open(out_path, "w") as f:
    for row in label_counts.iter_rows():
        f.write(f"{row[0]}: {row[1]}\n")

print(f"✅ Saved labels and counts to {out_path}")


## 6(5+1) Classes (Attack Labels)


We group the attacks in classes

In [12]:
dict_6classes = {}

# --- DDoS ---
dict_6classes['DDoS_ICMP'] = 'DDoS'
dict_6classes['DDoS_UDP'] = 'DDoS'
dict_6classes['DDoS_TCP'] = 'DDoS'
dict_6classes['DDoS_SYN'] = 'DDoS'

# --- DoS ---
dict_6classes['DoS_UDP'] = 'DoS'
dict_6classes['DoS_TCP'] = 'DoS'
dict_6classes['DoS_SYN'] = 'DoS'
dict_6classes['DoS_ICMP'] = 'DoS'

# --- Benign ---
dict_6classes['Benign'] = 'Benign'

# --- Spoofing ---
dict_6classes['Spoofing_ARP'] = 'Spoofing'

# --- Recon ---
dict_6classes['Recon_Ping_Sweep'] = 'Recon'
dict_6classes['Recon_OS_Scan'] = 'Recon'
dict_6classes['Recon_Port_Scan'] = 'Recon'
dict_6classes['Recon_Vulnerability_Scan'] = 'Recon'

# --- MQTT ---
dict_6classes['MQTT_DDoS_Flooding'] = 'MQTT'
dict_6classes['MQTT_DoS_Flooding'] = 'MQTT'
dict_6classes['MQTT_Malformed_Data'] = 'MQTT'


In [None]:
# === 6-class grouping run (Polars → NumPy) ===
import polars as pl
import numpy as np
import os, pickle, time
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split



y_column = "Label"

# --- Map labels to 6 classes (drop anything unmapped) ---
def map_labels_pl(df: pl.DataFrame, mapping: dict) -> pl.DataFrame:
    mapped = df.with_columns(
        pl.col(y_column).cast(pl.Utf8).replace(mapping, default=None).alias(y_column)
    )
    # Drop rows where mapping failed
    return mapped.drop_nulls(subset=[y_column])

train_pl_6 = map_labels_pl(train_pl, dict_6classes)
test_pl_6  = map_labels_pl(test_pl,  dict_6classes)

# --- Hygiene per split: replace ±inf→null, fill with medians, cast float32 ---
def prep_numeric_pl(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    df2 = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    meds = df2.select([pl.col(c).median().alias(c) for c in cols])
    df2 = df2.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df2

train_pl_6 = prep_numeric_pl(train_pl_6, numeric_cols)
test_pl_6  = prep_numeric_pl(test_pl_6,  numeric_cols)

# --- To NumPy (float32, Fortran order) ---
def to_f32_f_order(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    arr = df.select(cols).to_numpy()
    return np.asfortranarray(arr, dtype=np.float32)

X_train_full_6 = to_f32_f_order(train_pl_6, numeric_cols)
X_test_6       = to_f32_f_order(test_pl_6,  numeric_cols)

# --- Labels (string → LabelEncoder over union) ---
y_train_6_raw = train_pl_6.select(y_column).to_series().to_numpy().astype(str)
y_test_6_raw  = test_pl_6.select(y_column).to_series().to_numpy().astype(str)

le6 = LabelEncoder()
le6.fit(np.concatenate([y_train_6_raw, y_test_6_raw]))
y_train_full_6 = le6.transform(y_train_6_raw)
y_test_6       = le6.transform(y_test_6_raw)

# --- Scaling for LR (reuse your pre-fitted scaler params already loaded) ---
def apply_scaler_f32(X_f):
    # reusing mean_f32 and scale_safe defined earlier in your session
    out = np.empty_like(X_f, dtype=np.float32, order='F')
    np.subtract(X_f, mean_f32, out=out)
    np.divide(out, scale_safe, out=out)
    return out

X_train_lr_full_6 = apply_scaler_f32(X_train_full_6)
X_test_lr_6       = apply_scaler_f32(X_test_6)

# --- Train/val split (stratified) for early stopping boosters ---
X_tr_num6, X_val_num6, y_tr6, y_val6 = train_test_split(
    X_train_full_6, y_train_full_6, test_size=0.15, random_state=42, stratify=y_train_full_6
)
X_tr_lr6,  X_val_lr6  = train_test_split(
    X_train_lr_full_6, test_size=0.15, random_state=42, stratify=y_train_full_6
)

# --- Append header to the same report.txt ---
report_path = os.path.join(folder, "report.txt")
with open(report_path, "a") as fp:
    fp.write("\n\n===== Grouped 6-Class Run =====\n")
    fp.write("Classes (encoded order): " + ", ".join(map(str, le6.classes_)) + "\n")

# --- Train / evaluate each model on 6 classes ---
for name, model in tqdm(ML_models, total=len(ML_models), desc="Models (6-class)"):
    print(f"\n▶ Training {name} (6-class)")
    t0 = time.perf_counter()

    if name == "LogisticRegression":
        model.fit(X_tr_lr6, y_tr6)
        Xte = X_test_lr_6

    elif name == "XGBClassifier":
        # (Only if you have XGB in ML_models; your current list does not include it.)
        Xte = X_test_6
        try:
            from xgboost import callback as xgb_callback
            es = [xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=False)]
            model.fit(
                X_tr_num6, y_tr6,
                eval_set=[(X_val_num6, y_val6)],
                callbacks=es,
            )
        except (TypeError, ImportError):
            try:
                model.fit(
                    X_tr_num6, y_tr6,
                    eval_set=[(X_val_num6, y_val6)],
                    early_stopping_rounds=50,
                    verbose=False,
                )
            except TypeError:
                model.fit(X_tr_num6, y_tr6)

    elif name == "LGBMClassifier":
        import lightgbm as lgb
        model.fit(
            X_tr_num6, y_tr6,
            eval_set=[(X_val_num6, y_val6)],
            eval_metric="multi_logloss",
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        Xte = X_test_6

    else:
        model.fit(X_tr_num6, y_tr6)
        Xte = X_test_6

    secs = time.perf_counter() - t0
    print(f"⏱ {name} (6-class) trained in {secs:.1f}s")

    # Save model
    with open(os.path.join(folder, f"{name}_6classes_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    # Evaluate
    print(f"▶ Testing {name} (6-class)")
    y_pred = model.predict(Xte)
    acc  = accuracy_score(y_test_6, y_pred)
    rec  = recall_score(y_test_6, y_pred, average="macro")
    prec = precision_score(y_test_6, y_pred, average="macro")
    f1   = f1_score(y_test_6, y_pred, average="macro")

    with open(report_path, "a") as fp:
        fp.write(f"####### {name} (6 Classes) #######\n")
        fp.write(f"Accuracy : {acc:.4f}\n")
        fp.write(f"Recall   : {rec:.4f}\n")
        fp.write(f"Precision: {prec:.4f}\n")
        fp.write(f"F1 Score : {f1:.4f}\n\n")

print("\n✅ All 6-class models trained & logged to 'report.txt'.")
print("Encoded class order (6-class):", list(le6.classes_))


In [None]:
# Confusion Matrix (counts + row-normalized) for 6-class models
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os, pickle

# --- choose the model you want to visualize (must match what you saved with _6classes) ---
model_name = "LGBMClassifier"        # e.g. "LogisticRegression", "RandomForestClassifier", ...
suffix = "6classes"                   # keeps filenames distinct

model_path = os.path.join(folder, f"{model_name}_{suffix}_model.pkl")
with open(model_path, "rb") as f:
    model = pickle.load(f)

# Pick the correct test features & labels for 6-class run
Xte = X_test_lr_6 if model_name == "LogisticRegression" else X_test_6
y_true = y_test_6

# Predict
y_pred = model.predict(Xte)

# Labels/ticks: use full index range to force consistent matrix shape, display names from le6
idx_labels   = np.arange(len(le6.classes_))   # 0..K-1
display_lbls = le6.classes_

# --- Raw counts CM ---
cm = confusion_matrix(y_true, y_pred, labels=idx_labels)
np.savetxt(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.csv"), cm, fmt="%d", delimiter=",")

fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical"
)
ax.set_title(f"Confusion Matrix (counts) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

# --- Row-normalized CM (per true class) ---
with np.errstate(invalid="ignore", divide="ignore"):
    row_sums = cm.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # avoid div/0 if a class is absent
    cm_norm = cm / row_sums

np.savetxt(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.csv"), cm_norm, fmt="%.4f", delimiter=",")

fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical", values_format=".2f", cmap=plt.cm.Blues
)
ax.set_title(f"Confusion Matrix (row-normalized) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

print("Saved:",
      f"cm_counts_{model_name}_{suffix}.png / .csv",
      f"cm_normalized_{model_name}_{suffix}.png / .csv",
      sep="\n- ")


## 2(1+1) Classes (Attack or Benign)


Now, let's move to just 2 classifiers(attack, benign)

In [16]:
dict_2classes = {}

# --- Benign ---
dict_2classes['Benign'] = 'Benign'

# --- DDoS ---
dict_2classes['DDoS_ICMP'] = 'Attack'
dict_2classes['DDoS_UDP'] = 'Attack'
dict_2classes['DDoS_TCP'] = 'Attack'
dict_2classes['DDoS_SYN'] = 'Attack'

# --- DoS ---
dict_2classes['DoS_UDP'] = 'Attack'
dict_2classes['DoS_TCP'] = 'Attack'
dict_2classes['DoS_SYN'] = 'Attack'
dict_2classes['DoS_ICMP'] = 'Attack'

# --- Spoofing ---
dict_2classes['Spoofing_ARP'] = 'Attack'

# --- Recon ---
dict_2classes['Recon_Ping_Sweep'] = 'Attack'
dict_2classes['Recon_OS_Scan'] = 'Attack'
dict_2classes['Recon_Port_Scan'] = 'Attack'
dict_2classes['Recon_Vulnerability_Scan'] = 'Attack'

# --- MQTT ---
dict_2classes['MQTT_DDoS_Flooding'] = 'Attack'
dict_2classes['MQTT_DoS_Flooding'] = 'Attack'
dict_2classes['MQTT_Malformed_Data'] = 'Attack'


In [None]:
# === 2-class grouping run (Polars → NumPy) ===
import polars as pl
import numpy as np
import os, pickle, time
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

y_column = "Label"

# --- Map labels to 2 classes (drop anything unmapped) ---
def map_labels_pl(df: pl.DataFrame, mapping: dict) -> pl.DataFrame:
    mapped = df.with_columns(
        pl.col(y_column).cast(pl.Utf8).replace(mapping, default=None).alias(y_column)
    )
    return mapped.drop_nulls(subset=[y_column])

train_pl_2 = map_labels_pl(train_pl, dict_2classes)
test_pl_2  = map_labels_pl(test_pl,  dict_2classes)

# --- Hygiene per split: ±inf→null, fill medians, cast float32 ---
def prep_numeric_pl(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    df2 = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    meds = df2.select([pl.col(c).median().alias(c) for c in cols])
    df2 = df2.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df2

train_pl_2 = prep_numeric_pl(train_pl_2, numeric_cols)
test_pl_2  = prep_numeric_pl(test_pl_2,  numeric_cols)

# --- To NumPy (float32, Fortran order) ---
def to_f32_f_order(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    arr = df.select(cols).to_numpy()
    return np.asfortranarray(arr, dtype=np.float32)

X_train_full_2 = to_f32_f_order(train_pl_2, numeric_cols)
X_test_2       = to_f32_f_order(test_pl_2,  numeric_cols)

# --- Labels (string → LabelEncoder over union) ---
y_train_2_raw = train_pl_2.select(y_column).to_series().to_numpy().astype(str)
y_test_2_raw  = test_pl_2.select(y_column).to_series().to_numpy().astype(str)

le2 = LabelEncoder()
le2.fit(np.concatenate([y_train_2_raw, y_test_2_raw]))
y_train_full_2 = le2.transform(y_train_2_raw)
y_test_2       = le2.transform(y_test_2_raw)

# --- Scaling for LR (reuse your pre-fitted scaler params already loaded) ---
def apply_scaler_f32(X_f):
    out = np.empty_like(X_f, dtype=np.float32, order='F')
    np.subtract(X_f, mean_f32, out=out)
    np.divide(out, scale_safe, out=out)
    return out

X_train_lr_full_2 = apply_scaler_f32(X_train_full_2)
X_test_lr_2       = apply_scaler_f32(X_test_2)

# --- Train/val split (stratified) for early stopping boosters ---
X_tr_num2, X_val_num2, y_tr2, y_val2 = train_test_split(
    X_train_full_2, y_train_full_2, test_size=0.15, random_state=42, stratify=y_train_full_2
)
X_tr_lr2,  X_val_lr2  = train_test_split(
    X_train_lr_full_2, test_size=0.15, random_state=42, stratify=y_train_full_2
)

# --- Append header to the same report.txt ---
report_path = os.path.join(folder, "report.txt")
with open(report_path, "a") as fp:
    fp.write("\n\n===== Grouped 2-Class Run =====\n")
    fp.write("Classes (encoded order): " + ", ".join(map(str, le2.classes_)) + "\n")

# --- Train / evaluate each model on 2 classes ---
for name, model in tqdm(ML_models, total=len(ML_models), desc="Models (2-class)"):
    print(f"\n▶ Training {name} (2-class)")
    t0 = time.perf_counter()

    if name == "LogisticRegression":
        model.fit(X_tr_lr2, y_tr2)
        Xte = X_test_lr_2

    elif name == "XGBClassifier":
        # Only if XGB is in your ML_models
        Xte = X_test_2
        try:
            from xgboost import callback as xgb_callback
            es = [xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=False)]
            model.fit(
                X_tr_num2, y_tr2,
                eval_set=[(X_val_num2, y_val2)],
                callbacks=es,
            )
        except (TypeError, ImportError):
            try:
                model.fit(
                    X_tr_num2, y_tr2,
                    eval_set=[(X_val_num2, y_val2)],
                    early_stopping_rounds=50,
                    verbose=False,
                )
            except TypeError:
                model.fit(X_tr_num2, y_tr2)

    elif name == "LGBMClassifier":
        import lightgbm as lgb
        model.fit(
            X_tr_num2, y_tr2,
            eval_set=[(X_val_num2, y_val2)],
            eval_metric="binary_logloss",
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        Xte = X_test_2

    else:
        model.fit(X_tr_num2, y_tr2)
        Xte = X_test_2

    secs = time.perf_counter() - t0
    print(f"⏱ {name} (2-class) trained in {secs:.1f}s")

    # Save model
    with open(os.path.join(folder, f"{name}_2classes_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    # Evaluate
    print(f"▶ Testing {name} (2-class)")
    y_pred = model.predict(Xte)
    acc  = accuracy_score(y_test_2, y_pred)
    rec  = recall_score(y_test_2, y_pred, average="macro")
    prec = precision_score(y_test_2, y_pred, average="macro")
    f1   = f1_score(y_test_2, y_pred, average="macro")

    with open(report_path, "a") as fp:
        fp.write(f"####### {name} (2 Classes) #######\n")
        fp.write(f"Accuracy : {acc:.4f}\n")
        fp.write(f"Recall   : {rec:.4f}\n")
        fp.write(f"Precision: {prec:.4f}\n")
        fp.write(f"F1 Score : {f1:.4f}\n\n")

print("\n✅ All 2-class models trained & logged to 'report.txt'.")
print("Encoded class order (2-class):", list(le2.classes_))


In [None]:
# Confusion Matrix (counts + row-normalized) for 2-class models
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os, pickle

# --- choose the model you want to visualize (must match what you saved with _2classes) ---
model_name = "LGBMClassifier"   # e.g. "LogisticRegression", "RandomForestClassifier", ...
suffix = "2classes"             # keeps filenames distinct

model_path = os.path.join(folder, f"{model_name}_{suffix}_model.pkl")
with open(model_path, "rb") as f:
    model = pickle.load(f)

# Pick the correct test features & labels for 2-class run
Xte    = X_test_lr_2 if model_name == "LogisticRegression" else X_test_2
y_true = y_test_2

# Predict
y_pred = model.predict(Xte)

# Labels/ticks: use full index range to force consistent matrix shape; display names from le2
idx_labels   = np.arange(len(le2.classes_))   # 0..1
display_lbls = le2.classes_

# --- Raw counts CM ---
cm = confusion_matrix(y_true, y_pred, labels=idx_labels)
np.savetxt(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.csv"), cm, fmt="%d", delimiter=",")

fig, ax = plt.subplots(figsize=(8,8), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical"
)
ax.set_title(f"Confusion Matrix (counts) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

# --- Row-normalized CM (per true class) ---
with np.errstate(invalid="ignore", divide="ignore"):
    row_sums = cm.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    cm_norm = cm / row_sums

np.savetxt(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.csv"), cm_norm, fmt="%.4f", delimiter=",")

fig, ax = plt.subplots(figsize=(8,8), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical", values_format=".2f", cmap=plt.cm.Blues
)
ax.set_title(f"Confusion Matrix (row-normalized) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

print("Saved:",
      f"cm_counts_{model_name}_{suffix}.png / .csv",
      f"cm_normalized_{model_name}_{suffix}.png / .csv",
      sep="\n- ")


# Metrics for LR, RF, Adaboost, Bagging,  DT

In [None]:
import os, glob, pickle, gc
import numpy as np, polars as pl, pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.preprocessing import LabelEncoder

# ====== CONFIG ======
MODELS_DIR   = "."              # folder with your *.pkl models
TRAIN_PATH   = "train.parquet"
TEST_PATH    = "test.parquet"
PREPROC_PATH = "preproc.pkl"    # optional: contains {'scaler_mean','scaler_scale'}
LABEL_COL    = "Label"
SAVE_CSV     = "overfitting_report.csv"  # set None to avoid saving
# ====================

# ---------- Load data ----------
train_df = pl.read_parquet(TRAIN_PATH)
test_df  = pl.read_parquet(TEST_PATH)

# numeric columns only
num_dtypes = {pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64}
feat_cols_all = [c for c, dt in train_df.schema.items() if c != LABEL_COL and dt in num_dtypes]

# ---------- Clean once (±inf -> null -> median) and cast to float32 ----------
def clean_float32(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    df2 = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    meds = df2.select([pl.col(c).median().alias(c) for c in cols])
    df2 = df2.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df2

train_df = clean_float32(train_df, feat_cols_all)
test_df  = clean_float32(test_df,  feat_cols_all)

# ---------- Convert once to NumPy float32 (no copies later; we slice by indices) ----------
X_train_all = train_df.select(feat_cols_all).to_numpy().astype(np.float32, copy=False)
X_test_all  = test_df.select(feat_cols_all).to_numpy().astype(np.float32, copy=False)
y_train_raw = train_df[LABEL_COL].to_numpy()
y_test_raw  = test_df[LABEL_COL].to_numpy()

# ---------- Optional scaler for linear models (precompute scaled buffers once) ----------
scaler = None
X_train_scaled = None
X_test_scaled  = None
if os.path.exists(PREPROC_PATH):
    with open(PREPROC_PATH, "rb") as f:
        art = pickle.load(f)
    mean = art.get("scaler_mean")
    scale = art.get("scaler_scale")
    if mean is not None and scale is not None:
        mean  = np.asarray(mean,  dtype=np.float32)
        scale = np.asarray(scale, dtype=np.float32)
        scale_safe = np.where(scale == 0.0, 1.0, scale)
        def apply_scaler_inplace(X):
            X -= mean
            X /= scale_safe
        # create scaled copies once (Fortran order helps some estimators)
        X_train_scaled = X_train_all.copy(order='F')
        X_test_scaled  = X_test_all.copy(order='F')
        apply_scaler_inplace(X_train_scaled)
        apply_scaler_inplace(X_test_scaled)

# ---------- Label encoder on union ----------
le_union = LabelEncoder()
le_union.fit(np.concatenate([y_train_raw, y_test_raw]).astype(str))
y_train_num = le_union.transform(y_train_raw.astype(str))
y_test_num  = le_union.transform(y_test_raw.astype(str))

# ---------- Fast column index alignment (no Polars → NumPy per model) ----------
col_index = {c: i for i, c in enumerate(feat_cols_all)}
def align_indices(model, cols_all, col_index_map):
    model_cols = getattr(model, "feature_names_in_", None)
    if model_cols is None:
        return np.arange(len(cols_all), dtype=np.int32)
    # only keep columns that exist (defensive)
    idxs = [col_index_map[c] for c in model_cols if c in col_index_map]
    return np.asarray(idxs, dtype=np.int32)

rows = []

# Collect candidate .pkl model files
all_pkls = sorted(glob.glob(os.path.join(MODELS_DIR, "*.pkl")))
preproc_name = Path(PREPROC_PATH).name.lower()

# Exclude any LGBM* and preproc.pkl by filename (same behavior you had)
pkls = [
    p for p in all_pkls
    if not Path(p).name.lower().startswith("lgbmclassifier")
    and Path(p).name.lower() != preproc_name
]

if not pkls:
    print("No valid .pkl model files found in", MODELS_DIR)

for pkl_path in pkls:
    try:
        with open(pkl_path, "rb") as f:
            model = pickle.load(f)
        model_name = Path(pkl_path).name

        # Column alignment by indices (zero-copy views)
        idxs = align_indices(model, feat_cols_all, col_index)

        # Choose scaled or unscaled buffers (no copying)
        is_linear = any(k in type(model).__name__.lower() for k in ["logistic", "sgd", "linear"])
        use_scaled = is_linear and (X_train_scaled is not None)
        Xtr_full = (X_train_scaled if use_scaled else X_train_all)[:, idxs]
        Xte_full = (X_test_scaled  if use_scaled else X_test_all )[:, idxs]

        # Predict to infer label type (full train & test; no sampling)
        y_pred_tr = model.predict(Xtr_full)
        y_pred_te = model.predict(Xte_full)

        # Match y_true type to y_pred type
        if np.issubdtype(np.asarray(y_pred_tr).dtype, np.number):
            y_tr = y_train_num
            y_te = y_test_num
        else:
            y_tr = y_train_raw.astype(str)
            y_te = y_test_raw.astype(str)
            y_pred_tr = y_pred_tr.astype(str)
            y_pred_te = y_pred_te.astype(str)

        # Metrics
        acc_tr = accuracy_score(y_tr, y_pred_tr)
        acc_te = accuracy_score(y_te, y_pred_te)
        f1_tr  = f1_score(y_tr, y_pred_tr, average="macro")
        f1_te  = f1_score(y_te, y_pred_te, average="macro")

        # Log-loss (both train & test) only if predict_proba exists
        ll_tr = ll_te = np.nan
        if hasattr(model, "predict_proba"):
            try:
                proba_tr = model.predict_proba(Xtr_full)
                proba_te = model.predict_proba(Xte_full)
                # Use label sets derived from y_* to satisfy log_loss label ordering
                ll_tr = log_loss(y_tr, proba_tr, labels=np.unique(y_tr))
                ll_te = log_loss(y_te, proba_te, labels=np.unique(y_te))
                # free large arrays ASAP
                del proba_tr, proba_te
            except Exception:
                pass

        gap = acc_tr - acc_te
        rows.append({
            "model": model_name,
            "n_features_used": int(len(idxs)),
            "train_acc": acc_tr, "test_acc": acc_te, "acc_gap": gap,
            "train_f1_macro": f1_tr, "test_f1_macro": f1_te,
            "train_logloss": ll_tr, "test_logloss": ll_te
        })

        # Free per-model temps
        del Xtr_full, Xte_full, y_pred_tr, y_pred_te
        gc.collect()

    except Exception as e:
        rows.append({
            "model": Path(pkl_path).name,
            "n_features_used": np.nan,
            "train_acc": np.nan, "test_acc": np.nan, "acc_gap": np.nan,
            "train_f1_macro": np.nan, "test_f1_macro": np.nan,
            "train_logloss": np.nan, "test_logloss": np.nan,
            "error": repr(e)
        })

report = pd.DataFrame(rows).sort_values(["acc_gap"], ascending=False).reset_index(drop=True)
pd.set_option("display.max_colwidth", 120)
display(report)

if SAVE_CSV:
    report.to_csv(SAVE_CSV, index=False)
    print(f"\nSaved report to {SAVE_CSV}")

# Quick textual flag
flagged = report[(report["acc_gap"].notna()) & (report["acc_gap"] > 0.05)]
print(f"\nModels with potential overfitting (acc_gap > 0.05): {len(flagged)}")
for m in flagged["model"].tolist():
    print("  •", m)


# Metrics for LGBM Classifier

In [None]:
#!/usr/bin/env python3
import os, numpy as np, polars as pl, matplotlib.pyplot as plt, lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# ==========================================================
#                 LABEL GROUPING DICTIONARIES
# ==========================================================
dict_6classes = {
    # --- DDoS ---
    'DDoS_ICMP': 'DDoS', 'DDoS_UDP': 'DDoS', 'DDoS_TCP': 'DDoS', 'DDoS_SYN': 'DDoS',
    # --- DoS ---
    'DoS_UDP': 'DoS', 'DoS_TCP': 'DoS', 'DoS_SYN': 'DoS', 'DoS_ICMP': 'DoS',
    # --- Benign ---
    'Benign': 'Benign',
    # --- Spoofing ---
    'Spoofing_ARP': 'Spoofing',
    # --- Recon ---
    'Recon_Ping_Sweep': 'Recon', 'Recon_OS_Scan': 'Recon',
    'Recon_Port_Scan': 'Recon', 'Recon_Vulnerability_Scan': 'Recon',
    # --- MQTT ---
    'MQTT_DDoS_Flooding': 'MQTT', 'MQTT_DoS_Flooding': 'MQTT', 'MQTT_Malformed_Data': 'MQTT'
}

dict_2classes = {
    'Benign': 'Benign',
    'DDoS_ICMP': 'Attack', 'DDoS_UDP': 'Attack', 'DDoS_TCP': 'Attack', 'DDoS_SYN': 'Attack',
    'DoS_UDP': 'Attack', 'DoS_TCP': 'Attack', 'DoS_SYN': 'Attack', 'DoS_ICMP': 'Attack',
    'Spoofing_ARP': 'Attack',
    'Recon_Ping_Sweep': 'Attack', 'Recon_OS_Scan': 'Attack',
    'Recon_Port_Scan': 'Attack', 'Recon_Vulnerability_Scan': 'Attack',
    'MQTT_DDoS_Flooding': 'Attack', 'MQTT_DoS_Flooding': 'Attack', 'MQTT_Malformed_Data': 'Attack'
}

# ==========================================================
#                   DATA LOADING & CLEANING
# ==========================================================
LABEL = "Label"
train = pl.read_parquet("train.parquet")
test  = pl.read_parquet("test.parquet")

num_cols = [c for c, dt in train.schema.items() if c != LABEL and dt in pl.NUMERIC_DTYPES]

def clean_numeric(df: pl.DataFrame) -> pl.DataFrame:
    df2 = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in num_cols
    ])
    meds = df2.select([pl.col(c).median().alias(c) for c in num_cols])
    df2 = df2.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in num_cols
    ])
    return df2

train_clean = clean_numeric(train)
test_clean  = clean_numeric(test)

# ==========================================================
#                      HELPER FUNCTIONS
# ==========================================================
def map_labels(df: pl.DataFrame, mapping: dict | None) -> pl.DataFrame:
    if mapping is None:
        return df
    return df.with_columns(
        pl.col(LABEL).cast(pl.Utf8).replace(mapping, default=None).alias(LABEL)
    ).drop_nulls(subset=[LABEL])

def prepare_Xy(train_df: pl.DataFrame, test_df: pl.DataFrame, mapping: dict | None):
    tr_m = map_labels(train_df, mapping)
    te_m = map_labels(test_df, mapping)
    tr_c = clean_numeric(tr_m)
    te_c = clean_numeric(te_m)

    X_full = tr_c.select(num_cols).to_numpy().astype(np.float32)
    y_full = tr_c.select(LABEL).to_series().to_numpy().astype(str)
    X_test = te_c.select(num_cols).to_numpy().astype(np.float32)
    y_test = te_c.select(LABEL).to_series().to_numpy().astype(str)

    le = LabelEncoder().fit(np.concatenate([y_full, y_test]))
    y_full_enc = le.transform(y_full)
    y_test_enc = le.transform(y_test)

    X_tr, X_val, y_tr, y_val = train_test_split(
        X_full, y_full_enc, test_size=0.15, random_state=42, stratify=y_full_enc
    )
    return X_tr, X_val, y_tr, y_val, X_test, y_test_enc, le

def lightgbm_fit_and_curves(X_tr, y_tr, X_val, y_val, X_te, y_te, tag: str, base_params: dict):
    n_classes = int(len(np.unique(y_tr)))
    params = dict(base_params)
    if n_classes == 2:
        params.update(dict(objective="binary"))
        eval_metric = ["binary_logloss", "binary_error"]
    else:
        params.update(dict(objective="multiclass", num_class=n_classes))
        eval_metric = ["multi_logloss", "multi_error"]

    model = LGBMClassifier(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        eval_metric=eval_metric,
        callbacks=[lgb.early_stopping(50, verbose=False)],
    )

    res = model.evals_result_
    k0, k1 = list(res.keys())[:2]

    def pick(d, opts):
        for o in opts:
            if o in d: return o
        raise KeyError(f"Metric not found. Have: {list(d.keys())}")

    loss_name = pick(res[k0], ["multi_logloss","binary_logloss","logloss"])
    err_name  = pick(res[k0], ["multi_error","binary_error","error"])

    train_loss = np.array(res[k0][loss_name], dtype=float)
    val_loss   = np.array(res[k1][loss_name], dtype=float)
    train_acc  = 1.0 - np.array(res[k0][err_name], dtype=float)
    val_acc    = 1.0 - np.array(res[k1][err_name], dtype=float)

    epochs  = np.arange(1, len(train_loss)+1)
    best_it = getattr(model, "best_iteration_", len(epochs))

    # ---- Metrics on validation & test ----
    y_val_pred = model.predict(X_val)
    y_te_pred  = model.predict(X_te)

    metrics_val = dict(
        acc=accuracy_score(y_val, y_val_pred),
        rec=recall_score(y_val, y_val_pred, average="macro"),
        prec=precision_score(y_val, y_val_pred, average="macro"),
        f1=f1_score(y_val, y_val_pred, average="macro")
    )
    metrics_test = dict(
        acc=accuracy_score(y_te, y_te_pred),
        rec=recall_score(y_te, y_te_pred, average="macro"),
        prec=precision_score(y_te, y_te_pred, average="macro"),
        f1=f1_score(y_te, y_te_pred, average="macro")
    )

    print(f"\n===== {tag.upper()} RESULTS =====")
    print(f"Classes: {n_classes} | Best iteration: {best_it}")
    print(f"Validation  - Acc: {metrics_val['acc']:.4f} | Rec: {metrics_val['rec']:.4f} | "
          f"Prec: {metrics_val['prec']:.4f} | F1: {metrics_val['f1']:.4f}")
    print(f"Test        - Acc: {metrics_test['acc']:.4f} | Rec: {metrics_test['rec']:.4f} | "
          f"Prec: {metrics_test['prec']:.4f} | F1: {metrics_test['f1']:.4f}")

    # ---- Plot training curves ----
    fig, axes = plt.subplots(1, 2, figsize=(12,5))

    ax = axes[0]
    ax.plot(epochs, train_acc, label="Train Acc")
    ax.plot(epochs, val_acc,   label="Val Acc")
    ax.axvline(best_it, linestyle="--", alpha=0.7, label=f"best_iter={best_it}")
    ax.set_title(f"[{tag}] Accuracy"); ax.set_xlabel("Epoch"); ax.set_ylabel("Accuracy")
    ax.grid(True, alpha=0.3); ax.legend(loc="lower right")

    ax = axes[1]
    ax.plot(epochs, train_loss, label="Train Loss")
    ax.plot(epochs, val_loss,   label="Val Loss")
    ax.axvline(best_it, linestyle="--", alpha=0.7, label=f"best_iter={best_it}")
    ax.set_title(f"[{tag}] Loss"); ax.set_xlabel("Epoch"); ax.set_ylabel("Loss")
    ax.grid(True, alpha=0.3); ax.legend(loc="upper right")

    plt.tight_layout()
    out_path = f"lgbm_training_validation_curves_{tag}.png"
    plt.savefig(out_path, dpi=160, bbox_inches="tight")
    plt.close(fig)

    return {
        "tag": tag,
        "best_iter": int(best_it),
        "epochs": epochs,
        "train_acc": train_acc,
        "val_acc": val_acc,
        "train_loss": train_loss,
        "val_loss": val_loss,
        "figure_path": out_path,
        "n_classes": n_classes,
        "val_metrics": metrics_val,
        "test_metrics": metrics_test,
    }

# ==========================================================
#                 TRAINING CONFIG & EXECUTION
# ==========================================================
BASE_PARAMS = dict(
    n_estimators=4000,
    num_leaves=48,
    learning_rate=0.06,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=os.cpu_count() or 4,
    random_state=42,
)

experiments = [
    ("orig", None),
    ("6c",   dict_6classes),
    ("2c",   dict_2classes),
]

all_runs = []
for tag, mapping in experiments:
    X_tr, X_val, y_tr, y_val, X_te, y_te, le = prepare_Xy(train_clean, test_clean, mapping)
    run = lightgbm_fit_and_curves(X_tr, y_tr, X_val, y_val, X_te, y_te, tag, BASE_PARAMS)
    all_runs.append(run)
    print(f"Saved: {run['figure_path']} | tag={tag} | best_iter={run['best_iter']}")

# ==========================================================
#                 COMPARISON FIGURE (VAL CURVES)
# ==========================================================
fig, axes = plt.subplots(1, 2, figsize=(12,5))

for run in all_runs:
    axes[0].plot(run["epochs"], run["val_acc"], label=f"{run['tag']} (best={run['best_iter']})")
axes[0].set_title("Validation Accuracy (orig vs 6c vs 2c)")
axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy")
axes[0].grid(True, alpha=0.3); axes[0].legend(loc="lower right")

for run in all_runs:
    axes[1].plot(run["epochs"], run["val_loss"], label=f"{run['tag']}")
axes[1].set_title("Validation Loss (orig vs 6c vs 2c)")
axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss")
axes[1].grid(True, alpha=0.3); axes[1].legend(loc="upper right")

plt.tight_layout()
cmp_path = "lgbm_validation_curves_comparison.png"
plt.savefig(cmp_path, dpi=160, bbox_inches="tight")
plt.show()

print("Comparison figure saved:", cmp_path)


In [None]:
# === Train–Val Difference Plots for each scheme + comparison ===
import numpy as np
import matplotlib.pyplot as plt

# ---- Per-scheme gap plots ----
for run in all_runs:
    tag       = run["tag"]
    epochs    = run["epochs"]
    best_it   = run["best_iter"]
    train_acc = run["train_acc"]
    val_acc   = run["val_acc"]
    train_loss= run["train_loss"]
    val_loss  = run["val_loss"]

    acc_diff  = train_acc - val_acc          # positive => train > val (possible overfit)
    loss_diff = val_loss - train_loss        # positive => val loss > train loss (possible overfit)

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Accuracy gap
    ax = axes[0]
    ax.plot(epochs, acc_diff, label="Train - Val Accuracy Gap")
    ax.axhline(0, color="gray", linestyle="--", linewidth=1)
    ax.axvline(best_it, color="red", linestyle="--", alpha=0.7, label=f"best_iter={best_it}")
    ax.set_title(f"[{tag}] Accuracy Difference (Train - Validation)")
    ax.set_xlabel("Epoch"); ax.set_ylabel("Accuracy Gap")
    ax.grid(True, alpha=0.3); ax.legend(loc="upper right")
    if acc_diff.size:
        ax.set_ylim(np.min(acc_diff)*1.1, np.max(acc_diff)*1.1)

    # Loss gap
    ax = axes[1]
    ax.plot(epochs, loss_diff, label="Validation - Train Loss Gap")
    ax.axhline(0, color="gray", linestyle="--", linewidth=1)
    ax.axvline(best_it, color="red", linestyle="--", alpha=0.7, label=f"best_iter={best_it}")
    ax.set_title(f"[{tag}] Loss Difference (Validation - Train)")
    ax.set_xlabel("Epoch"); ax.set_ylabel("Loss Gap")
    ax.grid(True, alpha=0.3); ax.legend(loc="upper right")
    if loss_diff.size:
        ax.set_ylim(np.min(loss_diff)*1.1, np.max(loss_diff)*1.1)

    plt.tight_layout()
    out_path = f"lgbm_train_val_diff_curves_{tag}.png"
    plt.savefig(out_path, dpi=160, bbox_inches="tight")
    plt.show()
    print("Saved:", out_path)

# ---- Combined comparison overlay (val curves gaps across schemes) ----
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for run in all_runs:
    tag = run["tag"]
    acc_diff  = run["train_acc"] - run["val_acc"]
    loss_diff = run["val_loss"]  - run["train_loss"]

    axes[0].plot(run["epochs"], acc_diff, label=f"{tag} (best={run['best_iter']})")
    axes[1].plot(run["epochs"], loss_diff, label=f"{tag}")

axes[0].axhline(0, color="gray", linestyle="--", linewidth=1)
axes[0].set_title("Accuracy Gap: Train - Val (orig vs 6c vs 2c)")
axes[0].set_xlabel("Epoch"); axes[0].set_ylabel("Accuracy Gap")
axes[0].grid(True, alpha=0.3); axes[0].legend(loc="upper right")

axes[1].axhline(0, color="gray", linestyle="--", linewidth=1)
axes[1].set_title("Loss Gap: Val - Train (orig vs 6c vs 2c)")
axes[1].set_xlabel("Epoch"); axes[1].set_ylabel("Loss Gap")
axes[1].grid(True, alpha=0.3); axes[1].legend(loc="upper right")

plt.tight_layout()
cmp_gap_path = "lgbm_train_val_diff_curves_comparison.png"
plt.savefig(cmp_gap_path, dpi=160, bbox_inches="tight")
plt.show()
print("Saved:", cmp_gap_path)
