## Installing dependencies and setting up

In [None]:
# + pyarrow (fast CSV/Parquet IO) + polars (fast ETL) + fastparquet (optional)
!pip install pandas numpy tqdm scikit-learn xgboost lightgbm matplotlib pyarrow polars fastparquet


In [None]:
#!/usr/bin/env python3
# Prep: make your combined file compatible with the existing splitter.
import os, shutil, uuid
import polars as pl

folder = os.getcwd()
combined = os.path.join(folder, "combined_2.parquet")   # your single merged file
alias    = os.path.join(folder, "single_clean.parquet")       # matches "*_clean.parquet" glob

if not os.path.exists(combined):
    raise FileNotFoundError(f"Missing: {combined}")

# 1) Strip pre-existing 'rid' so the splitter can add its own
schema = pl.scan_parquet(combined).schema
if "rid" in schema:
    tmp_path = combined + f".tmp.{uuid.uuid4().hex}.parquet"
    print(f"→ 'rid' found in {combined}. Rewriting without it → {tmp_path}")
    (
        pl.scan_parquet(combined)
          .select(pl.all().exclude("rid"))
          .sink_parquet(
              tmp_path,
              compression="zstd",
              compression_level=1,
              statistics=False,
              maintain_order=False,
          )
    )
    shutil.move(tmp_path, combined)
    print("✓ 'rid' removed.")
else:
    print("✓ No 'rid' in combined file (nothing to remove).")

# 2) Ensure a *_clean.parquet exists (so your splitter's glob works unchanged)
if os.path.exists(alias):
    print(f"✓ Alias already exists: {alias}")
else:
    try:
        os.symlink(os.path.basename(combined), alias)
        print(f"→ Created symlink: {alias} → {combined}")
    except Exception:
        # Fallback if symlink not allowed (e.g., Windows without perms)
        shutil.copyfile(combined, alias)
        print(f"→ Symlink not allowed; copied file: {alias}")

print("\nAll set. Run your original split script as-is.")


In [None]:
#!/usr/bin/env python3
import os
import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

folder = os.getcwd()
pattern = os.path.join(folder, "*_clean.parquet")

# 1) Lazily scan all parts (single call; supports glob)
lazy_all = pl.scan_parquet(pattern)

# 2) Add a stable row id (after concat) and ensure Label dtype
#    We will ONLY collect (rid, Label), not the whole table.
lazy_all = lazy_all.with_row_index("rid").with_columns(pl.col("Label").cast(pl.Utf8))

# 3) Collect tiny metadata needed for splitting
meta = lazy_all.select("rid", "Label").collect(engine="streaming")

if "Label" not in meta.columns:
    raise RuntimeError("Expected a 'Label' column in the dataset.")

# 4) Decide if we can stratify (≥2 classes and each has ≥2 rows)
use_stratify = False
counts = meta.group_by("Label").len().select("len").to_series().to_numpy()
if meta["Label"].n_unique() >= 2 and (counts >= 2).all():
    use_stratify = True

# 5) Split on *indices* (NumPy) — tiny and fast
rid = meta["rid"].to_numpy()
labels = meta["Label"].to_numpy()

train_idx, test_idx = train_test_split(
    rid,
    test_size=0.20,
    shuffle=True,
    random_state=42,
    stratify=labels if use_stratify else None,
)

# 6) Build small row-id tables and join (semi) to stream rows to Parquet
train_ids = pl.DataFrame({"rid": train_idx}).lazy()
test_ids  = pl.DataFrame({"rid": test_idx}).lazy()

lazy_with_id = lazy_all  # already has "rid" column

lazy_train = lazy_with_id.join(train_ids, on="rid", how="semi")
lazy_test  = lazy_with_id.join(test_ids,  on="rid", how="semi")

train_parquet = os.path.join(folder, "train.parquet")
test_parquet  = os.path.join(folder, "test.parquet")

# 7) Stream directly to Parquet (fast path; no full materialization)
#    Use a fast codec/level for speed; tweak as you prefer.
lazy_train.sink_parquet(train_parquet, compression="zstd", compression_level=1, statistics=False)
lazy_test.sink_parquet(test_parquet,   compression="zstd", compression_level=1, statistics=False)

print(
    "Data split complete:\n"
    f" • Pattern: {pattern}\n"
    f" • Training rows: {len(train_idx)}\n"
    f" • Test rows: {len(test_idx)}\n"
    f" • Stratified: {use_stratify}\n"
    f" • Saved: {train_parquet}, {test_parquet}"
)


In [None]:
#!/usr/bin/env python3
import os
import polars as pl

folder = os.getcwd()
pattern = os.path.join(folder, "*_clean.parquet")

# Scan, then select only the Label column (projection pushdown will keep it fast).
q = (
    pl.scan_parquet(pattern)
      .select(pl.col("Label").cast(pl.Categorical))
      .filter(pl.col("Label").is_not_null())
)

# Total rows (collect just a tiny scalar)
n_rows = (
    q.select(pl.len().alias("rows"))
     .collect(engine="streaming")["rows"][0]
)

# Label counts (materialize only the small result)
label_counts = (
    q.group_by("Label")
     .len()
     .sort("len", descending=True)
     .collect(engine="streaming")
     .rename({"len": "count"})
)

print("Analyzing original Parquet files (glob): *_clean.parquet")
print(f"Total rows: {n_rows}\n")
print("Unique labels and their counts:")
for label, count in zip(label_counts["Label"].to_list(),
                        label_counts["count"].to_list()):
    print(f"  {label}: {count}")
print(f"\nTotal unique labels: {label_counts.shape[0]}")


We deduce that there are 20(19+1) Classes (Attack Labels)

In [None]:
#!/usr/bin/env python3
import os
import pickle
import numpy as np
import polars as pl
import polars.selectors as cs
from sklearn.preprocessing import StandardScaler

folder = os.getcwd()
train_parquet = os.path.join(folder, "train.parquet")

# 1) Load train
train_pl = pl.read_parquet(train_parquet)
y_column = "Label"
if y_column not in train_pl.columns:
    raise RuntimeError("Expected 'Label' in train.parquet")

# 2) Numeric cols (exclude label)
numeric_cols = [c for c in train_pl.select(cs.numeric()).columns if c != y_column]
if not numeric_cols:
    raise RuntimeError("No numeric feature columns found.")

# 3) Medians on TRAIN ONLY
meds_s = train_pl.select([pl.col(c).median().alias(c) for c in numeric_cols])
medians = {c: float(meds_s[c][0]) for c in numeric_cols}

# 4) Impute to float32 → view as float64 for stats
def to_imputed_f32(df: pl.DataFrame, cols: list[str], med: dict[str, float]) -> np.ndarray:
    df = df.with_columns([pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c) for c in cols])
    df = df.with_columns([pl.col(c).fill_null(med[c]).fill_nan(med[c]).cast(pl.Float32).alias(c) for c in cols])
    return df.select(cols).to_numpy().astype(np.float32, copy=False)

X_train_for_scaler = to_imputed_f32(train_pl, numeric_cols, medians).astype(np.float64, copy=False)

# 4.1) Drop zero/near-zero variance cols once
stds = X_train_for_scaler.std(axis=0, dtype=np.float64)  # ddof=0 like StandardScaler
EPS_STD = 1e-12
keep_mask = stds > EPS_STD
if not keep_mask.all():
    dropped = [c for c, k in zip(numeric_cols, keep_mask) if not k]
    print(f"Dropping {len(dropped)} ~zero-variance cols:", dropped)
    numeric_cols = [c for c, k in zip(numeric_cols, keep_mask) if k]
    X_train_for_scaler = X_train_for_scaler[:, keep_mask]
    medians = {c: medians[c] for c in numeric_cols}

# 5) Fit scaler
scaler = StandardScaler()
scaler.fit(X_train_for_scaler)

# 5.1) Save LR keep indices for near-constant scales
EPS_SCALE = 1e-6
lr_keep_mask = scaler.scale_ >= EPS_SCALE
lr_keep_idx = np.where(lr_keep_mask)[0].astype(np.int32)
if not lr_keep_mask.all():
    dropped_lr = [c for c, k in zip(numeric_cols, lr_keep_mask) if not k]
    print(f"[LR path] Dropping {len(dropped_lr)} near-constant cols by scale:", dropped_lr)

# 6) Save artifacts
artifacts = {
    "numeric_cols": numeric_cols,
    "medians": medians,
    "scaler_mean": scaler.mean_.astype(np.float64),
    "scaler_scale": scaler.scale_.astype(np.float64),  # keep raw
    "lr_keep_idx": lr_keep_idx,
}
with open(os.path.join(folder, "preproc.pkl"), "wb") as f:
    pickle.dump(artifacts, f)

print(f"Saved preproc.pkl with {len(numeric_cols)} cols (LR cols kept: {lr_keep_idx.size}).")


## 20(19+1) Classes (Attack Labels)

In [None]:
#!/usr/bin/env python3
# --- Threading: let libs use all cores (don't cap OMP to 1)
import os, time, warnings
CORES = os.cpu_count() or 4
os.environ["OMP_NUM_THREADS"] = str(CORES)        # XGBoost / OpenMP users
os.environ["MKL_NUM_THREADS"] = str(CORES)        # BLAS for LR/NumPy
os.environ["OPENBLAS_NUM_THREADS"] = str(CORES)
os.environ["NUMEXPR_NUM_THREADS"] = str(min(CORES, 8))
warnings.filterwarnings("ignore", category=UserWarning)

import polars as pl
import numpy as np
import pickle
from tqdm import tqdm

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
)
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

# --------------------------
# 1) Load train/test Parquet (Polars)
# --------------------------
folder = os.getcwd()
train_parquet = os.path.join(folder, "train.parquet")
test_parquet  = os.path.join(folder, "test.parquet")
if not (os.path.exists(train_parquet) and os.path.exists(test_parquet)):
    raise RuntimeError("Missing train.parquet/test.parquet. Run the splitter step first.")

train_pl = pl.read_parquet(train_parquet)
test_pl  = pl.read_parquet(test_parquet)

# --------------------------
# 2) Columns: numeric features + label
# --------------------------
y_column = "Label"
if y_column not in train_pl.columns or y_column not in test_pl.columns:
    raise RuntimeError("Expected a 'Label' column in both train and test dataframes.")

schema = train_pl.schema
numeric_cols = [c for c, dt in schema.items() if c != y_column and dt in pl.NUMERIC_DTYPES]
if not numeric_cols:
    raise RuntimeError("No numeric feature columns found.")

# --------------------------
# 3) Feature hygiene in Polars (inf -> null; fill null/nan with median; cast to f32)
# --------------------------
def prep_numeric(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    # replace ±inf with null
    df = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    # compute medians once
    meds = df.select([pl.col(c).median().alias(c) for c in cols])
    # fill nulls/NaNs with medians and cast to float32
    df = df.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df

train_pl = prep_numeric(train_pl, numeric_cols)
test_pl  = prep_numeric(test_pl,  numeric_cols)

# --------------------------
# 4) To NumPy (float32, optionally Fortran order)
# --------------------------
def to_f32_f_order(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    arr = df.select(cols).to_numpy()
    return np.asfortranarray(arr, dtype=np.float32)

X_train_full = to_f32_f_order(train_pl, numeric_cols)
X_test       = to_f32_f_order(test_pl,  numeric_cols)

# Labels
y_train_raw = train_pl.select(y_column).to_series().to_numpy()
y_test_raw  = test_pl.select(y_column).to_series().to_numpy()

# --------------------------
# 5) Label encoding on union(train,test) to avoid unseen-class errors
# --------------------------
le = LabelEncoder()
le.fit(np.concatenate([y_train_raw, y_test_raw]).astype(str))
y_train_full = le.transform(y_train_raw.astype(str))
y_test       = le.transform(y_test_raw.astype(str))

# --------------------------
# 6) Scaling for LR in float32 (apply StandardScaler params in f32)
# --------------------------
scaler_path = os.path.join(folder, "preproc.pkl")
if not os.path.exists(scaler_path):
    raise RuntimeError("preproc.pkl not found. Run the preprocessing step first.")
with open(scaler_path, "rb") as f:
    artifacts = pickle.load(f)

mean_f32  = np.asarray(artifacts["scaler_mean"],  dtype=np.float32)
scale_f32 = np.asarray(artifacts["scaler_scale"], dtype=np.float32)

if mean_f32 is None or scale_f32 is None:
    raise RuntimeError("Loaded scaler lacks mean_/scale_. Refit with StandardScaler.")
scale_safe = np.where(scale_f32 == 0.0, 1.0, scale_f32)

def apply_scaler_f32(X_f):
    out = np.empty_like(X_f, dtype=np.float32, order='F')
    np.subtract(X_f, mean_f32, out=out)
    np.divide(out, scale_safe, out=out)
    return out

X_train_lr_full = apply_scaler_f32(X_train_full)
X_test_lr       = apply_scaler_f32(X_test)

# --------------------------
# 7) Validation split for early stopping
# --------------------------
X_tr_num, X_val_num, y_tr, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.15, random_state=42, stratify=y_train_full
)
X_tr_lr,  X_val_lr  = train_test_split(
    X_train_lr_full, test_size=0.15, random_state=42, stratify=y_train_full
)

# --------------------------
# 8) Models (light defaults) + boosters with early stopping
# --------------------------
RANDOM_STATE = 42
ML_models = [
    ("LogisticRegression", LogisticRegression(
        solver="lbfgs", penalty="l2", max_iter=1000, tol=2e-3, random_state=RANDOM_STATE
    )),
    ("DecisionTreeClassifier", DecisionTreeClassifier(
        criterion="entropy", max_depth=5, random_state=RANDOM_STATE
    )),
    ("RandomForestClassifier", RandomForestClassifier(
        n_estimators=100, max_features="sqrt", n_jobs=-1, random_state=RANDOM_STATE
    )),
    ("AdaBoostClassifier", AdaBoostClassifier(
        n_estimators=50, learning_rate=0.5, random_state=RANDOM_STATE
    )),

    ("BaggingClassifier", BaggingClassifier(
        n_estimators=10, n_jobs=-1, random_state=RANDOM_STATE
    )),

    ("LGBMClassifier", LGBMClassifier(
        n_estimators=4000, num_leaves=48, learning_rate=0.06,
        subsample=0.8, colsample_bytree=0.8,
        n_jobs=CORES, random_state=RANDOM_STATE
    )),
]

# --------------------------
# 9) Report
# --------------------------
report_path = os.path.join(folder, "report.txt")
open(report_path, "w").close()

def log_metrics(name, model, X_te, y_te):
    y_pred = model.predict(X_te)
    acc  = accuracy_score(y_te, y_pred)
    rec  = recall_score(y_te, y_pred, average="macro")
    prec = precision_score(y_te, y_pred, average="macro")
    f1   = f1_score(y_te, y_pred, average="macro")
    with open(report_path, "a") as fp:
        fp.write(f"####### {name} #######\n")
        fp.write(f"Accuracy : {acc:.4f}\n")
        fp.write(f"Recall   : {rec:.4f}\n")
        fp.write(f"Precision: {prec:.4f}\n")
        fp.write(f"F1 Score : {f1:.4f}\n\n")

# --------------------------
# 10) Train / time / save / evaluate
# --------------------------
for name, model in tqdm(ML_models, total=len(ML_models), desc="Models"):
    print(f"\n▶ Training {name}")
    t0 = time.perf_counter()

    if name == "LogisticRegression":
        model.fit(X_tr_lr, y_tr)
        Xte = X_test_lr

    elif name == "XGBClassifier":
        Xte = X_test  # test features for XGB always unscaled numeric

    # Try new API (callbacks)
        try:
            from xgboost import callback as xgb_callback  # may not exist on very old versions
            es = [xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=False)]
            model.fit(
                X_tr_num, y_tr,
                eval_set=[(X_val_num, y_val)],
                callbacks=es,
        )
        except (TypeError, ImportError):
            # Fallback: older API with early_stopping_rounds in fit
            try:
                model.fit(
                    X_tr_num, y_tr,
                    eval_set=[(X_val_num, y_val)],
                    early_stopping_rounds=50,
                    verbose=False,
                )
            except TypeError:
                # Last resort: no early stopping
                model.fit(X_tr_num, y_tr)   

    elif name == "LGBMClassifier":
        import lightgbm as lgb
        model.fit(
            X_tr_num, y_tr,
            eval_set=[(X_val_num, y_val)],
            eval_metric="multi_logloss",
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        Xte = X_test

    else:
        model.fit(X_tr_num, y_tr)
        Xte = X_test

    secs = time.perf_counter() - t0
    print(f"⏱ {name} trained in {secs:.1f}s")

    # Save
    with open(os.path.join(folder, f"{name}_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    print(f"▶ Testing {name}")
    log_metrics(name, model, Xte, y_test)

print(f"\nAll models trained, tested, and metrics logged to '{report_path}'.")


In [None]:
# Confusion Matrix (counts + row-normalized) for any saved model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os, pickle

# --- choose the model you want to visualize (must match the filename you saved) ---
model_name = "LGBMClassifier"        # e.g. "LogisticRegression", "RandomForestClassifier", ...
model_path = os.path.join(folder, f"{model_name}_model.pkl")

with open(model_path, "rb") as f:
    model = pickle.load(f)

# Pick the correct test features
Xte = X_test_lr if model_name == "LogisticRegression" else X_test

# Predict
y_pred = model.predict(Xte)

# Labels:
# - y_test is already integer-encoded with LabelEncoder
# - Use index range for confusion_matrix, and le.classes_ as display labels
idx_labels   = np.arange(len(le.classes_))  # 0..K-1
display_lbls = le.classes_

# Raw counts CM
cm = confusion_matrix(y_test, y_pred, labels=idx_labels)

# Save counts CM to CSV
np.savetxt(os.path.join(folder, f"cm_counts_{model_name}.csv"), cm, fmt="%d", delimiter=",")

# Plot counts CM
fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical"
)
ax.set_title(f"Confusion Matrix (counts) — {model_name}")
plt.savefig(os.path.join(folder, f"cm_counts_{model_name}.png"), bbox_inches="tight")
plt.show()
plt.close(fig)

# Row-normalized (per true class)
with np.errstate(invalid="ignore", divide="ignore"):
    row_sums = cm.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # avoid div/0 if a class is absent
    cm_norm = (cm / row_sums)

# Save normalized CM to CSV
np.savetxt(os.path.join(folder, f"cm_normalized_{model_name}.csv"), cm_norm, fmt="%.4f", delimiter=",")

# Plot normalized CM
fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical", values_format=".2f", cmap=plt.cm.Blues
)
ax.set_title(f"Confusion Matrix (row-normalized) — {model_name}")
plt.savefig(os.path.join(folder, f"cm_normalized_{model_name}.png"), bbox_inches="tight")
plt.show()
plt.close(fig)

print("Saved:",
      f"cm_counts_{model_name}.png / .csv",
      f"cm_normalized_{model_name}.png / .csv",
      sep="\n- ")


In [None]:
import os
import polars as pl

# Combine train/test labels
df = pl.concat([train_pl.select("Label"), test_pl.select("Label")])

# Count labels
label_counts = (
    df.group_by("Label")
      .len()
      .sort("len", descending=True)
)

# Save to TXT
out_path = os.path.join(folder, "label_counts.txt")
with open(out_path, "w") as f:
    for row in label_counts.iter_rows():
        f.write(f"{row[0]}: {row[1]}\n")

print(f"✅ Saved labels and counts to {out_path}")


## 7(6+1) Classes (Attack Labels)


We group the attacks in classes

In [9]:
dict_7classes = {}

# --- DDoS ---
dict_7classes['DDoS_ICMP'] = 'DDoS'
dict_7classes['DDoS_UDP'] = 'DDoS'
dict_7classes['DDoS_TCP'] = 'DDoS'
dict_7classes['DDoS_SYN'] = 'DDoS'

# --- DoS ---
dict_7classes['DoS_UDP'] = 'DoS'
dict_7classes['DoS_TCP'] = 'DoS'
dict_7classes['DoS_SYN'] = 'DoS'
dict_7classes['DoS_ICMP'] = 'DoS'

# --- Benign ---
dict_7classes['Benign'] = 'Benign'

# --- Spoofing ---
dict_7classes['Spoofing_ARP'] = 'Spoofing'

# --- Recon ---
dict_7classes['Recon_Ping_Sweep'] = 'Recon'
dict_7classes['Recon_OS_Scan'] = 'Recon'
dict_7classes['Recon_Port_Scan'] = 'Recon'
dict_7classes['Recon_Vulnerability_Scan'] = 'Recon'

# --- MQTT ---
dict_7classes['MQTT_DDoS_Flooding'] = 'MQTT'
dict_7classes['MQTT_DoS_Flooding'] = 'MQTT'
dict_7classes['MQTT_Malformed_Data'] = 'MQTT'
dict_7classes['MQTT_DoS_Attack'] = 'MQTT'
dict_7classes['MQTT_SlowITe'] = 'MQTT'

dict_7classes['Bruteforce'] = 'BruteForce'



In [None]:
# === 7-class grouping run (Polars → NumPy) ===
import polars as pl
import numpy as np
import os, pickle, time
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split



y_column = "Label"

# --- Map labels to 7 classes (drop anything unmapped) ---
def map_labels_pl(df: pl.DataFrame, mapping: dict) -> pl.DataFrame:
    mapped = df.with_columns(
        pl.col(y_column).cast(pl.Utf8).replace(mapping, default=None).alias(y_column)
    )
    # Drop rows where mapping failed
    return mapped.drop_nulls(subset=[y_column])

train_pl_7 = map_labels_pl(train_pl, dict_7classes)
test_pl_7  = map_labels_pl(test_pl,  dict_7classes)

# --- Hygiene per split: replace ±inf→null, fill with medians, cast float32 ---
def prep_numeric_pl(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    df2 = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    meds = df2.select([pl.col(c).median().alias(c) for c in cols])
    df2 = df2.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df2

train_pl_7 = prep_numeric_pl(train_pl_7, numeric_cols)
test_pl_7  = prep_numeric_pl(test_pl_7,  numeric_cols)

# --- To NumPy (float32, Fortran order) ---
def to_f32_f_order(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    arr = df.select(cols).to_numpy()
    return np.asfortranarray(arr, dtype=np.float32)

X_train_full_7 = to_f32_f_order(train_pl_7, numeric_cols)
X_test_7       = to_f32_f_order(test_pl_7,  numeric_cols)

# --- Labels (string → LabelEncoder over union) ---
y_train_7_raw = train_pl_7.select(y_column).to_series().to_numpy().astype(str)
y_test_7_raw  = test_pl_7.select(y_column).to_series().to_numpy().astype(str)

le7 = LabelEncoder()
le7.fit(np.concatenate([y_train_7_raw, y_test_7_raw]))
y_train_full_7 = le7.transform(y_train_7_raw)
y_test_7       = le7.transform(y_test_7_raw)

# --- Scaling for LR (reuse your pre-fitted scaler params already loaded) ---
def apply_scaler_f32(X_f):
    # reusing mean_f32 and scale_safe defined earlier in your session
    out = np.empty_like(X_f, dtype=np.float32, order='F')
    np.subtract(X_f, mean_f32, out=out)
    np.divide(out, scale_safe, out=out)
    return out

X_train_lr_full_7 = apply_scaler_f32(X_train_full_7)
X_test_lr_7       = apply_scaler_f32(X_test_7)

# --- Train/val split (stratified) for early stopping boosters ---
X_tr_num7, X_val_num7, y_tr7, y_val7 = train_test_split(
    X_train_full_7, y_train_full_7, test_size=0.15, random_state=42, stratify=y_train_full_7
)
X_tr_lr7,  X_val_lr7  = train_test_split(
    X_train_lr_full_7, test_size=0.15, random_state=42, stratify=y_train_full_7
)

# --- Append header to the same report.txt ---
report_path = os.path.join(folder, "report.txt")
with open(report_path, "a") as fp:
    fp.write("\n\n===== Grouped 7-Class Run =====\n")
    fp.write("Classes (encoded order): " + ", ".join(map(str, le7.classes_)) + "\n")

# --- Train / evaluate each model on 7 classes ---
for name, model in tqdm(ML_models, total=len(ML_models), desc="Models (7-class)"):
    print(f"\n▶ Training {name} (7-class)")
    t0 = time.perf_counter()

    if name == "LogisticRegression":
        model.fit(X_tr_lr7, y_tr7)
        Xte = X_test_lr_7

    elif name == "XGBClassifier":
        # (Only if you have XGB in ML_models; your current list does not include it.)
        Xte = X_test_7
        try:
            from xgboost import callback as xgb_callback
            es = [xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=False)]
            model.fit(
                X_tr_num7, y_tr7,
                eval_set=[(X_val_num7, y_val7)],
                callbacks=es,
            )
        except (TypeError, ImportError):
            try:
                model.fit(
                    X_tr_num7, y_tr7,
                    eval_set=[(X_val_num7, y_val7)],
                    early_stopping_rounds=50,
                    verbose=False,
                )
            except TypeError:
                model.fit(X_tr_num7, y_tr7)

    elif name == "LGBMClassifier":
        import lightgbm as lgb
        model.fit(
            X_tr_num7, y_tr7,
            eval_set=[(X_val_num7, y_val7)],
            eval_metric="multi_logloss",
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        Xte = X_test_7

    else:
        model.fit(X_tr_num7, y_tr7)
        Xte = X_test_7

    secs = time.perf_counter() - t0
    print(f"⏱ {name} (7-class) trained in {secs:.1f}s")

    # Save model
    with open(os.path.join(folder, f"{name}_7classes_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    # Evaluate
    print(f"▶ Testing {name} (7-class)")
    y_pred = model.predict(Xte)
    acc  = accuracy_score(y_test_7, y_pred)
    rec  = recall_score(y_test_7, y_pred, average="macro")
    prec = precision_score(y_test_7, y_pred, average="macro")
    f1   = f1_score(y_test_7, y_pred, average="macro")

    with open(report_path, "a") as fp:
        fp.write(f"####### {name} (7 Classes) #######\n")
        fp.write(f"Accuracy : {acc:.4f}\n")
        fp.write(f"Recall   : {rec:.4f}\n")
        fp.write(f"Precision: {prec:.4f}\n")
        fp.write(f"F1 Score : {f1:.4f}\n\n")

print("\n✅ All 7-class models trained & logged to 'report.txt'.")
print("Encoded class order (7-class):", list(le7.classes_))


In [None]:
# Confusion Matrix (counts + row-normalized) for 7-class models
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os, pickle

# --- choose the model you want to visualize (must match what you saved with _7classes) ---
model_name = "LGBMClassifier"        # e.g. "LogisticRegression", "RandomForestClassifier", ...
suffix = "7classes"                   # keeps filenames distinct

model_path = os.path.join(folder, f"{model_name}_{suffix}_model.pkl")
with open(model_path, "rb") as f:
    model = pickle.load(f)

# Pick the correct test features & labels for 7-class run
Xte = X_test_lr_7 if model_name == "LogisticRegression" else X_test_7
y_true = y_test_7

# Predict
y_pred = model.predict(Xte)

# Labels/ticks: use full index range to force consistent matrix shape, display names from le7
idx_labels   = np.arange(len(le7.classes_))   # 0..K-1
display_lbls = le7.classes_

# --- Raw counts CM ---
cm = confusion_matrix(y_true, y_pred, labels=idx_labels)
np.savetxt(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.csv"), cm, fmt="%d", delimiter=",")

fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical"
)
ax.set_title(f"Confusion Matrix (counts) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

# --- Row-normalized CM (per true class) ---
with np.errstate(invalid="ignore", divide="ignore"):
    row_sums = cm.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1  # avoid div/0 if a class is absent
    cm_norm = cm / row_sums

np.savetxt(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.csv"), cm_norm, fmt="%.4f", delimiter=",")

fig, ax = plt.subplots(figsize=(10,10), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical", values_format=".2f", cmap=plt.cm.Blues
)
ax.set_title(f"Confusion Matrix (row-normalized) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

print("Saved:",
      f"cm_counts_{model_name}_{suffix}.png / .csv",
      f"cm_normalized_{model_name}_{suffix}.png / .csv",
      sep="\n- ")


## 2(1+1) Classes (Attack or Benign)


Now, let's move to just 2 classifiers(attack, benign)

In [12]:
dict_2classes = {}

# --- Benign ---
dict_2classes['Benign'] = 'Benign'

# --- DDoS ---
dict_2classes['DDoS_ICMP'] = 'Attack'
dict_2classes['DDoS_UDP'] = 'Attack'
dict_2classes['DDoS_TCP'] = 'Attack'
dict_2classes['DDoS_SYN'] = 'Attack'

# --- DoS ---
dict_2classes['DoS_UDP'] = 'Attack'
dict_2classes['DoS_TCP'] = 'Attack'
dict_2classes['DoS_SYN'] = 'Attack'
dict_2classes['DoS_ICMP'] = 'Attack'

# --- Spoofing ---
dict_2classes['Spoofing_ARP'] = 'Attack'

# --- Recon ---
dict_2classes['Recon_Ping_Sweep'] = 'Attack'
dict_2classes['Recon_OS_Scan'] = 'Attack'
dict_2classes['Recon_Port_Scan'] = 'Attack'
dict_2classes['Recon_Vulnerability_Scan'] = 'Attack'

# --- BruteForce ---
dict_2classes['Bruteforce'] = 'Attack'

# --- MQTT ---
dict_2classes['MQTT_DDoS_Flooding'] = 'Attack'
dict_2classes['MQTT_DoS_Flooding'] = 'Attack'
dict_2classes['MQTT_DoS_Attack'] = 'Attack'
dict_2classes['MQTT_Malformed_Data'] = 'Attack'
dict_2classes['MQTT_SlowITe'] = 'Attack'


In [None]:
# === 2-class grouping run (Polars → NumPy) ===
import polars as pl
import numpy as np
import os, pickle, time
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

y_column = "Label"

# --- Map labels to 2 classes (drop anything unmapped) ---
def map_labels_pl(df: pl.DataFrame, mapping: dict) -> pl.DataFrame:
    mapped = df.with_columns(
        pl.col(y_column).cast(pl.Utf8).replace(mapping, default=None).alias(y_column)
    )
    return mapped.drop_nulls(subset=[y_column])

train_pl_2 = map_labels_pl(train_pl, dict_2classes)
test_pl_2  = map_labels_pl(test_pl,  dict_2classes)

# --- Hygiene per split: ±inf→null, fill medians, cast float32 ---
def prep_numeric_pl(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    df2 = df.with_columns([
        pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
        for c in cols
    ])
    meds = df2.select([pl.col(c).median().alias(c) for c in cols])
    df2 = df2.with_columns([
        pl.col(c).fill_null(meds[c][0]).fill_nan(meds[c][0]).cast(pl.Float32).alias(c)
        for c in cols
    ])
    return df2

train_pl_2 = prep_numeric_pl(train_pl_2, numeric_cols)
test_pl_2  = prep_numeric_pl(test_pl_2,  numeric_cols)

# --- To NumPy (float32, Fortran order) ---
def to_f32_f_order(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
    arr = df.select(cols).to_numpy()
    return np.asfortranarray(arr, dtype=np.float32)

X_train_full_2 = to_f32_f_order(train_pl_2, numeric_cols)
X_test_2       = to_f32_f_order(test_pl_2,  numeric_cols)

# --- Labels (string → LabelEncoder over union) ---
y_train_2_raw = train_pl_2.select(y_column).to_series().to_numpy().astype(str)
y_test_2_raw  = test_pl_2.select(y_column).to_series().to_numpy().astype(str)

le2 = LabelEncoder()
le2.fit(np.concatenate([y_train_2_raw, y_test_2_raw]))
y_train_full_2 = le2.transform(y_train_2_raw)
y_test_2       = le2.transform(y_test_2_raw)

# --- Scaling for LR (reuse your pre-fitted scaler params already loaded) ---
def apply_scaler_f32(X_f):
    out = np.empty_like(X_f, dtype=np.float32, order='F')
    np.subtract(X_f, mean_f32, out=out)
    np.divide(out, scale_safe, out=out)
    return out

X_train_lr_full_2 = apply_scaler_f32(X_train_full_2)
X_test_lr_2       = apply_scaler_f32(X_test_2)

# --- Train/val split (stratified) for early stopping boosters ---
X_tr_num2, X_val_num2, y_tr2, y_val2 = train_test_split(
    X_train_full_2, y_train_full_2, test_size=0.15, random_state=42, stratify=y_train_full_2
)
X_tr_lr2,  X_val_lr2  = train_test_split(
    X_train_lr_full_2, test_size=0.15, random_state=42, stratify=y_train_full_2
)

# --- Append header to the same report.txt ---
report_path = os.path.join(folder, "report.txt")
with open(report_path, "a") as fp:
    fp.write("\n\n===== Grouped 2-Class Run =====\n")
    fp.write("Classes (encoded order): " + ", ".join(map(str, le2.classes_)) + "\n")

# --- Train / evaluate each model on 2 classes ---
for name, model in tqdm(ML_models, total=len(ML_models), desc="Models (2-class)"):
    print(f"\n▶ Training {name} (2-class)")
    t0 = time.perf_counter()

    if name == "LogisticRegression":
        model.fit(X_tr_lr2, y_tr2)
        Xte = X_test_lr_2

    elif name == "XGBClassifier":
        # Only if XGB is in your ML_models
        Xte = X_test_2
        try:
            from xgboost import callback as xgb_callback
            es = [xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=False)]
            model.fit(
                X_tr_num2, y_tr2,
                eval_set=[(X_val_num2, y_val2)],
                callbacks=es,
            )
        except (TypeError, ImportError):
            try:
                model.fit(
                    X_tr_num2, y_tr2,
                    eval_set=[(X_val_num2, y_val2)],
                    early_stopping_rounds=50,
                    verbose=False,
                )
            except TypeError:
                model.fit(X_tr_num2, y_tr2)

    elif name == "LGBMClassifier":
        import lightgbm as lgb
        model.fit(
            X_tr_num2, y_tr2,
            eval_set=[(X_val_num2, y_val2)],
            eval_metric="binary_logloss",
            callbacks=[lgb.early_stopping(50, verbose=False)],
        )
        Xte = X_test_2

    else:
        model.fit(X_tr_num2, y_tr2)
        Xte = X_test_2

    secs = time.perf_counter() - t0
    print(f"⏱ {name} (2-class) trained in {secs:.1f}s")

    # Save model
    with open(os.path.join(folder, f"{name}_2classes_model.pkl"), "wb") as f:
        pickle.dump(model, f)

    # Evaluate
    print(f"▶ Testing {name} (2-class)")
    y_pred = model.predict(Xte)
    acc  = accuracy_score(y_test_2, y_pred)
    rec  = recall_score(y_test_2, y_pred, average="macro")
    prec = precision_score(y_test_2, y_pred, average="macro")
    f1   = f1_score(y_test_2, y_pred, average="macro")

    with open(report_path, "a") as fp:
        fp.write(f"####### {name} (2 Classes) #######\n")
        fp.write(f"Accuracy : {acc:.4f}\n")
        fp.write(f"Recall   : {rec:.4f}\n")
        fp.write(f"Precision: {prec:.4f}\n")
        fp.write(f"F1 Score : {f1:.4f}\n\n")

print("\n✅ All 2-class models trained & logged to 'report.txt'.")
print("Encoded class order (2-class):", list(le2.classes_))


In [None]:
# Confusion Matrix (counts + row-normalized) for 2-class models
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os, pickle

# --- choose the model you want to visualize (must match what you saved with _2classes) ---
model_name = "LGBMClassifier"   # e.g. "LogisticRegression", "RandomForestClassifier", ...
suffix = "2classes"             # keeps filenames distinct

model_path = os.path.join(folder, f"{model_name}_{suffix}_model.pkl")
with open(model_path, "rb") as f:
    model = pickle.load(f)

# Pick the correct test features & labels for 2-class run
Xte    = X_test_lr_2 if model_name == "LogisticRegression" else X_test_2
y_true = y_test_2

# Predict
y_pred = model.predict(Xte)

# Labels/ticks: use full index range to force consistent matrix shape; display names from le2
idx_labels   = np.arange(len(le2.classes_))   # 0..1
display_lbls = le2.classes_

# --- Raw counts CM ---
cm = confusion_matrix(y_true, y_pred, labels=idx_labels)
np.savetxt(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.csv"), cm, fmt="%d", delimiter=",")

fig, ax = plt.subplots(figsize=(8,8), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical"
)
ax.set_title(f"Confusion Matrix (counts) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_counts_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

# --- Row-normalized CM (per true class) ---
with np.errstate(invalid="ignore", divide="ignore"):
    row_sums = cm.sum(axis=1, keepdims=True)
    row_sums[row_sums == 0] = 1
    cm_norm = cm / row_sums

np.savetxt(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.csv"), cm_norm, fmt="%.4f", delimiter=",")

fig, ax = plt.subplots(figsize=(8,8), dpi=200)
ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_lbls).plot(
    ax=ax, xticks_rotation="vertical", values_format=".2f", cmap=plt.cm.Blues
)
ax.set_title(f"Confusion Matrix (row-normalized) — {model_name} ({suffix})")
plt.savefig(os.path.join(folder, f"cm_normalized_{model_name}_{suffix}.png"), bbox_inches="tight")
plt.show(); plt.close(fig)

print("Saved:",
      f"cm_counts_{model_name}_{suffix}.png / .csv",
      f"cm_normalized_{model_name}_{suffix}.png / .csv",
      sep="\n- ")
