In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ============================
# Isolation Forest for CIC-IDS-2017
# ============================

# --- Imports ---
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score

# --- Paths ---
DATA_DIR = Path("/content/drive/MyDrive/zeusOps/data/CIC-IDS-2017")
MODEL_DIR = Path("/content/drive/MyDrive/zeusOps/models")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# --- Load preprocessed (scaled) data ---
X_train = pd.read_pickle(DATA_DIR / "cicids_x_train.pkl").values.astype("float32")
X_test  = pd.read_pickle(DATA_DIR / "cicids_x_test.pkl").values.astype("float32")
y_train_raw = np.array(pd.read_pickle(DATA_DIR / "cicids_y_train.pkl")).ravel()
y_test_raw  = np.array(pd.read_pickle(DATA_DIR / "cicids_y_test.pkl")).ravel()

# --- Detect normal label and binarize labels ---
unique, counts = np.unique(y_train_raw, return_counts=True)
normal_label = unique[np.argmax(counts)]
print("Detected normal label (treated as normal):", normal_label)

y_train = np.where(y_train_raw == normal_label, 0, 1)
y_test  = np.where(y_test_raw  == normal_label, 0, 1)

# --- Hyperparameter grid for randomized search ---
param_grid = {
    "n_estimators": [100, 200],
    "max_samples": [0.6, 0.8],
    "contamination": [0.05, 0.1, 0.15]  # adjust according to attack ratio
}

best_model = None
best_f1 = -1
best_params = {}

# --- Randomized search loop (faster than full grid) ---
for n in param_grid["n_estimators"]:
    for ms in param_grid["max_samples"]:
        for c in param_grid["contamination"]:
            print(f"\nTraining IF with n_estimators={n}, max_samples={ms}, contamination={c}")
            if_model = IsolationForest(
                n_estimators=n,
                max_samples=ms,
                contamination=c,
                random_state=42,
                n_jobs=-1
            )
            if_model.fit(X_train)

            # Predict: 1 = normal, -1 = anomaly
            y_pred = if_model.predict(X_test)
            y_pred_binary = np.where(y_pred == 1, 0, 1)

            f1 = f1_score(y_test, y_pred_binary)
            print(f"F1 = {f1:.4f}")

            if f1 > best_f1:
                best_f1 = f1
                best_model = if_model
                best_params = {"n_estimators": n, "max_samples": ms, "contamination": c}

# --- Final Evaluation with Best Model ---
y_pred_best = best_model.predict(X_test)
y_pred_binary_best = np.where(y_pred_best == 1, 0, 1)

print("\n=== Best Params ===")
print(best_params)
print(f"Best F1 Score: {best_f1:.4f}")

print("\n=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred_binary_best)
print(cm)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred_binary_best, target_names=['normal','attack'], digits=4))

roc_auc = roc_auc_score(y_test, y_pred_binary_best)
print(f"ROC-AUC: {roc_auc:.4f}")

# --- Save Best Model ---
joblib.dump(best_model, MODEL_DIR / "cicids_if.pkl")
print("Saved Isolation Forest model to:", MODEL_DIR / "cicids_if.pkl")


Detected normal label (treated as normal): 4

Training IF with n_estimators=100, max_samples=0.6, contamination=0.05
F1 = 0.1613

Training IF with n_estimators=100, max_samples=0.6, contamination=0.1
F1 = 0.3396

Training IF with n_estimators=100, max_samples=0.6, contamination=0.15
F1 = 0.4847

Training IF with n_estimators=100, max_samples=0.8, contamination=0.05
F1 = 0.1558

Training IF with n_estimators=100, max_samples=0.8, contamination=0.1
F1 = 0.3342

Training IF with n_estimators=100, max_samples=0.8, contamination=0.15
F1 = 0.4719

Training IF with n_estimators=200, max_samples=0.6, contamination=0.05
F1 = 0.1709

Training IF with n_estimators=200, max_samples=0.6, contamination=0.1
F1 = 0.3464

Training IF with n_estimators=200, max_samples=0.6, contamination=0.15
F1 = 0.4860

Training IF with n_estimators=200, max_samples=0.8, contamination=0.05
F1 = 0.1721

Training IF with n_estimators=200, max_samples=0.8, contamination=0.1
F1 = 0.3505

Training IF with n_estimators=200,