In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- Imports ---
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score ,f1_score
from xgboost import XGBClassifier
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings("ignore")

In [None]:
# --- Paths (CICIDS dataset) ---
DATA_DIR = Path("/content/drive/MyDrive/zeusOps/data/CIC-IDS-2017")
MODEL_DIR = Path("/content/drive/MyDrive/zeusOps/models")

# --- Load Preprocessed Data (multiclass labels for CICIDS) ---
X_train = pd.read_pickle(DATA_DIR / "cicids_x_train.pkl")
X_test  = pd.read_pickle(DATA_DIR / "cicids_x_test.pkl")
y_train = pd.read_pickle(DATA_DIR / "cicids_y_train.pkl")
y_test  = pd.read_pickle(DATA_DIR / "cicids_y_test.pkl")

# --- Load Pretrained Models ---
# Isolation Forest (trained on CICIDS)
if_model = joblib.load(MODEL_DIR / "cicids_if_model.pkl")
# Denoising Autoencoder (trained on CICIDS)
dae_model = load_model(MODEL_DIR / "cicids_dae_model.h5" , compile = False)

# Convert to numpy arrays
X_train = X_train.values if hasattr(X_train, "values") else np.array(X_train)
X_test  = X_test.values if hasattr(X_test, "values") else np.array(X_test)
y_train = np.array(y_train)
y_test  = np.array(y_test)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

Train shape: (1764525, 52)  Test shape: (756226, 52)


In [None]:
# --- Step 1: Get Anomaly Scores ---
if_scores_train = -if_model.decision_function(X_train)
if_scores_test  = -if_model.decision_function(X_test)

X_train_recon = dae_model.predict(X_train, verbose=0)
X_test_recon  = dae_model.predict(X_test, verbose=0)

dae_scores_train = np.mean((X_train - X_train_recon) ** 2, axis=1)
dae_scores_test  = np.mean((X_test - X_test_recon) ** 2, axis=1)

In [None]:
# --- Step 2: Normalize & Augment ---
scaler = MinMaxScaler()
train_scores = scaler.fit_transform(np.vstack([if_scores_train, dae_scores_train]).T)
test_scores  = scaler.transform(np.vstack([if_scores_test, dae_scores_test]).T)

X_train_aug = np.hstack([X_train, train_scores])
X_test_aug  = np.hstack([X_test, test_scores])

joblib.dump(scaler, MODEL_DIR / "cicids_minmax_scaler.pkl")
print("Original features:", X_train.shape[1])
print("New features (augmented):", X_train_aug.shape[1])

Original features: 52
New features (augmented): 54


In [None]:

# --- Step 3: Compute Class Weights ---
classes, counts = np.unique(y_train, return_counts=True)
total = len(y_train)
n_classes = len(classes)

class_weights = {c: total / (n_classes * cnt) for c, cnt in zip(classes, counts)}
print("\nClass Weights:", class_weights)

# Assign per-sample weights
sample_weights = np.array([class_weights[y] for y in y_train])


Class Weights: {np.int32(0): np.float64(184.80571847507332), np.int32(1): np.float64(39.3559718969555), np.int32(2): np.float64(2.8130231001004353), np.int32(3): np.float64(1.8586723295064924), np.int32(4): np.float64(0.17188427992709365), np.int32(5): np.float64(3.9705604385218787), np.int32(6): np.float64(168.05)}


In [None]:
# --- Step 4: Train Enhanced Multi-class XGBoost ---
num_classes = n_classes
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=num_classes,
    n_jobs=-1,
    eval_metric="mlogloss"
)

xgb_model.fit(X_train_aug, y_train, sample_weight=sample_weights)

In [None]:
# --- Step 5: Evaluate Default Predictions ---
y_pred = xgb_model.predict(X_test_aug)
y_prob = xgb_model.predict_proba(X_test_aug)

In [None]:
print("\n=== Confusion Matrix (Default) ===")
print(confusion_matrix(y_test, y_pred))


=== Confusion Matrix (Default) ===
[[   560      0      0      0     24      0      0]
 [     0   2742      0      2      1      0      0]
 [     0      0  38399      0      5      0      0]
 [     0      0      0  58112      7      4      1]
 [   246      0      6    105 627859    297      5]
 [     0      0      0      9      2  27193      4]
 [     0      0      0      0      3      0    640]]


In [None]:
print("\n=== Classification Report (Default) ===")
print(classification_report(y_test, y_pred, digits=4))


=== Classification Report (Default) ===
              precision    recall  f1-score   support

           0     0.6948    0.9589    0.8058       584
           1     1.0000    0.9989    0.9995      2745
           2     0.9998    0.9999    0.9999     38404
           3     0.9980    0.9998    0.9989     58124
           4     0.9999    0.9990    0.9994    628518
           5     0.9891    0.9994    0.9942     27208
           6     0.9846    0.9953    0.9899       643

    accuracy                         0.9990    756226
   macro avg     0.9523    0.9930    0.9697    756226
weighted avg     0.9991    0.9990    0.9991    756226



In [None]:
y_test_bin = label_binarize(y_test, classes=np.arange(num_classes))
roc_macro = roc_auc_score(y_test_bin, y_prob, average="macro", multi_class="ovr")
print("\nMacro ROC-AUC:", roc_macro)


Macro ROC-AUC: 0.9999089442731155


In [None]:
# --- Step 6: Threshold Tuning (per class) ---
best_thresholds = {}
y_pred_thresh = np.zeros_like(y_test)

for c in range(num_classes):
    # sweep thresholds for class c
    best_f1, best_t = 0, 0.5
    for t in np.linspace(0.1, 0.9, 17):  # coarse sweep
        preds = (y_prob[:, c] >= t).astype(int)
        f1 = f1_score((y_test == c).astype(int), preds)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    best_thresholds[c] = best_t


In [None]:
# Apply thresholds: pick class if prob >= its threshold, else fallback to argmax
for i in range(len(y_test)):
    chosen = [c for c in range(num_classes) if y_prob[i, c] >= best_thresholds[c]]
    y_pred_thresh[i] = chosen[0] if chosen else np.argmax(y_prob[i])

In [None]:
print("\n=== Confusion Matrix (Threshold Adjusted) ===")
print(confusion_matrix(y_test, y_pred_thresh))



=== Confusion Matrix (Threshold Adjusted) ===
[[   541      0      0      0     43      0      0]
 [     0   2743      0      0      2      0      0]
 [     0      0  38398      0      6      0      0]
 [     0      0      0  58104     15      4      1]
 [   199      0      2     87 627932    294      4]
 [     0      0      0      9      3  27193      3]
 [     0      0      0      0      6      0    637]]


In [None]:
print("\n=== Classification Report (Threshold Adjusted) ===")
print(classification_report(y_test, y_pred_thresh, digits=4))



=== Classification Report (Threshold Adjusted) ===
              precision    recall  f1-score   support

           0     0.7311    0.9264    0.8172       584
           1     1.0000    0.9993    0.9996      2745
           2     0.9999    0.9998    0.9999     38404
           3     0.9984    0.9997    0.9990     58124
           4     0.9999    0.9991    0.9995    628518
           5     0.9892    0.9994    0.9943     27208
           6     0.9876    0.9907    0.9891       643

    accuracy                         0.9991    756226
   macro avg     0.9580    0.9878    0.9712    756226
weighted avg     0.9992    0.9991    0.9991    756226



In [None]:
# --- Step 7: Save Enhanced Model ---
joblib.dump(xgb_model, MODEL_DIR / "cicids_xgb_enhanced.pkl")
print("\nEnhanced CICIDS XGBoost model saved!")



Enhanced CICIDS XGBoost model saved!
