In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# --- Imports ---
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
from xgboost import XGBClassifier
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings("ignore")

# --- Paths (UNSW-NB15 dataset) ---
DATA_DIR = Path("/content/drive/MyDrive/zeusOps/data/UNSW-NB15")
MODEL_DIR = Path("/content/drive/MyDrive/zeusOps/models")

# --- Load Preprocessed Data (multiclass labels for UNSW) ---
X_train = pd.read_pickle(DATA_DIR / "unsw_x_train.pkl")
X_test  = pd.read_pickle(DATA_DIR / "unsw_x_test.pkl")
y_train = pd.read_pickle(DATA_DIR / "unsw_y_train.pkl")
y_test  = pd.read_pickle(DATA_DIR / "unsw_y_test.pkl")

# --- Load Pretrained Models ---
if_model = joblib.load(MODEL_DIR / "unsw_if.pkl")
dae_model = load_model(MODEL_DIR / "unsw_dae_model.h5", compile=False)

# Convert to numpy
X_train = X_train.values if hasattr(X_train, "values") else np.array(X_train)
X_test  = X_test.values if hasattr(X_test, "values") else np.array(X_test)
y_train = np.array(y_train).ravel()
y_test  = np.array(y_test).ravel()

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

# --- Step 1: Get Anomaly Scores ---
if_scores_train = -if_model.decision_function(X_train)
if_scores_test  = -if_model.decision_function(X_test)

X_train_recon = dae_model.predict(X_train, verbose=0)
X_test_recon  = dae_model.predict(X_test, verbose=0)

dae_scores_train = np.mean((X_train - X_train_recon) ** 2, axis=1)
dae_scores_test  = np.mean((X_test - X_test_recon) ** 2, axis=1)

# --- Step 2: Normalize & Augment ---
scaler = MinMaxScaler()
train_scores = scaler.fit_transform(np.vstack([if_scores_train, dae_scores_train]).T)
test_scores  = scaler.transform(np.vstack([if_scores_test, dae_scores_test]).T)

X_train_aug = np.hstack([X_train, train_scores])
X_test_aug  = np.hstack([X_test, test_scores])

joblib.dump(scaler, MODEL_DIR / "unsw_minmax_scaler.pkl")
print("Original features:", X_train.shape[1])
print("New features (augmented):", X_train_aug.shape[1])

# --- Step 3: Compute Class Weights ---
classes, counts = np.unique(y_train, return_counts=True)
total = len(y_train)
n_classes = len(classes)

class_weights = {c: total / (n_classes * cnt) for c, cnt in zip(classes, counts)}
print("\nClass Weights:", class_weights)

sample_weights = np.array([class_weights[int(y)] for y in y_train])

# --- Step 4: Train Enhanced Multi-class XGBoost ---
num_classes = n_classes
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=num_classes,
    n_jobs=-1,
    eval_metric="mlogloss"
)

xgb_model.fit(X_train_aug, y_train, sample_weight=sample_weights)

# --- Step 5: Evaluate Default Predictions ---
y_pred = xgb_model.predict(X_test_aug)
y_prob = xgb_model.predict_proba(X_test_aug)

print("\n=== Confusion Matrix (Default) ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report (Default) ===")
print(classification_report(y_test, y_pred, digits=4))

y_test_bin = label_binarize(y_test, classes=np.arange(num_classes))
roc_macro = roc_auc_score(y_test_bin, y_prob, average="macro", multi_class="ovr")
roc_per_class = roc_auc_score(y_test_bin, y_prob, average=None, multi_class="ovr")
print("\nMacro ROC-AUC:", roc_macro)
print("Per-Class ROC-AUC:", dict(zip(range(num_classes), roc_per_class)))

# --- Step 6: Threshold Tuning (per class) ---
best_thresholds = {}
y_pred_thresh = np.zeros_like(y_test)

for c in range(num_classes):
    best_f1, best_t = 0, 0.5
    for t in np.linspace(0.1, 0.9, 17):
        preds = (y_prob[:, c] >= t).astype(int)
        f1 = f1_score((y_test == c).astype(int), preds)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    best_thresholds[c] = best_t

# Apply thresholds: pick class if prob >= threshold, else fallback to highest prob
for i in range(len(y_test)):
    chosen = [c for c in range(num_classes) if y_prob[i, c] >= best_thresholds[c]]
    y_pred_thresh[i] = chosen[0] if chosen else np.argmax(y_prob[i])

print("\n=== Confusion Matrix (Threshold Adjusted) ===")
print(confusion_matrix(y_test, y_pred_thresh))

print("\n=== Classification Report (Threshold Adjusted) ===")
print(classification_report(y_test, y_pred_thresh, digits=4))

# --- Step 7: Save Enhanced Model + Thresholds ---
joblib.dump({
    "model": xgb_model,
    "scaler": scaler,
    "thresholds": best_thresholds
}, MODEL_DIR / "unsw_xgb_enhanced.pkl")

print("\nEnhanced UNSW XGBoost model + thresholds saved!")

Train shape: (1647530, 24)  Test shape: (411883, 24)
Original features: 24
New features (augmented): 26

Class Weights: {np.int32(0): np.float64(85.73294478846854), np.int32(1): np.float64(111.19187419855571), np.int32(2): np.float64(626.6755420311906), np.int32(3): np.float64(33.04842333306588), np.int32(4): np.float64(6.783615858755131), np.int32(5): np.float64(8.590012304740453), np.int32(6): np.float64(7.377374374221976), np.int32(7): np.float64(0.09553127059900814), np.int32(8): np.float64(14.016044782468141), np.int32(9): np.float64(123.8837506579442), np.int32(10): np.float64(1093.2514930325149)}

=== Confusion Matrix (Default) ===
[[   105    216     49     38      8      3      5      6      7      0
       0]
 [   209     63      0     36     14      5      3      0      5      2
       0]
 [    37      0     10      5      4      2      0      0      2      0
       0]
 [   111    191     32    283    382     27     18      1     41     44
       3]
 [   131    197     46   