# Model Comparison

### Environment and File Checks

In [None]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import xgboost as xgb

print("Environment check:")
print("Python:", sys.version)
print("Pandas:", pd.__version__)
print("NumPy:", np.__version__)
import sklearn, imblearn
print("scikit-learn:", sklearn.__version__)
print("imbalanced-learn:", imblearn.__version__)
print("XGBoost:", xgb.__version__)


### Load Training data

In [None]:
cwd = os.getcwd()

# If user started Jupyter inside notebooks/, move one level up
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

print(f"Project root: {project_root}")

data_dir = os.path.join(project_root, "data")
processed_dir = os.path.join(data_dir, "processed")
train_path = os.path.join(processed_dir, "train.csv")

if not os.path.exists(train_path):
    raise FileNotFoundError(
        f"Missing {train_path}. "
        "Make sure you ran 01_preprocessing.ipynb successfully first."
    )

print(f"Found training data: {train_path}")


In [None]:
train_df = pd.read_csv(train_path)
print("Train shape:", train_df.shape)
train_df.head()


### Separate Features and Labels

In [None]:
if "label" not in train_df.columns:
    raise KeyError("Expected column 'label' in train.csv (binary insider / normal).")

# Features: drop 'label' and 'insider' (scenario ID) if present
drop_cols = ["label"]
if "insider" in train_df.columns:
    drop_cols.append("insider")

X = train_df.drop(columns=drop_cols)
y = train_df["label"].astype(int)

print("Feature matrix shape:", X.shape)
print("Label vector shape:", y.shape)
print("\nLabel distribution (train):")
print(y.value_counts())


### Define Models and SMOTE pipeline

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb
from collections import Counter

def smote_pipeline(model, sampling=0.3, scale=False):
    """
    Create a pipeline with SMOTE + optional scaling for models
    that benefit from oversampling (Logistic Regression, SVM).
    """
    steps = [('smote', SMOTE(random_state=42, sampling_strategy=sampling))]
    if scale:
        steps.append(('scale', StandardScaler()))
    steps.append(('model', model))
    return ImbPipeline(steps=steps)
    
# Define Models
models = {}

# 1) Logistic Regression + SMOTE + scaling
models["LogReg + SMOTE"] = smote_pipeline(
    LogisticRegression(
        solver='lbfgs',
        max_iter=500,
        tol=1e-3,
        class_weight='balanced',
        random_state=42
    ),
    sampling=0.3,
    scale=True
)

# 2) Linear SVM + SMOTE + scaling
models["Linear SVM + SMOTE"] = smote_pipeline(
    LinearSVC(
        class_weight='balanced',
        max_iter=5000,
        tol=1e-3,
        random_state=42
    ),
    sampling=0.3,
    scale=True
)

# 3) Random Forest (NO SMOTE, uses class_weight)
models["Random Forest"] = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    class_weight='balanced',
    random_state=42
)

# 4) XGBoost (NO SMOTE, uses scale_pos_weight)
counter = Counter(y) 
scale_pos_weight = counter[0] / counter[1]
print(f"scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

models["XGBoost"] = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)


### Cross-Validation
This may take a while to complete running as it is running on 5 folds. Don't be afraid.

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
}

results = []

for name, model in models.items():
    print(f"\n***Evaluating: {name}***")
    cv_result = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    
    precision_mean = cv_result["test_precision"].mean()
    recall_mean    = cv_result["test_recall"].mean()
    f1_mean        = cv_result["test_f1"].mean()
    
    precision_std = cv_result["test_precision"].std()
    recall_std    = cv_result["test_recall"].std()
    f1_std        = cv_result["test_f1"].std()
    
    print(f"Precision: {precision_mean:.4f} ± {precision_std:.4f}")
    print(f"Recall:    {recall_mean:.4f} ± {recall_std:.4f}")
    print(f"F1-score:  {f1_mean:.4f} ± {f1_std:.4f}")
    
    results.append({
        "model": name,
        "precision_mean": precision_mean,
        "precision_std": precision_std,
        "recall_mean": recall_mean,
        "recall_std": recall_std,
        "f1_mean": f1_mean,
        "f1_std": f1_std,
    })

results_df = pd.DataFrame(results)
results_df


### Sort results to determine best model

In [None]:
import pandas as pd
import numpy as np

df = results_df.copy()

df["precision_rank"] = df["precision_mean"].rank(ascending=False).astype(int)
df["recall_rank"] = df["recall_mean"].rank(ascending=False).astype(int)
df["f1_rank"] = df["f1_mean"].rank(ascending=False).astype(int)

summary = pd.DataFrame({
    "Model": df["model"],
    "Precision": df["precision_mean"].round(3).astype(str),
    "Recall": df["recall_mean"].round(3).astype(str),
    "F1 Score": df["f1_mean"].round(3).astype(str),
    "Precision Rank": df["precision_rank"],
    "Recall Rank": df["recall_rank"],
    "F1 Rank": df["f1_rank"],
})

# Sort by best model (highest F1)
summary_sorted = summary.sort_values(by="F1 Rank")

summary_sorted


# Key Observations

### Random Forest

RF becomes overly conservative, predicting threats only in extremely confident cases.

High precision but  poor recall will lead to missing most insider threats

### Logistic Regression + SMOTE

Produces many false positives

High recall comes at the cost of extremely poor precision

### Linear SVM + SMOTE

More balanced than logistic regression and RF

Moderate precision and recall

Still underperforms compared to XGBoost

### XGBoost

Best overall performance

Strong precision and recall balance

# Conclusion
XGBoost is the strongest baseline model and will make a strong foundation for the next stage of the pipeline.

The next notebook (03_modelTuning) will focus on hyperparameter optimization using RandomizedSearchCV to further improve performance before feature selection and threshold calibration.