# Model Comparison

### Environment and File Checks

In [1]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import xgboost as xgb

print("Environment check:")
print("Python:", sys.version)
print("Pandas:", pd.__version__)
print("NumPy:", np.__version__)
import sklearn, imblearn
print("scikit-learn:", sklearn.__version__)
print("imbalanced-learn:", imblearn.__version__)
print("XGBoost:", xgb.__version__)


Environment check:
Python: 3.10.18 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:08:55) [MSC v.1929 64 bit (AMD64)]
Pandas: 2.3.3
NumPy: 2.2.6
scikit-learn: 1.7.2
imbalanced-learn: 0.14.0
XGBoost: 3.0.5


### Load Training data

In [2]:
cwd = os.getcwd()

# If user started Jupyter inside notebooks/, move one level up
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

print(f"Project root: {project_root}")

data_dir = os.path.join(project_root, "data")
processed_dir = os.path.join(data_dir, "processed")
train_path = os.path.join(processed_dir, "train.csv")

if not os.path.exists(train_path):
    raise FileNotFoundError(
        f"Missing {train_path}. "
        "Make sure you ran 01_preprocessing.ipynb successfully first."
    )

print(f"Found training data: {train_path}")


Project root: C:\Users\MayaW\OneDrive - North Dakota University System\UND 2025-2026\CS492 - Senior Project\Final Notebooks
Found training data: C:\Users\MayaW\OneDrive - North Dakota University System\UND 2025-2026\CS492 - Senior Project\Final Notebooks\data\processed\train.csv


In [3]:
train_df = pd.read_csv(train_path)
print("Train shape:", train_df.shape)
train_df.head()


Train shape: (40299, 665)


Unnamed: 0,role,b_unit,f_unit,dept,team,ITAdmin,O,C,E,A,...,weekendhttp_hackf_mean_url_len,weekendhttp_hackf_mean_url_depth,weekendhttp_hackf_mean_http_c_len,weekendhttp_hackf_mean_http_c_nwords,weekendhttp_hackf_n-pc0,weekendhttp_hackf_n-pc1,weekendhttp_hackf_n-pc2,weekendhttp_hackf_n-pc3,insider,label
0,39,0,4,15,29,0,46,20,32,26,...,0,0,0,0,0,0,0,0,0,0
1,39,0,4,15,4,0,36,41,45,19,...,0,0,0,0,0,0,0,0,0,0
2,39,0,4,15,11,0,11,20,39,20,...,0,0,0,0,0,0,0,0,0,0
3,39,0,1,14,37,0,26,23,16,30,...,0,0,0,0,0,0,0,0,0,0
4,24,0,1,5,28,0,27,35,20,36,...,0,0,0,0,0,0,0,0,0,0


### Separate Features and Labels

In [4]:
if "label" not in train_df.columns:
    raise KeyError("Expected column 'label' in train.csv (binary insider / normal).")

# Features: drop 'label' and 'insider' (scenario ID) if present
drop_cols = ["label"]
if "insider" in train_df.columns:
    drop_cols.append("insider")

X = train_df.drop(columns=drop_cols)
y = train_df["label"].astype(int)

print("Feature matrix shape:", X.shape)
print("Label vector shape:", y.shape)
print("\nLabel distribution (train):")
print(y.value_counts())


Feature matrix shape: (40299, 663)
Label vector shape: (40299,)

Label distribution (train):
label
0    40109
1      190
Name: count, dtype: int64


### Define Models and SMOTE pipeline

In [11]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb
from collections import Counter

def smote_pipeline(model, sampling=0.3, scale=False):
    """
    Create a pipeline with SMOTE + optional scaling for models
    that benefit from oversampling (Logistic Regression, SVM).
    """
    steps = [('smote', SMOTE(random_state=42, sampling_strategy=sampling))]
    if scale:
        steps.append(('scale', StandardScaler()))
    steps.append(('model', model))
    return ImbPipeline(steps=steps)
    
# Define Models
models = {}

# 1) Logistic Regression + SMOTE + scaling
models["LogReg + SMOTE"] = smote_pipeline(
    LogisticRegression(
        solver='lbfgs',
        max_iter=500,
        tol=1e-3,
        class_weight='balanced',
        random_state=42
    ),
    sampling=0.3,
    scale=True
)

# 2) Linear SVM + SMOTE + scaling
models["Linear SVM + SMOTE"] = smote_pipeline(
    LinearSVC(
        class_weight='balanced',
        max_iter=5000,
        tol=1e-3,
        random_state=42
    ),
    sampling=0.3,
    scale=True
)

# 3) Random Forest (NO SMOTE, uses class_weight)
models["Random Forest (balanced)"] = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    class_weight='balanced',
    random_state=42
)

# 4) XGBoost (NO SMOTE, uses scale_pos_weight)
counter = Counter(y) 
scale_pos_weight = counter[0] / counter[1]
print(f"scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

models["XGBoost (no SMOTE)"] = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)


scale_pos_weight for XGBoost: 211.10


### Cross-Validation

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "precision": "precision",
    "recall": "recall",
    "f1": "f1",
}

results = []

for name, model in models.items():
    print(f"\n=== Evaluating: {name} ===")
    cv_result = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    
    precision_mean = cv_result["test_precision"].mean()
    recall_mean    = cv_result["test_recall"].mean()
    f1_mean        = cv_result["test_f1"].mean()
    
    precision_std = cv_result["test_precision"].std()
    recall_std    = cv_result["test_recall"].std()
    f1_std        = cv_result["test_f1"].std()
    
    print(f"Precision: {precision_mean:.4f} ± {precision_std:.4f}")
    print(f"Recall:    {recall_mean:.4f} ± {recall_std:.4f}")
    print(f"F1-score:  {f1_mean:.4f} ± {f1_std:.4f}")
    
    results.append({
        "model": name,
        "precision_mean": precision_mean,
        "precision_std": precision_std,
        "recall_mean": recall_mean,
        "recall_std": recall_std,
        "f1_mean": f1_mean,
        "f1_std": f1_std,
    })

results_df = pd.DataFrame(results)
results_df



=== Evaluating: LogReg + SMOTE ===



KeyboardInterrupt



# Key Observations
### Logistic Regression + SMOTE

Very high recall (~0.85)

Extremely low precision (~0.06)

Produces too many false positives to be operationally useful

### Linear SVM + SMOTE

More balanced than logistic regression

Moderate precision and recall

Still underperforms compared to tree-based models

### Random Forest + SMOTE

Good precision (~0.54)

Lower recall (~0.40)

Performance strongly affected by oversampling

### XGBoost (no SMOTE)

Best overall performance

Highest F1-score (~0.71)

Strong precision and recall balance

Naturally handles imbalanced data with scale_pos_weight

# Conclusion
XGBoost is the strongest baseline model and is selected as the primary candidate for further optimization.
Its superior performance, stability under class imbalance, and flexible tuning options make it the best foundation for the next stage of the pipeline.

The next notebook (03_modelTuning) will focus on hyperparameter optimization using RandomizedSearchCV to further improve performance before feature selection and threshold calibration.