# Model Tuning

### Load data & imports + Environment check

In [None]:
import os
import sys
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer

import xgboost as xgb
import joblib

print("Environment check:")
print("Python:", sys.version.split()[0])
print("Pandas:", pd.__version__)
print("NumPy:", np.__version__)
print("XGBoost:", xgb.__version__)


### Load Train Split

In [None]:
cwd = os.getcwd()
if os.path.basename(cwd) == "notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd

print("Project root:", project_root)

data_dir = os.path.join(project_root, "data", "processed")
train_path = os.path.join(data_dir, "train.csv")

if not os.path.exists(train_path):
    raise FileNotFoundError(f"Missing {train_path}. Run 01_preprocessing.ipynb first.")

train_df = pd.read_csv(train_path)
print("Train shape:", train_df.shape)
train_df.head()


### Build feature matrix 

In [None]:
if "label" not in train_df.columns:
    raise KeyError("Expected 'label' column in train.csv")

y_train = train_df["label"].astype(int)

drop_cols = ["label"]
if "insider" in train_df.columns:
    drop_cols.append("insider")

X_train = train_df.drop(columns=drop_cols)

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("\nLabel distribution:")
print(y_train.value_counts())


In [None]:
counter = Counter(y_train)
n_neg = counter[0]
n_pos = counter[1]
scale_pos_weight = n_neg / n_pos

print("Class counts:", counter)
print(f"scale_pos_weight: {scale_pos_weight:.2f}")


### Define Base XGBoost Model and Search Space

In [None]:
xgb_base = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
)

spw = scale_pos_weight
param_dist = {
    "learning_rate": [0.03, 0.05, 0.07, 0.1],
    "max_depth": [3, 4, 5, 6],
    "min_child_weight": [1, 3, 5, 7],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "n_estimators": [300, 400, 500, 600, 700],
    "gamma": [0, 0.5, 1, 3],
    "reg_lambda": [1, 3, 5, 10],
    "reg_alpha": [0, 0.1, 0.5, 1.0],
    "scale_pos_weight": [
        0.5 * spw,
        0.75 * spw,
        1.0 * spw,
        1.25 * spw,
    ],
}

### Setup Randomized CV Search

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scorer = make_scorer(f1_score)

search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_dist,
    n_iter=40,              
    scoring=f1_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)


### Run Tuning
This one will take a while too, as it has to evaluate 200 fits. Again, don't be afraid!

**If you don't want to wait, you can hardcode the best params provided in the README**

In [None]:
search.fit(X_train, y_train)

print("Best mean CV F1:", search.best_score_)
print("Best params:")
search.best_params_


### Train tuned model using best parameters

In [None]:
best_params = search.best_params_

#to hard code params
'''best_params = {'subsample': 0.7,
 'scale_pos_weight': 158.325,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'n_estimators': 300,
 'min_child_weight': 1,
 'max_depth': 6,
 'learning_rate': 0.1,
 'gamma': 0,
 'colsample_bytree': 0.8}
 '''
xgb_tuned = xgb.XGBClassifier(
    **best_params,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
)

xgb_tuned.fit(X_train, y_train)


### Save tuned model & Params

In [None]:
models_dir = os.path.join(project_root, "models")
os.makedirs(models_dir, exist_ok=True)

model_path = os.path.join(models_dir, "xgb_tuned_model.joblib")
joblib.dump(xgb_tuned, model_path)

print("Saved tuned model to:", model_path)

In [None]:
import json

params_path = os.path.join(models_dir, "xgb_best_params.json")
with open(params_path, "w") as f:
    json.dump(best_params, f, indent=4)

print("Saved best params to:", params_path)
best_params

# Conclusion
In this notebook, we tuned the XGBoost classifier using a 20-iteration
`RandomizedSearchCV` over a 5-fold Stratified Cross-Validation split on the
training data.

The search optimized for F1 score, which balances precision and recall on
the highly imbalanced CERT insider-threat dataset. The best parameters were then used to train a final, tuned model on the training set.

The tuned model and its hyperparameters are saved to:

- `models/xgb_tuned_model.joblib`
- `models/xgb_best_params.json`

These will be used in:

- Notebook 04 for feature importance and feature selection
- Notebook 05 for threshold evaluation and calibration