# 03 – Model Prototyping (MP)

This notebook prototypes different models and hyperparameters for PD prediction.

Goals:
- Compare baseline models (Logistic Regression, XGBoost, LightGBM if available)
- Use time-based splits to mimic production
- Identify a strong configuration to promote into `src/models/train.py` and `tune.py`

In [1]:
import os
from pathlib import Path

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

try:
    import lightgbm as lgb

    HAS_LGB = True
except ImportError:
    HAS_LGB = False

# Navigate to project root (one level up from notebooks)
if Path.cwd().name == "notebooks":
    os.chdir("..")

DATA_DIR = Path("data")
PROCESSED_DIR = DATA_DIR / "processed"
PROCESSED_FILE = PROCESSED_DIR / "loans_features.parquet"

df = pd.read_parquet(PROCESSED_FILE)
df.head()

Unnamed: 0,loan_amnt,annual_inc,int_rate,term,loan_status,dti,grade,sub_grade,emp_length,home_ownership,issue_d,default,loan_to_income,term_months,grade_numeric,sub_grade_numeric
0,3600.0,55000.0,13.99,36 months,Fully Paid,5.91,C,C4,10+ years,MORTGAGE,Dec-2015,0,6.5455,36,3,34
1,24700.0,65000.0,11.99,36 months,Fully Paid,16.06,C,C1,10+ years,MORTGAGE,Dec-2015,0,38.0,36,3,31
2,20000.0,63000.0,10.78,60 months,Fully Paid,10.78,B,B4,10+ years,MORTGAGE,Dec-2015,0,31.746,60,2,24
3,10400.0,104433.0,22.45,60 months,Fully Paid,25.37,F,F1,3 years,MORTGAGE,Dec-2015,0,9.9585,60,6,61
4,11950.0,34000.0,13.44,36 months,Fully Paid,10.2,C,C3,4 years,RENT,Dec-2015,0,35.1471,36,3,33


## Train / Test Split (Time-Based)

In [None]:
target_col = "default"
date_col_candidates = [c for c in df.columns if c.lower() in ("issue_d", "origination_date")]
date_col = date_col_candidates[0] if date_col_candidates else None
date_col

In [None]:
if date_col is not None:
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col)

X = df.drop(columns=[target_col])
y = df[target_col]

split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx].copy(), X.iloc[split_idx:].copy()
y_train, y_test = y.iloc[:split_idx].copy(), y.iloc[split_idx:].copy()
X_train.shape, X_test.shape

## Preprocessing Pipeline

In [None]:
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X_train.select_dtypes(include=["number", "bool"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

## Baseline: Logistic Regression

In [None]:
log_reg = LogisticRegression(max_iter=1000, n_jobs=-1)
pipe_lr = Pipeline(steps=[("preprocess", preprocessor), ("model", log_reg)])

pipe_lr.fit(X_train, y_train)
y_proba_lr = pipe_lr.predict_proba(X_test)[:, 1]
roc_auc_lr = roc_auc_score(y_test, y_proba_lr)
pr_auc_lr = average_precision_score(y_test, y_proba_lr)
roc_auc_lr, pr_auc_lr

## XGBoost Prototype

In [None]:
xgb = XGBClassifier(
    objective="binary:logistic",
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    tree_method="hist",
    random_state=42,
)

pipe_xgb = Pipeline(steps=[("preprocess", preprocessor), ("model", xgb)])

pipe_xgb.fit(X_train, y_train)
y_proba_xgb = pipe_xgb.predict_proba(X_test)[:, 1]
roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
pr_auc_xgb = average_precision_score(y_test, y_proba_xgb)
roc_auc_xgb, pr_auc_xgb

## LightGBM Prototype (if installed)

In [None]:
if HAS_LGB:
    lgb_model = lgb.LGBMClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
    )

    pipe_lgb = Pipeline(steps=[("preprocess", preprocessor), ("model", lgb_model)])

    pipe_lgb.fit(X_train, y_train)
    y_proba_lgb = pipe_lgb.predict_proba(X_test)[:, 1]
    roc_auc_lgb = roc_auc_score(y_test, y_proba_lgb)
    pr_auc_lgb = average_precision_score(y_test, y_proba_lgb)
    roc_auc_lgb, pr_auc_lgb
else:
    print("LightGBM not installed; skipping LGB prototype.")

## Model Comparison

In [None]:
results = []
results.append({"model": "LogisticRegression", "roc_auc": roc_auc_lr, "pr_auc": pr_auc_lr})
results.append({"model": "XGBoost", "roc_auc": roc_auc_xgb, "pr_auc": pr_auc_xgb})
try:
    results.append({"model": "LightGBM", "roc_auc": roc_auc_lgb, "pr_auc": pr_auc_lgb})
except NameError:
    pass

pd.DataFrame(results).sort_values("roc_auc", ascending=False)

## Next Steps

- Take the best-performing configuration and promote its hyperparameters into `src/models/tune.py`.
- Ensure feature engineering used here matches `src/features/transforms.py`.
- Re-run the full pipeline (`tune → train → evaluate`) and update documentation/plots.