Modeling (baseline + NN) & Inference

Setup & load

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path

ROOT = Path("..").resolve()
DATA = ROOT/"data"
FIGS = ROOT/"reports/figures"
FIGS.mkdir(parents=True, exist_ok=True)
def savefig(fig, name): 
    path = FIGS/f"{name}.png"; fig.savefig(path, dpi=300, bbox_inches="tight"); print("Saved:", path)

def read_data(path: Path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        first = f.readline()
    skip = 1 if first.lower().startswith("data derived") else 0
    return pd.read_csv(path, low_memory=False, skiprows=skip)

train = read_data(DATA/"training_loan_data.csv")
test  = read_data(DATA/"testing_loan_data.csv")
target_col = next((c for c in train.columns if c.strip().lower()=="bad_flag"), None)
assert target_col
drop_cols = [c for c in ["id","member_id"] if c in train.columns]
train.shape, test.shape, target_col


((199121, 23), (102505, 23), 'bad_flag')

Logistic Regression baseline (preprocessing + ROC figure)

In [None]:
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

y_raw = pd.to_numeric(train[target_col], errors="coerce")
mask = y_raw.notna()
y = y_raw.loc[mask].astype(int)
X = train.loc[mask].drop(columns=drop_cols + [target_col])

cat_cols = [c for c in X if X[c].dtype == "object"]
num_cols = [c for c in X if c not in cat_cols]

pre = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")),
                      ("sc",  StandardScaler(with_mean=False))]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("oh",  OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
])

X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = Pipeline([("pre", pre), ("lr", LogisticRegression(max_iter=400))]).fit(X_tr, y_tr)

va_probs = clf.predict_proba(X_va)[:,1]
va_auc  = roc_auc_score(y_va, va_probs)
print("Validation AUC (LR):", round(va_auc,4))
print(classification_report(y_va, (va_probs>=0.5).astype(int), digits=3))

# ROC
fpr, tpr, _ = roc_curve(y_va, va_probs)
fig, ax = plt.subplots(figsize=(5,4))
ax.plot(fpr, tpr); ax.plot([0,1],[0,1], linestyle="--")
ax.set_xlabel("FPR"); ax.set_ylabel("TPR"); ax.set_title(f"ROC — Logistic (AUC={va_auc:.3f})")
plt.tight_layout(); savefig(fig, "roc_logistic_baseline")
plt.show()