
# Model & Split Sanity Check

This notebook helps you **verify the saved model pickle (`.pkl`)** and the **train/test/OOT CSV dumps**.

## What it does
- Loads the pickled artefact and prints metadata (config, metrics).
- Shows pipeline structure and feature columns.
- Loads `X_train.csv`, `X_test.csv`, `X_oot.csv` and checks column schemas, basic stats, and leakage risks.
- (Optional) Runs the saved pipeline to compute quick AUCs on the CSVs to cross-check with stored metrics.

## How to use
1. Set the two paths below:
   - `MODEL_PKL`: path to your saved model, e.g. `model_bank/credit_model_2024_07_01.pkl`
   - `DUMPS_DIR`: directory that holds `X_train.csv`, `X_test.csv`, `X_oot.csv`, e.g. `model_bank/credit_model_2024_07_01/`
2. Run all cells.


In [3]:

# --- Configure paths ---
MODEL_PKL = "scripts/model_bank/credit_model_2024_07_01.pkl"   # <-- change me
DUMPS_DIR = "scripts/model_bank/credit_model_2024_07_01/"      # <-- change me

import os, json, pickle
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.metrics import roc_auc_score


In [4]:

with open(MODEL_PKL, "rb") as f:
    artefact = pickle.load(f)

print("Keys in artefact:", list(artefact.keys()))
print("\n== CONFIG ==")
pprint(artefact["config"])
print("\n== TRAIN/TEST/OOT results ==")
pprint(artefact["results"])

pipe = artefact["pipeline"]
print("\n== Pipeline ==")
print(pipe)

print("\n== Feature columns ==")
print("Numeric:", artefact["feature_columns"]["numeric"][:10], "... total", len(artefact["feature_columns"]["numeric"]))
print("Categorical:", artefact["feature_columns"]["categorical"])


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ModuleNotFoundError: No module named 'numpy._core'

In [None]:

def load_split_csvs(base_dir):
    x_train = pd.read_csv(os.path.join(base_dir, "X_train.csv"))
    y_train = pd.read_csv(os.path.join(base_dir, "y_train.csv"))["label"].astype(int)
    x_test  = pd.read_csv(os.path.join(base_dir, "X_test.csv"))
    y_test  = pd.read_csv(os.path.join(base_dir, "y_test.csv"))["label"].astype(int)
    x_oot   = pd.read_csv(os.path.join(base_dir, "X_oot.csv"))
    y_oot   = pd.read_csv(os.path.join(base_dir, "y_oot.csv"))["label"].astype(int)
    return x_train, y_train, x_test, y_test, x_oot, y_oot

X_train, y_train, X_test, y_test, X_oot, y_oot = load_split_csvs(DUMPS_DIR)

print("Shapes:")
print("  X_train", X_train.shape, "y_train", y_train.shape)
print("  X_test ", X_test.shape,  "y_test ", y_test.shape)
print("  X_oot  ", X_oot.shape,   "y_oot  ", y_oot.shape)

print("\nLabel rates:")
print("  train:", y_train.mean().round(4))
print("  test :", y_test.mean().round(4))
print("  oot  :", y_oot.mean().round(4))

# Quickly check that sets use the same feature columns
assert list(X_train.columns) == list(X_test.columns) == list(X_oot.columns), "Column mismatch across splits!"
print("\nColumn schema consistent across splits ✓")


In [None]:

# Show heads
display(X_train.head(3))
display(X_test.head(3))
display(X_oot.head(3))

# Basic describe for numeric columns
num_cols = [c for c in X_train.columns if np.issubdtype(X_train[c].dtype, np.number)]
desc = X_train[num_cols].describe().T
display(desc.head(10))


In [None]:

# If date columns were included in X_*, we can quickly inspect ranges.
date_cols = [c for c in X_train.columns if "date" in c.lower() or "snapshot" in c.lower()]
if date_cols:
    print("Date-like columns detected:", date_cols)
    for dc in date_cols:
        try:
            tr = pd.to_datetime(X_train[dc], errors='coerce')
            te = pd.to_datetime(X_test[dc], errors='coerce')
            oo = pd.to_datetime(X_oot[dc], errors='coerce')
            print(f"  {dc}: train [{tr.min()} .. {tr.max()}], test [{te.min()} .. {te.max()}], oot [{oo.min()} .. {oo.max()}]")
        except Exception as e:
            print(f"  Skipping {dc} (parse error):", e)
else:
    print("No date-like columns detected in X_*.")


In [None]:

def quick_eval(pipe, X, y, name):
    proba = pipe.predict_proba(X)[:, 1]
    auc = roc_auc_score(y, proba)
    gini = 2*auc - 1
    print(f"{name:>6} AUC={auc:.4f} | Gini={gini:.4f}")
    return auc, gini

print("\n== Quick AUC cross-checks ==")
auc_tr, g_tr = quick_eval(pipe, X_train, y_train, "TRAIN")
auc_te, g_te = quick_eval(pipe, X_test, y_test, " TEST")
auc_oo, g_oo = quick_eval(pipe, X_oot, y_oot, "  OOT")


In [None]:

# If keys are present in the dumps, check for overlap across splits
possible_keys = ["Customer_ID", "label_snapshot_date"]
present_keys = [k for k in possible_keys if k in X_train.columns]

if present_keys:
    print("Checking overlaps on keys:", present_keys)
    def kset(df):
        return set(map(tuple, df[present_keys].astype(str).itertuples(index=False, name=None)))
    inter_train_test = kset(X_train).intersection(kset(X_test))
    inter_train_oot  = kset(X_train).intersection(kset(X_oot))
    inter_test_oot   = kset(X_test).intersection(kset(X_oot))
    print("  train ∩ test:", len(inter_train_test))
    print("  train ∩ oot :", len(inter_train_oot))
    print("  test  ∩ oot :", len(inter_test_oot))
else:
    print("Key columns not found in X_*; skipping overlap checks.")
