# Read & Explore Model Predictions

This notebook loads **predictions Parquet files** and **summary JSONs** from your pipeline output directory:

- `datamart/gold/model_predictions/<model_version>/<model_version>_predictions_YYYY_MM_DD.parquet`
- `datamart/gold/model_predictions/<model_version>/summary_YYYY-MM-DD.json`

It then:
1. Lists all available model versions / snapshot dates
2. Loads and concatenates predictions
3. Loads summaries into a table
4. Shows quick stats
5. Plots (a) score distribution and (b) average score by snapshot date


In [109]:

# --- Setup ---
from pathlib import Path
import json, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Base directory of your repo (adjust if needed)
BASE_DIR = Path(".").resolve()
PRED_DIR = BASE_DIR / "datamart" / "gold" / "model_predictions"

print("Looking for predictions under:", PRED_DIR)
if not PRED_DIR.exists():
    print("WARNING: Predictions directory does not exist yet. Adjust BASE_DIR if your repo is elsewhere.")


Looking for predictions under: /app/scripts/datamart/gold/model_predictions


In [110]:

# --- Discover files ---
pred_paths = sorted(PRED_DIR.glob("**/*_predictions_*.parquet"))
sum_paths  = sorted(PRED_DIR.glob("**/summary_*.json"))

print(f"Found {len(pred_paths)} prediction file(s) and {len(sum_paths)} summary file(s).")
# Show a few
for p in pred_paths[:5]:
    print("PRED:", p.relative_to(BASE_DIR))
for s in sum_paths[:5]:
    print("SUM :", s.relative_to(BASE_DIR))


Found 3 prediction file(s) and 3 summary file(s).
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_04_01/credit_model_xgb_2024_04_01_predictions_2024_05_01.parquet
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_07_01/credit_model_xgb_2024_07_01_predictions_2024_06_01.parquet
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_07_01/credit_model_xgb_2024_07_01_predictions_2024_07_01.parquet
SUM : datamart/gold/model_predictions/credit_model_xgb_2024_04_01/summary_2024-05-01.json
SUM : datamart/gold/model_predictions/credit_model_xgb_2024_07_01/summary_2024-06-01.json
SUM : datamart/gold/model_predictions/credit_model_xgb_2024_07_01/summary_2024-07-01.json


## Predictions Registry

In [111]:

# --- Load & concatenate predictions ---
pred_dfs = []
for p in pred_paths:
    try:
        df = pd.read_parquet(p)
        # Try to infer model_version from parent folder name
        model_version = p.parent.name
        df["model_version"] = df.get("model_version", model_version)
        # Ensure snapshot_date is datetime
        if "snapshot_date" in df.columns:
            df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])
        pred_dfs.append(df)
    except Exception as e:
        print(f"Failed to read {p}: {e}")

if pred_dfs:
    preds = pd.concat(pred_dfs, ignore_index=True)
    print("Predictions loaded:", preds.shape)
    display(preds.head(10))
else:
    preds = pd.DataFrame()
    print("No predictions loaded.")


Predictions loaded: (1465, 5)


Unnamed: 0,Customer_ID,snapshot_date,model_version,predicted_default_risk,predicted_default
0,CUS_0x1011,2024-05-01,credit_model_xgb_2024_04_01,0.093188,0
1,CUS_0x1018,2024-05-01,credit_model_xgb_2024_04_01,0.619254,1
2,CUS_0x1041,2024-05-01,credit_model_xgb_2024_04_01,0.101161,0
3,CUS_0x105b,2024-05-01,credit_model_xgb_2024_04_01,0.06018,0
4,CUS_0x107c,2024-05-01,credit_model_xgb_2024_04_01,0.497473,0
5,CUS_0x1107,2024-05-01,credit_model_xgb_2024_04_01,0.699487,1
6,CUS_0x117f,2024-05-01,credit_model_xgb_2024_04_01,0.031496,0
7,CUS_0x11a4,2024-05-01,credit_model_xgb_2024_04_01,0.495613,0
8,CUS_0x123d,2024-05-01,credit_model_xgb_2024_04_01,0.675317,1
9,CUS_0x1281,2024-05-01,credit_model_xgb_2024_04_01,0.460368,0


In [112]:
# --- Load summaries ---
sums = []
for s in sum_paths:
    try:
        with open(s, "r") as f:
            j = json.load(f)
        # Attach model_version from parent folder
        model_version = s.parent.name
        j["model_version"] = model_version
        sums.append(j)
    except Exception as e:
        print(f"Failed to read {s}: {e}")

if sums:
    summaries = pd.json_normalize(sums, sep=".")
    # Coerce snapshotdate to datetime for sorting
    if "evaluation_time" in summaries.columns:
        summaries["evaluation_time"] = pd.to_datetime(summaries["evaluation_time"], errors="coerce")
    print("Summaries loaded:", summaries.shape)
    display(summaries.sort_values(["evaluation_time","model_version"]).tail(20))
else:
    summaries = pd.DataFrame()
    print("No summaries loaded.")


Summaries loaded: (3, 20)


Unnamed: 0,model_version,evaluation_time,sample_size,predicted_default_risk_min,predicted_default_risk_max,predicted_default_risk_mean,threshold,auc_roc,accuracy,precision,recall,f1_macro,f1_weighted,tp,tn,fp,fn,sources.model_file,sources.features_dir,sources.labels_dir
0,credit_model_xgb_2024_04_01,2024-05-01,491,0.013396,0.899246,0.254559,0.5,0.910256,0.835031,0.816327,0.559441,0.777294,0.824638,80,330,18,63,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
1,credit_model_xgb_2024_07_01,2024-06-01,489,0.008839,0.937832,0.273539,0.5,0.876367,0.836401,0.808696,0.615894,0.793444,0.829466,93,316,22,58,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
2,credit_model_xgb_2024_07_01,2024-07-01,485,0.012511,0.947349,0.25443,0.5,0.932775,0.859794,0.88,0.611111,0.813824,0.851401,88,329,12,56,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/


## Evaluation Summary

In [114]:
summaries[['model_version', 'evaluation_time', 'sample_size', 'predicted_default_risk_mean', 'auc_roc', 'threshold', 'precision', 'recall', 'f1_macro', 'f1_weighted']]

Unnamed: 0,model_version,evaluation_time,sample_size,predicted_default_risk_mean,auc_roc,threshold,precision,recall,f1_macro,f1_weighted
0,credit_model_xgb_2024_04_01,2024-05-01,491,0.254559,0.910256,0.5,0.816327,0.559441,0.777294,0.824638
1,credit_model_xgb_2024_07_01,2024-06-01,489,0.273539,0.876367,0.5,0.808696,0.615894,0.793444,0.829466
2,credit_model_xgb_2024_07_01,2024-07-01,485,0.25443,0.932775,0.5,0.88,0.611111,0.813824,0.851401
