# Read & Explore Model Predictions

This notebook loads **predictions Parquet files** and **summary JSONs** from your pipeline output directory:

- `datamart/gold/model_predictions/<model_version>/<model_version>_predictions_YYYY_MM_DD.parquet`
- `datamart/gold/model_predictions/<model_version>/summary_YYYY-MM-DD.json`

It then:
1. Lists all available model versions / snapshot dates
2. Loads and concatenates predictions
3. Loads summaries into a table
4. Shows quick stats
5. Plots (a) score distribution and (b) average score by snapshot date


In [1]:
# --- Setup ---
from pathlib import Path
import json, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Base directory of your repo (adjust if needed)
BASE_DIR = Path(".").resolve()
PRED_DIR = BASE_DIR / "datamart" / "gold" / "model_predictions"

print("Looking for predictions under:", PRED_DIR)
if not PRED_DIR.exists():
    print("WARNING: Predictions directory does not exist yet. Adjust BASE_DIR if your repo is elsewhere.")


Looking for predictions under: /app/scripts/datamart/gold/model_predictions


In [2]:

# --- Discover files ---
pred_paths = sorted(PRED_DIR.glob("**/*_predictions_*.parquet"))
sum_paths  = sorted(PRED_DIR.glob("**/summary_*.json"))

print(f"Found {len(pred_paths)} prediction file(s) and {len(sum_paths)} summary file(s).")
# Show a few
for p in pred_paths[:5]:
    print("PRED:", p.relative_to(BASE_DIR))
for s in sum_paths[:5]:
    print("SUM :", s.relative_to(BASE_DIR))


Found 8 prediction file(s) and 8 summary file(s).
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_03_01/credit_model_xgb_2024_03_01_predictions_2024_03_01.parquet
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_09_01/credit_model_xgb_2024_09_01_predictions_2024_04_01.parquet
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_09_01/credit_model_xgb_2024_09_01_predictions_2024_05_01.parquet
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_12_01/credit_model_xgb_2024_12_01_predictions_2024_06_01.parquet
PRED: datamart/gold/model_predictions/credit_model_xgb_2024_12_01/credit_model_xgb_2024_12_01_predictions_2024_08_01.parquet
SUM : datamart/gold/model_predictions/credit_model_xgb_2024_03_01/summary_2024-03-01.json
SUM : datamart/gold/model_predictions/credit_model_xgb_2024_09_01/summary_2024-04-01.json
SUM : datamart/gold/model_predictions/credit_model_xgb_2024_09_01/summary_2024-05-01.json
SUM : datamart/gold/model_predictions/credit_model_xgb_

## Predictions Registry

In [3]:

# --- Load & concatenate predictions ---
pred_dfs = []
for p in pred_paths:
    try:
        df = pd.read_parquet(p)
        # Try to infer model_version from parent folder name
        model_version = p.parent.name
        df["model_version"] = df.get("model_version", model_version)
        # Ensure snapshot_date is datetime
        if "snapshot_date" in df.columns:
            df["snapshot_date"] = pd.to_datetime(df["snapshot_date"])
        pred_dfs.append(df)
    except Exception as e:
        print(f"Failed to read {p}: {e}")

if pred_dfs:
    preds = pd.concat(pred_dfs, ignore_index=True)
    print("Predictions loaded:", preds.shape)
    display(preds.head(10))
else:
    preds = pd.DataFrame()
    print("No predictions loaded.")


Predictions loaded: (3939, 5)


Unnamed: 0,Customer_ID,snapshot_date,model_version,predicted_default_risk,predicted_default
0,CUS_0x1057,2024-03-01,credit_model_xgb_2024_03_01,0.070483,0
1,CUS_0x1169,2024-03-01,credit_model_xgb_2024_03_01,0.048414,0
2,CUS_0x117d,2024-03-01,credit_model_xgb_2024_03_01,0.029788,0
3,CUS_0x1182,2024-03-01,credit_model_xgb_2024_03_01,0.074065,0
4,CUS_0x11c6,2024-03-01,credit_model_xgb_2024_03_01,0.082058,0
5,CUS_0x1246,2024-03-01,credit_model_xgb_2024_03_01,0.24642,0
6,CUS_0x12bf,2024-03-01,credit_model_xgb_2024_03_01,0.225495,0
7,CUS_0x12ec,2024-03-01,credit_model_xgb_2024_03_01,0.537461,1
8,CUS_0x1330,2024-03-01,credit_model_xgb_2024_03_01,0.063383,0
9,CUS_0x141c,2024-03-01,credit_model_xgb_2024_03_01,0.097078,0


In [4]:
# --- Load summaries ---
sums = []
for s in sum_paths:
    try:
        with open(s, "r") as f:
            j = json.load(f)
        # Attach model_version from parent folder
        model_version = s.parent.name
        j["model_version"] = model_version
        sums.append(j)
    except Exception as e:
        print(f"Failed to read {s}: {e}")

if sums:
    summaries = pd.json_normalize(sums, sep=".")
    # Coerce snapshotdate to datetime for sorting
    if "evaluation_time" in summaries.columns:
        summaries["evaluation_time"] = pd.to_datetime(summaries["evaluation_time"], errors="coerce")
    print("Summaries loaded:", summaries.shape)
    display(summaries.sort_values(["evaluation_time","model_version"]).tail(20))
else:
    summaries = pd.DataFrame()
    print("No summaries loaded.")


Summaries loaded: (8, 20)


Unnamed: 0,model_version,evaluation_time,sample_size,predicted_default_risk_min,predicted_default_risk_max,predicted_default_risk_mean,threshold,auc_roc,accuracy,precision,recall,f1_macro,f1_weighted,tp,tn,fp,fn,sources.model_file,sources.features_dir,sources.labels_dir
0,credit_model_xgb_2024_03_01,2024-03-01,454,0.015278,0.921646,0.263482,0.5,0.883532,0.834802,0.72449,0.596639,0.77292,0.829319,71,308,27,48,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
1,credit_model_xgb_2024_09_01,2024-04-01,487,0.003844,0.98026,0.265991,0.5,0.958033,0.913758,0.88785,0.76,0.881181,0.911458,95,350,12,30,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
2,credit_model_xgb_2024_09_01,2024-05-01,491,0.004943,0.970289,0.274461,0.5,0.960775,0.90835,0.901639,0.769231,0.883714,0.906061,110,336,12,33,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
3,credit_model_xgb_2024_12_01,2024-06-01,489,0.001171,0.99242,0.293373,0.5,0.950351,0.90184,0.899225,0.768212,0.879902,0.899531,116,325,13,35,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
4,credit_model_xgb_2024_12_01,2024-08-01,518,0.001813,0.986468,0.28595,0.5,0.95724,0.895753,0.854962,0.761905,0.867258,0.893853,112,352,19,35,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
5,credit_model_xgb_2024_12_01,2024-09-01,511,0.001194,0.993286,0.316059,0.5,0.975809,0.925636,0.927632,0.839286,0.913559,0.924624,141,332,11,27,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
6,credit_model_xgb_2024_12_01,2024-11-01,491,0.001584,0.984644,0.292452,0.5,0.934946,0.867617,0.823077,0.718121,0.837282,0.864898,107,319,23,42,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
7,credit_model_xgb_2024_12_01,2024-12-01,498,0.003517,0.979292,0.280059,0.5,0.958996,0.885542,0.908333,0.703226,0.856835,0.881037,109,332,11,46,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/


In [5]:
summaries

Unnamed: 0,model_version,evaluation_time,sample_size,predicted_default_risk_min,predicted_default_risk_max,predicted_default_risk_mean,threshold,auc_roc,accuracy,precision,recall,f1_macro,f1_weighted,tp,tn,fp,fn,sources.model_file,sources.features_dir,sources.labels_dir
0,credit_model_xgb_2024_03_01,2024-03-01,454,0.015278,0.921646,0.263482,0.5,0.883532,0.834802,0.72449,0.596639,0.77292,0.829319,71,308,27,48,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
1,credit_model_xgb_2024_09_01,2024-04-01,487,0.003844,0.98026,0.265991,0.5,0.958033,0.913758,0.88785,0.76,0.881181,0.911458,95,350,12,30,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
2,credit_model_xgb_2024_09_01,2024-05-01,491,0.004943,0.970289,0.274461,0.5,0.960775,0.90835,0.901639,0.769231,0.883714,0.906061,110,336,12,33,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
3,credit_model_xgb_2024_12_01,2024-06-01,489,0.001171,0.99242,0.293373,0.5,0.950351,0.90184,0.899225,0.768212,0.879902,0.899531,116,325,13,35,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
4,credit_model_xgb_2024_12_01,2024-08-01,518,0.001813,0.986468,0.28595,0.5,0.95724,0.895753,0.854962,0.761905,0.867258,0.893853,112,352,19,35,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
5,credit_model_xgb_2024_12_01,2024-09-01,511,0.001194,0.993286,0.316059,0.5,0.975809,0.925636,0.927632,0.839286,0.913559,0.924624,141,332,11,27,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
6,credit_model_xgb_2024_12_01,2024-11-01,491,0.001584,0.984644,0.292452,0.5,0.934946,0.867617,0.823077,0.718121,0.837282,0.864898,107,319,23,42,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/
7,credit_model_xgb_2024_12_01,2024-12-01,498,0.003517,0.979292,0.280059,0.5,0.958996,0.885542,0.908333,0.703226,0.856835,0.881037,109,332,11,46,model_bank/production/best/model.pkl,datamart/pretrain_gold/features/,datamart/gold/labels/


## Evaluation Summary

In [6]:
summaries[['model_version', 'evaluation_time', 'sample_size', 'predicted_default_risk_mean', 'auc_roc', 'threshold', 'precision', 'recall', 'f1_macro', 'f1_weighted']]

Unnamed: 0,model_version,evaluation_time,sample_size,predicted_default_risk_mean,auc_roc,threshold,precision,recall,f1_macro,f1_weighted
0,credit_model_xgb_2024_03_01,2024-03-01,454,0.263482,0.883532,0.5,0.72449,0.596639,0.77292,0.829319
1,credit_model_xgb_2024_09_01,2024-04-01,487,0.265991,0.958033,0.5,0.88785,0.76,0.881181,0.911458
2,credit_model_xgb_2024_09_01,2024-05-01,491,0.274461,0.960775,0.5,0.901639,0.769231,0.883714,0.906061
3,credit_model_xgb_2024_12_01,2024-06-01,489,0.293373,0.950351,0.5,0.899225,0.768212,0.879902,0.899531
4,credit_model_xgb_2024_12_01,2024-08-01,518,0.28595,0.95724,0.5,0.854962,0.761905,0.867258,0.893853
5,credit_model_xgb_2024_12_01,2024-09-01,511,0.316059,0.975809,0.5,0.927632,0.839286,0.913559,0.924624
6,credit_model_xgb_2024_12_01,2024-11-01,491,0.292452,0.934946,0.5,0.823077,0.718121,0.837282,0.864898
7,credit_model_xgb_2024_12_01,2024-12-01,498,0.280059,0.958996,0.5,0.908333,0.703226,0.856835,0.881037
