# Model Registry Browser (Parquet)

This notebook inspects your **model registry** stored as Parquet and helps you:

- Load and merge all registry parquet files
- Preview schema and recent entries
- Find the latest **Production** model (if any)
- Otherwise, pick the **best OOT AUC** model
- Export a small CSV summary to `/mnt/data/`

> Update `REG_DIR` below to match your project (examples: `datamart/gold/model_registry`, `/app/datamart/gold/model_registry`, `/opt/airflow/datamart/gold/model_registry`).

In [24]:
import os, glob, json
from datetime import datetime, date, timedelta

import numpy as np
import pandas as pd

# Pretty display
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

# <<< CHANGE THIS TO YOUR ACTUAL REGISTRY FOLDER >>>
REG_DIR = "datamart/gold/model_registry"
print("Using registry dir:", REG_DIR)

Using registry dir: datamart/gold/model_registry


In [25]:
def read_registry_parquet(dir_path=REG_DIR) -> pd.DataFrame:
    if not os.path.exists(dir_path):
        raise FileNotFoundError(f"Registry directory not found: {dir_path}")
    files = sorted(glob.glob(os.path.join(dir_path, "*.parquet")))
    if not files:
        raise FileNotFoundError(f"No parquet files found under: {dir_path}")
    dfs = []
    for p in files:
        try:
            dfs.append(pd.read_parquet(p))
        except Exception as e:
            print(f"[WARN] Failed to read {p}: {e}")
    if not dfs:
        raise RuntimeError("No registry parquet files could be read.")
    df = pd.concat(dfs, ignore_index=True)
    return df

# Try to read
try:
    reg_df = read_registry_parquet(REG_DIR)
except Exception as e:
    print(f"[ERROR] {e}")
    reg_df = pd.DataFrame(columns=[
        "model_version","flavor","train_test_start_date","train_test_end_date",
        "oot_start_date","oot_end_date","auc_train","auc_test","auc_oot",
        "promoted","promoted_at_iso","artefact_dir"
    ])

In [27]:
subset_keys = [c for c in reg_df.columns if c != "promoted_flag"]
sort_cols = ["promoted_flag", "promoted_at"]

reg_df = (
    reg_df.sort_values(sort_cols, ascending=False)
          .drop_duplicates(subset=subset_keys, keep="first")
          .reset_index(drop=True)
)


In [28]:
reg_df

Unnamed: 0,model_version,train_start,train_end,oot_start,oot_end,auc_train,auc_test,auc_oot,promoted_flag,promoted_at
0,credit_model_xgb_2024_04_01,2023-02-01,2024-01-31,2024-02-01,2024-03-31,0.986841,0.88776,0.892744,True,2025-10-29 03:58:00.506667
1,credit_model_xgb_2024_01_01,2022-11-01,2023-10-31,2023-11-01,2023-12-31,0.933453,0.78261,0.859513,True,2025-10-28 15:43:33.592478
2,credit_model_xgb_2024_01_01,2022-11-01,2023-10-31,2023-11-01,2023-12-31,0.933453,0.78261,0.859513,False,NaT
3,credit_model_xgb_2024_04_01,2023-02-01,2024-01-31,2024-02-01,2024-03-31,0.986841,0.88776,0.892744,False,NaT
4,credit_model_logreg_2024_04_01,2023-02-01,2024-01-31,2024-02-01,2024-03-31,0.826603,0.846408,0.842912,False,NaT
5,credit_model_logreg_2024_01_01,2022-11-01,2023-10-31,2023-11-01,2023-12-31,0.821314,0.763695,0.833376,False,NaT
