# Model Registry Browser (Parquet)

This notebook inspects your **model registry** stored as Parquet and helps you:

- Load and merge all registry parquet files
- Preview schema and recent entries
- Find the latest **Production** model (if any)
- Otherwise, pick the **best OOT AUC** model
- Export a small CSV summary to `/mnt/data/`

> Update `REG_DIR` below to match your project (examples: `datamart/gold/model_registry`, `/app/datamart/gold/model_registry`, `/opt/airflow/datamart/gold/model_registry`).

In [44]:
import os, glob, json
from datetime import datetime, date, timedelta

import numpy as np
import pandas as pd

# Pretty display
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

# <<< CHANGE THIS TO YOUR ACTUAL REGISTRY FOLDER >>>
REG_DIR = "datamart/gold/model_registry"
print("Using registry dir:", REG_DIR)

Using registry dir: datamart/gold/model_registry


In [45]:
def read_registry_parquet(dir_path=REG_DIR) -> pd.DataFrame:
    if not os.path.exists(dir_path):
        raise FileNotFoundError(f"Registry directory not found: {dir_path}")
    files = sorted(glob.glob(os.path.join(dir_path, "*.parquet")))
    if not files:
        raise FileNotFoundError(f"No parquet files found under: {dir_path}")
    dfs = []
    for p in files:
        try:
            dfs.append(pd.read_parquet(p))
        except Exception as e:
            print(f"[WARN] Failed to read {p}: {e}")
    if not dfs:
        raise RuntimeError("No registry parquet files could be read.")
    df = pd.concat(dfs, ignore_index=True)
    return df

# Try to read
try:
    reg_df = read_registry_parquet(REG_DIR)
except Exception as e:
    print(f"[ERROR] {e}")
    reg_df = pd.DataFrame(columns=[
        "model_version","flavor","train_test_start_date","train_test_end_date",
        "oot_start_date","oot_end_date","auc_train","auc_test","auc_oot",
        "promoted","promoted_at_iso","artefact_dir"
    ])

In [46]:
subset_keys = [c for c in reg_df.columns if c not in ["promoted_flag", "promoted_at"]]
sort_cols = ["promoted_flag", "promoted_at"]

reg_df = (
    reg_df.sort_values(sort_cols, ascending=False)
          .drop_duplicates(subset=subset_keys, keep="first")
          .reset_index(drop=True)
)


In [47]:
reg_df

Unnamed: 0,model_version,train_start,train_end,oot_start,oot_end,auc_train,auc_test,auc_oot,promoted_flag,promoted_at
0,credit_model_xgb_2024_12_01,2023-10-01,2024-09-30,2024-10-01,2024-11-30,0.969035,0.9045,0.930115,True,2025-10-29 09:54:23.237161
1,credit_model_xgb_2024_09_01,2023-07-01,2024-06-30,2024-07-01,2024-08-31,0.955181,0.887973,0.926537,True,2025-10-29 09:51:52.554377
2,credit_model_xgb_2024_08_01,2023-06-01,2024-05-31,2024-06-01,2024-07-31,0.997877,0.873929,0.916328,True,2025-10-29 09:51:37.389903
3,credit_model_xgb_2024_06_01,2023-04-01,2024-03-31,2024-04-01,2024-05-31,0.961449,0.88297,0.910732,True,2025-10-29 09:37:22.167299
4,credit_model_xgb_2024_03_01,2023-01-01,2023-12-31,2024-01-01,2024-02-29,0.988702,0.828027,0.886269,True,2025-10-29 09:15:57.408058
5,credit_model_xgb_2024_07_01,2023-05-01,2024-04-30,2024-05-01,2024-06-30,0.977888,0.851782,0.897121,False,NaT
6,credit_model_xgb_2024_11_01,2023-09-01,2024-08-31,2024-09-01,2024-10-31,0.998222,0.898124,0.923353,False,NaT
7,credit_model_logreg_2024_03_01,2023-01-01,2023-12-31,2024-01-01,2024-02-29,0.830631,0.804494,0.839258,False,NaT
8,credit_model_logreg_2024_06_01,2023-04-01,2024-03-31,2024-04-01,2024-05-31,0.835313,0.843728,0.880166,False,NaT
9,credit_model_logreg_2024_03_01,2023-01-01,2023-12-31,2024-01-01,2024-02-29,0.830631,0.804494,0.839258,False,NaT
