# 07 — Error analysis & audit (season-wise ranking)

This notebook focuses on **interpreting** model behavior:
- Inspect winner ranks by season
- Identify failure seasons and common error modes
- Compare awards and model variants (baseline vs tree models)
- Produce auditable tables suitable for a report/paper

Inputs:
- Result artifacts exported by Notebook 05 (baseline) and Notebook 06 (tree models):
  - `metrics.json`
  - `val_winner_ranks.parquet`
  - `test_winner_ranks.parquet`


In [1]:
# =============================
# Setup: paths + run discovery
# =============================
from pathlib import Path
import json
import pandas as pd

# Notebook-safe project root detection
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

# We support BOTH historical output locations:
# - data/processed/modeling* (older)
# - data/experiments/*       (newer, preferred)
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
EXPERIMENTS_DIR = PROJECT_ROOT / "data" / "experiments"

# Baseline (logistic regression) results candidates
BASELINE_DIR_CANDIDATES = [
    EXPERIMENTS_DIR / "logreg_baseline",
    PROCESSED_DIR / "modeling",
]

# Tree models results candidates:
# new convention is: data/experiments/tree_models/<model_name>/{award}/{timestamp}
TREE_MODEL_NAME = "xgb"  # change to "lgb" or "cat" if you ran those
TREE_DIR_CANDIDATES = [
    EXPERIMENTS_DIR / "tree_models" / TREE_MODEL_NAME,
    PROCESSED_DIR / "modeling_tree",
]

def _first_existing(paths):
    for p in paths:
        if p.exists():
            return p
    return None

BASELINE_DIR = _first_existing(BASELINE_DIR_CANDIDATES)
TREE_DIR = _first_existing(TREE_DIR_CANDIDATES)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("BASELINE_DIR:", BASELINE_DIR)
print("TREE_DIR    :", TREE_DIR)

AWARDS = ["mvp", "dpoy", "smoy", "roy", "mip"]

def latest_run_dir(award_dir: Path) -> Path | None:
    """Return the latest timestamped subdir (YYYYMMDD_HHMMSS) if any."""
    if not award_dir.exists():
        return None
    subdirs = [p for p in award_dir.iterdir() if p.is_dir()]
    if not subdirs:
        return None
    # timestamps sort lexicographically
    subdirs = sorted(subdirs, key=lambda p: p.name)
    return subdirs[-1]

def load_run(run_dir: Path):
    """Load metrics + winner-rank tables from one run dir."""
    metrics_path = run_dir / "metrics.json"
    val_path = run_dir / "val_winner_ranks.parquet"
    test_path = run_dir / "test_winner_ranks.parquet"

    metrics = json.loads(metrics_path.read_text(encoding="utf-8")) if metrics_path.exists() else {}
    val_wr = pd.read_parquet(val_path) if val_path.exists() else pd.DataFrame()
    test_wr = pd.read_parquet(test_path) if test_path.exists() else pd.DataFrame()
    return metrics, val_wr, test_wr

def load_latest_runs(base_dir: Path | None, awards: list[str]) -> dict:
    """Return dict[award] = (run_dir, metrics, val_wr, test_wr)."""
    runs = {}
    if base_dir is None or not base_dir.exists():
        return runs
    for a in awards:
        a_dir = base_dir / a
        rdir = latest_run_dir(a_dir)
        if rdir is None:
            continue
        metrics, val_wr, test_wr = load_run(rdir)
        runs[a] = (rdir, metrics, val_wr, test_wr)
    return runs


PROJECT_ROOT: c:\Users\Luc\Documents\projets-data\nba-awards-predictor
BASELINE_DIR: c:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\logreg_baseline
TREE_DIR    : c:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\tree_models\xgb


## Load the latest run per award

We assume each award folder contains timestamp subfolders.  
This helper loads the **most recent** run per award.


In [2]:
def latest_run_dir(base_dir: Path, award: str) -> Path:
    award_dir = base_dir / award
    assert award_dir.exists(), f"Missing: {award_dir}"
    runs = [p for p in award_dir.iterdir() if p.is_dir()]
    assert runs, f"No runs found in {award_dir}"
    return sorted(runs, key=lambda p: p.name)[-1]


def load_run(base_dir: Path, award: str):
    run_dir = latest_run_dir(base_dir, award)
    metrics = json.loads((run_dir / "metrics.json").read_text(encoding="utf-8"))
    val_wr = pd.read_parquet(run_dir / "val_winner_ranks.parquet")
    test_wr = pd.read_parquet(run_dir / "test_winner_ranks.parquet")
    return run_dir, metrics, val_wr, test_wr


AWARDS = ["mvp", "dpoy", "smoy", "roy", "mip"]


In [3]:
# =============================
# Load latest runs
# =============================
baseline = load_latest_runs(BASELINE_DIR, AWARDS)
tree = load_latest_runs(TREE_DIR, AWARDS)

print(f"Baseline runs found: {len(baseline)}/{len(AWARDS)}")
print(f"Tree runs found    : {len(tree)}/{len(AWARDS)}")

# Helpful debug if something is missing
missing_b = [a for a in AWARDS if a not in baseline]
missing_t = [a for a in AWARDS if a not in tree]
if missing_b:
    print("[WARN] missing baseline awards:", missing_b)
if missing_t:
    print("[WARN] missing tree awards:", missing_t)


TypeError: latest_run_dir() missing 1 required positional argument: 'award'

## Compare metrics across awards (baseline)

Use this as a quick health-check and for report tables.


In [None]:
def metrics_table(runs: dict) -> pd.DataFrame:
    rows = []
    for a, (run_dir, metrics, val_wr, test_wr) in runs.items():
        row = {"award": a, "run_dir": str(run_dir)}
        for k, v in metrics.items():
            if isinstance(v, (int, float, str)):
                row[k] = v
        rows.append(row)

    if not rows:
        print("[WARN] No runs found -> empty table.")
        return pd.DataFrame()

    dfm = pd.DataFrame(rows)

    # Robust sort: use the best available columns
    sort_cols = [c for c in ["val_mrr", "test_mrr", "val_top1", "test_top1", "val_aucpr", "test_aucpr"] if c in dfm.columns]
    if sort_cols:
        dfm = dfm.sort_values(sort_cols, ascending=False)
    else:
        print("[WARN] No known metric columns to sort on. Available:", list(dfm.columns))
    return dfm

baseline_tbl = metrics_table(baseline)
display(baseline_tbl)


## Winner rank distribution (diagnostic)

We look at the rank of the true winner season-by-season.


In [None]:
def summarize_winner_ranks(winner_ranks: pd.DataFrame, split_name: str):
    if winner_ranks is None or winner_ranks.empty:
        return {}
    ranks = winner_ranks["rank"].astype(int)
    return {
        f"{split_name}_seasons": int(winner_ranks["season"].nunique()),
        f"{split_name}_top1": float((ranks == 1).mean()),
        f"{split_name}_top3": float((ranks <= 3).mean()),
        f"{split_name}_top5": float((ranks <= 5).mean()),
        f"{split_name}_top10": float((ranks <= 10).mean()),
        f"{split_name}_mrr": float((1.0 / ranks).mean()),
        f"{split_name}_rank_median": float(ranks.median()),
        f"{split_name}_rank_max": int(ranks.max()),
    }

rows = []
for a, (run_dir, metrics, val_wr, test_wr) in baseline.items():
    row = {"award": a, "run_dir": str(run_dir)}
    row.update(summarize_winner_ranks(val_wr, "val"))
    row.update(summarize_winner_ranks(test_wr, "test"))
    rows.append(row)

if not rows:
    print("[WARN] No baseline winner-rank tables found (baseline dict empty or missing parquet files)." )
    baseline_rank_tbl = pd.DataFrame()
else:
    baseline_rank_tbl = pd.DataFrame(rows)
    sort_cols = [c for c in ["val_mrr", "test_mrr", "val_top1", "test_top1"] if c in baseline_rank_tbl.columns]
    if sort_cols:
        baseline_rank_tbl = baseline_rank_tbl.sort_values(sort_cols, ascending=False)
    display(baseline_rank_tbl)


## Drill-down: seasons where the winner is badly ranked

This helps you understand whether failures come from:
- low minutes / sample size issues,
- missing defensive signal (DPOY),
- narrative components not captured by features,
- injuries / shortened seasons, etc.


In [None]:
AWARD = "dpoy"  # pick one
SPLIT = "test"    # "val" or "test"

run_dir, metrics, val_wr, test_wr = baseline[AWARD]
wr = test_wr if SPLIT == "test" else val_wr

display(wr.sort_values("rank", ascending=False))
print("Worst season:", int(wr.sort_values('rank', ascending=False).iloc[0]['season']))


## Compare baseline vs tree models (if available)

This gives you a clear “did boosting help?” story, award by award.


In [None]:
if tree:
    comp_rows = []
    for a in AWARDS:
        if a not in baseline or a not in tree:
            continue
        b_dir, b_metrics, b_val, b_test = baseline[a]
        t_dir, t_metrics, t_val, t_test = tree[a]
        comp_rows.append({
            "award": a,
            "baseline_run": str(b_dir),
            "tree_run": str(t_dir),
            "baseline_val_mrr": b_metrics.get("val_mrr"),
            "tree_val_mrr": t_metrics.get("val_mrr"),
            "baseline_test_mrr": b_metrics.get("test_mrr"),
            "tree_test_mrr": t_metrics.get("test_mrr"),
            "baseline_val_top1": b_metrics.get("val_top1"),
            "tree_val_top1": t_metrics.get("val_top1"),
            "baseline_test_top1": b_metrics.get("test_top1"),
            "tree_test_top1": t_metrics.get("test_top1"),
        })
    if not comp_rows:
        print("[WARN] No overlapping awards between baseline and tree runs.")
    else:
        comp = pd.DataFrame(comp_rows)
        sort_col = "tree_val_mrr" if "tree_val_mrr" in comp.columns else None
        if sort_col:
            comp = comp.sort_values(sort_col, ascending=False)
        display(comp)
else:
    print("No tree runs found yet. Run Notebook 06 first.")


## Optional: export report-ready tables

This writes CSVs you can include in a report/paper.


In [None]:
OUT = PROJECT_ROOT / "data" / "processed" / "reports"
OUT.mkdir(parents=True, exist_ok=True)

if not baseline_tbl.empty:
    baseline_tbl.to_csv(OUT / "baseline_metrics_summary.csv", index=False)
if 'baseline_rank_tbl' in globals() and not baseline_rank_tbl.empty:
    baseline_rank_tbl.to_csv(OUT / "baseline_winner_rank_summary.csv", index=False)

print("[OK] exported to:", OUT)
