# Modeling pipeline (time-aware ranking)

This notebook **only** consumes the award-specific datasets exported from `04_award_datasets.ipynb` (parquets).
It trains models and evaluates them with **season-wise ranking metrics** (Top-1 / Top-k / MRR).

**No business logic lives here** (eligibility, target creation, feature engineering).

In [1]:
# =============================
# Imports & project paths
# =============================
from __future__ import annotations

from pathlib import Path
import json
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

# Notebook-safe project root detection
PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_DIR = PROJECT_ROOT / "data" / "interim" / "awards"

EXPERIMENTS_DIR = PROJECT_ROOT / "data" / "experiments"
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_DIR = PROJECT_ROOT / "data" / "processed" / "modeling"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR     :", DATA_DIR)


PROJECT_ROOT: C:\Users\Luc\Documents\projets-data\nba-awards-predictor
DATA_DIR     : C:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\interim\awards


In [2]:
# =============================
# Award configs + loader
# =============================
AWARDS = {
    "mvp":  {"target_col": "is_mvp_winner"},
    "dpoy": {"target_col": "is_dpoy_winner"},
    "smoy": {"target_col": "is_smoy_winner"},
    "roy":  {"target_col": "is_roy_winner"},
    "mip":  {"target_col": "is_mip_winner"},
}

def load_award_dataset(award: str):
    if award not in AWARDS:
        raise KeyError(f"Unknown award '{award}'. Available: {list(AWARDS)}")

    target_col = AWARDS[award]["target_col"]
    base = DATA_DIR / award

    df = pd.read_parquet(base / f"{award}_df.parquet")
    X  = pd.read_parquet(base / f"X_{award}.parquet")
    y  = pd.read_parquet(base / f"y_{award}.parquet")[target_col]

    # Hard invariants
    assert len(df) == len(X) == len(y), "Length mismatch df/X/y"
    assert df.index.equals(X.index), "Index mismatch df vs X"
    assert df.index.equals(y.index), "Index mismatch df vs y"
    assert "season" in df.columns, "df must contain 'season'"

    # Winner constraint (should be 1/season after filtering; if not, fix upstream)
    w = y.groupby(df["season"]).sum()
    mn, med, mx = int(w.min()), int(w.median()), int(w.max())
    print(f"[{award.upper()}] rows={len(df):,} seasons={df['season'].nunique()} winners/season(min/med/max)={mn}/{med}/{mx}")

    return df, X, y, target_col

# Quick smoke test (uncomment)
# _df, _X, _y, _t = load_award_dataset("mvp")


## Time-aware split

Splits are defined **by season** (never random).
You can change the split strategy freely here without regenerating datasets.

In [3]:
# =============================
# Time-aware split helpers
# =============================
def time_split_masks(
    df: pd.DataFrame,
    train_end: int = 2018,
    val_end: int = 2021,
    season_col: str = "season",
):
    seasons = df[season_col].astype(int)
    train_mask = seasons <= train_end
    val_mask = (seasons > train_end) & (seasons <= val_end)
    test_mask = seasons > val_end

    # Sanity
    assert train_mask.any() and val_mask.any() and test_mask.any(), "Empty split detected. Adjust train_end/val_end."

    return train_mask.values, val_mask.values, test_mask.values

def describe_split(df, y, train_mask, val_mask, test_mask):
    def _desc(name, m):
        seasons = df.loc[m, "season"]
        pos = int(y.loc[m].sum())
        print(f"{name:>5}: rows={int(m.sum()):,} seasons={seasons.nunique()} range={int(seasons.min())}→{int(seasons.max())} positives={pos}")

    _desc("train", train_mask)
    _desc("val", val_mask)
    _desc("test", test_mask)



## Ranking metrics (season-wise)

We evaluate models by how well they rank players **within each season**:
- **Top-1**: winner ranked #1
- **MRR**: mean reciprocal rank of the winner
- **Top-k**: winner appears in Top-k candidates

In [4]:
# =============================
# Ranking metrics
# =============================
def rank_players(df: pd.DataFrame, scores: np.ndarray, season_col: str = "season"):
    ranked = df.copy()
    ranked["score"] = scores
    ranked["rank"] = ranked.groupby(season_col)["score"].rank(method="first", ascending=False)
    return ranked

def topk_accuracy(df, scores, y_true, k: int = 5, season_col: str = "season"):
    ranked = rank_players(df, scores, season_col=season_col)
    winner_rows = ranked.loc[y_true == 1, ["season", "rank"]]
    return float((winner_rows["rank"] <= k).mean())

def top1_accuracy(df, scores, y_true, season_col: str = "season"):
    return topk_accuracy(df, scores, y_true, k=1, season_col=season_col)

def mean_reciprocal_rank(df, scores, y_true, season_col: str = "season"):
    ranked = rank_players(df, scores, season_col=season_col)
    winner_rows = ranked.loc[y_true == 1, "rank"]
    return float((1.0 / winner_rows).mean())

def winner_rank_table(df, scores, y_true, season_col: str = "season"):
    ranked = rank_players(df, scores, season_col=season_col)
    out = ranked.loc[y_true == 1, [season_col, "score", "rank"]].sort_values(season_col)
    out["rank"] = out["rank"].astype(int)
    return out



## Volume/context features (award-specific)

Some awards (especially MVP/DPOY/SMOY/MIP) are implicitly constrained by playing time and sample size.
To avoid ranking low-minute players unrealistically high, we **do not filter any rows** but add
season-wise volume context features (percentiles of G/GS/MP/MPG) to the feature matrix *before training*.

We keep ROY unchanged by default, since it already performs very well in your current setup.


In [5]:
# =============================
# Volume/context features (award-specific)
# =============================
VOLUME_CANDIDATES = ["G", "GS", "MP", "MPG"]

def add_volume_features(df_award: pd.DataFrame, X_award: pd.DataFrame, award: str) -> pd.DataFrame:
    """Add season-wise volume context features to X without filtering rows.

    - Uses season-wise percentiles to stay consistent with the project philosophy.
    - Leaves ROY unchanged by default (can be overridden).
    """
    if award.lower() == "roy":
        return X_award

    X2 = X_award.copy()

    cols = [c for c in VOLUME_CANDIDATES if c in df_award.columns]
    if not cols:
        print(f"[WARN] No volume columns found in df for award={award}.")
        return X2

    for c in cols:
        # percentile rank within season in [0, 1]
        X2[f"{c}_vol_pct"] = (
            df_award.groupby("season")[c]
            .rank(pct=True, method="average")
            .astype("float32")
        )

    # Optional contextual flag for extreme low volume (still no filtering)
    if "MP" in df_award.columns:
        X2["low_volume_flag"] = (
            (df_award.groupby("season")["MP"].rank(pct=True, method="average") < 0.10)
            .astype("int8")
        )

    return X2


## Baseline model: Logistic Regression

Important note (empirically validated on MVP):  
`class_weight="balanced"` can **hurt ranking** because it optimizes global classification rather than season-wise ordering.

We start with:
- `SimpleImputer(median)` + `StandardScaler`
- `LogisticRegression` (no class weighting)

Then we can move to non-linear models (LightGBM/XGBoost/CatBoost).

In [6]:
# =============================
# Model runner (baseline)
# =============================
def make_logreg_baseline():
    # X should already be numeric if your upstream pipeline is correct.
    # If you still have categorical columns, switch to a ColumnTransformer-based preprocess.
    pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            solver="liblinear",
            max_iter=3000,
        )),
    ])
    return pipe

def fit_eval_award(award: str, train_end: int = 2018, val_end: int = 2021, topk=(3,5,10)):
    df, X, y, target_col = load_award_dataset(award)

    # Add award-specific volume context (no filtering), then enforce numeric-only
    X_aug = add_volume_features(df, X, award=award)
    X_num = X_aug.select_dtypes(include=["number"]).copy()

    train_mask, val_mask, test_mask = time_split_masks(df, train_end=train_end, val_end=val_end)
    describe_split(df, y, train_mask, val_mask, test_mask)

    model = make_logreg_baseline()
    model.fit(X_num.loc[train_mask], y.loc[train_mask])

    out = {}
    for split_name, mask in [("val", val_mask), ("test", test_mask)]:
        scores = model.predict_proba(X_num.loc[mask])[:, 1]

        out[f"{split_name}_aucpr"] = float(average_precision_score(y.loc[mask], scores))
        out[f"{split_name}_top1"]  = float(top1_accuracy(df.loc[mask], scores, y.loc[mask]))
        out[f"{split_name}_mrr"]   = float(mean_reciprocal_rank(df.loc[mask], scores, y.loc[mask]))

        for k in topk:
            out[f"{split_name}_top{k}"] = float(topk_accuracy(df.loc[mask], scores, y.loc[mask], k=k))

        out[f"{split_name}_winner_ranks"] = winner_rank_table(df.loc[mask], scores, y.loc[mask])

    return model, df, X_num, y, out



## Run a single award (recommended first)

Start with MVP to validate the pipeline end-to-end, then run the same code for other awards.

In [7]:
# =============================
# Single-award run
# =============================
award = "smoy"          # change to: "dpoy", "smoy", "roy", "mip", "mvp"
train_end = 2018
val_end = 2021

model, df, X, y, res = fit_eval_award(award, train_end=train_end, val_end=val_end)

print("\n--- Validation metrics ---")
print({k: v for k, v in res.items() if k.startswith("val_") and k != "val_winner_ranks"})

print("\n--- Test metrics ---")
print({k: v for k, v in res.items() if k.startswith("test_") and k != "test_winner_ranks"})

print("\nWinner ranks (VAL):")
display(res["val_winner_ranks"])

# Optional: show top-10 candidates for a couple of validation seasons
val_df = df[df["season"].between(train_end + 1, val_end)].copy()
val_scores = model.predict_proba(X.loc[val_df.index])[:, 1]
ranked_val = rank_players(val_df, val_scores)

for s in sorted(val_df["season"].unique())[:3]:
    print(f"\n=== {award.upper()} | season {s} | winner rank = {int(res['val_winner_ranks'].loc[res['val_winner_ranks']['season']==s,'rank'].iloc[0])} ===")
    display(ranked_val[ranked_val["season"] == s].sort_values("score", ascending=False).head(10)[
        ["Player", "Team", "score", "rank"]
    ])


[SMOY] rows=9,346 seasons=30 winners/season(min/med/max)=1/1/1
train: rows=6,709 seasons=23 range=1996→2018 positives=23
  val: rows=1,065 seasons=3 range=2019→2021 positives=3
 test: rows=1,572 seasons=4 range=2022→2025 positives=4



--- Validation metrics ---
{'val_aucpr': 0.9166666666666665, 'val_top1': 0.6666666666666666, 'val_mrr': 0.8333333333333334, 'val_top3': 1.0, 'val_top5': 1.0, 'val_top10': 1.0}

--- Test metrics ---
{'test_aucpr': 0.6392045454545454, 'test_top1': 0.5, 'test_mrr': 0.675, 'test_top3': 0.75, 'test_top5': 1.0, 'test_top10': 1.0}

Winner ranks (VAL):


Unnamed: 0,season,score,rank
10860,2019,0.41008,1
11436,2020,0.248671,2
11853,2021,0.345673,1



=== SMOY | season 2019 | winner rank = 1 ===


Unnamed: 0,Player,Team,score,rank
10860,Lou Williams,LAC,0.41008,1.0
10674,Dwyane Wade,MIA,0.202099,2.0
10978,Spencer Dinwiddie,BRK,0.145427,3.0
10904,Montrezl Harrell,LAC,0.101635,4.0
10659,Domantas Sabonis,IND,0.078883,5.0
10992,Terrence Ross,ORL,0.075599,6.0
10742,Jae Crowder,UTA,0.060615,7.0
10799,Jordan Clarkson,CLE,0.057387,8.0
10643,Dennis SchrAPder,OKC,0.050953,9.0
10648,Derrick Rose,MIN,0.027943,10.0



=== SMOY | season 2020 | winner rank = 2 ===


Unnamed: 0,Player,Team,score,rank
11174,Dennis SchrAPder,OKC,0.275725,1.0
11436,Montrezl Harrell,LAC,0.248671,2.0
11386,Lou Williams,LAC,0.144853,3.0
11454,Norman Powell,TOR,0.124575,4.0
11237,Goran DragiA,MIA,0.10815,5.0
11317,Jordan Clarkson,2TM,0.093308,6.0
11500,Serge Ibaka,TOR,0.087192,7.0
11528,Terrence Ross,ORL,0.062713,8.0
11094,Bogdan BogdanoviA,SAC,0.03421,9.0
11221,Frank Mason III,MIL,0.030381,10.0



=== SMOY | season 2021 | winner rank = 1 ===


Unnamed: 0,Player,Team,score,rank
11853,Jordan Clarkson,UTA,0.345673,1.0
11965,Montrezl Harrell,LAL,0.109138,2.0
12060,T.J. McConnell,IND,0.091805,3.0
11844,Joe Ingles,UTA,0.076621,4.0
12078,Tim Hardaway Jr.,DAL,0.076445,5.0
11660,Chris Boucher,TOR,0.056179,6.0
11908,Kyle Kuzma,LAL,0.044123,7.0
11714,Derrick Rose,2TM,0.04015,8.0
11731,Doug McDermott,IND,0.029648,9.0
12048,Shake Milton,PHI,0.028581,10.0


## Run all awards and compare (once MVP is stable)

This produces a small summary table and exports per-award results (metrics + winner ranks).

In [8]:
# =============================
# Multi-award run (optional)
# =============================

run_all = True  # set True when you're ready

# Results go to /experiments/logreg_baseline/...
EXPERIMENTS_DIR = PROJECT_ROOT / "data" / "experiments"
RESULTS_DIR = EXPERIMENTS_DIR / "logreg_baseline"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

if run_all:
    rows = []
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")

    for a in AWARDS:
        print(f"\n### Running {a.upper()} ###")
        model, df, X, y, res = fit_eval_award(a, train_end=train_end, val_end=val_end)

        row = {"award": a, "train_end": train_end, "val_end": val_end}
        for k, v in res.items():
            if isinstance(v, (float, int)):
                row[k] = float(v)
        rows.append(row)

        # Export result artifacts
        out_dir = RESULTS_DIR / a / ts
        out_dir.mkdir(parents=True, exist_ok=True)

        # Save metrics
        (out_dir / "metrics.json").write_text(json.dumps(row, indent=2), encoding="utf-8")

        # Save winner rank tables
        res["val_winner_ranks"].to_parquet(out_dir / "val_winner_ranks.parquet")
        res["test_winner_ranks"].to_parquet(out_dir / "test_winner_ranks.parquet")

        print(f"[OK] exported results to {out_dir}")

    summary = pd.DataFrame(rows).sort_values("val_mrr", ascending=False)
    display(summary)



### Running MVP ###
[MVP] rows=14,411 seasons=30 winners/season(min/med/max)=1/1/1
train: rows=10,527 seasons=23 range=1996→2018 positives=23
  val: rows=1,599 seasons=3 range=2019→2021 positives=3
 test: rows=2,285 seasons=4 range=2022→2025 positives=4


[OK] exported results to C:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\logreg_baseline\mvp\20260124_072725

### Running DPOY ###
[DPOY] rows=14,411 seasons=30 winners/season(min/med/max)=1/1/1
train: rows=10,527 seasons=23 range=1996→2018 positives=23
  val: rows=1,599 seasons=3 range=2019→2021 positives=3
 test: rows=2,285 seasons=4 range=2022→2025 positives=4


[OK] exported results to C:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\logreg_baseline\dpoy\20260124_072725

### Running SMOY ###
[SMOY] rows=9,346 seasons=30 winners/season(min/med/max)=1/1/1
train: rows=6,709 seasons=23 range=1996→2018 positives=23
  val: rows=1,065 seasons=3 range=2019→2021 positives=3
 test: rows=1,572 seasons=4 range=2022→2025 positives=4


[OK] exported results to C:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\logreg_baseline\smoy\20260124_072725

### Running ROY ###
[ROY] rows=2,422 seasons=30 winners/season(min/med/max)=1/1/2
train: rows=1,689 seasons=23 range=1996→2018 positives=24
  val: rows=317 seasons=3 range=2019→2021 positives=3
 test: rows=416 seasons=4 range=2022→2025 positives=4


[OK] exported results to C:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\logreg_baseline\roy\20260124_072725

### Running MIP ###
[MIP] rows=9,026 seasons=28 winners/season(min/med/max)=1/1/1
train: rows=6,611 seasons=21 range=1998→2018 positives=21
  val: rows=990 seasons=3 range=2019→2021 positives=3
 test: rows=1,425 seasons=4 range=2022→2025 positives=4


[OK] exported results to C:\Users\Luc\Documents\projets-data\nba-awards-predictor\data\experiments\logreg_baseline\mip\20260124_072725


Unnamed: 0,award,train_end,val_end,val_aucpr,val_top1,val_mrr,val_top3,val_top5,val_top10,test_aucpr,test_top1,test_mrr,test_top3,test_top5,test_top10
3,roy,2018,2021,0.833333,1.0,1.0,1.0,1.0,1.0,0.541667,0.5,0.75,1.0,1.0,1.0
2,smoy,2018,2021,0.916667,0.666667,0.833333,1.0,1.0,1.0,0.639205,0.5,0.675,0.75,1.0,1.0
1,dpoy,2018,2021,0.512195,0.333333,0.513333,0.666667,0.666667,0.666667,0.107707,0.0,0.158645,0.25,0.25,0.25
0,mvp,2018,2021,0.191805,0.333333,0.412698,0.333333,0.333333,0.666667,0.403788,0.5,0.675,0.75,1.0,1.0
4,mip,2018,2021,0.105802,0.0,0.189033,0.333333,0.333333,0.666667,0.557418,0.5,0.5625,0.5,0.5,0.75


A linear, interpretable baseline trained on percentile-based season-relative features achieves strong ranking performance on subjective NBA awards, with near-perfect performance on eligibility-constrained awards (ROY), and competitive shortlist accuracy on narrative-driven awards (MVP, SMOY).