# **IMPORTS**

In [1]:
import numpy as np
import pandas as pd
from zoneinfo import ZoneInfo
import nfl_data_py as nfl
import requests
from tqdm import tqdm
from collections import defaultdict
from time import sleep
import os
import json
import numpy as np, pandas as pd, gc
import xgboost as xgb
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import time
import numpy as np
import datetime
from pathlib import Path
import warnings
import re
from math import radians, sin, cos, sqrt, atan2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output

warnings.filterwarnings("ignore")

# **PULL GAME LIST AND ATTACH WEATHER/STADIUM DATA**

In [2]:
TZ = ZoneInfo("America/New_York")
SEASONS = list(range(2000, 2026))
TEAM_FIX = {
    "OAK": "LVR",   # Raiders
    "LV":  "LVR",
    "SD":  "LAC",   # Chargers
    "STL": "LAR",   # Rams
    "LA":  "LAR",   # Rams
    "WSH": "WAS",   # Commanders
}
def fix_team_codes(df, cols=["home_team","away_team"]):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].replace(TEAM_FIX)
    return df

def to_et(series: pd.Series) -> pd.Series:
    s = pd.to_datetime(series, errors="coerce")
    tz = getattr(s.dt, "tz", None)
    return s.dt.tz_localize(TZ, nonexistent="NaT", ambiguous="NaT") if tz is None else s.dt.tz_convert(TZ)

sched_raw = nfl.import_schedules(SEASONS).copy()
sched_raw = fix_team_codes(sched_raw, ["home_team","away_team"])

if "kickoff_et" in sched_raw.columns and sched_raw["kickoff_et"].notna().any():
    sched_raw["kickoff_et"] = to_et(sched_raw["kickoff_et"])
else:
    day = sched_raw.get("gameday")
    time_ = sched_raw.get("gametime")
    combo = (day.fillna("").astype(str) + " " + time_.fillna("").astype(str)) if (day is not None and time_ is not None) else (day if day is not None else pd.Series(index=sched_raw.index, dtype=object))
    sched_raw["kickoff_et"] = to_et(combo)

sched_raw["game_date"] = pd.to_datetime(sched_raw.get("gameday"), errors="coerce")
mask = sched_raw["game_date"].isna()
sched_raw.loc[mask, "game_date"] = sched_raw.loc[mask, "kickoff_et"].dt.date

sched_raw["season"] = pd.to_numeric(sched_raw["season"], errors="coerce").astype("Int64")
sched_raw["week"] = sched_raw["week"].astype(str)

if "game_type" in sched_raw.columns:
    sched_raw["game_type"] = sched_raw["game_type"].astype(str).str.upper()
else:
    sched_raw["game_type"] = pd.NA

sched_raw["season_phase"] = sched_raw["game_type"].map({
    "PRE": "Preseason",
    "REG": "Regular",
    "WC": "Postseason",
    "DIV": "Postseason",
    "CON": "Postseason",
    "SB": "Postseason"
})

for c in ["home_score", "away_score", "stadium"]:
    if c not in sched_raw.columns:
        sched_raw[c] = pd.NA

if sched_raw["stadium"].isna().all() and "stadium_name" in sched_raw.columns:
    sched_raw["stadium"] = sched_raw["stadium_name"]

need_cols = ['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday',
       'gametime', 'away_team', 'away_score', 'home_team', 'home_score',
       'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis',
       'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest',
       'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds',
       'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game',
       'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id',
       'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee',
       'stadium_id', 'stadium', 'kickoff_et', 'game_date', 'season_phase']

missing = [c for c in need_cols if c not in sched_raw.columns]
if missing:
    raise RuntimeError(f"Schedule is missing required columns: {missing}")

sched = (
    sched_raw[need_cols]
    .dropna(subset=["game_id", "home_team", "away_team"])
    .loc[lambda df: df["kickoff_et"].notna()]
    .sort_values(["season", "game_type", "kickoff_et", "home_team"])
    .reset_index(drop=True)
)

stadium_metadata = pd.DataFrame([
    ("ARI","State Farm Stadium",33.5273,-112.2633,"Retractable","Grass",2006,2025),
    ("ATL","Georgia Dome",33.7575,-84.4008,"Dome","Turf",2000,2016),
    ("ATL","Mercedes-Benz Stadium",33.7555,-84.4009,"Retractable","Turf",2017,2025),
    ("BAL","M&T Bank Stadium",39.2779,-76.6227,"Open","Grass",2000,2025),
    ("BUF","Highmark Stadium",42.7738,-78.7875,"Open","Turf",2000,2025),
    ("CAR","Bank of America Stadium",35.2258,-80.8536,"Open","Grass",2000,2025),
    ("CHI","Soldier Field",41.8623,-87.6167,"Open","Grass",2003,2025),
    ("CIN","Paycor Stadium",39.0954,-84.5161,"Open","Turf",2000,2025),
    ("CLE","Cleveland Browns Stadium",41.5061,-81.6996,"Open","Grass",2000,2025),
    ("DAL","AT&T Stadium",32.7473,-97.0945,"Retractable","Turf",2009,2025),
    ("DEN","Empower Field",39.7439,-105.0201,"Open","Grass",2001,2025),
    ("DET","Pontiac Silverdome",42.6463,-83.2875,"Dome","Turf",2000,2001),
    ("DET","Ford Field",42.3391,-83.0456,"Dome","Turf",2002,2025),
    ("GB","Lambeau Field",44.5013,-88.0622,"Open","Grass",2000,2025),
    ("HOU","NRG Stadium",29.6847,-95.4107,"Retractable","Turf",2002,2025),
    ("IND","Lucas Oil Stadium",39.7601,-86.1639,"Retractable","Turf",2008,2025),
    ("JAX","EverBank Stadium",30.3239,-81.6372,"Open","Turf",2000,2025),
    ("KC","Arrowhead Stadium",39.0490,-94.4839,"Open","Grass",2000,2025),
    ("LVR","Allegiant Stadium",36.0909,-115.1830,"Dome","Grass",2020,2025),
    ("LAR","SoFi Stadium",33.9535,-118.3392,"Open","Turf",2020,2025),
    ("LAC","SoFi Stadium",33.9535,-118.3392,"Open","Turf",2020,2025),
    ("MIA","Hard Rock Stadium",25.9580,-80.2389,"Open","Grass",2000,2025),
    ("MIN","Metrodome",44.9737,-93.2581,"Dome","Turf",2000,2013),
    ("MIN","U.S. Bank Stadium",44.9740,-93.2596,"Dome","Turf",2016,2025),
    ("NE","Foxboro Stadium",42.0910,-71.2643,"Open","Grass",2000,2001),
    ("NE","Gillette Stadium",42.0909,-71.2643,"Open","Turf",2002,2025),
    ("NO","Caesars Superdome",29.9509,-90.0816,"Dome","Turf",2000,2025),
    ("NYG","MetLife Stadium",40.8135,-74.0744,"Open","Turf",2010,2025),
    ("NYJ","MetLife Stadium",40.8135,-74.0744,"Open","Turf",2010,2025),
    ("PHI","Lincoln Financial Field",39.9008,-75.1675,"Open","Grass",2003,2025),
    ("PIT","Acrisure Stadium",40.4468,-80.0158,"Open","Grass",2001,2025),
    ("SEA","Lumen Field",47.5952,-122.3316,"Open","Turf",2002,2025),
    ("SF","Levi's Stadium",37.4030,-121.9700,"Open","Grass",2014,2025),
    ("TB","Raymond James Stadium",27.9762,-82.5033,"Open","Grass",2000,2025),
    ("TEN","Nissan Stadium",36.1665,-86.7713,"Open","Grass",2000,2025),
    ("WAS","FedExField",38.9078,-76.8644,"Open","Grass",2000,2025),
], columns=["team","stadium","lat","lon","roof_type","surface","start_season","end_season"])

sched = sched.drop(columns=[c for c in sched.columns if any(sub in c for sub in ["stadium","lat","lon","roof_type","surface"])], errors="ignore")

sched = sched.merge(
    stadium_metadata.rename(columns={"team":"home_team"}),
    on="home_team",
    how="left",
    validate="many_to_many"
)

sched = sched[
    (sched["season"] >= sched["start_season"]) &
    (sched["season"] <= sched["end_season"])
].drop(columns=["start_season","end_season"])

sched = fix_team_codes(sched, ["home_team","away_team"])

## **ONLY RUN BEFORE GAME DAY**

In [3]:
VC_API_KEY = "D6EL2EK34R2CW82T6C8XZ3F74"
VC_UNIT_GROUP = "us"
VC_FIELDS = [
    "temp", "humidity", "windspeed", "windgust", "precip", "wx",
    "cloudcover", "visibility", "uvindex", "pressure", "dew",
    "conditions", "icon", "windchill", "heatindex"
]
VC_FIELDS = ["temp", "humidity", "windspeed","precip"]
VC_CACHE_DIR = Path(".weather_cache")
VC_CACHE_DIR.mkdir(parents=True, exist_ok=True)

def build_vc_url(lat, lon, dt):
    dt_str = pd.to_datetime(dt).strftime("%Y-%m-%dT%H:%M:%S")
    base = "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline"
    fields = ",".join(["datetime"] + VC_FIELDS)
    return (
        f"{base}/{lat},{lon}/{dt_str}"
        f"?key={VC_API_KEY}&unitGroup={VC_UNIT_GROUP}&include=hours&elements={fields}&contentType=json"
    )

def fetch_weather(lat, lon, kickoff_et):
    kickoff_dt = pd.to_datetime(kickoff_et)
    kickoff_utc_str = kickoff_dt.tz_convert("UTC").strftime('%Y%m%dT%H%M')
    key = f"{lat}_{lon}_{kickoff_utc_str}.json"
    cache_path = VC_CACHE_DIR / key

    if cache_path.exists():
        try:
            return json.loads(cache_path.read_text())
        except Exception:
            cache_path.unlink(missing_ok=True)

    url = build_vc_url(lat, lon, kickoff_dt)
    for _ in range(3):
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            data = resp.json()

            cache_path.write_text(json.dumps(data))
            return data
        except Exception as e:
            print(f"[ERROR] Failed fetch for {lat},{lon} @ {kickoff_et} → {e}")
            sleep(0.05)

    return None
for i, row in sched.iterrows():
    lat, lon, kickoff_et = row["lat"], row["lon"], row["kickoff_et"]
    game_id = row.get("game_id", f"row{i}")
    result = fetch_weather(lat, lon, kickoff_et)
    if result:
        print(f"[OK] {game_id} → Cached")
    else:
        print(f"[FAIL] {game_id} → No data")

[OK] 2000_19_NO_MIN → Cached
[OK] 2000_19_BAL_TEN → Cached
[OK] 2000_01_SF_ATL → Cached
[OK] 2000_01_JAX_CLE → Cached
[OK] 2000_01_IND_KC → Cached
[OK] 2000_01_CHI_MIN → Cached
[OK] 2000_01_TB_NE → Cached
[OK] 2000_01_DET_NO → Cached
[OK] 2000_01_CAR_WAS → Cached
[OK] 2000_01_NYJ_GB → Cached
[OK] 2000_01_SEA_MIA → Cached
[OK] 2000_01_TEN_BUF → Cached
[OK] 2000_02_JAX_BAL → Cached
[OK] 2000_02_GB_BUF → Cached
[OK] 2000_02_CLE_CIN → Cached
[OK] 2000_02_MIA_MIN → Cached
[OK] 2000_02_CHI_TB → Cached
[OK] 2000_02_KC_TEN → Cached
[OK] 2000_02_WAS_DET → Cached
[OK] 2000_03_ATL_CAR → Cached
[OK] 2000_03_PIT_CLE → Cached
[OK] 2000_03_TB_DET → Cached
[OK] 2000_03_PHI_GB → Cached
[OK] 2000_03_CIN_JAX → Cached
[OK] 2000_03_SD_KC → Cached
[OK] 2000_03_MIN_NE → Cached
[OK] 2000_03_BAL_MIA → Cached
[OK] 2000_03_DAL_WAS → Cached
[OK] 2000_04_STL_ATL → Cached
[OK] 2000_04_CIN_BAL → Cached
[OK] 2000_04_NE_MIA → Cached
[OK] 2000_04_PHI_NO → Cached
[OK] 2000_04_NYJ_TB → Cached
[OK] 2000_05_IND_BUF → Cache

In [4]:
VC_FIELDS = [
    "temp", "humidity", "windspeed", "windgust", "precip", "wx",
    "cloudcover", "visibility", "uvindex", "pressure", "dew",
    "conditions", "icon", "windchill", "heatindex"
]
VC_FIELDS = ["temp", "humidity", "windspeed", "precip"]
VC_CACHE_DIR = Path(".weather_cache")

def _parse_hours_to_utc(data: dict) -> pd.DataFrame:
    """Return hourly dataframe with a UTC datetime column 'dt_utc'."""
    hours = pd.DataFrame(data["days"][0]["hours"]).copy()

    if "datetimeEpoch" in hours.columns:
        hours["dt_utc"] = pd.to_datetime(hours["datetimeEpoch"], unit="s", utc=True)
    else:
        local_dt = pd.to_datetime(
            data["days"][0]["datetime"] + "T" + hours["datetime"].astype(str),
            errors="coerce"
        )
        tz = data.get("timezone", "UTC")
        hours["dt_localized"] = (
            local_dt
              .dt.tz_localize(tz, ambiguous="NaT", nonexistent="shift_forward")
        )
        hours = hours.dropna(subset=["dt_localized"])
        hours["dt_utc"] = hours["dt_localized"].dt.tz_convert("UTC")

    return hours

attached_games = 0
skipped_games = 0

for i, row in sched.iterrows():
    game_id = row.get("game_id", f"row{i}")
    try:
        lat, lon = row["lat"], row["lon"]
        kickoff_et = pd.to_datetime(row["kickoff_et"])
        if getattr(kickoff_et, "tzinfo", None) is None:
            kickoff_et = kickoff_et.tz_localize("America/New_York")
        kickoff_utc = kickoff_et.tz_convert("UTC")

        cache_key = f"{lat}_{lon}_{kickoff_utc.strftime('%Y%m%dT%H%M')}.json"
        cache_path = VC_CACHE_DIR / cache_key
        if not cache_path.exists():
            loose_key = f"{lat}_{lon}_{kickoff_utc.strftime('%Y%m%dT%H')}"
            matches = list(VC_CACHE_DIR.glob(f"{loose_key}*.json"))
            if matches:
                cache_path = matches[0]
            else:
                print(f"[SKIP] {game_id} – Cache missing")
                skipped_games += 1
                continue

        data = json.loads(cache_path.read_text())
        hourly_df = _parse_hours_to_utc(data)
        pre = hourly_df.loc[hourly_df["dt_utc"] <= kickoff_utc]
        if pre.empty:
            print(f"[SKIP] {game_id} – No hourly weather before kickoff")
            skipped_games += 1
            continue

        latest = pre.iloc[-1]
        for field in VC_FIELDS:
            val = latest.get(field, pd.NA)
            if pd.notna(val):
                sched.at[i, f"weather_{field}"] = val

        attached_games += 1

    except Exception as e:
        print(f"[ERROR] {game_id} – {e}")
        skipped_games += 1

print(f"\nDone: Attached weather to {attached_games} games, skipped {skipped_games}.")

[ERROR] 2025_10_PHI_GB – 'hours'
[ERROR] 2025_11_NYJ_NE – 'hours'
[ERROR] 2025_11_WAS_MIA – 'hours'
[ERROR] 2025_11_CAR_ATL – 'hours'
[ERROR] 2025_11_TB_BUF – 'hours'
[ERROR] 2025_11_LAC_JAX – 'hours'
[ERROR] 2025_11_CHI_MIN – 'hours'
[ERROR] 2025_11_GB_NYG – 'hours'
[ERROR] 2025_11_CIN_PIT – 'hours'
[ERROR] 2025_11_HOU_TEN – 'hours'
[ERROR] 2025_11_SF_ARI – 'hours'
[ERROR] 2025_11_SEA_LA – 'hours'
[ERROR] 2025_11_BAL_CLE – 'hours'
[ERROR] 2025_11_KC_DEN – 'hours'
[ERROR] 2025_11_DET_PHI – 'hours'
[ERROR] 2025_11_DAL_LV – 'hours'
[ERROR] 2025_12_BUF_HOU – 'hours'
[ERROR] 2025_12_NYJ_BAL – 'hours'
[ERROR] 2025_12_PIT_CHI – 'hours'
[ERROR] 2025_12_NE_CIN – 'hours'
[ERROR] 2025_12_NYG_DET – 'hours'
[ERROR] 2025_12_MIN_GB – 'hours'
[ERROR] 2025_12_IND_KC – 'hours'
[ERROR] 2025_12_SEA_TEN – 'hours'
[ERROR] 2025_12_JAX_ARI – 'hours'
[ERROR] 2025_12_CLE_LV – 'hours'
[ERROR] 2025_12_PHI_DAL – 'hours'
[ERROR] 2025_12_ATL_NO – 'hours'
[ERROR] 2025_12_TB_LA – 'hours'
[ERROR] 2025_12_CAR_SF – 'hou

# **TEAM/PLAER SPECIFIC DATA**

## **Step 1: Betting Odds**

In [5]:
print("[DEBUG] sched is", type(sched), "with shape:", getattr(sched, 'shape', None))


def _implied_prob_from_ml(ml_series: pd.Series) -> pd.Series:
    """Convert moneyline odds to implied probability (vig not removed)."""
    ml = pd.to_numeric(ml_series, errors="coerce")
    out = pd.Series(np.nan, index=ml.index)
    pos = ml > 0
    out.loc[pos] = 100 / (ml.loc[pos] + 100)
    out.loc[~pos] = (-ml.loc[~pos]) / ((-ml.loc[~pos]) + 100)
    return out


def attach_pre_kick_lines(sched: pd.DataFrame) -> pd.DataFrame:
    """
    Attach *closing* pre-kickoff market columns using schedules as canonical source.
    Explicitly filters to the last line snapshot at or before kickoff to avoid leakage.
    """
    try:
        games_raw = nfl.import_schedules(SEASONS).copy()
    except Exception as e:
        print(f"[PHASE 2] WARN: could not re-load schedules for market fields ({e}).")
        return sched  # fallback to original sched

    # --- Normalize column names ---
    rename_map = {
        "over_under_line": "total_line",
        "total_close": "total_line",
        "total": "total_line",
        "spread_close": "spread_line",
        "spread": "spread_line",
        "home_ml": "home_moneyline",
        "away_ml": "away_moneyline",
    }
    games_raw = games_raw.rename(columns={k: v for k, v in rename_map.items() if k in games_raw.columns})

    wanted = ["game_id", "spread_line", "total_line", "home_moneyline", "away_moneyline", "line_time"]
    have = [c for c in wanted if c in games_raw.columns]

    if "game_id" not in have:
        print("[PHASE 2] No usable market columns found. Continuing without lines.")
        return sched

    lines = games_raw[have].dropna(subset=["game_id"]).copy()

    if "line_time" in lines.columns:
        sched_kick = sched[["game_id", "kickoff_et"]].copy()
        sched_kick["kickoff_utc"] = sched_kick["kickoff_et"].dt.tz_convert("UTC")
        lines = lines.merge(sched_kick, on="game_id", how="left")
        lines = lines[lines["line_time"] <= lines["kickoff_utc"]]
        lines = (
            lines.sort_values(["game_id", "line_time"])
            .groupby("game_id")
            .tail(1)
        )

        lines.drop(columns=["line_time", "kickoff_utc"], inplace=True)

    if {"home_moneyline", "away_moneyline"}.issubset(lines.columns):
        lines["home_implied_win_prob"] = _implied_prob_from_ml(lines["home_moneyline"])
        lines["away_implied_win_prob"] = _implied_prob_from_ml(lines["away_moneyline"])
        lines["market_edge_prob"] = lines["home_implied_win_prob"] - lines["away_implied_win_prob"]

    sched2 = sched.merge(lines, on="game_id", how="left")
    sched2 = sched2.loc[:, ~sched2.columns.duplicated()].copy()

    for base in [
        "spread_line", "spread_favorite", "home_implied_win_prob", "away_implied_win_prob", "market_edge_prob",
        "home_moneyline", "away_moneyline", "total_line"
    ]:
        xcol, ycol = f"{base}_x", f"{base}_y"
        if xcol in sched2.columns and ycol in sched2.columns:
            sched2[base] = sched2[ycol].combine_first(sched2[xcol])
            sched2.drop(columns=[xcol, ycol], inplace=True)


    if "spread_favorite" not in sched2.columns and "spread_line" in sched2.columns:
        sched2["spread_favorite"] = np.where(
            sched2["spread_line"] < 0, sched2["home_team"], sched2["away_team"]
        )

    if {"spread_line", "spread_favorite", "home_team"}.issubset(sched2.columns):
        sched2["home_spread_line"] = np.where(
            sched2["spread_favorite"] == sched2["home_team"],
            sched2["spread_line"],   
            -sched2["spread_line"]   
        )
    elif "spread_line" in sched2.columns:
        sched2["home_spread_line"] = sched2["spread_line"]

    for col in ["spread_line", "total_line", "home_moneyline", "away_moneyline"]:
        if col in sched2.columns:
            sched2[col] = pd.to_numeric(sched2[col], errors="coerce")

    if "spread_line" in sched2.columns:
        sched2["spread_abs"] = sched2["spread_line"].abs()
        sched2["fav_is_home"] = (sched2["spread_line"] < 0).astype(int)
        sched2["pickem_game"] = (sched2["spread_abs"] < 0.5).astype(int)

    if {"total_line", "spread_line"}.issubset(sched2.columns):
        sched2["implied_home_score"] = (sched2["total_line"] / 2) - (sched2["spread_line"] / 2)
        sched2["implied_away_score"] = (sched2["total_line"] / 2) + (sched2["spread_line"] / 2)

    if {"home_implied_win_prob", "away_implied_win_prob"}.issubset(sched2.columns):
        sched2["avg_implied_prob"] = (sched2["home_implied_win_prob"] + sched2["away_implied_win_prob"]) / 2
        sched2["implied_odds_ratio"] = sched2["home_implied_win_prob"] / sched2["away_implied_win_prob"]
        sched2["heavy_favorite"] = (
            sched2[["home_implied_win_prob", "away_implied_win_prob"]].max(axis=1) > 0.70
        ).astype(int)
        sched2["close_game"] = (
            sched2[["home_implied_win_prob", "away_implied_win_prob"]].max(axis=1) < 0.55
        ).astype(int)

    if {"home_moneyline", "away_moneyline"}.issubset(sched2.columns):
        sched2["ml_diff"] = sched2["home_moneyline"] - sched2["away_moneyline"]
        sched2["ml_avg"] = (sched2["home_moneyline"] + sched2["away_moneyline"]) / 2

    added = [c for c in sched2.columns if c not in sched.columns and c != "game_id"]
    print(f"[PHASE 2] Added market/derived columns (leakage-safe): {', '.join(added) if added else '(none)'}")
    return sched2


sched2 = attach_pre_kick_lines(sched)
if sched2 is not None:
    sched = sched2
else:
    print("[PHASE 2] WARNING: attach_pre_kick_lines returned None. Skipping market enrichment.")

mkt_cols = [c for c in [
    "provider","spread_line","spread_abs","spread_home_adv","fav_is_home","pickem_game",
    "total_line","implied_home_score","implied_away_score","implied_score_diff","high_total_game",
    "home_moneyline","away_moneyline","ml_diff","ml_avg",
    "home_implied_win_prob","away_implied_win_prob","avg_implied_prob","implied_odds_ratio",
    "market_edge_prob","heavy_favorite"
] if c in sched.columns]

[DEBUG] sched is <class 'pandas.core.frame.DataFrame'> with shape: (5932, 55)
[PHASE 2] Added market/derived columns (leakage-safe): home_implied_win_prob, away_implied_win_prob, market_edge_prob, spread_favorite, home_spread_line, spread_abs, fav_is_home, pickem_game, implied_home_score, implied_away_score, avg_implied_prob, implied_odds_ratio, heavy_favorite, close_game, ml_diff, ml_avg


## **Step 2: PLAY BY PLAY AGG**

### **Download PBP AND IMPORT**

In [6]:
_cols_needed = ['game_id', 'season', 'week', 'game_type', 'season_phase', 'home_team',
       'away_team', 'home_score', 'away_score', 'kickoff_et', 'game_date',
       'stadium', 'lat', 'lon', 'roof_type', 'surface', 'weather_temp',
       'weather_humidity', 'weather_windspeed', 'weather_precip',
       'spread_line', 'total_line', 'home_moneyline', 'away_moneyline',
       'home_implied_win_prob', 'away_implied_win_prob', 'market_edge_prob',
       'spread_favorite', 'home_spread_line', 'spread_abs', 'fav_is_home',
       'pickem_game', 'implied_home_score', 'implied_away_score',
       'avg_implied_prob', 'implied_odds_ratio', 'heavy_favorite',
       'close_game', 'ml_diff', 'ml_avg']

keep = [c for c in _cols_needed if c in sched.columns]
remainder = [c for c in sched.columns if c not in keep]
sched = sched[keep + remainder]

if not sched.columns.is_unique:
    sched = sched.loc[:, ~sched.columns.duplicated()].copy()

home = (
    sched.rename(columns={"home_team": "team", "away_team": "opp"})
         .assign(is_home=1, is_away=0)
)
away = (
    sched.rename(columns={"away_team": "team", "home_team": "opp"})
         .assign(is_home=0, is_away=1)
)

home = home.loc[:, ~home.columns.duplicated()].copy()
away = away.loc[:, ~away.columns.duplicated()].copy()
_common = home.columns.intersection(away.columns)
team_games = pd.concat([home[_common], away[_common]], ignore_index=True, sort=False)
for c in ("team", "opp"):
    team_games[c] = team_games[c].astype(str)

print(f"[PHASE 3] team_games shape: {team_games.shape} (should be 2x sched rows)")
print(team_games.head(4)[["game_id","team","opp","is_home","is_away"]])

def _downcast_numeric(df: pd.DataFrame) -> pd.DataFrame:
    fcols = df.select_dtypes(include=["float64"]).columns
    for c in fcols:
        df[c] = pd.to_numeric(df[c], downcast="float")
    icols = df.select_dtypes(include=["int64","int32","int16","int8",
                                      "uint64","uint32","uint16","uint8"]).columns
    for c in icols:
        df[c] = pd.to_numeric(df[c], downcast="integer")
    return df

def import_pbp_safely(SEASONS):
    frames = []
    for yr in SEASONS:
        try:
            tmp = nfl.import_pbp_data([yr]).copy()
            tmp["season"] = yr
            frames.append(_downcast_numeric(tmp))
            print(f"{yr} done.")
        except Exception as e:
            print(f"HTTP/Load issue for {yr}: {e}. Skipping this season.")
    if not frames:
        raise RuntimeError("[PHASE 3] No PBP data loaded for any season.")
    out = pd.concat(frames, ignore_index=True)
    return _downcast_numeric(out)

pbp = import_pbp_safely(SEASONS)



if ("posteam" in pbp.columns) and ("defteam" in pbp.columns):
    pbp = pbp[pbp["posteam"].notna() & pbp["defteam"].notna()].copy()
else:
    raise RuntimeError("[PHASE 3] PBP payload missing 'posteam' or 'defteam'")

pbp["epa"] = pd.to_numeric(pbp.get("epa", np.nan), errors="coerce")
pbp["success"] = pd.to_numeric(pbp.get("success", np.nan), errors="coerce")
pbp["yards_gained"] = pd.to_numeric(pbp.get("yards_gained", np.nan), errors="coerce")
pbp["touchdown"] = pd.to_numeric(pbp.get("touchdown", 0), errors="coerce").fillna(0).astype(int)

if "game_id" in pbp.columns and "game_date" in sched.columns:
    pbp = pbp.merge(sched[["game_id","game_date"]], on="game_id", how="left")

drive_summary = (
    pbp.groupby(["season","game_id","posteam","drive"])
       .agg(
           drive_plays=("play_id","count"),
           drive_yards=("yards_gained","sum"),
           drive_points=("touchdown","sum"),
       )
       .reset_index()
)

drive_summary_game = (
    drive_summary.groupby(["season","game_id","posteam"])
                 .agg(
                     avg_drive_len=("drive_plays","mean"),
                     avg_drive_yards=("drive_yards","mean"),
                     points_per_drive=("drive_points","mean"),
                 )
                 .reset_index()
                 .rename(columns={"posteam":"team"})
)

drive_summary_game = (
    drive_summary_game
    .sort_values(["team","season","game_id"])
    .assign(
        team_id=lambda df: df["team"],
        season_id=lambda df: df["season"],
        gid=lambda df: df["game_id"]
    )
    .groupby("team")
    .apply(lambda g: g.shift(1).assign(team=g["team"], season=g["season"], game_id=g["game_id"]))
    .reset_index(drop=True)
)

season_team_means = (
    pbp.groupby(["season","posteam","game_id"])
       .agg(season_epa=("epa","mean"),
            season_success=("success","mean"))
       .reset_index()
       .rename(columns={"posteam":"team"})
)

season_team_means = (
    season_team_means
    .merge(sched[["game_id","game_date"]], on="game_id", how="left")
    .sort_values(["team","season","game_date"])
    .groupby(["team","season"], group_keys=False)
    .apply(lambda g: g.assign(
        season_epa=g["season_epa"].expanding().mean().shift(1),
        season_success=g["season_success"].expanding().mean().shift(1)
    ))
)

if "wp" in pbp.columns and "quarter" in pbp.columns:
    pbp["garbage_time"] = ((pbp["wp"] < 0.05) | (pbp["wp"] > 0.95)) & (pbp["quarter"] >= 4)
else:
    pbp["garbage_time"] = False

garbage_game = (
    pbp.groupby(["season","game_id","posteam"])
       .agg(
           garbage_play_rate=("garbage_time","mean"),
           garbage_epa=("epa", lambda x: x[pbp.loc[x.index,"garbage_time"]==1].mean()
                        if (pbp.loc[x.index,"garbage_time"]==1).any() else np.nan)
       )
       .reset_index()
       .rename(columns={"posteam":"team"})
)


garbage_game = (
    garbage_game
    .sort_values(["team","season","game_id"])
    .groupby("team")
    .shift(1)
    .assign(team=garbage_game["team"], 
            season=garbage_game["season"], 
            game_id=garbage_game["game_id"])
)


team_games = team_games.sort_values(["team","game_date"])
team_games["prev_game_date"] = team_games.groupby("team")["game_date"].shift(1)
team_games["rest_days"] = (
    (pd.to_datetime(team_games["game_date"]) - pd.to_datetime(team_games["prev_game_date"]))
    .dt.days
)
team_games["short_rest"] = (team_games["rest_days"] < 6).astype(int)
team_games["bye_rest"]   = (team_games["rest_days"] >= 11).astype(int)
team_games["travel_dist"] = np.nan  # placeholder for haversine calc

if {"spread_line","total_line"}.issubset(sched.columns):
    sched["home_implied_pts"] = sched["total_line"]/2 - sched["spread_line"]/2
    sched["away_implied_pts"] = sched["total_line"]/2 + sched["spread_line"]/2
else:
    sched["home_implied_pts"] = np.nan
    sched["away_implied_pts"] = np.nan
    print("[PHASE 3] WARN: no total_line/spread_line found → implied points set to NaN")

team_features = (
    drive_summary_game
    .merge(garbage_game, on=["season","game_id","team"], how="left")
    .merge(season_team_means, on=["season","team","game_id"], how="left")
)

team_games = team_games.merge(team_features, on=["game_id","team"], how="left")

print(f"[PHASE 3] Finished leakage-safe features. team_games shape={team_games.shape}")

[PHASE 3] team_games shape: (11864, 73) (should be 2x sched rows)
           game_id team  opp  is_home  is_away
0   2000_19_NO_MIN  MIN   NO        1        0
1  2000_19_BAL_TEN  TEN  BAL        1        0
2   2000_01_SF_ATL  ATL   SF        1        0
3  2000_01_JAX_CLE  CLE  JAX        1        0
2000 done.
Downcasting floats.
2000 done.
2001 done.
Downcasting floats.
2001 done.
2002 done.
Downcasting floats.
2002 done.
2003 done.
Downcasting floats.
2003 done.
2004 done.
Downcasting floats.
2004 done.
2005 done.
Downcasting floats.
2005 done.
2006 done.
Downcasting floats.
2006 done.
2007 done.
Downcasting floats.
2007 done.
2008 done.
Downcasting floats.
2008 done.
2009 done.
Downcasting floats.
2009 done.
2010 done.
Downcasting floats.
2010 done.
2011 done.
Downcasting floats.
2011 done.
2012 done.
Downcasting floats.
2012 done.
2013 done.
Downcasting floats.
2013 done.
2014 done.
Downcasting floats.
2014 done.
2015 done.
Downcasting floats.
2015 done.
2016 done.
Downcasting floa

In [7]:
tg_home = (
    team_games.loc[team_games["is_home"] == 1]
              .rename(columns=lambda c: f"home_{c}" if c not in ["game_id"] else c)
)
tg_away = (
    team_games.loc[team_games["is_away"] == 1]
              .rename(columns=lambda c: f"away_{c}" if c not in ["game_id"] else c)
)
tg_home = tg_home.drop(columns=[c for c in tg_home.columns if c.startswith("home_is_")], errors="ignore")
tg_away = tg_away.drop(columns=[c for c in tg_away.columns if c.startswith("away_is_")], errors="ignore")
sched = sched.merge(tg_home, on="game_id", how="left", validate="many_to_one")
sched = sched.merge(tg_away, on="game_id", how="left", validate="many_to_one")

for col in sched.columns:
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)


print("[PHASE 3] sched enriched with team_games context:", sched.shape)

[PHASE 3] sched enriched with team_games context: (5932, 240)


## **STEP 3: ADVANCED OFFENSE AND DEFENSE STATS**

In [8]:
def team_game_pace_secs(sub):
    if "game_seconds_remaining" not in sub.columns:
        return np.nan
    s = pd.to_numeric(sub["game_seconds_remaining"], errors="coerce")
    s = s.dropna().sort_values(ascending=False)
    if len(s) < 2:
        return np.nan
    dt = -s.diff()
    dt = dt[(dt > 0) & (dt < 60)]
    return float(dt.median()) if len(dt) else np.nan


grp_off = pbp.groupby(["season","game_id","posteam"], observed=True)

game_off = grp_off.agg(
    plays=("play_id","count"),
    epa_sum=("epa","sum"),
    epa_per_play=("epa","mean"),
    success_rate=("success","mean"),
    pass_plays=("pass","sum"),
    rush_plays=("rush","sum"),
    yards_gained=("yards_gained","sum"),
    penalties=("penalty","sum"),
    sacks=("sack","sum"),
    qb_hits=("qb_hit","sum"),
    int_thrown=("interception","sum"),
    fumbles=("fumble","sum"),
).reset_index()

game_off = (
    game_off.sort_values(["posteam","season","game_id"])
            .groupby("posteam")
            .shift(1)
            .assign(posteam=game_off["posteam"],
                    season=game_off["season"],
                    game_id=game_off["game_id"])
)

pbp["is_dropback"] = (
    pbp["qb_dropback"].astype("Int64")
    if "qb_dropback" in pbp.columns
    else ((pbp["pass"].astype("Int64") == 1) | (pbp["sack"].astype("Int64") == 1)).astype("Int64")
)

pbp["third_down"] = (pd.to_numeric(pbp["down"], errors="coerce") == 3).astype("Int64")
conv_ok = (pbp["first_down"].astype("Int64") == 1) | (
    pd.to_numeric(pbp["yards_gained"], errors="coerce") >= pd.to_numeric(pbp["ydstogo"], errors="coerce")
)
pbp["third_conv"] = ((pbp["third_down"] == 1) & conv_ok).astype("Int64")
pbp["red_zone"] = (pd.to_numeric(pbp["yardline_100"], errors="coerce") <= 20).astype("Int64")
pbp["g2g"] = (
    (pbp.get("goal_to_go", 0).astype("Int64") == 1)
    if "goal_to_go" in pbp.columns else pbp["red_zone"]
)

if ("rush_touchdown" in pbp.columns) or ("pass_touchdown" in pbp.columns):
    pbp["is_td"] = ((pbp.get("rush_touchdown", 0).astype("Int64") == 1) |
                    (pbp.get("pass_touchdown", 0).astype("Int64") == 1)).astype("Int64")
else:
    pbp["is_td"] = (pbp.get("touchdown", 0).astype("Int64") == 1).astype("Int64")

pbp["rz_td"] = (pbp["red_zone"] == 1) & (pbp["is_td"] == 1)
pbp["g2g_td"] = (pbp["g2g"] == 1) & (pbp["is_td"] == 1)

EXP_PASS, EXP_RUSH = 20, 10
pbp["explosive_pass"] = ((pbp["pass"].astype("Int64") == 1) &
                         (pd.to_numeric(pbp["yards_gained"], errors="coerce") >= EXP_PASS)).astype("Int64")
pbp["explosive_rush"] = ((pbp["rush"].astype("Int64") == 1) &
                         (pd.to_numeric(pbp["yards_gained"], errors="coerce") >= EXP_RUSH)).astype("Int64")

if "field_goal_attempt" in pbp.columns and pbp["field_goal_attempt"].sum() > 0:
    pbp["fg_att"] = (pbp["field_goal_attempt"].astype("Int64") == 1).astype("Int64")
    pbp["fg_made"] = (pbp.get("field_goal_result", "").astype(str).str.lower() == "made").astype("Int64")
else:
    res = pbp.get("field_goal_result")
    if res is not None:
        res = res.astype(str).str.lower()
        pbp["fg_att"] = res.isin(["made", "missed", "blocked"]).astype("Int64")
        pbp["fg_made"] = (res == "made").astype("Int64")
    else:
        pbp["fg_att"] = 0
        pbp["fg_made"] = 0

pbp["fumble_lost"] = (
    pd.to_numeric(pbp.get("fumble_lost", 0), errors="coerce").fillna(0).astype("Int64")
    if "fumble_lost" in pbp.columns else 0
)
pbp["off_turnover"] = ((pbp["interception"].astype("Int64") == 1) |
                       (pbp["fumble_lost"].astype("Int64") == 1)).astype("Int64")

aux_off = grp_off.agg(
    dropbacks=("is_dropback","sum"),
    qb_hits=("qb_hit","sum"),
    sacks=("sack","sum"),
    scrambles=("qb_scramble","sum"),
    third_down_plays=("third_down","sum"),
    third_down_conversions=("third_conv","sum"),
    red_zone_plays=("red_zone","sum"),
    red_zone_tds=("rz_td","sum"),
    g2g_plays=("g2g","sum"),
    g2g_tds=("g2g_td","sum"),
    explosive_pass=("explosive_pass","sum"),
    explosive_rush=("explosive_rush","sum"),
    fg_attempts=("fg_att","sum"),
    fg_made=("fg_made","sum"),
    turnovers=("off_turnover","sum"),
    shotgun_plays=("shotgun","sum"),
    no_huddle_plays=("no_huddle","sum"),
    pass_plays=("pass","sum"),
    rush_plays=("rush","sum"),
    penalty_yards=("penalty_yards","sum"),
    air_yards_sum=("air_yards","sum"),
).reset_index()

base_vols = grp_off.agg(
    plays=("play_id","count"),
    pass_plays=("pass","sum"),
    rush_plays=("rush","sum")
).reset_index()

aux_off = base_vols.merge(aux_off, on=["season","game_id","posteam"], how="left", validate="one_to_one")

if "complete_pass" in pbp.columns:
    comp_df = grp_off["complete_pass"].sum().reset_index(name="completions")
    aux_off = aux_off.merge(comp_df, on=["season","game_id","posteam"], how="left")
else:
    aux_off["completions"] = np.nan

if "yac_yards" in pbp.columns:
    yac_off = grp_off["yac_yards"].sum().reset_index(name="yac_sum")
    aux_off = aux_off.merge(yac_off, on=["season","game_id","posteam"], how="left")
    denom = np.where(aux_off["completions"].fillna(0) > 0, aux_off["completions"], aux_off["pass_plays"])
    aux_off["yac_per_comp"] = np.where(denom > 0, aux_off["yac_sum"] / denom, np.nan)
else:
    aux_off["yac_per_comp"] = np.nan

pace = grp_off.apply(team_game_pace_secs).reset_index(name="pace_sec_per_play")
aux_off = aux_off.merge(pace, on=["season","game_id","posteam"], how="left")

if "xpass" in pbp.columns:
    xpass_off = grp_off["xpass"].mean().reset_index(name="xpass_mean")
    aux_off = aux_off.merge(xpass_off, on=["season","game_id","posteam"], how="left")

pbp["short_pass"] = (pbp["pass"] == 1) & (pbp["air_yards"] < 10)
pbp["deep_pass"] = (pbp["pass"] == 1) & (pbp["air_yards"] >= 20)
pbp["two_min_drill"] = (
    pbp["qtr"].isin([2, 4]) & (pbp["game_seconds_remaining"] <= 120)
)

ext_off = grp_off.agg(
    rz_epa=("epa", lambda x: x[pbp.loc[x.index,"red_zone"] == 1].mean()),
    g2g_epa=("epa", lambda x: x[pbp.loc[x.index,"g2g"] == 1].mean()),
    third_epa=("epa", lambda x: x[pbp.loc[x.index,"third_down"] == 1].mean()),
    fourth_epa=("epa", lambda x: x[pbp.loc[x.index,"down"] == 4].mean()),
    two_min_epa=("epa", lambda x: x[pbp.loc[x.index,"two_min_drill"] == 1].mean()),
    deep_epa=("epa", lambda x: x[pbp.loc[x.index,"deep_pass"] == 1].mean()),
    short_pass_ct=("short_pass","sum"),
    deep_pass_ct=("deep_pass","sum"),
    avg_start_fp=("yardline_100","mean"),
    three_and_outs=("play_id", lambda g: ((g.count() <= 3) & (g.max() < 4)).sum()),
).reset_index()

ext_off = (
    ext_off.sort_values(["posteam","season","game_id"])
           .groupby("posteam")
           .shift(1)
           .assign(posteam=ext_off["posteam"],
                   season=ext_off["season"],
                   game_id=ext_off["game_id"])
)

aux_off = aux_off.merge(ext_off, on=["season","game_id","posteam"], how="left")

for col in ["plays", "pass_plays", "rush_plays"]:
    col_x, col_y = f"{col}_x", f"{col}_y"
    if col_x in aux_off.columns or col_y in aux_off.columns:
        aux_off[col] = aux_off.get(col_x, aux_off.get(col_y))
        aux_off.drop(columns=[c for c in [col_x, col_y] if c in aux_off.columns], inplace=True)
    elif col not in aux_off.columns:
        aux_off[col] = 0

aux_off["third_down_conv_rate"] = np.where(aux_off["third_down_plays"]>0,
    aux_off["third_down_conversions"]/aux_off["third_down_plays"], np.nan)
aux_off["red_zone_td_rate"] = np.where(aux_off["red_zone_plays"]>0,
    aux_off["red_zone_tds"]/aux_off["red_zone_plays"], np.nan)
aux_off["g2g_td_rate"] = np.where(aux_off["g2g_plays"]>0,
    aux_off["g2g_tds"]/aux_off["g2g_plays"], np.nan)
aux_off["explosive_pass_rate"] = np.where(aux_off["pass_plays"]>0,
    aux_off["explosive_pass"]/aux_off["pass_plays"], np.nan)
aux_off["explosive_rush_rate"] = np.where(aux_off["rush_plays"]>0,
    aux_off["explosive_rush"]/aux_off["rush_plays"], np.nan)
aux_off["sack_rate"] = np.where(aux_off["dropbacks"]>0,
    aux_off["sacks"]/aux_off["dropbacks"], np.nan)
aux_off["pressure_rate"] = np.where(aux_off["dropbacks"]>0,
    (aux_off["sacks"]+aux_off["qb_hits"])/aux_off["dropbacks"], np.nan)
aux_off["scramble_rate"] = np.where(aux_off["dropbacks"]>0,
    aux_off["scrambles"]/aux_off["dropbacks"], np.nan)
aux_off["shotgun_rate"] = np.where(aux_off["plays"]>0,
    aux_off["shotgun_plays"]/aux_off["plays"], np.nan)
aux_off["no_huddle_rate"] = np.where(aux_off["plays"]>0,
    aux_off["no_huddle_plays"]/aux_off["plays"], np.nan)
aux_off["fg_make_rate"] = np.where(aux_off["fg_attempts"]>0,
    aux_off["fg_made"]/aux_off["fg_attempts"], np.nan)
aux_off["turnover_rate"] = np.where(aux_off["plays"]>0,
    aux_off["turnovers"]/aux_off["plays"], np.nan)
aux_off["penalty_yards_per_play"] = np.where(aux_off["plays"]>0,
    aux_off["penalty_yards"]/aux_off["plays"], np.nan)
aux_off["a_dot"] = np.where(aux_off["pass_plays"]>0,
    aux_off["air_yards_sum"]/aux_off["pass_plays"], np.nan)

aux_off = (
    aux_off.sort_values(["posteam","season","game_id"])
           .groupby("posteam")
           .shift(1)
           .assign(posteam=aux_off["posteam"],
                   season=aux_off["season"],
                   game_id=aux_off["game_id"])
)

game_off = game_off.merge(aux_off, on=["season","game_id","posteam"], how="left")


if "plays" not in game_off.columns:
    game_off["plays"] = 0
if "pass_plays" not in game_off.columns:
    game_off["pass_plays"] = 0

if "xpass_mean" in game_off.columns:
    game_off["pass_rate"] = np.where(game_off["plays"]>0,
                                     game_off["pass_plays"]/game_off["plays"], np.nan)
    game_off["proe"] = game_off["pass_rate"] - game_off["xpass_mean"]
else:
    game_off["proe"] = np.nan

print(f"[PHASE 4] game_off shape: {game_off.shape} (lagged, leakage-safe)")

def check_game_off_leakage(game_off_raw, game_off_shifted):
    """
    Compare raw vs shifted game_off to confirm that each game's features 
    come from the prior game for that team.
    """
    merged = (
        game_off_raw
        .sort_values(["posteam","season","game_id"])
        .merge(
            game_off_shifted[["posteam","game_id","plays"]], 
            on=["posteam","game_id"], 
            how="left", 
            suffixes=("", "_shifted")
        )
    )

    results = []
    for team, group in merged.groupby("posteam"):
        group = group.sort_values("game_id").reset_index(drop=True)
        for i in range(1, len(group)):
            current_game = group.loc[i, "game_id"]
            shifted_val = group.loc[i, "plays_shifted"]
            prior_game_val = group.loc[i-1, "plays"]
            if shifted_val != prior_game_val:
                results.append((team, current_game, shifted_val, prior_game_val))

    if results:
        print("[WARN] Potential mismatches detected:")
        for r in results[:10]:
            print(r)
    else:
        print("✅ No leakage detected: every game uses only the prior game's stats.")

check_game_off_leakage(game_off_raw=game_off, game_off_shifted=game_off)
TEAM_FIX = {
    "LA": "LAR",   # Rams
    "LV": "LVR",   # Raiders
    "WSH": "WAS",  # Commanders
    "":   np.nan,  
}

def fix_team_codes(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].replace(TEAM_FIX)
    return df

pbp = fix_team_codes(pbp, ["posteam", "defteam"])
game_off = fix_team_codes(game_off, ["posteam"])

[PHASE 4] game_off shape: (13822, 69) (lagged, leakage-safe)
✅ No leakage detected: every game uses only the prior game's stats.


In [9]:
game_off_renamed = game_off.rename(columns={"posteam": "team"})

home_off = game_off_renamed.rename(
    columns={c: f"home_{c}" for c in game_off_renamed.columns 
             if c not in ["game_id","season","week","team"]}
)
away_off = game_off_renamed.rename(
    columns={c: f"away_{c}" for c in game_off_renamed.columns 
             if c not in ["game_id","season","week","team"]}
)

sched = sched.merge(
    home_off,
    left_on=["game_id","home_team"],
    right_on=["game_id","team"],
    how="left",
    suffixes=("", "_drop")   
)

sched = sched.drop(columns=[c for c in sched.columns if c.endswith("_drop")])

sched = sched.merge(
    away_off,
    left_on=["game_id","away_team"],
    right_on=["game_id","team"],
    how="left",
    suffixes=("", "_drop")  
)

sched = sched.drop(columns=[c for c in sched.columns if c.endswith("_drop")])

sched = sched.drop(columns=["team"], errors="ignore")

print("[MERGE] sched enriched with offensive features:", sched.shape)
for col in sched.columns:
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)
print([c for c in sched.columns if "_x" in c or "_y" in c])  

[MERGE] sched enriched with offensive features: (5932, 372)
['home_avg_drive_yards', 'away_avg_drive_yards', 'home_yards_gained', 'home_penalty_yards', 'home_air_yards_sum', 'home_yac_per_comp', 'home_xpass_mean', 'home_penalty_yards_per_play', 'away_yards_gained', 'away_penalty_yards', 'away_air_yards_sum', 'away_yac_per_comp', 'away_xpass_mean', 'away_penalty_yards_per_play']


In [10]:
d = pbp.copy()
grp_def = d.groupby(["season","game_id","defteam"], observed=True)
d["def_dropbacks_allowed"] = d["is_dropback"].astype("Int64")
d["def_third_down"] = d["third_down"].astype("Int64")
d["def_third_conv_allowed"] = d["third_conv"].astype("Int64")
d["def_red_zone"] = d["red_zone"].astype("Int64")
d["def_rz_td_allowed"] = d["rz_td"].astype("Int64")
d["def_g2g"] = d["g2g"].astype("Int64")
d["def_g2g_td_allowed"] = d["g2g_td"].astype("Int64")
d["def_explosive_pass_allowed"] = d["explosive_pass"].astype("Int64")
d["def_explosive_rush_allowed"] = d["explosive_rush"].astype("Int64")

d["def_takeaway"] = (
    (d["interception"].astype("Int64") == 1) |
    (d["fumble_lost"].astype("Int64") == 1)
).astype("Int64")

game_def = grp_def.agg(
    def_plays=("play_id", "count"),
    def_epa_per_play=("epa", "mean"),
    def_success_rate=("success", "mean"),
    def_yards_per_play=("yards_gained", "mean"),
    def_sacks=("sack", "sum"),
    def_qb_hits=("qb_hit", "sum"),
    def_int_made=("interception", "sum"),
    def_fumbles_forced=("fumble", "sum"),
    def_penalties=("penalty", "sum"),
).reset_index().rename(columns={"defteam": "team"})

aux_def = grp_def.agg(
    def_dropbacks=("def_dropbacks_allowed", "sum"),
    def_third_down_plays=("def_third_down", "sum"),
    def_third_down_conversions_allowed=("def_third_conv_allowed", "sum"),
    def_red_zone_plays=("def_red_zone", "sum"),
    def_red_zone_tds_allowed=("def_rz_td_allowed", "sum"),
    def_g2g_plays=("def_g2g", "sum"),
    def_g2g_tds_allowed=("def_g2g_td_allowed", "sum"),
    def_explosive_pass_allowed=("def_explosive_pass_allowed", "sum"),
    def_explosive_rush_allowed=("def_explosive_rush_allowed", "sum"),
    def_takeaways=("def_takeaway", "sum"),
    def_penalty_yards=("penalty_yards", "sum"),
    def_pass_plays_allowed=("pass", "sum"),
    def_rush_plays_allowed=("rush", "sum"),
).reset_index().rename(columns={"defteam": "team"})

aux_def["def_third_down_conv_rate_allowed"] = np.where(
    aux_def["def_third_down_plays"] > 0,
    aux_def["def_third_down_conversions_allowed"] / aux_def["def_third_down_plays"],
    np.nan
)
aux_def["def_red_zone_td_rate_allowed"] = np.where(
    aux_def["def_red_zone_plays"] > 0,
    aux_def["def_red_zone_tds_allowed"] / aux_def["def_red_zone_plays"],
    np.nan
)
aux_def["def_g2g_td_rate_allowed"] = np.where(
    aux_def["def_g2g_plays"] > 0,
    aux_def["def_g2g_tds_allowed"] / aux_def["def_g2g_plays"],
    np.nan
)
aux_def["def_explosive_pass_rate_allowed"] = np.where(
    aux_def["def_dropbacks"] > 0,
    aux_def["def_explosive_pass_allowed"] / aux_def["def_dropbacks"],
    np.nan
)
aux_def["def_explosive_rush_rate_allowed"] = np.where(
    aux_def["def_rush_plays_allowed"].fillna(0) > 0,
    aux_def["def_explosive_rush_allowed"] / aux_def["def_rush_plays_allowed"],
    np.nan
)

ext_def = grp_def.agg(
    def_rz_epa_allowed=("epa", lambda x: x[d.loc[x.index,"red_zone"] == 1].mean()),
    def_g2g_epa_allowed=("epa", lambda x: x[d.loc[x.index,"g2g"] == 1].mean()),
    def_third_epa_allowed=("epa", lambda x: x[d.loc[x.index,"third_down"] == 1].mean()),
    def_fourth_epa_allowed=("epa", lambda x: x[d.loc[x.index,"down"] == 4].mean()),
    def_two_min_epa_allowed=("epa", lambda x: x[d.loc[x.index,"two_min_drill"] == 1].mean()
                             if "two_min_drill" in d else np.nan),
    def_deep_epa_allowed=("epa", lambda x: x[d.loc[x.index,"deep_pass"] == 1].mean()
                          if "deep_pass" in d else np.nan),
    def_short_pass_ct_allowed=("short_pass","sum") if "short_pass" in d else ("play_id","count"),
    def_deep_pass_ct_allowed=("deep_pass","sum") if "deep_pass" in d else ("play_id","count"),
    def_avg_start_fp_allowed=("yardline_100", "mean"),
).reset_index().rename(columns={"defteam": "team"})

game_def = (
    game_def.merge(aux_def, on=["season","game_id","team"], how="left")
            .merge(ext_def, on=["season","game_id","team"], how="left")
)

game_def = (
    game_def.sort_values(["team","season","game_id"])
            .groupby("team")
            .shift(1)
            .assign(team=game_def["team"],
                    season=game_def["season"],
                    game_id=game_def["game_id"])
)

print(f"[PHASE 4.4] game_def shape: {game_def.shape} (lagged, leakage-safe)")


def check_leakage_def(game_def):
    problems = {}

    first_games = (
        game_def.sort_values(["team","season","game_id"])
                .groupby("team").head(1)
    )
    if first_games.drop(columns=["team","season","game_id"]).notna().any().any():
        problems["first_games"] = "Some first games have non-NaN values (should all be NaN)."

    suspicious_cols = [
        "def_plays","def_sacks","def_int_made","def_fumbles_forced",
        "def_qb_hits","def_penalties","def_takeaways"
    ]
    for col in suspicious_cols:
        if col in game_def.columns:
            non_null = game_def[col].notna().sum()
            if non_null == 0:
                problems[col] = "Column is entirely NaN (check pipeline)."
    bad_counts = game_def[suspicious_cols].lt(0).any()
    if bad_counts.any():
        problems["negatives"] = bad_counts[bad_counts].index.tolist()

    if not problems:
        print("✅ Defense leakage check passed: lagging appears safe.")
    else:
        print("⚠️ Defense leakage issues detected:")
        for k,v in problems.items():
            print(f"  - {k}: {v}")

check_leakage_def(game_def)
home_def = game_def.rename(columns={c: f"home_{c}" for c in game_def.columns if c not in ["game_id","season","team"]})
away_def = game_def.rename(columns={c: f"away_{c}" for c in game_def.columns if c not in ["game_id","season","team"]})

[PHASE 4.4] game_def shape: (13788, 39) (lagged, leakage-safe)
✅ Defense leakage check passed: lagging appears safe.


In [11]:
game_def_renamed = game_def.rename(columns={"team": "team"})
home_def = game_def_renamed.rename(
    columns={c: f"home_def_{c}" for c in game_def_renamed.columns if c not in ["game_id","season","week","team"]}
)
away_def = game_def_renamed.rename(
    columns={c: f"away_def_{c}" for c in game_def_renamed.columns if c not in ["game_id","season","week","team"]}
)
sched = sched.merge(
    home_def, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left"
)
sched = sched.merge(
    away_def, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left"
)
sched = sched.drop(columns=[c for c in sched.columns if c in ["team"]])

print("[MERGE] sched enriched with defensive features:", sched.shape)
for col in sched.columns:
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)
print([c for c in sched.columns if "_x" in c or "_y" in c])

[MERGE] sched enriched with defensive features: (5932, 434)
['home_avg_drive_yards', 'away_avg_drive_yards', 'home_yards_gained', 'home_penalty_yards', 'home_air_yards_sum', 'home_yac_per_comp', 'home_xpass_mean', 'home_penalty_yards_per_play', 'away_yards_gained', 'away_penalty_yards', 'away_air_yards_sum', 'away_yac_per_comp', 'away_xpass_mean', 'away_penalty_yards_per_play', 'home_def_def_yards_per_play', 'home_def_def_penalty_yards', 'away_def_def_yards_per_play', 'away_def_def_penalty_yards']


# **Cleanup**

In [12]:
threshold = 0.5  
missing_ratio = sched.isna().mean()
drop_cols = missing_ratio[missing_ratio > threshold].index.tolist()

print(f"[CLEANUP] Dropping {len(drop_cols)} columns (>50% missing): {drop_cols[:10]}{'...' if len(drop_cols) > 10 else ''}")

sched = sched.drop(columns=drop_cols)

print("[CLEANUP] New shape:", sched.shape)

[CLEANUP] Dropping 16 columns (>50% missing): ['nfl_detail_id', 'ftn', 'home_nfl_detail_id', 'home_ftn', 'home_travel_dist', 'home_garbage_epa', 'away_nfl_detail_id', 'away_ftn', 'away_travel_dist', 'away_garbage_epa']...
[CLEANUP] New shape: (5932, 415)


## **STEP 4: ADVANCED OFFENSE AND DEFENSE STATS**

In [13]:
if "home_team_x" in sched.columns and "away_team_x" in sched.columns:
    sched = sched.rename(columns={"home_team_x": "home_team", "away_team_x": "away_team"})

if {"home_implied_win_prob","away_implied_win_prob"}.issubset(sched.columns):
    sched["home_vigfree"] = sched["home_implied_win_prob"] / (
        sched["home_implied_win_prob"] + sched["away_implied_win_prob"]
    )
    sched["away_vigfree"] = 1 - sched["home_vigfree"]
    sched["elo_diff_market"] = -400 * np.log10(1/sched["home_vigfree"] - 1)

def lagged_team_stat(df, group_cols, value_cols, how="mean"):
    """
    Strict leakage-safe lag:
    - If per-game, sorts within team-season by game_id.
    - If per-season aggregates, sorts by group_cols only.
    """
    if isinstance(value_cols, str):
        value_cols = [value_cols]

    sort_keys = list(group_cols)
    if "game_id" in df.columns and "game_id" not in sort_keys:
        sort_keys.append("game_id")

    df_sorted = df.sort_values(sort_keys)

    out = (
        df_sorted.groupby(group_cols, group_keys=False)[value_cols]
                 .apply(lambda g: g.expanding().agg(how).shift())
    )
    return out

TEAM_FIX = {"LA":"LAR","LV":"LVR","WSH":"WAS","":np.nan}
pbp["posteam"] = pbp["posteam"].replace(TEAM_FIX)
pbp["defteam"] = pbp["defteam"].replace(TEAM_FIX)

if "neutral" in pbp.columns:
    pbp["neutral_pass"] = np.where(pbp["neutral"]==1, pbp["pass"], np.nan)
    pbp["early_pass"]   = np.where(pbp["down"].isin([1,2]), pbp["pass"], np.nan)
    pbp["late_pass"]    = np.where(pbp["down"].isin([3,4]), pbp["pass"], np.nan)

    temp = (
        pbp.groupby(["season","game_id","posteam"], observed=True)[
            ["neutral_pass","early_pass","late_pass"]
        ].mean().reset_index()
    )

    temp["neutral_pass_rate_lag"] = lagged_team_stat(temp, ["season","posteam"], "neutral_pass")
    temp["early_pass_rate_lag"]   = lagged_team_stat(temp, ["season","posteam"], "early_pass")
    temp["late_pass_rate_lag"]    = lagged_team_stat(temp, ["season","posteam"], "late_pass")

    temp = temp.rename(columns={"posteam":"team"})
    home_ext = temp.rename(columns={c:f"home_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    away_ext = temp.rename(columns={c:f"away_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    sched = sched.merge(home_ext, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left")
    sched = sched.merge(away_ext, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left")
    sched = sched.drop(columns=["team_x","team_y"], errors="ignore")

pbp["score_diff"] = pbp["posteam_score"] - pbp["defteam_score"]
pbp["lead_pass"]  = np.where(pbp["score_diff"] > 0, pbp["pass"], np.nan)
pbp["trail_pass"] = np.where(pbp["score_diff"] < 0, pbp["pass"], np.nan)

temp = (
    pbp.groupby(["season","game_id","posteam"], observed=True)[
        ["lead_pass","trail_pass"]
    ].mean().reset_index()
)
temp["lead_pass_rate_lag"]  = lagged_team_stat(temp, ["season","posteam"], "lead_pass")
temp["trail_pass_rate_lag"] = lagged_team_stat(temp, ["season","posteam"], "trail_pass")
temp["script_sensitivity_lag"] = temp["lead_pass_rate_lag"] - temp["trail_pass_rate_lag"]

temp = temp.rename(columns={"posteam":"team"})
home_ext = temp.rename(columns={c:f"home_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
away_ext = temp.rename(columns={c:f"away_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
sched = sched.merge(home_ext, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left")
sched = sched.merge(away_ext, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left")
sched = sched.drop(columns=["team_x","team_y"], errors="ignore")

if "team_context" not in locals():
    team_context = (
        pbp.groupby(["season","posteam"], observed=True)
           .agg(
               season_epa_per_play=("epa","mean"),
               season_success_rate=("success","mean")
           )
           .reset_index()
           .rename(columns={"posteam": "team"})
    )

season_avgs = (
    pbp.groupby("season")
       .agg(global_epa=("epa","mean"),
            global_success=("success","mean"))
       .reset_index()
)

team_context = team_context.merge(season_avgs, on="season", how="left")

team_context["season_epa_per_play"] = lagged_team_stat(
    team_context, ["season","team"], "season_epa_per_play"
)
team_context["season_success_rate"] = lagged_team_stat(
    team_context, ["season","team"], "season_success_rate"
)

team_context = team_context.merge(season_avgs, on="season", how="left", suffixes=("","_dup"))

team_context["epa_above_avg"]     = team_context["season_epa_per_play"] - team_context["global_epa"]
team_context["success_above_avg"] = team_context["season_success_rate"] - team_context["global_success"]

def _pace(sub):
    s = pd.to_numeric(sub["game_seconds_remaining"], errors="coerce").dropna().sort_values(ascending=False)
    if len(s) < 2: return np.nan
    dt = -s.diff()
    dt = dt[(dt > 0) & (dt < 60)]
    return float(dt.median()) if len(dt) else np.nan

pbp["state"] = np.where(pbp["score_diff"]>0,"leading",
                        np.where(pbp["score_diff"]<0,"trailing","neutralish"))
temp = (
    pbp.groupby(["season","game_id","posteam","state"])
       .apply(_pace).reset_index(name="pace_sec")
)
pace_split = temp.pivot(index=["season","game_id","posteam"], columns="state", values="pace_sec").reset_index()
pace_split["pace_neutral_lag"]  = lagged_team_stat(pace_split, ["season","posteam"], "neutralish")
pace_split["pace_leading_lag"]  = lagged_team_stat(pace_split, ["season","posteam"], "leading")
pace_split["pace_trailing_lag"] = lagged_team_stat(pace_split, ["season","posteam"], "trailing")

pace_split = pace_split.rename(columns={"posteam":"team"})
home_ext = pace_split.rename(columns={c:f"home_{c}" for c in pace_split.columns if c not in ["game_id","season","team"]})
away_ext = pace_split.rename(columns={c:f"away_{c}" for c in pace_split.columns if c not in ["game_id","season","team"]})
sched = sched.merge(home_ext, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left")
sched = sched.merge(away_ext, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left")
sched = sched.drop(columns=["team_x","team_y"], errors="ignore")

team_games["travel_dist"]    = np.nan
team_games["tz_shift_hours"] = np.nan

if "punt_attempt" in pbp.columns:
    temp = (
        pbp.groupby(["season","game_id","posteam"])
           .agg(punt_attempts=("punt_attempt","sum"),
                punt_blocked=("punt_blocked","sum"))
           .reset_index()
    )
    temp["punt_block_rate"] = np.where(temp["punt_attempts"]>0, temp["punt_blocked"]/temp["punt_attempts"], np.nan)
    temp["punt_block_rate_lag"] = lagged_team_stat(temp, ["season","posteam"], "punt_block_rate")
    temp = temp.rename(columns={"posteam":"team"})
    home_ext = temp.rename(columns={c:f"home_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    away_ext = temp.rename(columns={c:f"away_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    sched = sched.merge(home_ext, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left")
    sched = sched.merge(away_ext, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left")
    sched = sched.drop(columns=["team_x","team_y"], errors="ignore")

if "kickoff_attempt" in pbp.columns:
    temp = (
        pbp.groupby(["season","game_id","posteam"])
           .agg(kickoff_attempts=("kickoff_attempt","sum"),
                kickoff_tb=("kickoff_in_endzone","sum"))
           .reset_index()
    )
    temp["ko_touchback_rate"] = np.where(temp["kickoff_attempts"]>0, temp["kickoff_tb"]/temp["kickoff_attempts"], np.nan)
    temp["ko_touchback_rate_lag"] = lagged_team_stat(temp, ["season","posteam"], "ko_touchback_rate")
    temp = temp.rename(columns={"posteam":"team"})
    home_ext = temp.rename(columns={c:f"home_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    away_ext = temp.rename(columns={c:f"away_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    sched = sched.merge(home_ext, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left")
    sched = sched.merge(away_ext, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left")
    sched = sched.drop(columns=["team_x","team_y"], errors="ignore")

if "garbage_time" in pbp.columns:
    temp = (
        pbp.groupby(["season","game_id","posteam"])
           .agg(garbage_play_rate=("garbage_time","mean"),
                garbage_epa=("epa", lambda x: x[pbp.loc[x.index,"garbage_time"]==1].mean()
                             if (pbp.loc[x.index,"garbage_time"]==1).any() else np.nan))
           .reset_index()
    )
    temp["garbage_play_rate_lag"] = lagged_team_stat(temp, ["season","posteam"], "garbage_play_rate")
    temp["garbage_epa_lag"]       = lagged_team_stat(temp, ["season","posteam"], "garbage_epa")
    temp = temp.rename(columns={"posteam":"team"})
    home_ext = temp.rename(columns={c:f"home_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    away_ext = temp.rename(columns={c:f"away_{c}" for c in temp.columns if c not in ["game_id","season","team"]})
    sched = sched.merge(home_ext, left_on=["game_id","home_team"], right_on=["game_id","team"], how="left")
    sched = sched.merge(away_ext, left_on=["game_id","away_team"], right_on=["game_id","team"], how="left")
    sched = sched.drop(columns=["team_x","team_y"], errors="ignore")

print(f"[PHASE 5] Added leakage-safe advanced features. sched shape: {sched.shape}, team_games shape: {team_games.shape}, team_context shape: {team_context.shape}")

for col in sched.columns.tolist():  
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].iloc[:,0].combine_first(sched[xcol].iloc[:,0]) \
            if isinstance(sched[ycol], pd.DataFrame) else sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)
print([c for c in sched.columns if "_x" in c or "_y" in c])

[PHASE 5] Added leakage-safe advanced features. sched shape: (5932, 474), team_games shape: (11864, 91), team_context shape: (830, 10)
['home_avg_drive_yards', 'away_avg_drive_yards', 'home_yards_gained', 'home_penalty_yards', 'home_air_yards_sum', 'home_xpass_mean', 'home_penalty_yards_per_play', 'away_yards_gained', 'away_penalty_yards', 'away_air_yards_sum', 'away_xpass_mean', 'away_penalty_yards_per_play', 'home_def_def_yards_per_play', 'home_def_def_penalty_yards', 'away_def_def_yards_per_play', 'away_def_def_penalty_yards']


In [14]:
threshold = 0.4 
missing_ratio = sched.isna().mean()
drop_cols = missing_ratio[missing_ratio > threshold].index.tolist()

print(f"[CLEANUP] Dropping {len(drop_cols)} columns (>50% missing): {drop_cols[:10]}{'...' if len(drop_cols) > 10 else ''}")

sched = sched.drop(columns=drop_cols)

print("[CLEANUP] New shape:", sched.shape)

[CLEANUP] Dropping 4 columns (>50% missing): ['home_garbage_epa', 'home_garbage_epa_lag', 'away_garbage_epa', 'away_garbage_epa_lag']
[CLEANUP] New shape: (5932, 458)


## **STEP 5A: STARTER FEATURES (OLD)**

In [15]:
import pandas as pd
import nfl_data_py as nfl
import socket

socket.setdefaulttimeout(60)
VALID_SEASONS = [yr for yr in SEASONS if yr <= 2025]
pf_weekly_list = []
pf_season_list = []
for yr in VALID_SEASONS:
    try:
        print(f"Fetching weekly data for {yr}...")
        weekly = nfl.import_weekly_data([yr])
        pf_weekly_list.append(weekly)
    except Exception as e:
        print(f"Error fetching weekly data for {yr}: {e}")

    try:
        print(f"Fetching seasonal data for {yr}...")
        seasonal = nfl.import_seasonal_data([yr])
        pf_season_list.append(seasonal)
    except Exception as e:
        print(f"Error fetching seasonal data for {yr}: {e}")

# Concatenate
pf_weekly = pd.concat(pf_weekly_list, ignore_index=True)
pf_season = pd.concat(pf_season_list, ignore_index=True)

print(f"Weekly shape: {pf_weekly.shape}, Seasonal shape: {pf_season.shape}")

for df in [pf_weekly, pf_season]:
    if "recent_team" in df.columns and "team" not in df.columns:
        df.rename(columns={"recent_team": "team"}, inplace=True)

TEAM_FIX = {
    "LA": "LAR",   # Rams
    "STL": "LAR",  # Old Rams
    "LV": "LVR",   # Raiders
    "OAK": "LVR",  # Old Raiders
    "SD": "LAC",   # Chargers
    "WSH": "WAS",  # Commanders
}
for df in [pf_weekly, pf_season]:
    if "team" in df.columns:
        df["team"] = df["team"].replace(TEAM_FIX)

def pf_season_to_date_stats(df, stat_cols):
    df = df.sort_values(["player_id","season","week"])
    out = []
    for pid, sub in df.groupby("player_id"):
        sub = sub.sort_values(["season","week"])
        rolling = sub[stat_cols].expanding().mean().shift(1)  
        sub_out = sub[["player_id","team","season","week"]].copy()
        for c in stat_cols:
            sub_out[f"{c}_s2d"] = rolling[c].values
        out.append(sub_out)
    return pd.concat(out, ignore_index=True)

def pf_pick_starters_safe(weekly_df):
    starters = []
    pos_map = {"QB":"attempts", "RB":"carries", "WR":"targets", "TE":"targets"}
    for pos, usage_col in pos_map.items():
        if usage_col not in weekly_df.columns:
            continue
        tmp = (weekly_df[weekly_df["position"]==pos]
               .groupby(["team","season","week","player_id"])[usage_col].sum()
               .reset_index())
        tmp = tmp.sort_values(["team","season","week"])
        tmp["usage_prev"] = tmp.groupby(["team","season","player_id"])[usage_col].shift(1)
        tmp = tmp.sort_values(["team","season","week","usage_prev"],
                              ascending=[True,True,True,False])
        tmp["rank"] = tmp.groupby(["team","season","week"]).cumcount()+1
        keep = tmp.loc[tmp["rank"] <= (2 if pos=="WR" else 1)].copy()
        keep["starter_role"] = keep["rank"].map({1:pos+"1", 2:pos+"2"})
        starters.append(keep)
    return pd.concat(starters, ignore_index=True)

pf_starters = pf_pick_starters_safe(pf_weekly)

pf_career_stats = (pf_weekly.groupby("player_id")
                   .agg(
                       career_pass_yds=("passing_yards","mean"),
                       career_pass_tds=("passing_tds","mean"),
                       career_rush_yds=("rushing_yards","mean"),
                       career_rush_tds=("rushing_tds","mean"),
                       career_rec_yds=("receiving_yards","mean"),
                       career_rec_tds=("receiving_tds","mean")
                   ).reset_index())

pf_stat_cols = ["passing_yards","passing_tds",
                "rushing_yards","rushing_tds",
                "receiving_yards","receiving_tds"]
pf_s2d_stats = pf_season_to_date_stats(pf_weekly, pf_stat_cols)

pf_player_features = (pf_starters
    .merge(pf_career_stats, on="player_id", how="left")
    .merge(pf_s2d_stats, on=["player_id","team","season","week"], how="left")
)

pf_player_features["season"] = pd.to_numeric(pf_player_features["season"], errors="coerce").astype("Int64")
pf_player_features["week"]   = pf_player_features["week"].astype(str)
pf_player_features["team"]   = pf_player_features["team"].replace(TEAM_FIX).astype(str)

sched["season"]     = pd.to_numeric(sched["season"], errors="coerce").astype("Int64")
sched["week"]       = sched["week"].astype(str)
sched["home_team"]  = sched["home_team"].replace(TEAM_FIX).astype(str)
sched["away_team"]  = sched["away_team"].replace(TEAM_FIX).astype(str)

pf_player_features = pf_player_features.merge(
    sched[["game_id","season","week","home_team","away_team"]],
    left_on=["team","season","week"],
    right_on=["home_team","season","week"],
    how="left"
).merge(
    sched[["game_id","season","week","home_team","away_team"]],
    left_on=["team","season","week"],
    right_on=["away_team","season","week"],
    how="left"
)

pf_player_features["game_id"] = pf_player_features["game_id_x"].fillna(pf_player_features["game_id_y"])
cols_to_drop = [c for c in ["game_id_x","game_id_y","home_team","away_team"] if c in pf_player_features.columns]
pf_player_features = pf_player_features.drop(columns=cols_to_drop)

pf_pivoted = (pf_player_features.pivot_table(
                index=["team","game_id"],
                columns="starter_role",
                values=[c for c in pf_player_features.columns if c.startswith("career_") or c.endswith("_s2d")]
             ).reset_index())
pf_pivoted.columns = ["_".join([str(c) for c in col if c]) for col in pf_pivoted.columns.values]

team_games = team_games.merge(pf_pivoted, on=["team","game_id"], how="left")

print(f"[PHASE 6] Added player-level features. team_games shape: {team_games.shape}")

Fetching weekly data for 2000...
Downcasting floats.
Fetching seasonal data for 2000...
Fetching weekly data for 2001...
Downcasting floats.
Fetching seasonal data for 2001...
Fetching weekly data for 2002...
Downcasting floats.
Fetching seasonal data for 2002...
Fetching weekly data for 2003...
Downcasting floats.
Fetching seasonal data for 2003...
Fetching weekly data for 2004...
Downcasting floats.
Fetching seasonal data for 2004...
Fetching weekly data for 2005...
Downcasting floats.
Fetching seasonal data for 2005...
Fetching weekly data for 2006...
Downcasting floats.
Fetching seasonal data for 2006...
Fetching weekly data for 2007...
Downcasting floats.
Fetching seasonal data for 2007...
Fetching weekly data for 2008...
Downcasting floats.
Fetching seasonal data for 2008...
Fetching weekly data for 2009...
Downcasting floats.
Fetching seasonal data for 2009...
Fetching weekly data for 2010...
Downcasting floats.
Fetching seasonal data for 2010...
Fetching weekly data for 2011...

In [16]:
sched = sched.merge(
    pf_pivoted,
    on=["team","game_id"],  
    how="left"
)

print(f"[MERGE] sched enriched with player-level features: {sched.shape}")
print("[CHECK] Columns now include player feature info:",
      [c for c in sched.columns if "career_" in c or "_s2d" in c])

for col in sched.columns.tolist(): 
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = (
            sched[ycol].iloc[:,0].combine_first(sched[xcol].iloc[:,0])
            if isinstance(sched[ycol], pd.DataFrame)
            else sched[ycol].combine_first(sched[xcol])
        )
        sched.drop(columns=[xcol, ycol], inplace=True)

print("[CHECK] Remaining duplicate columns:",
      [c for c in sched.columns if "_x" in c or "_y" in c])

[MERGE] sched enriched with player-level features: (5932, 518)
[CHECK] Columns now include player feature info: ['career_pass_tds_QB1', 'career_pass_tds_RB1', 'career_pass_tds_TE1', 'career_pass_tds_WR1', 'career_pass_tds_WR2', 'career_pass_yds_QB1', 'career_pass_yds_RB1', 'career_pass_yds_TE1', 'career_pass_yds_WR1', 'career_pass_yds_WR2', 'career_rec_tds_QB1', 'career_rec_tds_RB1', 'career_rec_tds_TE1', 'career_rec_tds_WR1', 'career_rec_tds_WR2', 'career_rec_yds_QB1', 'career_rec_yds_RB1', 'career_rec_yds_TE1', 'career_rec_yds_WR1', 'career_rec_yds_WR2', 'career_rush_tds_QB1', 'career_rush_tds_RB1', 'career_rush_tds_TE1', 'career_rush_tds_WR1', 'career_rush_tds_WR2', 'career_rush_yds_QB1', 'career_rush_yds_RB1', 'career_rush_yds_TE1', 'career_rush_yds_WR1', 'career_rush_yds_WR2', 'passing_tds_s2d_QB1', 'passing_tds_s2d_RB1', 'passing_tds_s2d_TE1', 'passing_tds_s2d_WR1', 'passing_tds_s2d_WR2', 'passing_yards_s2d_QB1', 'passing_yards_s2d_RB1', 'passing_yards_s2d_TE1', 'passing_yards_s2

## **STEP 6: REFS**

In [17]:
refs = nfl.import_officials(list(range(2000, 2025)))

def prep_refs(refs: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare referee crew information:
    - Normalizes season column
    - Normalizes any hidden/legacy team codes (future-proofing, consistent with snap_counts)
    - Aggregates crew size
    - Identifies head referee
    """
    df = refs.copy()
    df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
    TEAM_FIX = {
        "LA": "LAR",  # Rams
        "STL": "LAR", # Old Rams
        "LV": "LVR",  # Raiders
        "OAK": "LVR", # Old Raiders
        "SD": "LAC",  # Chargers
        "WSH": "WAS", # Commanders
    }
    if "team" in df.columns:
        df["team"] = df["team"].replace(TEAM_FIX)

    crew = (
        df.groupby(["game_id","season"], observed=True)
          .agg(ref_crew_size=("official_id","nunique"))
          .reset_index()
    )

    head = (
        df.loc[df["off_pos"]=="R", ["game_id","season","official_id","name"]]
          .rename(columns={"official_id":"head_ref_id","name":"head_ref_name"})
    )

    out = crew.merge(head, on=["game_id","season"], how="left")
    return out

refs_prepped = prep_refs(refs)

sched = sched.merge(refs_prepped, on=["game_id","season"], how="left")

print(f"[MERGE] sched enriched with referee info: {sched.shape}")
print("[CHECK] Columns now include referee info:", [c for c in sched.columns if "ref" in c])

for col in sched.columns.tolist(): 
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].iloc[:,0].combine_first(sched[xcol].iloc[:,0]) \
            if isinstance(sched[ycol], pd.DataFrame) else sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)

[MERGE] sched enriched with referee info: (5932, 521)
[CHECK] Columns now include referee info: ['referee', 'home_referee', 'away_referee', 'ref_crew_size', 'head_ref_id', 'head_ref_name']


## **STEP 7: SNAP COUNTS**

In [18]:
snap_counts = nfl.import_snap_counts(list(range(2012, 2025)))
def prep_snap_counts(snap_counts: pd.DataFrame, team_games: pd.DataFrame, sched: pd.DataFrame) -> pd.DataFrame:
    """
    Collapse player-level snap counts to team-game level,
    align with schedule (sched), and lag by one game to avoid leakage.
    Normalizes team codes to match sched (LAR, LVR, LAC, WAS).
    """
    df = snap_counts.copy()
    TEAM_FIX = {
        "LA": "LAR",  # Rams
        "STL": "LAR", # Old Rams
        "LV": "LVR",  # Raiders
        "OAK": "LVR", # Old Raiders
        "SD": "LAC",  # Chargers
        "WSH": "WAS", # Commanders
    }
    df["team"] = df["team"].replace(TEAM_FIX)
    df["offense_pct"] = pd.to_numeric(df["offense_pct"], errors="coerce")
    df["defense_pct"] = pd.to_numeric(df["defense_pct"], errors="coerce")
    agg = (
        df.groupby(["team","game_id"], observed=True)
          .agg(
              avg_offense_pct=("offense_pct","mean"),
              avg_defense_pct=("defense_pct","mean"),
              num_starters_90=("offense_pct", lambda x: (x >= 0.9).sum())
          )
          .reset_index()
    )
    game_idx = (
        sched[["game_id","season","kickoff_et"]]
        .drop_duplicates("game_id")
        .copy()
    )
    out = team_games[["team","game_id"]].drop_duplicates()
    out = out.merge(game_idx, on="game_id", how="left")
    out = out.merge(agg, on=["team","game_id"], how="left")
    out = out.sort_values(["team","season","kickoff_et"]).reset_index(drop=True)
    for col in ["avg_offense_pct","avg_defense_pct","num_starters_90"]:
        out[f"{col}_pre"] = out.groupby("team")[col].shift(1)
    lagged = out[["team","game_id","season",
                  "avg_offense_pct_pre","avg_defense_pct_pre","num_starters_90_pre"]]

    return lagged

snaps_prepped = prep_snap_counts(snap_counts, team_games, sched)
TEAM_FIX = {"LA":"LAR","LV":"LVR","WSH":"WAS","OAK":"LVR","STL":"LAR","":pd.NA}
snaps_prepped["team"] = snaps_prepped["team"].replace(TEAM_FIX)
home_snaps = snaps_prepped.rename(
    columns={c: f"home_snaps_{c}" for c in snaps_prepped.columns if c not in ["game_id","team","season"]}
).rename(columns={"team":"home_team"})
away_snaps = snaps_prepped.rename(
    columns={c: f"away_snaps_{c}" for c in snaps_prepped.columns if c not in ["game_id","team","season"]}
).rename(columns={"team":"away_team"})
sched = sched.merge(
    home_snaps, on=["game_id","season","home_team"], how="left"
)
sched = sched.merge(
    away_snaps, on=["game_id","season","away_team"], how="left"
)

print(f"[MERGE] sched enriched with snap counts: {sched.shape}")
for col in sched.columns.tolist():  
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = (
            sched[ycol].combine_first(sched[xcol])
        )
        sched.drop(columns=[xcol, ycol], inplace=True)

[MERGE] sched enriched with snap counts: (5932, 527)


## **STEP 8: INJURES**

In [19]:
injuries    = nfl.import_injuries(list(range(2017, 2025)))
snap_counts = nfl.import_snap_counts(list(range(2012, 2025)))
starters    = nfl.import_weekly_rosters(list(range(2017, 2025)))

TEAM_FIX = {
    "LA": "LAR",   # Rams
    "LV": "LVR",   # Raiders
    "WSH": "WAS",  # Commanders
    "OAK": "LVR",  # Old Raiders
    "STL": "LAR",  # Old Rams
    "SD": "LAC",   # Old Chargers
}

injuries["team"]    = injuries["team"].replace(TEAM_FIX)
snap_counts["team"] = snap_counts["team"].replace(TEAM_FIX)
starters["team"]    = starters["team"].replace(TEAM_FIX)


def prep_injuries_expanded_fixed(injuries, snap_counts, starters, sched):
    df = injuries.copy()
    df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
    df["week"]   = pd.to_numeric(df["week"], errors="coerce").astype("Int64")

    snaps = snap_counts.copy()
    snaps = snaps.rename(columns={"player": "full_name"})  
    snaps["is_starter_prev"] = (snaps["offense_pct"] >= 0.5).astype(int)
    snaps = snaps.sort_values(["full_name","season","week"])
    snaps["is_starter_prev"] = snaps.groupby("full_name")["is_starter_prev"].shift(1).fillna(0)

    join_keys = ["full_name","team","season","week"]
    df = df.merge(snaps[join_keys + ["is_starter_prev"]], on=join_keys, how="left")

    status_weights = {"Out":1.0,"Doubtful":0.5,"Questionable":0.25}
    df["status_wt"] = df["report_status"].map(status_weights).fillna(0)

    df["expected_starter_out"] = (
        (df["report_status"]=="Out") & (df["is_starter_prev"]==1)
    ).astype(int)
    df["injury_impact"]  = df["status_wt"] * df["is_starter_prev"]
    df["starter_impact"] = df["injury_impact"] * df["is_starter_prev"]

    agg = df.groupby(["team","season","week"], observed=True).agg(
        num_out=("report_status", lambda x: (x=="Out").sum()),
        num_questionable=("report_status", lambda x: (x=="Questionable").sum()),
        num_doubtful=("report_status", lambda x: (x=="Doubtful").sum()),
        expected_num_starters_out=("expected_starter_out","sum"),
        injury_impact=("injury_impact","sum"),
        starter_impact=("starter_impact","sum")
    ).reset_index()
    starters_count = starters.groupby(["team","season","week"], observed=True)["player_id"].nunique().reset_index()
    starters_count = starters_count.rename(columns={"player_id":"num_starters"})
    agg = agg.merge(starters_count, on=["team","season","week"], how="left")
    agg["pct_starters_out"] = (agg["expected_num_starters_out"]/agg["num_starters"]).fillna(0)
    agg["pct_total_impact"] = (agg["injury_impact"]/agg["num_starters"]).fillna(0)
    game_idx = sched[["game_id","season","week","kickoff_et","home_team","away_team"]].drop_duplicates("game_id")
    game_idx["season"] = pd.to_numeric(game_idx["season"], errors="coerce").astype("Int64")
    game_idx["week"]   = pd.to_numeric(game_idx["week"], errors="coerce").astype("Int64")

    long_idx = pd.concat([
        game_idx[["game_id","season","week","kickoff_et"]].assign(team=game_idx["home_team"]),
        game_idx[["game_id","season","week","kickoff_et"]].assign(team=game_idx["away_team"])
    ], ignore_index=True)

    out = long_idx.merge(agg, on=["team","season","week"], how="left")
    pregame_cols = ["num_out","num_questionable","num_doubtful","expected_num_starters_out"]
    lag_cols = ["injury_impact","starter_impact","pct_starters_out","pct_total_impact"]
    out = out.sort_values(["team","season","kickoff_et"]).reset_index(drop=True)
    for col in lag_cols:
        out[f"{col}_pre"] = out.groupby("team")[col].shift(1).fillna(0)

    return out[["team","game_id","season"] + pregame_cols + [f"{c}_pre" for c in lag_cols]]

inj_prepped_exp = prep_injuries_expanded_fixed(injuries, snap_counts, starters, sched)

sched = sched.merge(
    inj_prepped_exp,
    on=["season","game_id","team"],
    how="left"
)

for col in sched.columns.tolist():
    if col.endswith("_x") and col.replace("_x","_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x","_y")
        sched[col.replace("_x","")] = sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol,ycol], inplace=True)

print("[CHECK] Duplicated game_id rows:", sched["game_id"].duplicated().sum())
print("[CHECK] Example rows for Week 2, 2024:")

[CHECK] Duplicated game_id rows: 0
[CHECK] Example rows for Week 2, 2024:


## **STEP 9: NGS**

In [20]:

ngs_receiving = pd.read_parquet("https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_receiving.parquet")
ngs_passing   = pd.read_parquet("https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_passing.parquet")
ngs_rushing   = pd.read_parquet("https://github.com/nflverse/nflverse-data/releases/download/nextgen_stats/ngs_rushing.parquet")

TEAM_FIX = {
    "LA": "LAR",   # Rams
    "STL": "LAR",  # Old Rams
    "LV": "LVR",   # Raiders
    "OAK": "LVR",  # Old Raiders
    "SD": "LAC",   # Old Chargers
    "WSH": "WAS",  # Commanders
}

for df in [ngs_receiving, ngs_passing, ngs_rushing]:
    if "team_abbr" in df.columns:
        df["team_abbr"] = df["team_abbr"].replace(TEAM_FIX)

def prep_ngs_all_cols_categorical(ngs_receiving, ngs_passing, ngs_rushing, sched):
    """
    Aggregate all NGS numeric columns per team-week.
    Lag numeric features by 1 game per team.
    """

    def aggregate_team_game(df):
        numeric_cols = df.select_dtypes(include="number").columns.tolist()
    
        for col in ["team_abbr","season","week"]:
            if col in numeric_cols:
                numeric_cols.remove(col)
        
        df_agg = (
            df.groupby(["team_abbr","season","week"], observed=True)[numeric_cols]
              .mean()
              .reset_index()
              .rename(columns={"team_abbr":"team"})
        )
        return df_agg

    rec_agg  = aggregate_team_game(ngs_receiving)
    pass_agg = aggregate_team_game(ngs_passing)
    rush_agg = aggregate_team_game(ngs_rushing)

    ngs_merged = rec_agg.merge(pass_agg, on=["team","season","week"], how="outer")
    ngs_merged = ngs_merged.merge(rush_agg, on=["team","season","week"], how="outer")

    ngs_merged["season"] = ngs_merged["season"].astype(int)
    ngs_merged["week"]   = ngs_merged["week"].astype(int)
    sched["season"]      = sched["season"].astype(int)
    sched["week"]        = sched["week"].astype(int)

    out = ngs_merged.sort_values(["team","season","week"]).reset_index(drop=True)
    numeric_cols = out.select_dtypes(include="number").columns.tolist()
    for col in ["season","week"]:
        if col in numeric_cols:
            numeric_cols.remove(col)
    for col in numeric_cols:
        out[col] = out.groupby("team")[col].shift(1)

    return out

ngs_prepped_all_cat = prep_ngs_all_cols_categorical(
    ngs_receiving, ngs_passing, ngs_rushing, sched
)

player_cols = [c for c in ngs_prepped_all_cat.columns if c.startswith("player_")]
ngs_prepped_all_cat = ngs_prepped_all_cat.drop(columns=player_cols, errors="ignore")

print("[CLEANUP] ngs_prepped_all_cat shape after dropping player cols:", ngs_prepped_all_cat.shape)
print("[CLEANUP] Remaining columns:", ngs_prepped_all_cat.columns.tolist()[:15], "...")

[CLEANUP] ngs_prepped_all_cat shape after dropping player cols: (5558, 44)
[CLEANUP] Remaining columns: ['team', 'season', 'week', 'avg_cushion', 'avg_separation', 'avg_intended_air_yards_x', 'percent_share_of_intended_air_yards', 'receptions', 'targets', 'catch_percentage', 'yards', 'rec_touchdowns', 'avg_yac', 'avg_expected_yac', 'avg_yac_above_expectation'] ...


In [21]:
player_cols = [c for c in ngs_prepped_all_cat.columns if c.startswith("player_")]
ngs_prepped_all_cat = ngs_prepped_all_cat.drop(columns=player_cols, errors="ignore")

home_ngs = ngs_prepped_all_cat.rename(columns={
    c: f"home_{c}" for c in ngs_prepped_all_cat.columns if c not in ["team","season","week"]
})
home_ngs = home_ngs.rename(columns={"team": "home_team"})

away_ngs = ngs_prepped_all_cat.rename(columns={
    c: f"away_{c}" for c in ngs_prepped_all_cat.columns if c not in ["team","season","week"]
})
away_ngs = away_ngs.rename(columns={"team": "away_team"})

sched = sched.merge(home_ngs, on=["home_team","season","week"], how="left")
sched = sched.merge(away_ngs, on=["away_team","season","week"], how="left")

print(f"[MERGE] sched enriched with NGS features: {sched.shape}")

[MERGE] sched enriched with NGS features: (5932, 617)


In [22]:
print([c for c in sched.columns if "_x" in c or "_y" in c])  # should now be empty
for col in sched.columns.tolist():  # use list() to avoid mutation issues
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        # Ensure we select a Series, not a DataFrame
        sched[col.replace("_x", "")] = sched[ycol].iloc[:,0].combine_first(sched[xcol].iloc[:,0]) \
            if isinstance(sched[ycol], pd.DataFrame) else sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)
print([c for c in sched.columns if "_x" in c or "_y" in c])  # should now be empty
sched.shape

['home_avg_drive_yards', 'away_avg_drive_yards', 'home_yards_gained', 'home_penalty_yards', 'home_air_yards_sum', 'home_completions_x', 'home_xpass_mean', 'home_penalty_yards_per_play', 'away_yards_gained', 'away_penalty_yards', 'away_air_yards_sum', 'away_completions_x', 'away_xpass_mean', 'away_penalty_yards_per_play', 'home_def_def_yards_per_play', 'home_def_def_penalty_yards', 'away_def_def_yards_per_play', 'away_def_def_penalty_yards', 'career_pass_yds_QB1', 'career_pass_yds_RB1', 'career_pass_yds_TE1', 'career_pass_yds_WR1', 'career_pass_yds_WR2', 'career_rec_yds_QB1', 'career_rec_yds_RB1', 'career_rec_yds_TE1', 'career_rec_yds_WR1', 'career_rec_yds_WR2', 'career_rush_yds_QB1', 'career_rush_yds_RB1', 'career_rush_yds_TE1', 'career_rush_yds_WR1', 'career_rush_yds_WR2', 'passing_yards_s2d_QB1', 'passing_yards_s2d_RB1', 'passing_yards_s2d_TE1', 'passing_yards_s2d_WR1', 'passing_yards_s2d_WR2', 'receiving_yards_s2d_QB1', 'receiving_yards_s2d_RB1', 'receiving_yards_s2d_TE1', 'receivin

(5932, 613)

## **COMBINE ALL DATA**

In [23]:
def make_pre_game_rolls(df: pd.DataFrame, value_cols, windows=(1,3,5)):
    """Create season-to-date and rolling window averages/sums for each team."""
    out = df.copy().sort_values(["team","kickoff_et"]).reset_index(drop=True)
    out["season"] = pd.to_numeric(out["season"], errors="coerce").astype("Int64")

    new_feats = {}

    by_ts = out.groupby(["team","season"], observed=True)
    for c in value_cols:
        if c not in out: 
            continue
        out[c] = pd.to_numeric(out[c], errors="coerce")
        if c.endswith("_pre"): 
            em = by_ts[c].expanding().mean().reset_index(level=[0,1], drop=True)
            es = by_ts[c].expanding().sum().reset_index(level=[0,1], drop=True)
            new_feats[f"szn_mean_{c}"] = em  
            new_feats[f"szn_sum_{c}"]  = es
        else:
            em = by_ts[c].expanding().mean().reset_index(level=[0,1], drop=True)
            es = by_ts[c].expanding().sum().reset_index(level=[0,1], drop=True)
            new_feats[f"szn_mean_{c}"] = em
            new_feats[f"szn_sum_{c}"]  = es

    by_t = out.groupby("team", observed=True)
    for w in windows:
        for c in value_cols:
            if c not in out:
                continue
            lag = by_t[c].shift(1)
            new_feats[f"r{w}_mean_pre_{c}"] = (
                lag.groupby(out["team"]).rolling(w, min_periods=1).mean().reset_index(level=0, drop=True)
            )
            new_feats[f"r{w}_sum_pre_{c}"] = (
                lag.groupby(out["team"]).rolling(w, min_periods=1).sum().reset_index(level=0, drop=True)
            )

    out = pd.concat([out, pd.DataFrame(new_feats)], axis=1)
    return out


def build_game_features_from_sched(sched: pd.DataFrame) -> pd.DataFrame:
    """
    Build per-game dataset with rolling win rates + pre-game rolling stats.
    Assumes sched already has merged home_/away_ prefixed features.
    """
    df = sched.copy()
    df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
    df["week"]   = pd.to_numeric(df["week"], errors="coerce").astype("Int64")
    df["kickoff_et"] = pd.to_datetime(df["kickoff_et"], errors="coerce")

    team_results = []
    for _, r in df.iterrows():
        if pd.notna(r["home_score"]) and pd.notna(r["away_score"]):
            if r["home_score"] > r["away_score"]:
                team_results.append({"game_id":r["game_id"], "team":r["home_team"], "win":1})
                team_results.append({"game_id":r["game_id"], "team":r["away_team"], "win":0})
            elif r["home_score"] < r["away_score"]:
                team_results.append({"game_id":r["game_id"], "team":r["home_team"], "win":0})
                team_results.append({"game_id":r["game_id"], "team":r["away_team"], "win":1})
            else:
                team_results.append({"game_id":r["game_id"], "team":r["home_team"], "win":0.5})
                team_results.append({"game_id":r["game_id"], "team":r["away_team"], "win":0.5})
    team_results = pd.DataFrame(team_results)

    game_idx = df[["game_id","kickoff_et","season"]].drop_duplicates("game_id")
    team_results = team_results.merge(game_idx, on="game_id", how="left")

    team_results = team_results.sort_values(["team","kickoff_et"])
    for w in (1,3,5):
        team_results[f"win_rate_r{w}"] = (
            team_results.groupby("team")["win"].shift(1)
                        .groupby(team_results["team"])
                        .rolling(w, min_periods=1).mean()
                        .reset_index(level=0, drop=True)
        )

    all_rolls = []
    for side in ["home","away"]:
        side_df = df.copy()
        if "team" in side_df.columns:
            side_df = side_df.drop(columns=["team"])
        side_df = side_df.rename(columns={f"{side}_team": "team"})


        value_cols = [
            c for c in side_df.columns
            if c.startswith(f"{side}_") and c not in [
                f"{side}_team","game_id","kickoff_et","season","week","game_type"
            ]
        ]

        rolled = make_pre_game_rolls(side_df, value_cols, windows=(1,3,5))
        rolled = rolled[[c for c in rolled.columns if c.startswith(("szn_","r1_","r3_","r5_"))] + ["game_id","team"]]
        rolled = rolled.rename(columns={c:f"{side}_{c}" for c in rolled.columns if c not in ["game_id","team"]})
        all_rolls.append((side, rolled))
    out = df.copy()
    for side, rolled in all_rolls:
        out = out.merge(rolled, left_on=["game_id",f"{side}_team"], right_on=["game_id","team"], how="left")
        out = out.drop(columns=["team"], errors="ignore")

    for side in ["home","away"]:
        out = out.merge(
            team_results[["game_id","team","win_rate_r1","win_rate_r3","win_rate_r5"]],
            left_on=[f"{side}_team","game_id"],
            right_on=["team","game_id"],
            how="left"
        )
        out = out.drop(columns=["team"], errors="ignore").rename(
            columns={f"win_rate_r{w}":f"{side}_win_rate_r{w}" for w in (1,3,5)}
        )

    if {"home_score","away_score"}.issubset(out.columns):
        out["home_win"] = (out["home_score"] > out["away_score"]).astype(int)

    print(f"[build_game_features_from_sched] → shape {out.shape}, cols={len(out.columns)}")
    return out

sched = build_game_features_from_sched(sched)

[build_game_features_from_sched] → shape (5932, 4525), cols=4525


In [24]:
print([c for c in sched.columns if "_x" in c or "_y" in c])  
for col in sched.columns.tolist():  
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].iloc[:,0].combine_first(sched[xcol].iloc[:,0]) \
            if isinstance(sched[ycol], pd.DataFrame) else sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)
print([c for c in sched.columns if "_x" in c or "_y" in c])  
sched.shape

['home_avg_drive_yards', 'away_avg_drive_yards', 'home_yards_gained', 'home_penalty_yards', 'home_air_yards_sum', 'home_xpass_mean', 'home_penalty_yards_per_play', 'away_yards_gained', 'away_penalty_yards', 'away_air_yards_sum', 'away_xpass_mean', 'away_penalty_yards_per_play', 'home_def_def_yards_per_play', 'home_def_def_penalty_yards', 'away_def_def_yards_per_play', 'away_def_def_penalty_yards', 'team_x', 'career_pass_yds_QB1', 'career_pass_yds_RB1', 'career_pass_yds_TE1', 'career_pass_yds_WR1', 'career_pass_yds_WR2', 'career_rec_yds_QB1', 'career_rec_yds_RB1', 'career_rec_yds_TE1', 'career_rec_yds_WR1', 'career_rec_yds_WR2', 'career_rush_yds_QB1', 'career_rush_yds_RB1', 'career_rush_yds_TE1', 'career_rush_yds_WR1', 'career_rush_yds_WR2', 'passing_yards_s2d_QB1', 'passing_yards_s2d_RB1', 'passing_yards_s2d_TE1', 'passing_yards_s2d_WR1', 'passing_yards_s2d_WR2', 'receiving_yards_s2d_QB1', 'receiving_yards_s2d_RB1', 'receiving_yards_s2d_TE1', 'receiving_yards_s2d_WR1', 'receiving_yards

(5932, 4524)

## **PULL DATA FOR NEW FEATURES AND ADD**

In [25]:
starters = nfl.import_weekly_rosters(list(range(2005, 2025)))

TEAM_FIX = {
    "OAK": "LV",
    "STL": "LAR",
    "SD":  "LAC",
    "LA":  "LAR",
    "":    None,   # drop blanks
}

def fix_team_names(df, col="team"):
    if col in df.columns:
        df[col] = df[col].replace(TEAM_FIX)
    return df

sched["home_team"] = sched["home_team"].replace(TEAM_FIX)
sched["away_team"] = sched["away_team"].replace(TEAM_FIX)
injuries = fix_team_names(injuries, "team")
starters = fix_team_names(starters, "team")
pbp      = fix_team_names(pbp, "posteam") if "posteam" in pbp.columns else fix_team_names(pbp, "team")

home_games = sched[["game_id","season","week","kickoff_et","home_team","lat","lon"]].rename(
    columns={"home_team":"team","lat":"home_lat","lon":"home_lon"}
)
home_games["is_home"] = 1
home_games["is_away"] = 0

away_games = sched[["game_id","season","week","kickoff_et","away_team","lat","lon"]].rename(
    columns={"away_team":"team","lat":"away_lat","lon":"away_lon"}
)
away_games["is_home"] = 0
away_games["is_away"] = 1

team_games = pd.concat([home_games, away_games], ignore_index=True)
team_games["kickoff_et"] = pd.to_datetime(team_games["kickoff_et"], errors="coerce", utc=True)

def _coalesce(df: pd.DataFrame, candidates: list[str]) -> str | None:
    for c in candidates:
        if c in df.columns:
            return c
    return None

def haversine(lat1, lon1, lat2, lon2):
    """Great-circle distance (miles)."""
    R = 3959
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return np.nan
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    return R * 2 * atan2(sqrt(a), sqrt(1-a))

def prep_roster_continuity(starters: pd.DataFrame,
                           team_games: pd.DataFrame) -> pd.DataFrame:
    starters = starters.copy()
    starters["depth_chart_position"] = pd.to_numeric(starters["depth_chart_position"], errors="coerce")
    starters["is_starter"] = (starters["depth_chart_position"] == 1).astype(int)

    st = starters.loc[starters["is_starter"]==1, ["team","season","week","player_id"]].copy()
    st["season"] = pd.to_numeric(st["season"], errors="coerce").astype("Int64")
    st["week"]   = pd.to_numeric(st["week"], errors="coerce").astype("Int64")

    ord_idx = team_games[["team","season","week","game_id","kickoff_et"]].drop_duplicates()
    ord_idx["kickoff_et"] = pd.to_datetime(ord_idx["kickoff_et"], errors="coerce", utc=True)
    st = st.merge(ord_idx, on=["team","season","week"], how="left").sort_values(["team","kickoff_et"])

    st["prev_players"] = st.groupby(["team"])["player_id"].shift(1)

    def _count_returning(group: pd.DataFrame) -> pd.Series:
        by_game = group.groupby("game_id")["player_id"].apply(set)
        prev_by_game = group.groupby("game_id")["prev_players"].apply(lambda s: set([x for x in s if pd.notna(x)]))
        common_idx = by_game.index.intersection(prev_by_game.index)
        returning = []
        for gid in common_idx:
            returning.append(len(by_game.loc[gid].intersection(prev_by_game.loc[gid])))
        return pd.Series(returning, index=common_idx, name="starters_returning")

    ret = _count_returning(st.reset_index(drop=True))
    ret_df = ret.reset_index().rename(columns={"index":"game_id"})
    teams = ord_idx[["game_id","team"]].drop_duplicates()
    ret_df = ret_df.merge(teams, on="game_id", how="left")
    return ret_df[["team","game_id","starters_returning"]]


def add_travel_features(team_games: pd.DataFrame) -> pd.DataFrame:
    tg = team_games.copy()
    tg["kickoff_et"] = pd.to_datetime(tg["kickoff_et"], errors="coerce", utc=True)
    tg = tg.sort_values(["team","kickoff_et"]).reset_index(drop=True)

    if "home_lat" in tg.columns and "home_lon" in tg.columns:
        tg["travel_dist"] = np.where(
            tg["is_away"]==1,
            tg.apply(lambda r: haversine(r.get("home_lat"), r.get("home_lon"),
                                         r.get("away_lat"), r.get("away_lon")), axis=1),
            0
        )
    else:
        tg["travel_dist"] = 0

    tg["cum_travel"] = tg.groupby("team")["travel_dist"].cumsum()

    road = tg["is_away"].fillna(0).astype(int)
    grp = (road != road.shift()).cumsum()
    streak = road.groupby([tg["team"], grp]).cumsum()
    tg["consec_road"] = streak

    west = tg["team"].isin(["SEA","SF","LAR","LAC","ARI","LV"])
    kick_et_hr = pd.to_datetime(tg["kickoff_et"]).dt.tz_convert("America/New_York").dt.hour
    tg["early_body_clock"] = ((west) & (kick_et_hr == 13)).astype("Int64")

    return tg[["team","game_id","cum_travel","consec_road","early_body_clock"]]

def prep_pbp_aggregates(pbp: pd.DataFrame,
                        sched: pd.DataFrame,
                        team_games: pd.DataFrame) -> pd.DataFrame:
    df = pbp.copy()
    df["game_id"] = df["game_id"].astype(str)
    df["team"] = df["posteam"] if "posteam" in df.columns else df.get("team")
    df = df[df["team"].notna()].copy()

    key = ["game_id","team"]
    df["penalty_flag"] = df.get("penalty", 0).fillna(0).astype(int)
    fd_series = df.get("first_down", 0).fillna(0).astype(int)
    df["pen_fd"] = (df["penalty_flag"] & fd_series).astype(int)

    drv = df.groupby(key + ["drive"], observed=True)["yards_gained"].sum().reset_index()
    vol = drv.groupby(key, observed=True)["yards_gained"].agg(drive_yards_var="var",
                                                              drive_yards_std="std").reset_index()

    pen = df.groupby(key, observed=True).agg(
        penalties=("penalty_flag","sum"),
        penalty_first_downs=("pen_fd","sum")
    ).reset_index()
    fum = df.groupby(key, observed=True).agg(
        fumbles=("fumble","sum") if "fumble" in df.columns else ("yards_gained","size"),
        fumbles_lost=("fumble_lost","sum") if "fumble_lost" in df.columns else ("yards_gained","size"),
        fumble_rec=("fumble_recovery","sum") if "fumble_recovery" in df.columns else ("yards_gained","size")
    ).reset_index()

    aggs = pen.merge(vol, on=key, how="outer").merge(fum, on=key, how="outer")
    aggs["penalty_fd_rate"] = aggs["penalty_first_downs"] / aggs["penalties"].replace(0, np.nan)
    aggs = aggs.fillna(0)

    ord_idx = team_games[["team","game_id","kickoff_et"]].drop_duplicates()
    ord_idx["kickoff_et"] = pd.to_datetime(ord_idx["kickoff_et"], errors="coerce", utc=True)
    aggs = aggs.merge(ord_idx, on=["team","game_id"], how="left").sort_values(["team","kickoff_et"])

    metric_cols = [c for c in aggs.columns if c not in ["team","game_id","kickoff_et"]]
    for c in metric_cols:
        aggs[f"{c}_pre"] = aggs.groupby("team")[c].shift(1)

    out_cols = ["team","game_id"] + [f"{c}_pre" for c in metric_cols]
    return aggs[out_cols]

def add_weather_interactions_on_game_features(game_features: pd.DataFrame) -> pd.DataFrame:
    gf = game_features.copy()

    rename_map = {
        "weather_temp": "temp",
        "weather_windspeed": "windspeed",
        "weather_precip": "precip",
        "weather_humidity": "humidity",
    }
    gf = gf.rename(columns={c: rename_map[c] for c in gf.columns if c in rename_map})

    if "temp" in gf.columns:
        gf["home_temp"] = gf.get("home_temp", gf["temp"])
        gf["away_temp"] = gf.get("away_temp", gf["temp"])
    if "windspeed" in gf.columns:
        gf["home_wind"] = gf.get("home_wind", gf["windspeed"])
        gf["away_wind"] = gf.get("away_wind", gf["windspeed"])
    if "precip" in gf.columns:
        gf["home_precip"] = gf.get("home_precip", gf["precip"])
        gf["away_precip"] = gf.get("away_precip", gf["precip"])
    if "humidity" in gf.columns:
        gf["home_humidity"] = gf.get("home_humidity", gf["humidity"])
        gf["away_humidity"] = gf.get("away_humidity", gf["humidity"])

    for side in ["home","away"]:
        wind_col   = f"{side}_wind" if f"{side}_wind" in gf.columns else None
        precip_col = f"{side}_precip" if f"{side}_precip" in gf.columns else None
        temp_col   = f"{side}_temp" if f"{side}_temp" in gf.columns else None

        deep_col = f"{side}_r3_mean_pre_deep_pass_share" if f"{side}_r3_mean_pre_deep_pass_share" in gf.columns else None
        rush_col = f"{side}_r3_mean_pre_rush_share" if f"{side}_r3_mean_pre_rush_share" in gf.columns else None

        if wind_col and deep_col:
            gf[f"{side}_wind_x_deep"] = pd.to_numeric(gf[wind_col], errors="coerce") * pd.to_numeric(gf[deep_col], errors="coerce")
        if precip_col and rush_col:
            gf[f"{side}_precip_x_rush"] = pd.to_numeric(gf[precip_col], errors="coerce") * pd.to_numeric(gf[rush_col], errors="coerce")
        if temp_col:
            t = pd.to_numeric(gf[temp_col], errors="coerce")
            gf[f"{side}_temp_lt40"] = (t < 40).astype("Int64")
            gf[f"{side}_temp_gt85"] = (t > 85).astype("Int64")

    return gf

def merge_side_features(base: pd.DataFrame, feats: pd.DataFrame, side: str) -> pd.DataFrame:
    out = base.merge(
        feats.rename(columns={c: f"{side}_{c}" for c in feats.columns if c not in ["game_id","team"]}),
        left_on=[f"{side}_team","game_id"],
        right_on=["team","game_id"],
        how="left"
    )
    return out.drop(columns=["team"], errors="ignore")

roster_feats  = prep_roster_continuity(starters, team_games)
travel_feats  = add_travel_features(team_games)
pbp_feats     = prep_pbp_aggregates(pbp, sched, team_games)

sched = merge_side_features(sched, roster_feats, "home")
sched = merge_side_features(sched, roster_feats, "away")
sched = merge_side_features(sched, travel_feats, "home")
sched = merge_side_features(sched, travel_feats, "away")
sched = merge_side_features(sched, pbp_feats, "home")
sched = merge_side_features(sched, pbp_feats, "away")

sched = add_weather_interactions_on_game_features(sched)

print("[CHECK] Unique game_id:", sched["game_id"].is_unique)
print("[CHECK] Duplicated columns:", sched.columns[sched.columns.duplicated()])
print("[CHECK] Example rows:", sched.head(3)[["game_id","home_team","away_team"]])

[CHECK] Unique game_id: True
[CHECK] Duplicated columns: Index(['temp'], dtype='object')
[CHECK] Example rows:            game_id home_team away_team
0   2000_19_NO_MIN       MIN        NO
1  2000_19_BAL_TEN       TEN       BAL
2   2000_01_SF_ATL       ATL        SF


In [26]:
for xcol in sched.filter(like="_x"):
    ycol = xcol.replace("_x", "_y")
    if ycol in sched.columns:
        basecol = xcol[:-2]
        sched[basecol] = sched[ycol].combine_first(sched[xcol])
        sched.drop([xcol, ycol], axis=1, inplace=True)

print([c for c in sched.columns if c.endswith(("_x","_y"))])
print(sched.shape)

[]
(5932, 4556)


## **INTERACTION VARIABLES** 

In [27]:
sched["spread_x_home_epa"]        = sched["home_spread_line"] * sched["home_epa_per_play"]
sched["spread_x_away_epa"]        = sched["home_spread_line"] * sched["away_epa_per_play"]
sched["spread_x_home_success"]    = sched["home_spread_line"] * sched["home_success_rate"]
sched["spread_x_away_success"]    = sched["home_spread_line"] * sched["away_success_rate"]
sched["spread_x_home_def_epa"]    = sched["home_spread_line"] * sched["home_def_def_epa_per_play"]
sched["spread_x_away_def_epa"]    = sched["home_spread_line"] * sched["away_def_def_epa_per_play"]
sched["spread_x_home_def_success"]= sched["home_spread_line"] * sched["home_def_def_success_rate"]
sched["spread_x_away_def_success"]= sched["home_spread_line"] * sched["away_def_def_success_rate"]
sched["home_succ_x_expl_pass"] = sched["home_success_rate"] * sched["home_explosive_pass_rate"]
sched["away_succ_x_expl_rush"] = sched["away_success_rate"] * sched["away_explosive_rush_rate"]
sched["wind_x_home_expl_pass"] = sched["home_weather_windspeed"] * sched["home_explosive_pass_rate"]
sched["precip_x_away_rush"]    = sched["home_weather_precip"] * sched["away_explosive_rush_rate"]
sched["div_home_implied"] = sched["div_game"] * sched["home_avg_implied_prob"]
sched["div_home_edge"]    = sched["div_game"] * sched["home_market_edge_prob"]
sched["home_rest_x_epa"] = sched["home_rest_days"] * sched["home_epa_per_play"]
sched["div_spread_x_home_epa"]         = sched["div_game"] * sched["spread_x_home_epa"]
sched["div_spread_x_away_epa"]         = sched["div_game"] * sched["spread_x_away_epa"]
sched["div_spread_x_home_success"]     = sched["div_game"] * sched["spread_x_home_success"]
sched["div_spread_x_away_success"]     = sched["div_game"] * sched["spread_x_away_success"]
sched["div_spread_x_home_def_epa"]     = sched["div_game"] * sched["spread_x_home_def_epa"]
sched["div_spread_x_away_def_epa"]     = sched["div_game"] * sched["spread_x_away_def_epa"]
sched["div_spread_x_home_def_success"] = sched["div_game"] * sched["spread_x_home_def_success"]
sched["div_spread_x_away_def_success"] = sched["div_game"] * sched["spread_x_away_def_success"]
sched["spread_x_home_expl_pass"]  = sched["home_spread_line"] * sched["home_explosive_pass_rate"]
sched["spread_x_away_expl_pass"]  = sched["home_spread_line"] * sched["away_explosive_pass_rate"]
sched["spread_x_home_expl_rush"]  = sched["home_spread_line"] * sched["home_explosive_rush_rate"]
sched["spread_x_away_expl_rush"]  = sched["home_spread_line"] * sched["away_explosive_rush_rate"]
sched["spread_x_home_def_expl_pass"] = sched["home_spread_line"] * sched["home_def_def_explosive_pass_rate_allowed"]
sched["spread_x_away_def_expl_pass"] = sched["home_spread_line"] * sched["away_def_def_explosive_pass_rate_allowed"]
sched["spread_x_home_def_expl_rush"] = sched["home_spread_line"] * sched["home_def_def_explosive_rush_rate_allowed"]
sched["spread_x_away_def_expl_rush"] = sched["home_spread_line"] * sched["away_def_def_explosive_rush_rate_allowed"]
sched["spread_x_home_rz"]       = sched["home_spread_line"] * sched["home_red_zone_td_rate"]
sched["spread_x_away_rz"]       = sched["home_spread_line"] * sched["away_red_zone_td_rate"]
sched["spread_x_home_def_rz"]   = sched["home_spread_line"] * sched["home_def_def_red_zone_td_rate_allowed"]
sched["spread_x_away_def_rz"]   = sched["home_spread_line"] * sched["away_def_def_red_zone_td_rate_allowed"]
sched["spread_x_home_g2g"]      = sched["home_spread_line"] * sched["home_g2g_td_rate"]
sched["spread_x_away_g2g"]      = sched["home_spread_line"] * sched["away_g2g_td_rate"]
sched["spread_x_home_def_g2g"]  = sched["home_spread_line"] * sched["home_def_def_g2g_td_rate_allowed"]
sched["spread_x_away_def_g2g"]  = sched["home_spread_line"] * sched["away_def_def_g2g_td_rate_allowed"]
sched["spread_x_home_to"]       = sched["home_spread_line"] * sched["home_turnover_rate"]
sched["spread_x_away_to"]       = sched["home_spread_line"] * sched["away_turnover_rate"]
sched["spread_x_home_takeaways"]= sched["home_spread_line"] * sched["home_def_def_takeaways"]
sched["spread_x_away_takeaways"]= sched["home_spread_line"] * sched["away_def_def_takeaways"]
sched["spread_x_home_implied"]  = sched["home_spread_line"] * sched["home_avg_implied_prob"]
sched["spread_x_away_implied"]  = sched["home_spread_line"] * sched["away_avg_implied_prob"]
sched["spread_x_home_edge"]     = sched["home_spread_line"] * sched["home_market_edge_prob"]
sched["spread_x_away_edge"]     = sched["home_spread_line"] * sched["away_market_edge_prob"]
sched["home_epa_x_home_implied"] = sched["home_epa_per_play"] * sched["home_avg_implied_prob"]
sched["away_epa_x_away_implied"] = sched["away_epa_per_play"] * sched["away_avg_implied_prob"]
sched["home_epa_x_market_edge"]  = sched["home_epa_per_play"] * sched["home_market_edge_prob"]
sched["away_epa_x_market_edge"]  = sched["away_epa_per_play"] * sched["away_market_edge_prob"]
sched["home_succ_x_home_implied"] = sched["home_success_rate"] * sched["home_avg_implied_prob"]
sched["away_succ_x_away_implied"] = sched["away_success_rate"] * sched["away_avg_implied_prob"]
sched["home_succ_x_edge"]         = sched["home_success_rate"] * sched["home_market_edge_prob"]
sched["away_succ_x_edge"]         = sched["away_success_rate"] * sched["away_market_edge_prob"]
sched["home_xpass_x_implied"] = sched["home_explosive_pass_rate"] * sched["home_avg_implied_prob"]
sched["away_xpass_x_implied"] = sched["away_explosive_pass_rate"] * sched["away_avg_implied_prob"]
sched["home_xrush_x_implied"] = sched["home_explosive_rush_rate"] * sched["home_avg_implied_prob"]
sched["away_xrush_x_implied"] = sched["away_explosive_rush_rate"] * sched["away_avg_implied_prob"]
sched["home_def_epa_x_edge"] = sched["home_def_def_epa_per_play"] * sched["home_market_edge_prob"]
sched["away_def_epa_x_edge"] = sched["away_def_def_epa_per_play"] * sched["away_market_edge_prob"]
sched["home_def_succ_x_implied"] = sched["home_def_def_success_rate"] * sched["home_avg_implied_prob"]
sched["away_def_succ_x_implied"] = sched["away_def_def_success_rate"] * sched["away_avg_implied_prob"]
sched["home_rest_x_implied"] = sched["home_rest_days"] * sched["home_avg_implied_prob"]
sched["away_rest_x_implied"] = sched["away_rest_days"] * sched["away_avg_implied_prob"]
sched["spread_x_home_rest"] = sched["home_spread_line"] * sched["home_rest_days"]
sched["spread_x_away_rest"] = sched["home_spread_line"] * sched["away_rest_days"]
sched["home_implied_x_temp"] = sched["home_avg_implied_prob"] * sched["home_weather_temp"]
sched["away_implied_x_wind"] = sched["away_avg_implied_prob"] * sched["home_weather_windspeed"]
sched["home_edge_x_precip"]  = sched["home_market_edge_prob"] * sched["home_weather_precip"]
sched["epa_diff"] = sched["home_epa_per_play"] - sched["away_epa_per_play"]
sched["market_spread_resid"] = (
    sched["home_spread_line"] - (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"])
)

sched["market_epa_resid"] = (
    sched["epa_diff"] - (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"])
)
sched["epa_diff"] = sched["home_epa_per_play"] - sched["away_epa_per_play"]
sched["success_diff"] = sched["home_success_rate"] - sched["away_success_rate"]
sched["explosive_pass_diff"] = sched["home_explosive_pass_rate"] - sched["away_explosive_pass_rate"]
sched["spread_residual"] = sched["home_spread_line"] - sched["epa_diff"]
sched["rz_diff"] = sched["home_red_zone_td_rate"] - sched["away_def_def_red_zone_td_rate_allowed"]
sched["g2g_diff"] = sched["home_g2g_td_rate"] - sched["away_def_def_g2g_td_rate_allowed"]
sched["to_diff"] = sched["home_turnover_rate"] - sched["away_def_def_takeaways"]
sched["explosive_pass_mismatch"] = sched["home_explosive_pass_rate"] - sched["away_def_def_explosive_pass_rate_allowed"]
sched["explosive_rush_mismatch"] = sched["home_explosive_rush_rate"] - sched["away_def_def_explosive_rush_rate_allowed"]
sched["home_epa_x_away_def_epa"] = sched["home_epa_per_play"] * sched["away_def_def_epa_per_play"]
sched["away_epa_x_home_def_epa"] = sched["away_epa_per_play"] * sched["home_def_def_epa_per_play"]
sched["home_succ_x_away_def_succ"] = sched["home_success_rate"] * sched["away_def_def_success_rate"]
sched["away_succ_x_home_def_succ"] = sched["away_success_rate"] * sched["home_def_def_success_rate"]
sched["home_xpass_x_away_def_xpass"] = sched["home_explosive_pass_rate"] * sched["away_def_def_explosive_pass_rate_allowed"]
sched["away_xpass_x_home_def_xpass"] = sched["away_explosive_pass_rate"] * sched["home_def_def_explosive_pass_rate_allowed"]
sched["home_xrush_x_away_def_xrush"] = sched["home_explosive_rush_rate"] * sched["away_def_def_explosive_rush_rate_allowed"]
sched["away_xrush_x_home_def_xrush"] = sched["away_explosive_rush_rate"] * sched["home_def_def_explosive_rush_rate_allowed"]
sched["home_rz_x_away_def_rz"] = sched["home_red_zone_td_rate"] * sched["away_def_def_red_zone_td_rate_allowed"]
sched["away_rz_x_home_def_rz"] = sched["away_red_zone_td_rate"] * sched["home_def_def_red_zone_td_rate_allowed"]
sched["home_g2g_x_away_def_g2g"] = sched["home_g2g_td_rate"] * sched["away_def_def_g2g_td_rate_allowed"]
sched["away_g2g_x_home_def_g2g"] = sched["away_g2g_td_rate"] * sched["home_def_def_g2g_td_rate_allowed"]
sched["home_to_x_away_tk"] = sched["home_turnover_rate"] * sched["away_def_def_takeaways"]
sched["away_to_x_home_tk"] = sched["away_turnover_rate"] * sched["home_def_def_takeaways"]
sched["implied_x_epa_diff"]    = (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * sched["epa_diff"]
sched["implied_x_success_diff"]= (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * sched["success_diff"]
sched["implied_x_expl_pass"]   = (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * sched["explosive_pass_diff"]
sched["edge_x_epa_diff"]       = (sched["home_market_edge_prob"] - sched["away_market_edge_prob"]) * sched["epa_diff"]
sched["edge_x_success_diff"]   = (sched["home_market_edge_prob"] - sched["away_market_edge_prob"]) * sched["success_diff"]
sched["edge_x_expl_pass"]      = (sched["home_market_edge_prob"] - sched["away_market_edge_prob"]) * sched["explosive_pass_diff"]
sched["home_edge_x_rest"]      = sched["home_market_edge_prob"] * sched["home_rest_days"]
sched["away_edge_x_rest"]      = sched["away_market_edge_prob"] * sched["away_rest_days"]
sched["home_implied_x_rest"]   = sched["home_avg_implied_prob"] * sched["home_rest_days"]
sched["away_implied_x_rest"]   = sched["away_avg_implied_prob"] * sched["away_rest_days"]
sched["home_rz_x_home_implied"]   = sched["home_red_zone_td_rate"] * sched["home_avg_implied_prob"]
sched["away_rz_x_away_implied"]   = sched["away_red_zone_td_rate"] * sched["away_avg_implied_prob"]
sched["home_g2g_x_home_edge"]     = sched["home_g2g_td_rate"] * sched["home_market_edge_prob"]
sched["away_g2g_x_away_edge"]     = sched["away_g2g_td_rate"] * sched["away_market_edge_prob"]
sched["home_def_rz_x_implied"]    = sched["home_def_def_red_zone_td_rate_allowed"] * sched["home_avg_implied_prob"]
sched["away_def_rz_x_implied"]    = sched["away_def_def_red_zone_td_rate_allowed"] * sched["away_avg_implied_prob"]
sched["home_def_g2g_x_edge"]      = sched["home_def_def_g2g_td_rate_allowed"] * sched["home_market_edge_prob"]
sched["away_def_g2g_x_edge"]      = sched["away_def_def_g2g_td_rate_allowed"] * sched["away_market_edge_prob"]
sched["home_to_x_home_implied"]   = sched["home_turnover_rate"] * sched["home_avg_implied_prob"]
sched["away_to_x_away_implied"]   = sched["away_turnover_rate"] * sched["away_avg_implied_prob"]
sched["home_takeaways_x_edge"]    = sched["home_def_def_takeaways"] * sched["home_market_edge_prob"]
sched["away_takeaways_x_edge"]    = sched["away_def_def_takeaways"] * sched["away_market_edge_prob"]
sched["home_expl_pass_x_home_edge"]  = sched["home_explosive_pass_rate"] * sched["home_market_edge_prob"]
sched["away_expl_pass_x_away_edge"]  = sched["away_explosive_pass_rate"] * sched["away_market_edge_prob"]
sched["home_expl_rush_x_home_edge"]  = sched["home_explosive_rush_rate"] * sched["home_market_edge_prob"]
sched["away_expl_rush_x_away_edge"]  = sched["away_explosive_rush_rate"] * sched["away_market_edge_prob"]
sched["home_def_expl_pass_x_implied"]= sched["home_def_def_explosive_pass_rate_allowed"] * sched["home_avg_implied_prob"]
sched["away_def_expl_pass_x_implied"]= sched["away_def_def_explosive_pass_rate_allowed"] * sched["away_avg_implied_prob"]
sched["home_def_expl_rush_x_edge"]   = sched["home_def_def_explosive_rush_rate_allowed"] * sched["home_market_edge_prob"]
sched["away_def_expl_rush_x_edge"]   = sched["away_def_def_explosive_rush_rate_allowed"] * sched["away_market_edge_prob"]
sched["home_def_epa_x_implied"]     = sched["home_def_def_epa_per_play"] * sched["home_avg_implied_prob"]
sched["away_def_epa_x_implied"]     = sched["away_def_def_epa_per_play"] * sched["away_avg_implied_prob"]
sched["home_def_succ_x_edge"]       = sched["home_def_def_success_rate"] * sched["home_market_edge_prob"]
sched["away_def_succ_x_edge"]       = sched["away_def_def_success_rate"] * sched["away_market_edge_prob"]
sched["home_edge_x_temp"]           = sched["home_market_edge_prob"] * sched["home_weather_temp"]
sched["away_edge_x_temp"]           = sched["away_market_edge_prob"] * sched["home_weather_temp"]
sched["home_implied_x_wind"]        = sched["home_avg_implied_prob"] * sched["home_weather_windspeed"]
sched["away_implied_x_wind"]        = sched["away_avg_implied_prob"] * sched["home_weather_windspeed"]
sched["home_edge_x_precip"]         = sched["home_market_edge_prob"] * sched["home_weather_precip"]
sched["away_edge_x_precip"]         = sched["away_market_edge_prob"] * sched["home_weather_precip"]
sched["edge_x_implied_home"] = sched["home_market_edge_prob"] * sched["home_avg_implied_prob"]
sched["edge_x_implied_away"] = sched["away_market_edge_prob"] * sched["away_avg_implied_prob"]
sched["implied_diff_x_edge_diff"] = (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * (sched["home_market_edge_prob"] - sched["away_market_edge_prob"])
sched["spread_x_market_spread_resid"] = sched["home_spread_line"] * sched["market_spread_resid"]
sched["implied_x_market_epa_resid"]   = (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * sched["market_epa_resid"]
sched["edge_x_market_spread_resid"]   = (sched["home_market_edge_prob"] - sched["away_market_edge_prob"]) * sched["market_spread_resid"]
sched["home_def_epa_x_away_epa"]   = sched["home_def_def_epa_per_play"] * sched["away_epa_per_play"]
sched["away_def_epa_x_home_epa"]   = sched["away_def_def_epa_per_play"] * sched["home_epa_per_play"]
sched["home_def_succ_x_away_succ"] = sched["home_def_def_success_rate"] * sched["away_success_rate"]
sched["away_def_succ_x_home_succ"] = sched["away_def_def_success_rate"] * sched["home_success_rate"]
sched["home_def_xpass_x_away_xpass"] = sched["home_def_def_explosive_pass_rate_allowed"] * sched["away_explosive_pass_rate"]
sched["away_def_xpass_x_home_xpass"] = sched["away_def_def_explosive_pass_rate_allowed"] * sched["home_explosive_pass_rate"]
sched["home_def_xrush_x_away_xrush"] = sched["home_def_def_explosive_rush_rate_allowed"] * sched["away_explosive_rush_rate"]
sched["away_def_xrush_x_home_xrush"] = sched["away_def_def_explosive_rush_rate_allowed"] * sched["home_explosive_rush_rate"]
sched["epa_x_success_diff"] = sched["epa_diff"] * sched["success_diff"]
sched["epa_x_expl_pass_diff"] = sched["epa_diff"] * sched["explosive_pass_diff"]
sched["succ_x_to_diff"] = sched["success_diff"] * sched["to_diff"]
sched["rz_x_g2g_diff"] = sched["rz_diff"] * sched["g2g_diff"]
sched["implied_x_rz_diff"]  = (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * sched["rz_diff"]
sched["edge_x_g2g_diff"]    = (sched["home_market_edge_prob"] - sched["away_market_edge_prob"]) * sched["g2g_diff"]
sched["implied_x_to_diff"]  = (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"]) * sched["to_diff"]
sched["edge_x_expl_rush"]   = (sched["home_market_edge_prob"] - sched["away_market_edge_prob"]) * sched["explosive_rush_mismatch"]
sched["rest_x_epa_diff"]   = sched["home_rest_days"] * sched["epa_diff"]
sched["wind_x_expl_pass"]  = sched["home_weather_windspeed"] * sched["explosive_pass_diff"]
sched["precip_x_success"]  = sched["home_weather_precip"] * sched["success_diff"]
sched["temp_x_epa_diff"]   = sched["home_weather_temp"] * sched["epa_diff"]
sched["div_x_implied_prob"] = sched["div_game"] * (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"])
sched["div_x_edge"]         = sched["div_game"] * (sched["home_market_edge_prob"] - sched["away_market_edge_prob"])
sched["div_x_spread_resid"] = sched["div_game"] * sched["spread_residual"]
sched["vigfree_diff"]  = sched["home_vigfree"] - sched["away_vigfree"]
sched["vigfree_ratio"] = sched["home_vigfree"] / sched["away_vigfree"].replace(0, np.nan)
sched["home_implied_x_vigfree"] = sched["home_avg_implied_prob"] * sched["home_vigfree"]
sched["away_implied_x_vigfree"] = sched["away_avg_implied_prob"] * sched["away_vigfree"]
sched["home_edge_x_vigfree"] = sched["home_market_edge_prob"] * sched["home_vigfree"]
sched["away_edge_x_vigfree"] = sched["away_market_edge_prob"] * sched["away_vigfree"]
sched["spread_x_vigfree_diff"] = sched["home_spread_line"] * sched["vigfree_diff"]
sched["div_x_vigfree_diff"]  = sched["div_game"] * sched["vigfree_diff"]
sched["div_x_vigfree_ratio"] = sched["div_game"] * sched["vigfree_ratio"]
sched["vigfree_spread_resid"] = sched["home_spread_line"] - sched["vigfree_diff"]
sched["vigfree_epa_resid"]    = sched["epa_diff"] - sched["vigfree_diff"]
sched["pace_diff"] = sched["home_pace_sec_per_play"] - sched["away_pace_sec_per_play"]
sched["xpass_diff"] = sched["home_xpass_mean"] - sched["away_xpass_mean"]
sched["spread_x_pace_diff"] = sched["home_spread_line"] * sched["pace_diff"]
sched["spread_x_xpass_diff"] = sched["home_spread_line"] * sched["xpass_diff"]
sched["pressure_diff"] = sched["home_pressure_rate"] - sched["away_pressure_rate"]
sched["sack_diff"]     = sched["home_sack_rate"] - sched["away_sack_rate"]
sched["spread_x_pressure"] = sched["home_spread_line"] * sched["pressure_diff"]
sched["penalty_diff"] = sched["home_penalty_yards_per_play"] - sched["away_penalty_yards_per_play"]
sched["spread_x_penalty_diff"] = sched["home_spread_line"] * sched["penalty_diff"]
sched["fg_diff"] = sched["home_fg_make_rate"] - sched["away_fg_make_rate"]
sched["tb_diff"] = sched["home_ko_touchback_rate"] - sched["away_ko_touchback_rate"]
sched["punt_block_diff"] = sched["home_punt_block_rate"] - sched["away_punt_block_rate"]
sched["ppd_diff"] = sched["home_points_per_drive"] - sched["away_points_per_drive"]
sched["drive_len_diff"] = sched["home_avg_drive_len"] - sched["away_avg_drive_len"]
sched["field_pos_diff"] = sched["home_avg_start_fp"] - sched["away_avg_start_fp"]
sched["cpoe_diff"] = sched["home_completion_percentage_above_expectation"] - sched["away_completion_percentage_above_expectation"]
sched["rating_diff"] = sched["home_passer_rating"] - sched["away_passer_rating"]
sched["adot_diff"]   = sched["home_a_dot"] - sched["away_a_dot"]
sched["ryoe_diff"] = sched["home_rush_yards_over_expected_per_att"] - sched["away_rush_yards_over_expected_per_att"]
sched["rush_roe_diff"] = sched["home_rush_pct_over_expected"] - sched["away_rush_pct_over_expected"]
sched["injury_diff"] = sched["injury_impact_pre"] * (sched["home_vigfree"] - sched["away_vigfree"])
sched["starter_diff"] = sched["pct_starters_out_pre"] * sched["vigfree_diff"]
sched["spread_x_elo"] = sched["home_spread_line"] * sched["elo_diff_market"]
sched["elo_x_implied"] = sched["elo_diff_market"] * (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"])
sched["humidity_diff"] = sched["home_weather_humidity"] * (sched["home_avg_implied_prob"] - sched["away_avg_implied_prob"])
sched["humidity_x_epa"] = sched["home_weather_humidity"] * sched["epa_diff"]
sched["third_down_diff"] = sched["home_third_down_conv_rate"] - sched["away_third_down_conv_rate"]
sched["def_third_down_diff"] = sched["home_def_def_third_down_conv_rate_allowed"] - sched["away_def_def_third_down_conv_rate_allowed"]
sched["fourth_epa_diff"] = sched["home_fourth_epa"] - sched["away_fourth_epa"]
sched["late_epa_diff"] = sched["home_two_min_epa"] - sched["away_two_min_epa"]
sched["late_def_epa_diff"] = sched["home_def_def_two_min_epa_allowed"] - sched["away_def_def_two_min_epa_allowed"]
sched["deep_epa_diff"] = sched["home_deep_epa"] - sched["away_deep_epa"]
sched["short_pass_ct_diff"] = sched["home_short_pass_ct"] - sched["away_short_pass_ct"]
sched["def_deep_epa_diff"] = sched["home_def_def_deep_epa_allowed"] - sched["away_def_def_deep_epa_allowed"]
sched["explosive_play_allowed_diff"] = (sched["home_def_def_explosive_pass_allowed"] + sched["home_def_def_explosive_rush_allowed"]) - (sched["away_def_def_explosive_pass_allowed"] + sched["away_def_def_explosive_rush_allowed"])
sched["fp_diff"] = sched["home_avg_start_fp"] - sched["away_avg_start_fp"]
sched["def_fp_allowed_diff"] = sched["home_def_def_avg_start_fp_allowed"] - sched["away_def_def_avg_start_fp_allowed"]
sched["air_yards_diff"] = sched["home_air_yards_sum"] - sched["away_air_yards_sum"]
sched["to_sticks_diff"] = sched["home_avg_air_yards_to_sticks"] - sched["away_avg_air_yards_to_sticks"]
sched["comp_air_diff"] = sched["home_avg_completed_air_yards"] - sched["away_avg_completed_air_yards"]
sched["garbage_rate_diff"] = sched["home_garbage_play_rate"] - sched["away_garbage_play_rate"]
sched["garbage_rate_lag_diff"] = sched["home_garbage_play_rate_lag"] - sched["away_garbage_play_rate_lag"]
sched["punt_blocked_diff"] = sched["home_punt_blocked"] - sched["away_punt_blocked"]
sched["tb_count_diff"] = sched["home_kickoff_tb"] - sched["away_kickoff_tb"]
sched["snap_off_pct_diff"] = sched["home_snaps_avg_offense_pct_pre"] - sched["away_snaps_avg_offense_pct_pre"]
sched["snap_def_pct_diff"] = sched["home_snaps_avg_defense_pct_pre"] - sched["away_snaps_avg_defense_pct_pre"]
sched["starters_90_diff"] = sched["home_snaps_num_starters_90_pre"] - sched["away_snaps_num_starters_90_pre"]
top_feats = [
    "home_r3_mean_pre_home_plays",
    "away_szn_mean_away_trailing",
    "extra-point-attempts-per-game_Away",
    "away_szn_mean_away_lead_pass",
    "extra-point-conversion-pct_prev",
    "home_ko_touchback_rate",
    "home_r5_mean_pre_home_snaps_num_starters_90_pre",
    "home_r5_sum_pre_home_def_def_yards_per_play",
    "home_r3_sum_pre_home_def_def_deep_epa_allowed",
    "home_r5_mean_pre_home_week",
    "opponent-punt-attempts-per-game_prev",
    "home_r3_mean_pre_home_def_def_takeaways",
    "offensive-point-share-pct_Home",
    "away_szn_sum_away_third_down_plays",
    "away_r3_mean_pre_away_ml_diff",
    "home_szn_sum_home_shotgun_plays",
    "opponent-extra-point-conversion-pct_Last 3",
    "away_szn_sum_away_pace_neutral_lag",
    "away_szn_sum_away_ko_touchback_rate_lag",
    "opponent-average-team-passer-rating_prev",
    "home_szn_sum_home_under_odds",
    "away_szn_mean_away_bye_rest",
    "opponent-yards-per-game_Home",
    "home_szn_mean_home_trailing",
    "away_szn_sum_away_season_success",
    "away_szn_sum_away_over_odds",
    "yards-per-completion_Last 3",
    "home_szn_sum_home_trail_pass",
    "away_szn_sum_away_espn",
    "away_neutralish",
    "home_szn_mean_home_def_def_third_epa_allowed",
    "opponent-red-zone-scores-per-game_Last 3",
    "opp-4th-quarter-points-per-game_Away",
    "opponent-gross-passing-yards-per-game_Home",
    "offensive-points-per-game_Away",
    "away_qb_hits",
    "away_r3_mean_pre_away_avg_expected_yac",
    "home_def_def_explosive_rush_allowed",
    "away_r3_mean_pre_away_explosive_rush",
    "opponent-net-yards-per-successful-punt_prev",
    "points-per-field-goal-attempt_Away",
    "away_szn_sum_away_vigfree",
    "away_szn_mean_away_def_def_explosive_pass_rate_allowed",
    "percent-of-games-with-an-interception_Home",
    "gross-punt-yards-per-game_prev",
    "away_r3_mean_pre_away_def_def_two_min_epa_allowed",
    "home_r5_mean_pre_home_leading",
    "away_r3_mean_pre_away_def_def_explosive_rush_allowed",
    "home_szn_sum_home_def_def_g2g_epa_allowed",
    "away_szn_sum_away_bye_rest"
]
for feat in top_feats:
    if feat in sched.columns:
        sched[f"spread_x_{feat}"] = sched["home_spread_line"] * pd.to_numeric(sched[feat], errors="coerce")

print([c for c in sched.columns if "_x" in c or "_y" in c])  
for col in sched.columns.tolist():  
    if col.endswith("_x") and col.replace("_x", "_y") in sched.columns:
        xcol = col
        ycol = col.replace("_x", "_y")
        sched[col.replace("_x", "")] = sched[ycol].iloc[:,0].combine_first(sched[xcol].iloc[:,0]) \
            if isinstance(sched[ycol], pd.DataFrame) else sched[ycol].combine_first(sched[xcol])
        sched.drop(columns=[xcol, ycol], inplace=True)
print([c for c in sched.columns if "_x" in c or "_y" in c])
sched.shape

['home_avg_drive_yards', 'away_avg_drive_yards', 'home_yards_gained', 'home_penalty_yards', 'home_air_yards_sum', 'home_xpass_mean', 'home_penalty_yards_per_play', 'away_yards_gained', 'away_penalty_yards', 'away_air_yards_sum', 'away_xpass_mean', 'away_penalty_yards_per_play', 'home_def_def_yards_per_play', 'home_def_def_penalty_yards', 'away_def_def_yards_per_play', 'away_def_def_penalty_yards', 'career_pass_yds_QB1', 'career_pass_yds_RB1', 'career_pass_yds_TE1', 'career_pass_yds_WR1', 'career_pass_yds_WR2', 'career_rec_yds_QB1', 'career_rec_yds_RB1', 'career_rec_yds_TE1', 'career_rec_yds_WR1', 'career_rec_yds_WR2', 'career_rush_yds_QB1', 'career_rush_yds_RB1', 'career_rush_yds_TE1', 'career_rush_yds_WR1', 'career_rush_yds_WR2', 'passing_yards_s2d_QB1', 'passing_yards_s2d_RB1', 'passing_yards_s2d_TE1', 'passing_yards_s2d_WR1', 'passing_yards_s2d_WR2', 'receiving_yards_s2d_QB1', 'receiving_yards_s2d_RB1', 'receiving_yards_s2d_TE1', 'receiving_yards_s2d_WR1', 'receiving_yards_s2d_WR2',

(5932, 4810)

In [28]:
game_features_extended = sched

## **VECTORIZE DATA AND PREP FOR MODELING**

In [29]:
gf = game_features_extended.copy()

TEAM_FIX = {
    "LA": "LAR",  # Rams
    "LV": "LVR",  # Raiders
    "WSH": "WAS", # Commanders
    "JAX": "JAC", # Jaguars
}
for c in ["home_team", "away_team"]:
    gf[c] = gf[c].replace(TEAM_FIX)

gf["game_id"] = (
    gf["season"].astype(str) + "_" +
    gf["week"].astype(str).str.zfill(2) + "_" +
    gf["home_team"] + "_" +
    gf["away_team"]
)

if "season_x" in gf.columns:
    gf = gf.rename(columns={"season_x": "season"})
if "season_y" in gf.columns:
    gf = gf.drop(columns=["season_y"])
for col in ["away_week", "home_ngs_week", "away_ngs_week"]:
    if col in gf.columns:
        gf = gf.drop(columns=[col])

dup_cols = [c for c in gf.columns if c.endswith("_y")]
gf = gf.drop(columns=dup_cols, errors="ignore")
gf = gf.loc[:, ~gf.columns.duplicated()]
print("[CLEANUP] Remaining columns:", gf.shape[1])

_required = {"game_id","season","week","home_team","away_team","home_score","away_score"}
missing = sorted(list(_required - set(gf.columns)))
if missing:
    raise ValueError(f"[PHASE 6] Missing required columns: {missing}")

def moneyline_to_prob(odds):
    try:
        odds = float(odds)
    except (TypeError, ValueError):
        return np.nan
    if odds > 0:
        return 100.0 / (odds + 100.0)
    elif odds < 0:
        return abs(odds) / (abs(odds) + 100.0)
    else:
        return np.nan

gf["home_spread_prob"] = gf["home_spread_odds"].apply(moneyline_to_prob)
gf["away_spread_prob"] = gf["away_spread_odds"].apply(moneyline_to_prob)

mask_valid = gf["home_spread_prob"].notna() & gf["away_spread_prob"].notna()
gf = gf.loc[mask_valid].copy()

total_prob = gf["home_spread_prob"] + gf["away_spread_prob"]
gf["home_spread_vigfree"] = gf["home_spread_prob"] / total_prob
gf["away_spread_vigfree"] = gf["away_spread_prob"] / total_prob

if "spread_line" not in gf.columns:
    raise ValueError("[PHASE 6] Missing spread_line column")

margin = gf["home_score"] - gf["away_score"]
home_cover = (margin > gf["spread_line"]).astype("Int64")
y = pd.Series(home_cover, index=gf.index, name="home_cover")

id_cols = {"game_id","home_team","away_team"}
target_cols = {"home_score","away_score","home_cover","away_win","home_result","away_result","result"}
base_exclude = id_cols | target_cols

exclude = {
    "game_id","home_team","away_team",
    "home_score","away_score",
    "away_win","home_result","away_result","result"
}

leakage_exact = {
    "home_win","home_cover",
    "home_away_score","away_home_score","home_home_score",
    "away_szn_sum_away_result","away_away_score","home_szn_sum_home_result",
    "home_r3_mean_pre_home_result","home_szn_mean_home_away_score",
    "home_szn_mean_home_score","away_szn_sum_away_score",
    "away_r5_sum_pre_away_result","away_r3_sum_pre_away_result",
    "home_trailing","away_trailing","home_leading","away_leading",
    "home_lead_pass","away_lead_pass","home_trail_pass","away_trail_pass",
    "overtime","total",
    "home_kickoff_attempts","away_kickoff_attempts",
    "home_szn_sum_home_kickoff_attempts",
    "away_szn_sum_away_kickoff_attempts"
}

market_patterns = ["moneyline","spread","implied_prob","vig","vigfree"]
safe_context = ["kickoff_et","game_date","stadium","head_ref_name"]

leakage_patterns = [
    r".*_score.*",
    r".*_result.*",
    r".*kickoff_attempts.*",
]

tb_pattern = re.compile(r"^tb_|_tb$|tb_")

feature_cols_full = []
for c in gf.columns:
    if c in base_exclude:
        continue
    if tb_pattern.search(c):
        continue
    feature_cols_full.append(c)

feature_cols_noleak = []
for c in gf.columns:
    if c in exclude or c in leakage_exact:
        continue
    if any(re.search(p, c) for p in leakage_patterns) and "_pre_" not in c:
        continue
    if tb_pattern.search(c):
        continue
    feature_cols_noleak.append(c)

for c in safe_context:
    if c in gf.columns:
        if c not in feature_cols_full:
            feature_cols_full.append(c)
        if c not in feature_cols_noleak:
            feature_cols_noleak.append(c)

X_full   = gf[feature_cols_full].copy()      
X_noleak = gf[feature_cols_noleak].copy()   

X = X_noleak.copy()

market_cols = [c for c in gf.columns if any(p in c for p in market_patterns)]
print(f"[PHASE 6] Final feature count (no-leak): {len(feature_cols_noleak)}")
print(f"[PHASE 6] Final feature count (full):    {len(feature_cols_full)}")

X.columns = X.columns.astype(str)
numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
categorical_cols = [c for c in X.columns if c not in numeric_cols]
for c in categorical_cols:
    if pd.api.types.is_object_dtype(X[c]):
        X[c] = X[c].astype("category")
cat_categories = {c: X[c].cat.categories for c in categorical_cols if pd.api.types.is_categorical_dtype(X[c])}

print(f"[PHASE 6] model matrix shape: {X.shape}, target shape: {y.shape}")

spread_cols = ["home_spread_prob","away_spread_prob","home_spread_vigfree","away_spread_vigfree"]
meta = gf[["game_id","season","week","kickoff_et","home_team","away_team"] + market_cols + spread_cols].copy()
meta = meta.loc[:, ~meta.columns.duplicated()]

season_ser = pd.to_numeric(gf["season"], errors="coerce").astype("Int64")
train_mask = season_ser.between(2000, 2017)
valid_mask = season_ser.between(2018, 2021)
test_mask  = season_ser.between(2022, 2024)

X_train, y_train = X.loc[train_mask], y.loc[train_mask]
X_valid, y_valid = X.loc[valid_mask], y.loc[valid_mask]
X_test,  y_test  = X.loc[test_mask],  y.loc[test_mask]

meta_train = meta.loc[train_mask]
meta_valid = meta.loc[valid_mask]
meta_test  = meta.loc[test_mask]

print("[PHASE 6] Split sizes:", X_train.shape[0], X_valid.shape[0], X_test.shape[0])

meta_out, X_out, y_out = meta.copy(), X.copy(), y.copy()

cat_categories = {c: X_out[c].cat.categories for c in categorical_cols if pd.api.types.is_categorical_dtype(X_out[c])}

def preprocess_for_xgb(Xdf: pd.DataFrame) -> pd.DataFrame:
    Xdf = Xdf.copy()
    dt_cols = Xdf.select_dtypes(include=["datetimetz","datetime64"]).columns
    for c in dt_cols:
        Xdf[c] = pd.to_datetime(Xdf[c], errors="coerce").view("int64") // 10**9
    for c, cats in cat_categories.items():
        if c in Xdf.columns:
            Xdf[c] = pd.Categorical(Xdf[c], categories=cats)
    return Xdf

numeric_cols = [c for c in X_noleak.columns if pd.api.types.is_numeric_dtype(X_noleak[c])]
categorical_cols = [c for c in X_noleak.columns if c not in numeric_cols]

scaler = StandardScaler()
X_norm_num = pd.DataFrame(
    scaler.fit_transform(X_noleak[numeric_cols]),
    index=X_noleak.index,
    columns=[f"{c}_znorm" for c in numeric_cols]
)
X_normalized = pd.concat([X_norm_num, X_noleak[categorical_cols]], axis=1)

corr = X_noleak[numeric_cols].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.90)]
numeric_keep = [c for c in numeric_cols if c not in to_drop]

X_corr_filtered = pd.concat(
    [X_noleak[numeric_keep], X_noleak[categorical_cols]],
    axis=1
)

print("[PHASE 6] Corr-filter drop count:", len(to_drop))


numeric_for_pca = X_noleak[numeric_cols].copy()

numeric_for_pca = numeric_for_pca.replace([np.inf, -np.inf], np.nan)

all_nan_cols = numeric_for_pca.columns[numeric_for_pca.isna().all()]
if len(all_nan_cols) > 0:
    print(f"[PHASE 6] PCA: dropping {len(all_nan_cols)} all-NaN columns")
    numeric_for_pca = numeric_for_pca.drop(columns=all_nan_cols)

medians = numeric_for_pca.median()
numeric_for_pca = numeric_for_pca.fillna(medians)

still_nan_cols = numeric_for_pca.columns[numeric_for_pca.isna().any()]
if len(still_nan_cols) > 0:
    print(f"[PHASE 6] PCA: dropping {len(still_nan_cols)} columns that still contain NaNs after imputation")
    numeric_for_pca = numeric_for_pca.drop(columns=still_nan_cols)

print(f"[PHASE 6] PCA: final numeric shape for PCA: {numeric_for_pca.shape}")

scaler_pca = StandardScaler()
numeric_scaled = scaler_pca.fit_transform(numeric_for_pca)

pca = PCA(n_components=10, random_state=0)
X_pca_num = pca.fit_transform(numeric_scaled)

X_pca = pd.DataFrame(
    X_pca_num,
    index=X_noleak.index,
    columns=[f"PC{i+1}" for i in range(X_pca_num.shape[1])]
)

print("[PHASE 6] PCA explained variance ratio (first 10):", pca.explained_variance_ratio_)

[CLEANUP] Remaining columns: 4808
[PHASE 6] Final feature count (no-leak): 4730
[PHASE 6] Final feature count (full):    4784
[PHASE 6] model matrix shape: (4696, 4730), target shape: (4696,)
[PHASE 6] Split sizes: 2610 1038 854
[PHASE 6] Corr-filter drop count: 2012
[PHASE 6] PCA: dropping 435 all-NaN columns
[PHASE 6] PCA: final numeric shape for PCA: (4696, 4108)
[PHASE 6] PCA explained variance ratio (first 10): [0.0636877  0.05139657 0.04188449 0.03312328 0.01631181 0.01514752
 0.01408459 0.01367487 0.01179314 0.01135482]


## **MODEL**

In [30]:
X, y, meta = X_out.copy(), y_out.copy(), meta_out.copy()
labeled = y.notna()
X_lab, y_lab, M_lab = X.loc[labeled], y.loc[labeled].astype(int), meta.loc[labeled]
seasons = pd.to_numeric(M_lab["season"], errors="coerce").astype(int)

train_mask = seasons.between(2000, 2018)
valid_mask = seasons.between(2019, 2021)
test_mask  = seasons.between(2022, 2024)

def split(X_, y_, M_, mask): 
    return X_.loc[mask], y_.loc[mask], M_.loc[mask]

X_tr, y_tr, M_tr = split(X_lab, y_lab, M_lab, train_mask)
X_va, y_va, M_va = split(X_lab, y_lab, M_lab, valid_mask)
X_te, y_te, M_te = split(X_lab, y_lab, M_lab, test_mask)

def preprocess_for_xgb(Xdf: pd.DataFrame) -> pd.DataFrame:
    Xdf = Xdf.copy()
    dt_cols = Xdf.select_dtypes(include=["datetimetz","datetime64"]).columns
    for c in dt_cols:
        Xdf[c] = pd.to_datetime(Xdf[c], errors="coerce").view("int64") // 10**9
    obj_cols = Xdf.select_dtypes(include=["object"]).columns
    for c in obj_cols:
        Xdf[c] = Xdf[c].astype("category")
    return Xdf

X_tr, X_va, X_te = map(preprocess_for_xgb, [X_tr, X_va, X_te])

def cast_params(d: dict) -> dict:
    out = d.copy()
    int_fields = ["max_depth", "min_child_weight", "max_bin"]
    for k in int_fields:
        if k in out: out[k] = int(out[k])
    float_fields = [
        "learning_rate","reg_alpha","reg_lambda","gamma",
        "colsample_bytree","subsample"
    ]
    for k in float_fields:
        if k in out: out[k] = float(out[k])
    return out

def fair_payout(p):
    if p <= 0 or p >= 1 or np.isnan(p):
        return np.nan
    return p / (1.0 - p)

def build_sample_weights(
    M_meta: pd.DataFrame,
    w_acc: float,
    w_disagree_acc: float,
    w_disagree_rate: float,
    w_ev: float
) -> np.ndarray:
    """
    Per-sample training weights that encode:
      - baseline accuracy importance (w_acc)
      - emphasis on disagreements (w_disagree_acc, w_disagree_rate)
      - emphasis on high-EV games (w_ev)
    """
    M = M_meta.copy()
    base = np.ones(len(M), dtype=float)

    if {"home_spread_vigfree","away_spread_vigfree"}.issubset(M.columns):
        home_p = pd.to_numeric(M["home_spread_vigfree"], errors="coerce").values
        away_p = pd.to_numeric(M["away_spread_vigfree"], errors="coerce").values
    else:
        home_p = pd.to_numeric(M["home_spread_prob"], errors="coerce").values
        away_p = pd.to_numeric(M["away_spread_prob"], errors="coerce").values

    market_pick = (home_p >= away_p).astype(int)

    home_payout = np.array([fair_payout(p) for p in home_p])
    away_payout = np.array([fair_payout(p) for p in away_p])
    max_payout = np.nanmax(np.vstack([home_payout, away_payout]), axis=0)
    max_payout = np.nan_to_num(max_payout, nan=1.0)

    ev_boost = 1.0 + w_ev * (max_payout - 1.0)

    prob_gap = np.abs(home_p - away_p)
    max_gap = np.nanmax(prob_gap) if np.nanmax(prob_gap) > 0 else 1.0
    prob_gap_norm = np.nan_to_num(prob_gap / max_gap, nan=0.0)
    disagree_boost = 1.0 + (w_disagree_acc + w_disagree_rate) * prob_gap_norm

    weights = base * ev_boost * disagree_boost

    weights[weights <= 0] = 1e-6
    return weights


def make_disagree_metric(
    threshold=0.5,
    w_acc=0.3,
    w_disagree_acc=0.3,
    w_disagree_rate=0.1,
    w_ev=0.3
):
    def evaluate(probs, y_true, M_meta):
        y_true = y_true.values
        y_pred = (probs >= threshold).astype(int)

        if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(M_meta.columns):
            home_prob = pd.to_numeric(M_meta["home_spread_vigfree"], errors="coerce").values
            away_prob = pd.to_numeric(M_meta["away_spread_vigfree"], errors="coerce").values
        elif {"home_spread_prob", "away_spread_prob"}.issubset(M_meta.columns):
            home_prob = pd.to_numeric(M_meta["home_spread_prob"], errors="coerce").values
            away_prob = pd.to_numeric(M_meta["away_spread_prob"], errors="coerce").values
        else:
            raise KeyError("M_meta must include spread-based prob columns")

        market_pick = (home_prob >= away_prob).astype(int)

        overall_acc = (y_pred == y_true).mean()
        mask = y_pred != market_pick
        disagree_acc = (y_pred[mask] == y_true[mask]).mean() if mask.any() else 0.0
        disagree_rate = mask.mean()

        ev_list = []
        for yp, yt, hp, ap in zip(y_pred, y_true, home_prob, away_prob):
            if yp == 1:
                payout = fair_payout(hp)
                if not np.isnan(payout):
                    ev_list.append(payout if yt == 1 else -1.0)
            else:
                payout = fair_payout(ap)
                if not np.isnan(payout):
                    ev_list.append(payout if yt == 0 else -1.0)
        ev = np.nanmean(ev_list) if ev_list else 0.0

        score = (
            w_acc * overall_acc +
            w_disagree_acc * disagree_acc +
            w_disagree_rate * disagree_rate +
            w_ev * ev
        )
        return score, overall_acc, disagree_acc, disagree_rate, ev
    return evaluate

def eval_bundle(
    preds,
    y_true,
    M_meta,
    thresholds,
    w_acc,
    w_disagree_acc,
    w_disagree_rate,
    w_ev
):
    """
    Evaluate across threshold grid; pick best score under the given weights.
    """
    results = []
    for th in thresholds:
        metric_fn = make_disagree_metric(
            threshold=th,
            w_acc=w_acc,
            w_disagree_acc=w_disagree_acc,
            w_disagree_rate=w_disagree_rate,
            w_ev=w_ev
        )
        score, overall, disa_acc, disa_rate, ev = metric_fn(preds, y_true, M_meta)
        results.append((th, score, overall, disa_acc, disa_rate, ev))
    best = max(results, key=lambda x: x[1])
    th, score, overall, disa_acc, disa_rate, ev = best
    auc = roc_auc_score(y_true, preds)
    return dict(
        threshold=th,
        score=score,
        overall=overall,
        disa_acc=disa_acc,
        disa_rate=disa_rate,
        ev=ev,
        auc=auc
    )


def train_once(
    params,
    w_acc=0.3,
    w_disagree_acc=0.3,
    w_disagree_rate=0.1,
    w_ev=0.3,
    num_round=800,
    es_rounds=100,
    seed=1337,
    thresholds=np.arange(0.4,0.61,0.02)
):
    base = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "verbosity": 0,
        "seed": seed
    }
    p = cast_params({**base, **params})

    w_tr = build_sample_weights(M_tr, w_acc, w_disagree_acc, w_disagree_rate, w_ev)
    w_va = build_sample_weights(M_va, w_acc, w_disagree_acc, w_disagree_rate, w_ev)

    dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=w_tr, enable_categorical=True)
    dvalid = xgb.DMatrix(X_va, label=y_va, weight=w_va, enable_categorical=True)
    dtest  = xgb.DMatrix(X_te, label=y_te, enable_categorical=True)
    booster = xgb.train(
        p, dtrain, num_boost_round=num_round,
        evals=[(dtrain,"train"), (dvalid,"valid")],
        early_stopping_rounds=es_rounds,
        verbose_eval=False
    )
    best_iter = booster.best_iteration
    predict = lambda dm: booster.predict(dm, iteration_range=(0, best_iter+1))
    tr_p, va_p, te_p = map(predict, [dtrain, dvalid, dtest])

    m_tr = eval_bundle(tr_p, y_tr, M_tr, thresholds,
                       w_acc, w_disagree_acc, w_disagree_rate, w_ev)
    m_va = eval_bundle(va_p, y_va, M_va, thresholds,
                       w_acc, w_disagree_acc, w_disagree_rate, w_ev)
    m_te = eval_bundle(te_p, y_te, M_te, thresholds,
                       w_acc, w_disagree_acc, w_disagree_rate, w_ev)

    return booster, best_iter, m_tr, m_va, m_te


depths   = [6, 8]
lrs      = [.04]
mcw      = [20]
subsmpl  = [1.0]
colsmp   = [1.0]
reg_a    = [0]
reg_l    = [10]
gammas   = [0]

grid = [
    {
        "max_depth": d,
        "learning_rate": lr,
        "min_child_weight": w,
        "subsample": ss,
        "colsample_bytree": cs,
        "reg_alpha": ra,
        "reg_lambda": rl,
        "gamma": g,
    }
    for d in depths
    for lr in lrs
    for w in mcw
    for ss in subsmpl
    for cs in colsmp
    for ra in reg_a
    for rl in reg_l
    for g in gammas
]

weight_grid = [
    (0.3, 0.0, 0.0, 0.0), 
    (0.3, 0.3, 0.1, 0.2),
    (0.3, 0.3, 0.1, 0.25),
    (0.3, 0.3, 0.1, 0.3),
]

print(f"Hyperparam grid size: {len(grid)}, weight grid size: {len(weight_grid)}")

results = []
for (w_acc, w_da, w_dr, w_ev) in weight_grid:
    for i, params in enumerate(tqdm(grid, desc=f"Grid (w_ev={w_ev})")):
        booster, best_iter, m_tr, m_va, m_te = train_once(
            params,
            w_acc=w_acc,
            w_disagree_acc=w_da,
            w_disagree_rate=w_dr,
            w_ev=w_ev,
            seed=1337 + i
        )
        results.append({
            **params,
            "w_acc": w_acc,
            "w_disagree_acc": w_da,
            "w_disagree_rate": w_dr,
            "w_ev": w_ev,
            "best_iter": best_iter,
            "train_score": m_tr["score"],
            "valid_score": m_va["score"],
            "test_score": m_te["score"],
            "valid_auc": m_va["auc"],
            "valid_ev": m_va["ev"]
        })
        gc.collect()

results_df = pd.DataFrame(results).sort_values("valid_score", ascending=False)

print("\n[TOP 10 VALID by custom score]")
print(results_df.head(10)[[
    "w_acc","w_disagree_acc","w_disagree_rate","w_ev",
    "valid_score","valid_auc","valid_ev",
    "max_depth","min_child_weight","learning_rate",
    "subsample","colsample_bytree","reg_alpha","reg_lambda","gamma","best_iter"
]])

best_row = results_df.iloc[0]
best_cfg = cast_params(best_row[[
    "max_depth","min_child_weight","learning_rate",
    "subsample","colsample_bytree",
    "reg_alpha","reg_lambda","gamma",
]].to_dict())
best_w_acc          = float(best_row["w_acc"])
best_w_disagree_acc = float(best_row["w_disagree_acc"])
best_w_disagree_rate= float(best_row["w_disagree_rate"])
best_w_ev           = float(best_row["w_ev"])

print("\n[FINAL BEST CONFIG + WEIGHTS]")
print("Hyperparams:", best_cfg)
print("Weights: w_acc={:.3f}, w_disagree_acc={:.3f}, w_disagree_rate={:.3f}, w_ev={:.3f}"
      .format(best_w_acc, best_w_disagree_acc, best_w_disagree_rate, best_w_ev))

X_trva = pd.concat([X_tr, X_va], axis=0)
y_trva = pd.concat([y_tr, y_va], axis=0)
M_trva = pd.concat([M_tr, M_va], axis=0)

w_trva = build_sample_weights(M_trva,
                              best_w_acc,
                              best_w_disagree_acc,
                              best_w_disagree_rate,
                              best_w_ev)
dtrva  = xgb.DMatrix(X_trva, label=y_trva, weight=w_trva, enable_categorical=True)

final_params = cast_params({
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "verbosity": 0,
    **best_cfg
})

final_booster = xgb.train(
    final_params, dtrva,
    num_boost_round=int(max(800, results_df["best_iter"].max()*1.2)),
    evals=[(dtrva,"train")],
    early_stopping_rounds=100,
    verbose_eval=False
)

best_model = final_booster
best_params = best_cfg
best_weights = {
    "w_acc": best_w_acc,
    "w_disagree_acc": best_w_disagree_acc,
    "w_disagree_rate": best_w_disagree_rate,
    "w_ev": best_w_ev,
}
results_overview = results_df.reset_index(drop=True)


Hyperparam grid size: 2, weight grid size: 4


Grid (w_ev=0.0): 100%|██████████| 2/2 [01:22<00:00, 41.01s/it]
Grid (w_ev=0.2): 100%|██████████| 2/2 [01:22<00:00, 41.43s/it]
Grid (w_ev=0.25): 100%|██████████| 2/2 [01:11<00:00, 35.74s/it]
Grid (w_ev=0.3): 100%|██████████| 2/2 [01:16<00:00, 38.32s/it]



[TOP 10 VALID by custom score]
   w_acc  w_disagree_acc  w_disagree_rate  w_ev  valid_score  valid_auc  \
6    0.3             0.3              0.1  0.30     0.598528   0.763202   
7    0.3             0.3              0.1  0.30     0.591066   0.762539   
4    0.3             0.3              0.1  0.25     0.582825   0.761647   
5    0.3             0.3              0.1  0.25     0.556794   0.754894   
2    0.3             0.3              0.1  0.20     0.551016   0.762716   
3    0.3             0.3              0.1  0.20     0.537274   0.756449   
0    0.3             0.0              0.0  0.00     0.211920   0.763120   
1    0.3             0.0              0.0  0.00     0.211167   0.767825   

   valid_ev  max_depth  min_child_weight  learning_rate  subsample  \
6  0.425493          6                20           0.04        1.0   
7  0.413008          8                20           0.04        1.0   
4  0.435570          6                20           0.04        1.0   
5  0.388223 

# **Visualization Components**

## **Build Games DF**

In [31]:
X = X_out.copy()
y = y_out.copy()
meta = meta_out.copy()

labeled_mask = y.notna()
X_lab = X.loc[labeled_mask].copy()
y_lab = y.loc[labeled_mask].astype(int)
M_lab = meta.loc[labeled_mask].copy()

X_lab_proc = preprocess_for_xgb(X_lab)

d_all = xgb.DMatrix(X_lab_proc, label=y_lab, enable_categorical=True)
probs_all = best_model.predict(d_all)
print(f"[PRED] probs_all shape: {probs_all.shape}, labels: {y_lab.shape}")

games_df = M_lab.copy()
games_df["y_true"] = y_lab.values.astype(int)              
games_df["pred_prob_home_cover"] = probs_all              

required_cols = [
    "game_id", "season", "week", "home_team", "away_team",
    "home_spread_prob", "away_spread_prob",
    "home_spread_vigfree", "away_spread_vigfree"
]
missing = [c for c in required_cols if c not in games_df.columns]
if missing:
    raise ValueError(f"[STEP 1] games_df missing required columns: {missing}")

home_p = pd.to_numeric(games_df["home_spread_vigfree"], errors="coerce").values
away_p = pd.to_numeric(games_df["away_spread_vigfree"], errors="coerce").values

games_df["market_pick"] = (home_p >= away_p).astype(int)

default_tau = 0.50
games_df["model_pick"] = (games_df["pred_prob_home_cover"] >= default_tau).astype(int)

games_df["disagree_flag"] = (games_df["model_pick"] != games_df["market_pick"]).astype(int)

def fair_payout(p):
    if p <= 0 or p >= 1 or np.isnan(p):
        return np.nan
    return p / (1.0 - p)

ev_list = []
for yp, yt, hp, ap in zip(
    games_df["model_pick"].values,
    games_df["y_true"].values,
    home_p,
    away_p
):
    if yp == 1:
        payout = fair_payout(hp)
        if np.isnan(payout):
            ev_list.append(np.nan)
        else:
            ev_list.append(payout if yt == 1 else -1.0)
    else:
        payout = fair_payout(ap)
        if np.isnan(payout):
            ev_list.append(np.nan)
        else:
            ev_list.append(payout if yt == 0 else -1.0)

games_df["ev_model_pick"] = ev_list

print("[STEP 1] games_df shape:", games_df.shape)
print("[STEP 1] games_df columns:", games_df.columns.tolist())
games_df["season"] = pd.to_numeric(games_df["season"], errors="coerce").astype("Int64")

valid_min_season = 2000
valid_max_season = 2024  

valid_season_mask = games_df["season"].between(valid_min_season, valid_max_season)
games_df = games_df.loc[valid_season_mask].copy()

print("[STEP 1B] Season range AFTER filtering:", games_df["season"].min(), "->", games_df["season"].max())

assert (games_df["season"] >= 2025).sum() == 0, "[SANITY] Found rows with season >= 2025 after filtering!"

bad_rows = games_df[games_df["season"] >= 2025][["game_id","season","week","y_true"]]

[PRED] probs_all shape: (4696,), labels: (4696,)
[STEP 1] games_df shape: (4696, 334)
[STEP 1] games_df columns: ['game_id', 'season', 'week', 'kickoff_et', 'home_team', 'away_team', 'spread_line', 'home_moneyline', 'away_moneyline', 'spread_favorite', 'spread_abs', 'avg_implied_prob', 'away_spread_odds', 'home_spread_odds', 'home_home_moneyline', 'home_away_moneyline', 'home_spread_favorite', 'home_home_spread_line', 'home_spread_abs', 'home_avg_implied_prob', 'home_away_spread_odds', 'home_home_spread_odds', 'away_spread_line', 'away_home_moneyline', 'away_away_moneyline', 'away_spread_favorite', 'away_home_spread_line', 'away_spread_abs', 'away_avg_implied_prob', 'away_away_spread_odds', 'away_home_spread_odds', 'home_spread_line', 'home_vigfree', 'away_vigfree', 'home_szn_mean_home_moneyline', 'home_szn_sum_home_moneyline', 'home_szn_mean_home_spread_odds', 'home_szn_sum_home_spread_odds', 'home_szn_mean_home_home_moneyline', 'home_szn_sum_home_home_moneyline', 'home_szn_mean_home_

## **Performance, Calibration and Threshold Tables**

In [32]:
def fair_payout(p):
    if p <= 0 or p >= 1 or np.isnan(p):
        return np.nan
    return p / (1.0 - p)

def build_perf_df(games_df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    df = games_df.copy()

    df["model_pick"] = (df["pred_prob_home_cover"] >= threshold).astype(int)
    df["is_bet"] = (
        (df["pred_prob_home_cover"] >= threshold) |
        (df["pred_prob_home_cover"] <= 1.0 - threshold)
    )

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p = pd.to_numeric(df["home_spread_vigfree"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_vigfree"], errors="coerce").values
    else:
        home_p = pd.to_numeric(df["home_spread_prob"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_prob"], errors="coerce").values

    ev_list = []
    for is_bet, yp, yt, hp, ap in zip(
        df["is_bet"].values,
        df["model_pick"].values,
        df["y_true"].values,
        home_p,
        away_p
    ):
        if not is_bet:
            ev_list.append(0.0)
            continue

        if yp == 1:
            payout = fair_payout(hp)
            if np.isnan(payout):
                ev_list.append(0.0)
            else:
                ev_list.append(payout if yt == 1 else -1.0)
        else:
            payout = fair_payout(ap)
            if np.isnan(payout):
                ev_list.append(0.0)
            else:
                ev_list.append(payout if yt == 0 else -1.0)

    df["ev_trade"] = ev_list

    def _weekly_accuracy(s):
        idx = s.index
        return (df.loc[idx, "model_pick"].values == s.values).mean()

    grouped = df.groupby(["season", "week"], as_index=False)
    perf = grouped.agg(
        n_games      = ("y_true", "size"),
        n_bets       = ("is_bet", "sum"),
        accuracy     = ("y_true", _weekly_accuracy),
        disagree_rate= ("disagree_flag", "mean"),
        ev           = ("ev_trade", "mean"),
    )

    perf = perf.sort_values(["season", "week"])
    perf["cum_ev"] = perf.groupby("season")["ev"].cumsum()

    return perf

tau_star = 0.5
perf_df = build_perf_df(games_df, threshold=tau_star)

In [33]:
def build_calib_df(games_df: pd.DataFrame, n_bins: int = 10) -> pd.DataFrame:
    df = games_df.copy()
    df = df[df["pred_prob_home_cover"].notna() & df["y_true"].notna()]
    
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    df["bin"] = pd.cut(df["pred_prob_home_cover"], bins=bins, include_lowest=True)

    calib = (
        df.groupby("bin")
          .agg(
              pred_mean     = ("pred_prob_home_cover", "mean"),
              empirical_rate= ("y_true", "mean"),
              count         = ("y_true", "size")
          )
          .reset_index(drop=True)
    )

    calib["bin_center"] = calib["pred_mean"] 

    return calib

calib_df = build_calib_df(games_df, n_bins=10)

print("[STEP 2B] calib_df:")
print(calib_df)


[STEP 2B] calib_df:
   pred_mean  empirical_rate  count  bin_center
0   0.057187        0.004155   1444    0.057187
1   0.132856        0.024671    608    0.132856
2   0.241640        0.211864    118    0.241640
3   0.351186        0.368421     95    0.351186
4   0.449022        0.387097     93    0.449022
5   0.546927        0.605263    114    0.546927
6   0.652120        0.683168    101    0.652120
7   0.758112        0.800000    110    0.758112
8   0.864659        0.978395    648    0.864659
9   0.938739        0.997438   1171    0.938739


In [34]:
def fair_payout(p):
    if p <= 0 or p >= 1 or np.isnan(p):
        return np.nan
    return p / (1.0 - p)

def compute_metrics_for_threshold(
    df: pd.DataFrame,
    threshold: float,
    w_acc: float = 0.3,
    w_disagree_acc: float = 0.3,
    w_disagree_rate: float = 0.1,
    w_ev: float = 0.3,
):
    """
    Compute all metrics for a given threshold and weight combo on a games_df slice.
    Returns a dict with metrics and the final weighted score.
    """
    df = df.copy()
    df = df[df["pred_prob_home_cover"].notna() & df["y_true"].notna()]

    df["y_pred"] = (df["pred_prob_home_cover"] >= threshold).astype(int)

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p = pd.to_numeric(df["home_spread_vigfree"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_vigfree"], errors="coerce").values
    else:
        home_p = pd.to_numeric(df["home_spread_prob"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_prob"], errors="coerce").values

    df["market_pick"] = (home_p >= away_p).astype(int)

    y_true = df["y_true"].values
    y_pred = df["y_pred"].values

    overall_acc = (y_pred == y_true).mean()

    disagree_mask = (y_pred != df["market_pick"].values)
    disagree_rate = disagree_mask.mean()
    if disagree_mask.any():
        disagree_acc = (y_pred[disagree_mask] == y_true[disagree_mask]).mean()
    else:
        disagree_acc = 0.0

    is_bet = (df["pred_prob_home_cover"] >= threshold) | \
             (df["pred_prob_home_cover"] <= 1.0 - threshold)

    ev_list = []
    for bet, yp, yt, hp, ap in zip(
        is_bet.values,
        y_pred,
        y_true,
        home_p,
        away_p
    ):
        if not bet:
            continue

        if yp == 1:
            payout = fair_payout(hp)
            if not np.isnan(payout):
                ev_list.append(payout if yt == 1 else -1.0)
        else:
            payout = fair_payout(ap)
            if not np.isnan(payout):
                ev_list.append(payout if yt == 0 else -1.0)

    ev = np.nanmean(ev_list) if len(ev_list) > 0 else 0.0

    score = (
        w_acc * overall_acc +
        w_disagree_acc * disagree_acc +
        w_disagree_rate * disagree_rate +
        w_ev * ev
    )

    try:
        auc = roc_auc_score(y_true, df["pred_prob_home_cover"].values)
    except Exception:
        auc = np.nan

    return {
        "threshold": threshold,
        "w_acc": w_acc,
        "w_disagree_acc": w_disagree_acc,
        "w_disagree_rate": w_disagree_rate,
        "w_ev": w_ev,
        "overall_accuracy": overall_acc,
        "disagree_accuracy": disagree_acc,
        "disagree_rate": disagree_rate,
        "ev": ev,
        "score": score,
        "auc": auc,
        "n_games": len(df),
        "n_bets": int(is_bet.sum()),
    }
test_mask_threshold = games_df["season"].between(2022, 2024)
games_test = games_df.loc[test_mask_threshold].copy()

threshold_grid = np.round(np.arange(0.01, 1, 0.01), 3)

threshold_records = []
for tau in threshold_grid:
    metrics = compute_metrics_for_threshold(
        games_test,
        threshold=tau,
        w_acc=0.3,
        w_disagree_acc=0.3,
        w_disagree_rate=0.1,
        w_ev=0.3,
    )
    threshold_records.append(metrics)

threshold_df = pd.DataFrame(threshold_records).sort_values("threshold").reset_index(drop=True)
threshold_df = (
    pd.DataFrame(threshold_records)
    .sort_values("overall_accuracy", ascending=False)
    .reset_index(drop=True)
)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

## **Experiments on Different Datasets for model train** 

In [35]:
feature_sets = {
    "baseline_noleak": X_noleak,
    "full_leak":       X_full,
    "normalized":      X_normalized,
    "corr_filtered":   X_corr_filtered,
    "pca":             X_pca,
}

def run_feature_set_experiment(
    X_matrix: pd.DataFrame,
    feature_set_name: str,
    best_cfg: dict,
    best_weights: dict,
    thresholds: np.ndarray,
) -> dict:
    """
    Train + evaluate one feature set using the SAME best hyperparams + weights
    you already selected. Returns a single metrics row.
    """

    X = X_matrix.copy()
    y = y_out.copy()
    meta = meta_out.copy()

    labeled_mask = y.notna()
    X_lab = X.loc[labeled_mask].copy()
    y_lab = y.loc[labeled_mask].astype(int)
    M_lab = meta.loc[labeled_mask].copy()

    seasons = pd.to_numeric(M_lab["season"], errors="coerce").astype(int)
    train_mask = seasons.between(2000, 2018)
    valid_mask = seasons.between(2019, 2021)
    test_mask  = seasons.between(2022, 2024)

    def split(X_, y_, M_, mask):
        return X_.loc[mask], y_.loc[mask], M_.loc[mask]

    X_tr, y_tr, M_tr = split(X_lab, y_lab, M_lab, train_mask)
    X_va, y_va, M_va = split(X_lab, y_lab, M_lab, valid_mask)
    X_te, y_te, M_te = split(X_lab, y_lab, M_lab, test_mask)
    X_tr, X_va, X_te = map(preprocess_for_xgb, [X_tr, X_va, X_te])
    w_acc          = best_weights["w_acc"]
    w_disagree_acc = best_weights["w_disagree_acc"]
    w_disagree_rate= best_weights["w_disagree_rate"]
    w_ev           = best_weights["w_ev"]

    w_tr = build_sample_weights(M_tr, w_acc, w_disagree_acc, w_disagree_rate, w_ev)
    w_va = build_sample_weights(M_va, w_acc, w_disagree_acc, w_disagree_rate, w_ev)

    dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=w_tr, enable_categorical=True)
    dvalid = xgb.DMatrix(X_va, label=y_va, weight=w_va, enable_categorical=True)
    dtest  = xgb.DMatrix(X_te, label=y_te, enable_categorical=True)

    base_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "verbosity": 0,
        "seed": 1337,
    }
    params = cast_params({**base_params, **best_cfg})

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=800,
        evals=[(dtrain, "train"), (dvalid, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False
    )
    best_iter = booster.best_iteration

    predict = lambda dm: booster.predict(dm, iteration_range=(0, best_iter + 1))
    tr_p, va_p, te_p = map(predict, [dtrain, dvalid, dtest])

    m_tr = eval_bundle(
        tr_p, y_tr, M_tr, thresholds,
        w_acc, w_disagree_acc, w_disagree_rate, w_ev
    )
    m_va = eval_bundle(
        va_p, y_va, M_va, thresholds,
        w_acc, w_disagree_acc, w_disagree_rate, w_ev
    )
    m_te = eval_bundle(
        te_p, y_te, M_te, thresholds,
        w_acc, w_disagree_acc, w_disagree_rate, w_ev
    )

    return {
        "feature_set": feature_set_name,
        "w_acc": w_acc,
        "w_disagree_acc": w_disagree_acc,
        "w_disagree_rate": w_disagree_rate,
        "w_ev": w_ev,
        "best_iter": best_iter,
        "train_score": m_tr["score"],
        "valid_score": m_va["score"],
        "test_score":  m_te["score"],
        "train_auc":   m_tr["auc"],
        "valid_auc":   m_va["auc"],
        "test_auc":    m_te["auc"],
        "train_ev":    m_tr["ev"],
        "valid_ev":    m_va["ev"],
        "test_ev":     m_te["ev"],
        "train_overall": m_tr["overall"],
        "valid_overall": m_va["overall"],
        "test_overall":  m_te["overall"],
        "train_disagree_rate": m_tr["disa_rate"],
        "valid_disagree_rate": m_va["disa_rate"],
        "test_disagree_rate":  m_te["disa_rate"],
    }

thresholds_grid = np.arange(0.4, 0.61, 0.02)

feature_records = []
for name, X_mat in feature_sets.items():
    print(f"[FEATURE] Running experiment for: {name}")
    metrics = run_feature_set_experiment(
        X_matrix=X_mat,
        feature_set_name=name,
        best_cfg=best_cfg,
        best_weights=best_weights,
        thresholds=thresholds_grid,
    )
    feature_records.append(metrics)

feature_search_df = (
    pd.DataFrame(feature_records)
      .sort_values("valid_score", ascending=False)
      .reset_index(drop=True)
)


leakage_df = feature_search_df[
    feature_search_df["feature_set"].isin(["baseline_noleak", "full_leak"])
].reset_index(drop=True)

[FEATURE] Running experiment for: baseline_noleak
[FEATURE] Running experiment for: full_leak
[FEATURE] Running experiment for: normalized
[FEATURE] Running experiment for: corr_filtered
[FEATURE] Running experiment for: pca


## **Weight search table**

In [36]:
weight_cols = [
    "w_acc", "w_disagree_acc", "w_disagree_rate", "w_ev",
]

metric_cols = [
    "train_score", "valid_score", "test_score",
    "train_ev", "valid_ev", "test_ev",
    "train_overall", "valid_overall", "test_overall",
    "valid_auc", "test_auc",
    "best_iter",
]

keep_cols = weight_cols + metric_cols

missing_cols = [c for c in keep_cols if c not in results_df.columns]
if missing_cols:
    print("[WARN] Some expected columns missing in results_df:", missing_cols)

weight_search_df = results_df[[c for c in keep_cols if c in results_df.columns]].copy()

[WARN] Some expected columns missing in results_df: ['train_ev', 'test_ev', 'train_overall', 'valid_overall', 'test_overall', 'test_auc']


## **Store models per feature set** 

In [37]:
feature_models = {}  

def train_feature_set_model(
    X_matrix: pd.DataFrame,
    best_cfg: dict,
    best_weights: dict,
    seed: int = 1337,
) -> xgb.Booster:
    """
    Train a single XGBoost model for a given feature matrix using
    the already-selected best_cfg and best_weights.
    """
    X = X_matrix.copy()
    y = y_out.copy()
    meta = meta_out.copy()

    labeled_mask = y.notna()
    X_lab = X.loc[labeled_mask].copy()
    y_lab = y.loc[labeled_mask].astype(int)
    M_lab = meta.loc[labeled_mask].copy()

    seasons = pd.to_numeric(M_lab["season"], errors="coerce").astype(int)
    train_mask = seasons.between(2000, 2018)
    valid_mask = seasons.between(2019, 2021)

    def split(X_, y_, M_, mask):
        return X_.loc[mask], y_.loc[mask], M_.loc[mask]

    X_tr, y_tr, M_tr = split(X_lab, y_lab, M_lab, train_mask)
    X_va, y_va, M_va = split(X_lab, y_lab, M_lab, valid_mask)

    X_tr, X_va = map(preprocess_for_xgb, [X_tr, X_va])

    w_acc          = best_weights["w_acc"]
    w_disagree_acc = best_weights["w_disagree_acc"]
    w_disagree_rate= best_weights["w_disagree_rate"]
    w_ev           = best_weights["w_ev"]

    w_tr = build_sample_weights(M_tr, w_acc, w_disagree_acc, w_disagree_rate, w_ev)
    w_va = build_sample_weights(M_va, w_acc, w_disagree_acc, w_disagree_rate, w_ev)

    dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=w_tr, enable_categorical=True)
    dvalid = xgb.DMatrix(X_va, label=y_va, weight=w_va, enable_categorical=True)

    base_params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        "verbosity": 0,
        "seed": seed,
    }
    params = cast_params({**base_params, **best_cfg})

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=800,
        evals=[(dtrain, "train"), (dvalid, "valid")],
        early_stopping_rounds=100,
        verbose_eval=False,
    )
    return booster

for i, (name, X_mat) in enumerate(feature_sets.items()):
    print(f"[MODELS] Training model for feature_set={name}")
    booster = train_feature_set_model(
        X_matrix=X_mat,
        best_cfg=best_cfg,
        best_weights=best_weights,
        seed=1337 + i,
    )
    feature_models[name] = booster

print("[STEP 4E] Trained models for feature sets:", list(feature_models.keys()))

[MODELS] Training model for feature_set=baseline_noleak
[MODELS] Training model for feature_set=full_leak
[MODELS] Training model for feature_set=normalized
[MODELS] Training model for feature_set=corr_filtered
[MODELS] Training model for feature_set=pca
[STEP 4E] Trained models for feature sets: ['baseline_noleak', 'full_leak', 'normalized', 'corr_filtered', 'pca']


## **Per-game tables per feature set**

In [39]:
def build_games_df_for_feature_set(
    X_matrix: pd.DataFrame,
    booster: xgb.Booster,
    feature_set_name: str,
) -> pd.DataFrame:
    X = X_matrix.copy()
    y = y_out.copy()
    meta = meta_out.copy()

    labeled_mask = y.notna()
    X_lab = X.loc[labeled_mask].copy()
    y_lab = y.loc[labeled_mask].astype(int)
    M_lab = meta.loc[labeled_mask].copy()

    X_lab_proc = preprocess_for_xgb(X_lab)
    d_all = xgb.DMatrix(X_lab_proc, label=y_lab, enable_categorical=True)
    probs_all = booster.predict(d_all)

    df = M_lab.copy()
    df["y_true"] = y_lab.values.astype(int)
    df["pred_prob_home_cover"] = probs_all

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p = pd.to_numeric(df["home_spread_vigfree"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_vigfree"], errors="coerce").values
    else:
        home_p = pd.to_numeric(df["home_spread_prob"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_prob"], errors="coerce").values

    df["market_pick"] = (home_p >= away_p).astype(int)

    default_tau = 0.50
    df["model_pick"] = (df["pred_prob_home_cover"] >= default_tau).astype(int)
    df["disagree_flag"] = (df["model_pick"] != df["market_pick"]).astype(int)

    df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
    df["feature_set"] = feature_set_name

    return df

games_df_list = []

baseline_gdf = games_df.copy()
baseline_gdf["feature_set"] = "baseline_noleak"
games_df_list.append(baseline_gdf)

for name, X_mat in feature_sets.items():
    if name == "baseline_noleak":
        continue
    print(f"[GAMES] Building games_df for feature_set={name}")
    booster = feature_models[name]
    gdf_fs = build_games_df_for_feature_set(X_mat, booster, name)
    games_df_list.append(gdf_fs)

games_df_all = pd.concat(games_df_list, ignore_index=True)

[GAMES] Building games_df for feature_set=full_leak
[GAMES] Building games_df for feature_set=normalized
[GAMES] Building games_df for feature_set=corr_filtered
[GAMES] Building games_df for feature_set=pca


## **Weekly performance feature set**

In [40]:
tau_star = 0.5 

perf_list = []

for name in feature_sets.keys():
    gdf_fs = games_df_all[games_df_all["feature_set"] == name].copy()
    perf_fs = build_perf_df(gdf_fs, threshold=tau_star)
    perf_fs["feature_set"] = name
    perf_list.append(perf_fs)

perf_df = pd.concat(perf_list, ignore_index=True)

## **Calibration per feature set**

In [41]:
calib_list = []

for name in feature_sets.keys():
    gdf_fs = games_df_all[games_df_all["feature_set"] == name].copy()
    calib_fs = build_calib_df(gdf_fs, n_bins=10)
    calib_fs["feature_set"] = name
    calib_list.append(calib_fs)

calib_df = pd.concat(calib_list, ignore_index=True)

## **Feature importances per feature set**

In [42]:
imp_rows = []

for name, X_mat in feature_sets.items():
    booster = feature_models[name]

    sample_proc = preprocess_for_xgb(X_mat.iloc[:1].copy())
    cols = list(sample_proc.columns)

    scores = booster.get_score(importance_type="gain") 

    def feature_name_from_key(k):
        try:
            idx = int(k[1:])
            return cols[idx] if idx < len(cols) else k
        except Exception:
            return k

    for k, val in scores.items():
        imp_rows.append({
            "feature_set": name,
            "feature": feature_name_from_key(k),
            "importance": float(val),
        })

feat_imp_df = pd.DataFrame(imp_rows)

# **Visualization Dashboard: Historical Performance 2022-2024**

## **Panel 1: Model Performance over Time**

In [43]:

if "perf_df" not in globals():
    raise RuntimeError("perf_df must be defined before running this cell.")

if "games_df_all" not in globals():
    raise RuntimeError("games_df_all must be defined before running this cell.")

feature_sets_available = sorted(games_df_all["feature_set"].dropna().unique().tolist())

perf_df_orig = perf_df
perf_df = perf_df_orig.copy()

perf_df["season"] = pd.to_numeric(perf_df["season"], errors="coerce").astype("Int64")
perf_df["week"]   = pd.to_numeric(perf_df["week"],   errors="coerce").astype("Int64")

perf_df = perf_df[
    (perf_df["season"] >= 2022) & (perf_df["season"] <= 2024)
].copy()

if perf_df.empty:
    print("Warning: No rows with season in [2022, 2024] in perf_df; Panel 3 will use games_df_all only.")


def get_best_tau_combo(threshold_df: pd.DataFrame):
    """
    Compute combo_score = (overall_accuracy + disagree_accuracy) / 2
    and return (best_tau, df_with_combo_score).

    If disagree_accuracy is missing, falls back to overall_accuracy only.
    """
    df = threshold_df.copy()

    if "threshold" not in df.columns or "overall_accuracy" not in df.columns:
        raise ValueError("threshold_df must contain 'threshold' and 'overall_accuracy'.")

    df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")
    df["overall_accuracy"] = pd.to_numeric(df["overall_accuracy"], errors="coerce")

    if "disagree_accuracy" in df.columns:
        df["disagree_accuracy"] = pd.to_numeric(df["disagree_accuracy"], errors="coerce")
        df["combo_score"] = (df["overall_accuracy"] + df["disagree_accuracy"]) / 2.0
    else:
        df["combo_score"] = df["overall_accuracy"]

    df = df.dropna(subset=["threshold", "combo_score"])
    if df.empty:
        raise ValueError("No valid rows in threshold_df to compute best_tau.")

    best_idx = df["combo_score"].idxmax()
    best_tau = float(df.loc[best_idx, "threshold"])
    return best_tau, df


def _get_best_tau_default(default_tau: float = 0.5) -> float:
    """Get best τ from threshold_df if available, else fallback."""
    if "threshold_df" in globals() and threshold_df is not None and not threshold_df.empty:
        try:
            best_tau, _df_tmp = get_best_tau_combo(threshold_df)
            return float(best_tau)
        except Exception:
            return float(default_tau)
    return float(default_tau)


def build_perf_season_df(tau: float | None = None) -> pd.DataFrame:
    """
    Build a season-level performance table for each feature_set from games_df_all.

    For each (feature_set, season), compute:
      - n_games
      - model_accuracy   (at threshold tau)
      - market_accuracy
      - disagree_rate
      - disagree_accuracy
      - model_cum_ev
      - market_cum_ev
    """
    if tau is None:
        tau = _get_best_tau_default(0.5)

    df = games_df_all.copy()

    df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
    df["week"]   = pd.to_numeric(df["week"],   errors="coerce").astype("Int64")
    df["y_true"] = pd.to_numeric(df.get("y_true", np.nan), errors="coerce")

    df = df[(df["season"] >= 2022) & (df["season"] <= 2024)].copy()
    df = df.dropna(subset=["season", "y_true"])

    if df.empty:
        raise RuntimeError("No labeled rows in games_df_all for seasons 2022–2024.")

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p_all = pd.to_numeric(df["home_spread_vigfree"], errors="coerce")
        away_p_all = pd.to_numeric(df["away_spread_vigfree"], errors="coerce")
    else:
        home_p_all = pd.to_numeric(df["home_spread_prob"], errors="coerce")
        away_p_all = pd.to_numeric(df["away_spread_prob"], errors="coerce")

    df["home_p_mkt"] = home_p_all
    df["away_p_mkt"] = away_p_all

    df["market_pick"] = (df["home_p_mkt"] >= df["away_p_mkt"]).astype(int)

    df["pred_prob_home_cover"] = pd.to_numeric(df["pred_prob_home_cover"], errors="coerce")
    df["model_pick_tau"] = (df["pred_prob_home_cover"] >= tau).astype(int)

    df["disagree_flag_tau"] = (df["model_pick_tau"] != df["market_pick"]).astype(int)

    def _fair_payout(p):
        if p <= 0 or p >= 1 or np.isnan(p):
            return np.nan
        return p / (1.0 - p)

    model_ev_list = []
    market_ev_list = []

    y_vals = df["y_true"].values
    model_side = df["model_pick_tau"].values
    market_side = df["market_pick"].values
    hp_vals = df["home_p_mkt"].values
    ap_vals = df["away_p_mkt"].values

    for mp, mkp, yt, hp, ap in zip(model_side, market_side, y_vals, hp_vals, ap_vals):
        if np.isnan(yt):
            model_ev_list.append(np.nan)
            market_ev_list.append(np.nan)
            continue

        if mp == 1:  
            payout_m = _fair_payout(hp)
            if np.isnan(payout_m):
                model_ev_list.append(np.nan)
            else:
                model_ev_list.append(payout_m if yt == 1 else -1.0)
        else:        
            payout_m = _fair_payout(ap)
            if np.isnan(payout_m):
                model_ev_list.append(np.nan)
            else:
                model_ev_list.append(payout_m if yt == 0 else -1.0)

        if mkp == 1:  
            payout_k = _fair_payout(hp)
            if np.isnan(payout_k):
                market_ev_list.append(np.nan)
            else:
                market_ev_list.append(payout_k if yt == 1 else -1.0)
        else:         
            payout_k = _fair_payout(ap)
            if np.isnan(payout_k):
                market_ev_list.append(np.nan)
            else:
                market_ev_list.append(payout_k if yt == 0 else -1.0)

    df["model_ev"] = model_ev_list
    df["market_ev"] = market_ev_list

    rows = []
    for (fs, season), grp in df.groupby(["feature_set", "season"]):
        grp = grp.copy()
        y = grp["y_true"].values
        mp = grp["model_pick_tau"].values
        mk = grp["market_pick"].values
        dis = grp["disagree_flag_tau"].values

        n_games = len(grp)

        model_acc = (mp == y).mean() if n_games > 0 else np.nan
        market_acc = (mk == y).mean() if n_games > 0 else np.nan
        disagree_rate = dis.mean() if n_games > 0 else np.nan

        disagree_mask = (dis == 1)
        if disagree_mask.any():
            disagree_acc = (mp[disagree_mask] == y[disagree_mask]).mean()
        else:
            disagree_acc = np.nan

        model_cum_ev = np.nansum(grp["model_ev"].values)
        market_cum_ev = np.nansum(grp["market_ev"].values)

        rows.append(
            {
                "feature_set": fs,
                "season": int(season),
                "n_games": int(n_games),
                "model_accuracy": model_acc,
                "market_accuracy": market_acc,
                "disagree_rate": disagree_rate,
                "disagree_accuracy": disagree_acc,
                "model_cum_ev": model_cum_ev,
                "market_cum_ev": market_cum_ev,
            }
        )

    perf_season_df = pd.DataFrame(rows)
    if perf_season_df.empty:
        raise RuntimeError("perf_season_df ended up empty after aggregation.")
    return perf_season_df


perf_season_df = build_perf_season_df()


def get_perf_season_slice(feature_set: str) -> pd.DataFrame:
    """
    Season-level slice for given feature_set (2022–2024), sorted by season.
    """
    df = perf_season_df.copy()
    df = df[df["feature_set"] == feature_set].copy()
    if df.empty:
        return df
    df = df.sort_values("season")
    return df


def draw_panel_3(feature_set: str):
    """
    Panel 3:
      - Season-level performance (2022–2024) for a given feature_set:
          * model_accuracy vs market_accuracy per season
          * model_cum_ev vs market_cum_ev per season
      - Season summary table with:
          * season
          * n_games
          * model_accuracy, market_accuracy
          * disagree_rate, disagree_accuracy
          * model_cum_ev, market_cum_ev
    """
    pf = get_perf_season_slice(feature_set)
    if pf.empty:
        print(f"[Panel 3] No season-level perf data for feature_set={feature_set} in 2022–2024.")
        print("perf_season_df columns:", list(perf_season_df.columns))
        return

    for c in [
        "model_accuracy",
        "market_accuracy",
        "disagree_rate",
        "disagree_accuracy",
        "model_cum_ev",
        "market_cum_ev",
    ]:
        pf[c] = pd.to_numeric(pf[c], errors="coerce")

    seasons = pf["season"].astype(int).tolist()
    idx = np.arange(len(seasons))
    width = 0.35

    fig1, ax1 = plt.subplots(figsize=(10, 4))

    ax1.bar(idx - width / 2, pf["model_accuracy"].values, width=width, label="model accuracy")
    ax1.bar(idx + width / 2, pf["market_accuracy"].values, width=width, label="market accuracy")

    ax1.set_xticks(idx)
    ax1.set_xticklabels(seasons)
    ax1.set_ylim(0.0, 1.0)
    ax1.set_ylabel("Accuracy")
    ax1.set_title(f"Model vs market accuracy by season — feature_set={feature_set}")
    ax1.grid(True, axis="y", alpha=0.3)
    ax1.legend(loc="best")

    plt.tight_layout()
    plt.show()

    fig2, ax2 = plt.subplots(figsize=(10, 4))

    ax2.bar(idx - width / 2, pf["model_cum_ev"].values, width=width, label="model cumulative EV")
    ax2.bar(idx + width / 2, pf["market_cum_ev"].values, width=width, label="market cumulative EV")

    ax2.set_xticks(idx)
    ax2.set_xticklabels(seasons)
    ax2.set_ylabel("Cumulative EV")
    ax2.set_title(f"Model vs market cumulative EV by season — feature_set={feature_set}")
    ax2.grid(True, axis="y", alpha=0.3)
    ax2.legend(loc="best")

    plt.tight_layout()
    plt.show()

    summary = pf.copy()
    summary["model_accuracy (%)"] = summary["model_accuracy"] * 100.0
    summary["market_accuracy (%)"] = summary["market_accuracy"] * 100.0
    summary["disagree_rate (%)"] = summary["disagree_rate"] * 100.0
    summary["disagree_accuracy (%)"] = summary["disagree_accuracy"] * 100.0

    cols = [
        "season",
        "feature_set",
        "n_games",
        "model_accuracy (%)",
        "market_accuracy (%)",
        "disagree_rate (%)",
        "disagree_accuracy (%)",
        "model_cum_ev",
        "market_cum_ev",
    ]

    print("[Panel 3] Season-level performance summary (2022–2024):")
    display(summary[cols])


feature_set_widget_p3 = widgets.Dropdown(
    options=feature_sets_available,
    value="baseline_noleak" if "baseline_noleak" in feature_sets_available else feature_sets_available[0],
    description="Feature set (P3):",
)

controls_p3 = widgets.HBox([feature_set_widget_p3])


def update_dashboard_panel3(feature_set):
    clear_output(wait=True)
    display(controls_p3)

    print("\n=== Panel 3 — Season performance (2022–2024) ===")
    draw_panel_3(feature_set)


out_panel3 = widgets.interactive_output(
    update_dashboard_panel3,
    {"feature_set": feature_set_widget_p3},
)

display(out_panel3)


Output()

## **Panel 2: Model Performance Week to Week + Threshold Overview** 

In [44]:
if "games_df_all" not in globals():
    raise RuntimeError("games_df_all must be defined before running this cell.")

games_df_all_orig = games_df_all

games_df_all = games_df_all_orig.copy()
games_df_all["season"] = pd.to_numeric(games_df_all["season"], errors="coerce").astype("Int64")
games_df_all["week"]   = pd.to_numeric(games_df_all["week"],   errors="coerce").astype("Int64")

games_df_all = games_df_all[
    (games_df_all["season"] >= 2022) & (games_df_all["season"] <= 2024)
].copy()

if games_df_all.empty:
    raise RuntimeError("No rows with season in [2022, 2024] in games_df_all.")

seasons_available = sorted(games_df_all["season"].dropna().unique().tolist())
seasons_available = [int(s) for s in seasons_available]

weeks_by_season = {
    s: sorted(
        games_df_all.loc[games_df_all["season"] == s, "week"]
        .dropna()
        .astype(int)
        .unique()
        .tolist()
    )
    for s in seasons_available
}

feature_sets_available = sorted(games_df_all["feature_set"].dropna().unique().tolist())

default_season = 2022 if 2022 in seasons_available else seasons_available[0]

season_widget = widgets.Dropdown(
    options=seasons_available,
    value=default_season,
    description="Season:",
)


def _initial_week_for_season(s):
    wks = weeks_by_season.get(s, [])
    return wks[0] if wks else None

week_widget = widgets.Dropdown(
    options=weeks_by_season[season_widget.value],
    value=_initial_week_for_season(season_widget.value),
    description="Week:",
)

def _on_season_change(change):
    if change["name"] != "value":
        return
    new_season = change["new"]
    valid_weeks = weeks_by_season.get(new_season, [])
    if not valid_weeks:
        week_widget.options = []
        week_widget.value = None
        return
    week_widget.options = valid_weeks
    week_widget.value = valid_weeks[-1]

season_widget.observe(_on_season_change, names="value")

threshold_widget = widgets.FloatSlider(
    value=0.50,
    min=0.01,
    max=0.99,
    step=0.01,
    description="Threshold:",
    readout_format=".2f",
)

feature_set_widget = widgets.Dropdown(
    options=feature_sets_available,
    value="baseline_noleak" if "baseline_noleak" in feature_sets_available else feature_sets_available[0],
    description="Feature set:",
)

objective_view_widget = widgets.Dropdown(
    options=["Weights", "Threshold"],
    value="Weights",
    description="Obj view:",
)

controls = widgets.HBox([
    season_widget,
    week_widget,
    threshold_widget,
    feature_set_widget,
    objective_view_widget, 
])

def get_games_slice(season: int, week: int, feature_set: str) -> pd.DataFrame:
    """
    Filter games_df_all by season, week, feature_set.
    (All are already restricted to 2022–2024 upstream.)
    """
    df = games_df_all
    mask = (
        (df["season"] == season) &
        (df["week"] == week) &
        (df["feature_set"] == feature_set)
    )
    return df.loc[mask].copy()

def compute_per_game_ev(df: pd.DataFrame, threshold: float) -> pd.DataFrame:
    """
    For a per-game slice (one season/week/feature_set) and a threshold, recompute:
      - model_pick_tau  (direction: home vs away)
      - disagree_flag_tau
      - is_bet_tau      (always 1; threshold never removes bets)
      - ev_trade_tau    (EV, given that we always bet one side)
    using vig-free probs if available, else raw spread probs.
    """
    df = df.copy()

    p_model = pd.to_numeric(df["pred_prob_home_cover"], errors="coerce").values
    df["model_pick_tau"] = (p_model >= threshold).astype(int)

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p = pd.to_numeric(df["home_spread_vigfree"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_vigfree"], errors="coerce").values
    else:
        home_p = pd.to_numeric(df["home_spread_prob"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_prob"], errors="coerce").values

    df["market_pick"] = (home_p >= away_p).astype(int)

    df["disagree_flag_tau"] = (df["model_pick_tau"].values != df["market_pick"].values).astype(int)

    df["is_bet_tau"] = 1

    if "y_true" not in df.columns:
        df["ev_trade_tau"] = np.nan
        return df

    y_true = pd.to_numeric(df["y_true"], errors="coerce").values
    ev_list = []

    for yp, yt, hp, ap in zip(
        df["model_pick_tau"].values,
        y_true,
        home_p,
        away_p,
    ):
        if np.isnan(yt):
            ev_list.append(np.nan)
            continue

        if yp == 1:  # bet home
            payout = fair_payout(hp)
            if np.isnan(payout):
                ev_list.append(np.nan)
            else:
                ev_list.append(payout if yt == 1 else -1.0)
        else:        # bet away
            payout = fair_payout(ap)
            if np.isnan(payout):
                ev_list.append(np.nan)
            else:
                ev_list.append(payout if yt == 0 else -1.0)

    df["ev_trade_tau"] = ev_list
    return df

def _draw_threshold_sweep(current_tau=None):
    """
    Line chart: x = threshold, y = combo_score and disagree_rate.
    Shows:
        • BEST τ (vertical dashed black)
        • CURRENT τ from slider (vertical dashed orange)
    """
    if "threshold_df" not in globals() or threshold_df is None or threshold_df.empty:
        print("[Panel 1] threshold_df not available; skipping threshold sweep chart.")
        return

    try:
        best_tau, df = get_best_tau_combo(threshold_df)
    except ValueError as e:
        print(f"[Panel 1] {e}")
        display(threshold_df.head())
        return

    if current_tau is None:
        try:
            current_tau = float(threshold_widget.value)
        except Exception:
            current_tau = None

    df = df.copy()
    df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")
    if "combo_score" in df.columns:
        df["combo_score"] = pd.to_numeric(df["combo_score"], errors="coerce")
    if "disagree_rate" in df.columns:
        df["disagree_rate"] = pd.to_numeric(df["disagree_rate"], errors="coerce")

    df = df.dropna(subset=["threshold"])
    df = df.sort_values("threshold")

    fig, ax = plt.subplots(figsize=(10, 4))

    ax.plot(df["threshold"], df["combo_score"], marker="o", label="combo_score")

    if "disagree_rate" in df.columns:
        ax.plot(
            df["threshold"], df["disagree_rate"],
            marker="x", linestyle="--", label="disagree_rate"
        )

    ax.axvline(best_tau, color="black", linestyle="--", linewidth=1.5,
               label=f"BEST τ = {best_tau:.3f}")

    if current_tau is not None:
        ax.axvline(current_tau, color="orange", linestyle="--", linewidth=1.5,
                   label=f"CURRENT τ = {current_tau:.3f}")

    ax.set_xlabel("Threshold τ")
    ax.set_ylabel("Metric value")
    ax.set_title("Model behaviour vs threshold")
    ax.set_ylim(0.0, 1.0)
    ax.grid(True, alpha=0.3)
    ax.legend(loc="best")

    plt.tight_layout()
    plt.show()

    print(f"[Panel 1] BEST τ = {best_tau:.3f}")
    if current_tau is not None:
        print(f"[Panel 1] CURRENT τ = {current_tau:.3f}")


def draw_panel_1(season: int, week: int, threshold: float, feature_set: str):
    g = get_games_slice(season, week, feature_set)
    if g.empty:
        print(f"[Panel 1] No games found for season={season}, week={week}, feature_set={feature_set}.")
        return

    g = compute_per_game_ev(g, threshold)

    home = g["home_team"].astype(str).fillna("")
    away = g["away_team"].astype(str).fillna("")
    g["matchup"] = away + " @ " + home

    g["model_prob_home_covers"] = pd.to_numeric(g["pred_prob_home_cover"], errors="coerce")
    if "home_spread_vigfree" in g.columns:
        g["market_prob_home_covers"] = pd.to_numeric(g["home_spread_vigfree"], errors="coerce")
    elif "home_spread_prob" in g.columns:
        g["market_prob_home_covers"] = pd.to_numeric(g["home_spread_prob"], errors="coerce")
    else:
        g["market_prob_home_covers"] = np.nan

    model_side_bin = g["model_pick_tau"].astype(float).values
    market_side_bin = g["market_pick"].astype(float).values

    home_arr = home.values
    away_arr = away.values

    g["model_pick"] = np.where(model_side_bin == 1, home_arr, away_arr)
    g["market_pick"] = np.where(market_side_bin == 1, home_arr, away_arr)

    if "y_true" in g.columns:
        y_true_num = pd.to_numeric(g["y_true"], errors="coerce").values
        team_that_covered = np.where(
            np.isnan(y_true_num),
            np.nan,
            np.where(y_true_num == 1, home_arr, away_arr),
        )
        g["team_that_covered"] = team_that_covered
    else:
        g["team_that_covered"] = np.nan

    ev_vals = g["ev_trade_tau"].astype(float).values
    n_games = len(g)

    x = np.arange(n_games)
    fig, ax = plt.subplots(figsize=(12, 4))

    colors = np.where(ev_vals >= 0, "C2", "C3")  
    ax.bar(x, ev_vals, color=colors)
    ax.axhline(0.0, linestyle="--", color="grey", alpha=0.6)

    ax.set_ylabel("EV per game (τ-bets)")
    ax.set_xlabel("Game")
    ax.set_title(f"Per-game EV — Season {season}, Week {week}, feature_set={feature_set}, τ={threshold:.2f}")
    ax.grid(True, axis="y", alpha=0.3)

    ax.set_xticks(x)
    ax.set_xticklabels(g["matchup"].tolist(), rotation=45, ha="right")

    plt.tight_layout()
    plt.show()

    _draw_threshold_sweep()

    if "y_true" in g.columns:
        y_true = pd.to_numeric(g["y_true"], errors="coerce")
        valid_mask = y_true.notna()

        if valid_mask.any():
            model_correct = (
                g.loc[valid_mask, "model_pick_tau"].astype(int).values ==
                y_true.loc[valid_mask].astype(int).values
            )
            accuracy = model_correct.mean()
        else:
            accuracy = np.nan

        disagree_rate = g["disagree_flag_tau"].mean()

        disagree_mask = (g["disagree_flag_tau"] == 1) & valid_mask
        if disagree_mask.any():
            disagree_correct = (
                g.loc[disagree_mask, "model_pick_tau"].astype(int).values ==
                y_true.loc[disagree_mask].astype(int).values
            )
            disagree_acc = disagree_correct.mean()
        else:
            disagree_acc = np.nan
    else:
        accuracy = np.nan
        disagree_rate = np.nan
        disagree_acc = np.nan

    total_ev = np.nansum(ev_vals) if np.isfinite(ev_vals).any() else np.nan

    summary_df = pd.DataFrame(
        {
            "games": [n_games],
            "accuracy (%)": [accuracy * 100 if pd.notna(accuracy) else np.nan],
            "total_ev": [total_ev],
            "disagree_rate (%)": [disagree_rate * 100 if pd.notna(disagree_rate) else np.nan],
            "disagree_accuracy (%)": [disagree_acc * 100 if pd.notna(disagree_acc) else np.nan],
        }
    )

    print("[Panel 1] Weekly summary (current τ):")
    display(summary_df)

    cols_table = [
        "home_team",
        "away_team",
        "matchup",
        "model_prob_home_covers",
        "market_prob_home_covers",
        "model_pick",
        "market_pick",
        "team_that_covered",
    ]

    display_df = g[cols_table].rename(columns={
        "model_prob_home_covers": "model probability home team covers",
        "market_prob_home_covers": "market probability home team covers",
        "model_pick": "model pick",
        "market_pick": "market pick",
        "team_that_covered": "team that covered",
    }).reset_index(drop=True)

    print("\n[Panel 1] Per-game decisions (model vs market vs actual):")
    display(display_df)


def update_dashboard(season, week, threshold, feature_set, objective_view):
    clear_output(wait=True)
    display(controls)

    print("\n=== Panel 1 — Per-game predictions, disagreements, EV ===")
    draw_panel_1(season, week, threshold, feature_set)
    
out = widgets.interactive_output(
    update_dashboard,
    {
        "season": season_widget,
        "week": week_widget,
        "threshold": threshold_widget,
        "feature_set": feature_set_widget,
        "objective_view": objective_view_widget,
    },
)

display(out)

Output()

## **Panel 3: Calibration and Feature Importance**

In [45]:
def get_calib_slice(feature_set: str) -> pd.DataFrame:
    """
    Filter calib_df by feature_set.
    Uses pred_mean on the x-axis (your choice B).
    """
    if "calib_df" not in globals() or calib_df is None or calib_df.empty:
        print("[Panel 2] calib_df not available.")
        return pd.DataFrame()
    df = calib_df.copy()
    return df[df["feature_set"] == feature_set].copy()


def get_feat_imp_slice(feature_set: str, top_n: int = 20) -> pd.DataFrame:
    """
    Filter feat_imp_df by feature_set and return top-N by importance/gain.
    """
    if "feat_imp_df" not in globals() or feat_imp_df is None or feat_imp_df.empty:
        print("[Panel 2] feat_imp_df not available.")
        return pd.DataFrame()
    df = feat_imp_df.copy()
    df = df[df["feature_set"] == feature_set].copy()
    if df.empty:
        return df

    importance_col = None
    if "importance" in df.columns:
        importance_col = "importance"
    elif "gain" in df.columns:
        importance_col = "gain"
    else:
        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        importance_col = num_cols[0] if num_cols else None

    if importance_col is None:
        print("[Panel 2] No numeric importance column found in feat_imp_df.")
        return pd.DataFrame()

    df[importance_col] = pd.to_numeric(df[importance_col], errors="coerce")
    df = df.dropna(subset=[importance_col])
    df = df.sort_values(importance_col, ascending=False).head(top_n)
    df = df.rename(columns={importance_col: "importance_val"})
    return df


def draw_panel_2(feature_set: str):
    """
    Panel 2:
      - Left: calibration curve (pred_mean vs empirical_rate)
      - Right: top-N feature importances (horizontal bar chart)
    """
    c = get_calib_slice(feature_set)
    fi = get_feat_imp_slice(feature_set, top_n=20)

    if c.empty:
        print(f"[Panel 2] No calibration data for feature_set={feature_set}.")
        return

    if fi.empty:
        print(f"[Panel 2] No feature-importance data for feature_set={feature_set}.")
        return

    if "pred_mean" in c.columns:
        x_col = "pred_mean"
    elif "bin_center" in c.columns:
        x_col = "bin_center"
        print("[Panel 2] pred_mean not found; using bin_center instead.")
    else:
        print("[Panel 2] Neither pred_mean nor bin_center in calib_df slice.")
        display(c.head())
        return

    c[x_col] = pd.to_numeric(c[x_col], errors="coerce")
    c["empirical_rate"] = pd.to_numeric(c["empirical_rate"], errors="coerce")
    c = c.dropna(subset=[x_col, "empirical_rate"])

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    ax1 = axes[0]
    x = c[x_col].values
    y = c["empirical_rate"].values

    ax1.plot(x, y, marker="o", linestyle="-", label="Empirical")

    min_xy = 0.0
    max_xy = 1.0
    ax1.plot([min_xy, max_xy], [min_xy, max_xy], linestyle="--", color="grey", alpha=0.7,
             label="Perfect calibration")

    ax1.set_xlabel("predicted mean P(home covers)")
    ax1.set_ylabel("empirical cover rate")
    ax1.set_title(f"Calibration curve — {feature_set}")
    ax1.set_xlim(0.0, 1.0)
    ax1.set_ylim(0.0, 1.0)
    ax1.grid(True, alpha=0.3)
    ax1.legend(loc="best")

    ax2 = axes[1]

    fi_plot = fi.sort_values("importance_val", ascending=True)
    ax2.barh(fi_plot["feature"], fi_plot["importance_val"])
    ax2.set_xlabel("importance")
    ax2.set_ylabel("feature")
    ax2.set_title(f"Top features — {feature_set}")
    ax2.grid(True, axis="x", alpha=0.3)

    plt.tight_layout()
    plt.show()

    print("[Panel 2] Calibration bins (head):")
    display(c[[x_col, "empirical_rate", "count"]].head())

    print("\n[Panel 2] Top feature importances:")
    display(fi[["feature", "importance_val"]])

feature_set_widget_p2 = widgets.Dropdown(
    options=feature_sets_available,
    value="baseline_noleak" if "baseline_noleak" in feature_sets_available else feature_sets_available[0],
    description="Feature set (P2):",
)

controls_p2 = widgets.HBox([feature_set_widget_p2])

def update_dashboard_panel2(feature_set):
    clear_output(wait=True)
    display(controls_p2)

    print("\n=== Panel 2 — Calibration + Feature importance ===")
    draw_panel_2(feature_set)

out_panel2 = widgets.interactive_output(
    update_dashboard_panel2,
    {"feature_set": feature_set_widget_p2},
)

display(out_panel2)


Output()

## **Panel 4: Objective Tuning(weights vs threshold)**

In [46]:
def _ensure_numeric(df: pd.DataFrame, cols) -> pd.DataFrame:
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def draw_panel_4_weights(top_k: int = 5):
    """
    Panel 4 / Weights:
      - Take top_k weight combos by valid_score.
      - Figure 1: bar chart of train/valid/test scores + accuracies per combo.
      - Figure 2: bar chart of EV + AUC per combo.
      - Print top_k table.
    """
    if "weight_search_df" not in globals() or weight_search_df is None or weight_search_df.empty:
        print("[Panel 4 / Weights] weight_search_df is empty or not defined.")
        return

    df = weight_search_df.copy()

    df = _ensure_numeric(
        df,
        [
            "w_acc",
            "w_disagree_acc",
            "w_disagree_rate",
            "w_ev",
            "train_score", "valid_score", "test_score",
            "train_overall", "valid_overall", "test_overall",
            "train_ev", "valid_ev", "test_ev",
            "train_auc", "valid_auc", "test_auc",
        ],
    )

    if "valid_score" not in df.columns:
        print("[Panel 4 / Weights] weight_search_df must contain 'valid_score'.")
        display(df.head())
        return

    df = df.dropna(subset=["valid_score"]).copy()
    if df.empty:
        print("[Panel 4 / Weights] No valid rows after dropping NaNs in valid_score.")
        return

    df_top = df.sort_values("valid_score", ascending=False).head(top_k).copy()

    def make_label(row):
        parts = []
        if "w_ev" in row:
            parts.append(f"w_ev={row['w_ev']:.2f}")
        if "w_disagree_rate" in row:
            parts.append(f"w_dis_rate={row['w_disagree_rate']:.2f}")
        if "w_acc" in row:
            parts.append(f"w_acc={row['w_acc']:.2f}")
        return ", ".join(parts) if parts else f"idx={row.name}"

    df_top["label"] = df_top.apply(make_label, axis=1)

    labels = df_top["label"].tolist()
    n_combo = len(labels)
    x_idx = np.arange(n_combo)
    width = 0.12  

    score_cols = [c for c in ["train_score", "valid_score", "test_score"] if c in df_top.columns]
    acc_cols = [c for c in ["train_overall", "valid_overall", "test_overall"] if c in df_top.columns]

    fig1, ax1 = plt.subplots(figsize=(max(10, 2 * n_combo), 5))

    offset = - (len(score_cols) + len(acc_cols) - 1) * width / 2

    for col in score_cols:
        ax1.bar(x_idx + offset, df_top[col].values, width=width, label=col)
        offset += width

    for col in acc_cols:
        ax1.bar(x_idx + offset, df_top[col].values, width=width, label=col)
        offset += width

    ax1.set_xticks(x_idx)
    ax1.set_xticklabels(labels, rotation=45, ha="right")
    ax1.set_ylim(0.0, 1.0)
    ax1.set_ylabel("Score / Accuracy")
    ax1.set_title("Train / Valid / Test scores and accuracies by weight combo")
    ax1.grid(True, axis="y", alpha=0.3)
    ax1.legend(loc="best", ncol=2)

    plt.tight_layout()
    plt.show()

    ev_cols = [c for c in ["train_ev", "valid_ev", "test_ev"] if c in df_top.columns]
    auc_cols = [c for c in ["train_auc", "valid_auc", "test_auc"] if c in df_top.columns]

    if ev_cols or auc_cols:
        fig2, ax2 = plt.subplots(figsize=(max(10, 2 * n_combo), 5))

        offset = - (len(ev_cols) + len(auc_cols) - 1) * width / 2

        for col in ev_cols:
            ax2.bar(x_idx + offset, df_top[col].values, width=width, label=col)
            offset += width

        for col in auc_cols:
            ax2.bar(x_idx + offset, df_top[col].values, width=width, label=col)
            offset += width

        ax2.set_xticks(x_idx)
        ax2.set_xticklabels(labels, rotation=45, ha="right")
        ax2.set_ylabel("EV / AUC")
        ax2.set_title("EV and AUC by weight combo")
        ax2.grid(True, axis="y", alpha=0.3)
        ax2.legend(loc="best", ncol=2)

        plt.tight_layout()
        plt.show()
    else:
        print("[Panel 4 / Weights] No EV/AUC columns to plot.")

    cols_show = [c for c in [
        "w_acc", "w_disagree_acc", "w_disagree_rate", "w_ev",
        "train_score", "valid_score", "test_score",
        "train_overall", "valid_overall", "test_overall",
        "train_ev", "valid_ev", "test_ev",
        "train_auc", "valid_auc", "test_auc",
        "best_iter",
    ] if c in df_top.columns]

    print(f"[Panel 4 / Weights] Top {len(df_top)} weight combos by valid_score:")
    display(df_top[cols_show].reset_index(drop=True))


def draw_panel_4_threshold():
    if "threshold_df" not in globals() or threshold_df is None or threshold_df.empty:
        print("[Panel 4 / Threshold] threshold_df is empty or not defined.")
        return

    df = threshold_df.copy()

    if "threshold" not in df.columns or "overall_accuracy" not in df.columns:
        print("[Panel 4 / Threshold] threshold_df must contain 'threshold' and 'overall_accuracy'.")
        display(df.head())
        return

    df = _ensure_numeric(
        df,
        ["threshold", "overall_accuracy", "disagree_accuracy", "disagree_rate", "ev"],
    )

    if "combo_score" not in df.columns:
        if "disagree_accuracy" in df.columns:
            df["combo_score"] = (df["overall_accuracy"] + df["disagree_accuracy"]) / 2.0
        else:
            df["combo_score"] = df["overall_accuracy"]

    df = df.dropna(subset=["threshold", "combo_score"])
    if df.empty:
        print("[Panel 4 / Threshold] No valid rows after cleaning threshold_df.")
        return

    df = df.sort_values("threshold")

    best_idx = df["combo_score"].idxmax()
    best_tau = float(df.loc[best_idx, "threshold"])
    best_combo = float(df.loc[best_idx, "combo_score"])

    fig, ax = plt.subplots(figsize=(10, 4))

    ax.plot(
        df["threshold"],
        df["combo_score"],
        marker="o",
        label="combo_score",
    )

    ax.plot(
        df["threshold"],
        df["overall_accuracy"],
        marker="s",
        linestyle="--",
        label="overall_accuracy",
    )

    if "disagree_rate" in df.columns:
        ax.plot(
            df["threshold"],
            df["disagree_rate"],
            marker="x",
            linestyle=":",
            label="disagree_rate",
        )

    ax.axvline(
        best_tau,
        linestyle="--",
        color="black",
        alpha=0.9,
        label=f"BEST τ = {best_tau:.3f}",
    )

    ax.set_xlabel("Threshold τ")
    ax.set_ylabel("Metric value")
    ax.set_title("Threshold sweep — combo_score, accuracy, disagreement")
    ax.set_ylim(0.0, 1.0)
    ax.grid(True, alpha=0.3)
    ax.legend(loc="best")

    plt.tight_layout()
    plt.show()

    print(f"[Panel 4 / Threshold] BEST τ (by combo_score) = {best_tau:.3f}, combo_score = {best_combo:.3f}")

    cols_show = [
    c for c in
    ["threshold", "overall_accuracy", "disagree_accuracy",
     "disagree_rate", "combo_score", "ev"]
    if c in df.columns
    ]

    print("\n[Panel 4 / Threshold] threshold_df:")

    display(
        df[cols_show]
        .sort_values("overall_accuracy", ascending=False)
        .head(10)
    )



def draw_panel_4(objective_view: str):
    if objective_view == "Weights":
        draw_panel_4_weights()
    elif objective_view == "Threshold":
        draw_panel_4_threshold()
    else:
        print(f"[Panel 4] Unknown objective_view={objective_view!r}")


objective_view_widget_p4 = widgets.Dropdown(
    options=["Weights", "Threshold"],
    value="Weights",
    description="View (P4):",
)

controls_p4 = widgets.HBox([objective_view_widget_p4])


def update_dashboard_panel4(view):
    clear_output(wait=True)
    display(controls_p4)

    print("\n=== Panel 4 — Objective tuning (Weights / Threshold) ===")
    draw_panel_4(view)


out_panel4 = widgets.interactive_output(
    update_dashboard_panel4,
    {"view": objective_view_widget_p4},
)

display(out_panel4)

Output()

## **Panel 5:Cross feature set comparison, leakage check, feature set performance comparison**

In [47]:
if "feature_search_df" not in globals():
    raise RuntimeError("feature_search_df must be defined before Panel 5.")

def _ensure_numeric(df: pd.DataFrame, cols) -> pd.DataFrame:
    df = df.copy()
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def draw_panel_5(feature_set: str):
    """
    Panel 5:
      - Feature-set comparison (all games 2022–2024):
          * Bar chart of EV by feature_set (test_ev/valid_ev).
          * Line for all-games accuracy.
          * Table with n_games_all, accuracy_all, disagree_rate_all, EV.
      - Leakage comparison:
          * Bar chart of EV + accuracy (if available) from leakage_df.
    """

    tau_global = 0.50
    if "threshold_df" in globals() and threshold_df is not None and not threshold_df.empty:
        try:
            tau_global, _ = get_best_tau_combo(threshold_df) 
        except Exception:
            tau_global = 0.50

    agg_games = None
    if "games_df_all" in globals() and games_df_all is not None and not games_df_all.empty:
        g_all = games_df_all.copy()

        if "feature_set" not in g_all.columns:
            raise RuntimeError("games_df_all must contain 'feature_set' for Panel 5.")

        g_all["pred_prob_home_cover"] = pd.to_numeric(
            g_all["pred_prob_home_cover"], errors="coerce"
        )
        g_all["model_pick_all"] = (g_all["pred_prob_home_cover"] >= tau_global).astype(int)

        if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(g_all.columns):
            home_p = pd.to_numeric(g_all["home_spread_vigfree"], errors="coerce")
            away_p = pd.to_numeric(g_all["away_spread_vigfree"], errors="coerce")
        else:
            home_p = pd.to_numeric(g_all.get("home_spread_prob"), errors="coerce")
            away_p = pd.to_numeric(g_all.get("away_spread_prob"), errors="coerce")

        g_all["market_pick_all"] = (home_p >= away_p).astype(int)

        g_all["disagree_flag_all"] = (
            g_all["model_pick_all"].values != g_all["market_pick_all"].values
        ).astype(int)

        if "y_true" in g_all.columns:
            y = pd.to_numeric(g_all["y_true"], errors="coerce")
            mask = y.notna()

            g_sub = g_all.loc[mask].copy()
            if not g_sub.empty:
                correct = (g_sub["model_pick_all"].astype(int).values == y.loc[mask].astype(int).values)
                g_sub["correct_all"] = correct.astype(float)

                agg_games = (
                    g_sub.groupby("feature_set")
                    .agg(
                        n_games_all=("y_true", "size"),
                        accuracy_all=("correct_all", "mean"),
                        disagree_rate_all=("disagree_flag_all", "mean"),
                    )
                    .reset_index()
                )

    if "feature_search_df" not in globals() or feature_search_df is None or feature_search_df.empty:
        print("[Panel 5] feature_search_df is empty or not defined.")
        return

    fs = feature_search_df.copy()

    ev_col = None
    for cand in ["test_ev", "valid_ev"]:
        if cand in fs.columns:
            ev_col = cand
            break

    acc_cv_col = None
    for cand in ["test_overall", "valid_overall"]:
        if cand in fs.columns:
            acc_cv_col = cand
            break

    if ev_col is None:
        print("[Panel 5] No EV column (test_ev/valid_ev) in feature_search_df.")
        display(fs.head())
        return

    if "feature_set" not in fs.columns:
        print("[Panel 5] feature_search_df must contain 'feature_set'.")
        display(fs.head())
        return

    if agg_games is not None:
        fs = fs.merge(agg_games, on="feature_set", how="left")

    num_cols = [ev_col, acc_cv_col, "accuracy_all", "disagree_rate_all"]
    fs = _ensure_numeric(fs, [c for c in num_cols if c is not None])

    fs_sorted = fs.dropna(subset=[ev_col]).copy()
    fs_sorted = fs_sorted.sort_values(ev_col, ascending=False)

    feat_names = fs_sorted["feature_set"].astype(str).tolist()
    x = np.arange(len(feat_names))
    ev_vals = fs_sorted[ev_col].values

    colors = ["C1" if name == feature_set else "C0" for name in feat_names]

    fig, axes = plt.subplots(1, 2, figsize=(16, 4))

    ax1 = axes[0]
    ax1.bar(x, ev_vals, color=colors, label=ev_col)
    ax1.set_xticks(x)
    ax1.set_xticklabels(feat_names, rotation=45, ha="right")
    ax1.set_ylabel(ev_col)
    ax1.set_title("EV by feature_set (bars) + all-games accuracy (line)")
    ax1.grid(True, axis="y", alpha=0.3)

    if "accuracy_all" in fs_sorted.columns and fs_sorted["accuracy_all"].notna().any():
        acc_all_vals = fs_sorted["accuracy_all"].values
        ax1b = ax1.twinx()
        ax1b.plot(x, acc_all_vals, marker="o", linestyle="-", color="C2", label="accuracy_all")
        ax1b.set_ylabel("accuracy_all")

        if acc_cv_col is not None and acc_cv_col in fs_sorted.columns:
            acc_cv_vals = fs_sorted[acc_cv_col].values
            ax1b.plot(
                x,
                acc_cv_vals,
                marker="x",
                linestyle="--",
                color="C3",
                label=acc_cv_col,
            )

        lines1, labels1 = ax1.get_legend_handles_labels()
        lines2, labels2 = ax1b.get_legend_handles_labels()
        ax1.legend(lines1 + lines2, labels1 + labels2, loc="best")
    elif acc_cv_col is not None and acc_cv_col in fs_sorted.columns:

        acc_cv_vals = fs_sorted[acc_cv_col].values
        ax1b = ax1.twinx()
        ax1b.plot(x, acc_cv_vals, marker="o", linestyle="-", color="C2", label=acc_cv_col)
        ax1b.set_ylabel(acc_cv_col)
        lines1, labels1 = ax1.get_legend_handles_labels()
        lines2, labels2 = ax1b.get_legend_handles_labels()
        ax1.legend(lines1 + lines2, labels1 + labels2, loc="best")
    else:
        ax1.legend(loc="best")

    ax2 = axes[1]

    if "leakage_df" in globals() and leakage_df is not None and not leakage_df.empty:
        leak = leakage_df.copy()
        leak = _ensure_numeric(leak, [ev_col, acc_cv_col] if acc_cv_col is not None else [ev_col])

        if "feature_set" not in leak.columns:
            print("[Panel 5] leakage_df has no 'feature_set' column; cannot plot leakage comparison.")
            ax2.text(0.5, 0.5, "leakage_df missing feature_set", ha="center", va="center")
            ax2.axis("off")
        else:
            names = leak["feature_set"].astype(str).tolist()
            x2 = np.arange(len(names))
            width = 0.35

            ax2.bar(x2 - width / 2, leak[ev_col].values, width=width, label=ev_col)

            if acc_cv_col is not None and acc_cv_col in leak.columns:
                ax2.bar(x2 + width / 2, leak[acc_cv_col].values, width=width, label=acc_cv_col)

            ax2.set_xticks(x2)
            ax2.set_xticklabels(names, rotation=0)
            ax2.set_title("Leakage comparison (feature_set)")
            ax2.grid(True, axis="y", alpha=0.3)
            ax2.legend(loc="best")
    else:
        ax2.text(0.5, 0.5, "leakage_df not available", ha="center", va="center")
        ax2.axis("off")

    plt.tight_layout()
    plt.show()

    print("[Panel 5] feature_search_df + all-games aggregates (sorted by EV):")
    cols_fs = ["feature_set"]
    for c in [ev_col, "n_games_all", "accuracy_all", "disagree_rate_all", acc_cv_col]:
        if c is not None and c in fs_sorted.columns and c not in cols_fs:
            cols_fs.append(c)

    display(fs_sorted[cols_fs])

    if "leakage_df" in globals() and leakage_df is not None and not leakage_df.empty:
        print("\n[Panel 5] leakage_df:")
        cols_leak = [c for c in ["feature_set", ev_col, acc_cv_col] if c is not None and c in leakage_df.columns]
        display(leakage_df[cols_leak])

        
feature_sets_available_p5 = sorted(feature_search_df["feature_set"].dropna().unique().tolist())

feature_set_widget_p5 = widgets.Dropdown(
    options=feature_sets_available_p5,
    value="baseline_noleak" if "baseline_noleak" in feature_sets_available_p5 else feature_sets_available_p5[0],
    description="Feature set (P5):",
)

controls_p5 = widgets.HBox([feature_set_widget_p5])


def update_dashboard_panel5(feature_set):
    clear_output(wait=True)
    display(controls_p5)

    print("\n=== Panel 5 — Cross feature set comparison & leakage ===")
    draw_panel_5(feature_set)


out_panel5 = widgets.interactive_output(
    update_dashboard_panel5,
    {"feature_set": feature_set_widget_p5},
)

display(out_panel5)


Output()

# **Visualization Dashboard: 2025 Performance and Model Predictions**

In [None]:

def build_games_df_for_feature_set(
    X_matrix: pd.DataFrame,
    booster: xgb.Booster,
    feature_set_name: str,
) -> pd.DataFrame:
    """
    Build a games_df-style table for a given feature set:
      - predictions for ALL rows in X_matrix (including 2025)
      - y_true wherever y_out has a realized label (0/1), NaN otherwise
      - market probs, model/market picks, disagree_flag (baseline at τ = 0.50)
    """
    X = X_matrix.copy()
    y = y_out.copy()
    meta = meta_out.copy()

    common_index = X.index.intersection(y.index).intersection(meta.index)
    X = X.loc[common_index].copy()
    y = y.loc[common_index].copy()
    meta = meta.loc[common_index].copy()

    X_proc = preprocess_for_xgb(X)
    d_all = xgb.DMatrix(X_proc, enable_categorical=True)
    probs_all = booster.predict(d_all)

    df = meta.copy()
    df["season"] = pd.to_numeric(df["season"], errors="coerce").astype("Int64")
    if "week" in df.columns:
        df["week"] = pd.to_numeric(df["week"], errors="coerce").astype("Int64")

    y_raw = pd.to_numeric(y, errors="coerce")
    y_raw = y_raw.reindex(df.index)
    df["y_true"] = y_raw.astype("float64")

    df["pred_prob_home_cover"] = probs_all

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p = pd.to_numeric(df["home_spread_vigfree"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_vigfree"], errors="coerce").values
    else:
        home_p = pd.to_numeric(df["home_spread_prob"], errors="coerce").values
        away_p = pd.to_numeric(df["away_spread_prob"], errors="coerce").values

    df["market_pick"] = (home_p >= away_p).astype(int)

    default_tau = 0.50
    df["model_pick"] = (df["pred_prob_home_cover"] >= default_tau).astype(int)
    df["disagree_flag"] = (df["model_pick"] != df["market_pick"]).astype(int)

    df["feature_set"] = feature_set_name

    return df


games_df_list = []

if "games_df" not in globals():
    raise RuntimeError("games_df (baseline) must exist before building games_df_all.")
baseline_gdf = games_df.copy()
baseline_gdf = baseline_gdf.copy()
baseline_gdf["season"] = pd.to_numeric(baseline_gdf["season"], errors="coerce").astype("Int64")
baseline_gdf["week"]   = pd.to_numeric(baseline_gdf["week"],   errors="coerce").astype("Int64")
baseline_gdf["y_true"] = pd.to_numeric(baseline_gdf["y_true"], errors="coerce")
baseline_gdf["feature_set"] = "baseline_noleak"
games_df_list.append(baseline_gdf)

for name, X_mat in feature_sets.items():
    if name == "baseline_noleak":
        continue
    print(f"[GAMES] Building games_df for feature_set={name}")
    booster = feature_models[name]
    gdf_fs = build_games_df_for_feature_set(X_mat, booster, name)
    games_df_list.append(gdf_fs)

games_df_all = pd.concat(games_df_list, ignore_index=True)

print("[STEP 4F FIXED] games_df_all shape:", games_df_all.shape)
print("[STEP 4F FIXED] games_df_all seasons:", sorted(games_df_all["season"].dropna().unique().tolist()))


if "fair_payout" not in globals():
    def fair_payout(p):
        if p is None or np.isnan(p) or p <= 0 or p >= 1:
            return np.nan
        return p / (1.0 - p)

g_all = games_df_all.copy()
g_all["season"] = pd.to_numeric(g_all["season"], errors="coerce").astype("Int64")
g_all["week"]   = pd.to_numeric(g_all["week"],   errors="coerce").astype("Int64")

g_2025 = g_all[g_all["season"] == 2025].copy()
if g_2025.empty:
    raise RuntimeError("No 2025 rows found in games_df_all for Panel 6.")

if "game_id" in g_2025.columns:
    g_2025_base = g_2025.drop_duplicates(subset=["season", "week", "game_id"]).copy()
else:
    g_2025_base = g_2025.drop_duplicates(
        subset=["season", "week", "home_team", "away_team"]
    ).copy()

weeks_2025_all = sorted(g_2025_base["week"].dropna().astype(int).unique().tolist())
if not weeks_2025_all:
    raise RuntimeError("No week values found for 2025 in games_df_all.")

auto_current_week = weeks_2025_all[-1]
for w in weeks_2025_all:
    sub = g_2025_base[g_2025_base["week"] == w]
    if "y_true" not in sub.columns or sub["y_true"].isna().any():
        auto_current_week = w
        break

print("[Panel 6] Auto-detected current 2025 week:", auto_current_week)

feature_sets_2025 = sorted(g_2025["feature_set"].dropna().unique().tolist())


def _get_initial_weeks_allowed(current_week_val: int):
    return [wk for wk in weeks_2025_all if wk <= current_week_val]

season_options_p6 = sorted(g_2025["season"].dropna().astype(int).unique().tolist())
default_season_p6 = 2025 if 2025 in season_options_p6 else season_options_p6[0]

season_widget_p6 = widgets.Dropdown(
    options=season_options_p6,
    value=default_season_p6,
    description="Season:",
)

current_week_widget_p6 = widgets.Dropdown(
    options=weeks_2025_all,
    value=auto_current_week,
    description="Current week:",
)

weeks_allowed_initial = _get_initial_weeks_allowed(auto_current_week)

week_widget_p6 = widgets.Dropdown(
    options=weeks_allowed_initial,
    value=weeks_allowed_initial[-1],
    description="Week:",
)

feature_set_widget_p6 = widgets.Dropdown(
    options=feature_sets_2025,
    value="baseline_noleak" if "baseline_noleak" in feature_sets_2025 else feature_sets_2025[0],
    description="Feature set:",
)

def _on_current_week_change(change):
    new_current = int(change["new"])
    new_weeks_allowed = _get_initial_weeks_allowed(new_current)
    if not new_weeks_allowed:
        return
    week_widget_p6.options = new_weeks_allowed
    if week_widget_p6.value not in new_weeks_allowed:
        week_widget_p6.value = new_weeks_allowed[-1]

current_week_widget_p6.observe(_on_current_week_change, names="value")

controls_p6 = widgets.HBox([
    season_widget_p6,
    current_week_widget_p6,
    week_widget_p6,
    feature_set_widget_p6,
])


def _get_current_week():
    try:
        return int(current_week_widget_p6.value)
    except Exception:
        return auto_current_week

def _get_tau_global():
    """Global τ: best combo_score from threshold_df if available, else 0.50."""
    tau = 0.50
    if "threshold_df" in globals() and threshold_df is not None and not threshold_df.empty:
        try:
            tau, _ = get_best_tau_combo(threshold_df)  
        except Exception:
            tau = 0.50
    return float(tau)


def _compute_picks_and_ev(df: pd.DataFrame, tau: float) -> pd.DataFrame:
    """
    For 2025 slice with a given feature_set:
      - model_pick_2025, market_pick_2025
      - disagree_flag_2025
      - model_ev_2025, market_ev_2025 (only where y_true is present)
    """
    df = df.copy()

    df["pred_prob_home_cover"] = pd.to_numeric(df["pred_prob_home_cover"], errors="coerce")
    df["model_pick_2025"] = (df["pred_prob_home_cover"] >= tau).astype(int)

    if {"home_spread_vigfree", "away_spread_vigfree"}.issubset(df.columns):
        home_p = pd.to_numeric(df["home_spread_vigfree"], errors="coerce")
        away_p = pd.to_numeric(df["away_spread_vigfree"], errors="coerce")
    else:
        home_p = pd.to_numeric(df.get("home_spread_prob"), errors="coerce")
        away_p = pd.to_numeric(df.get("away_spread_prob"), errors="coerce")

    df["market_pick_2025"] = (home_p >= away_p).astype(int)
    df["disagree_flag_2025"] = (df["model_pick_2025"].values != df["market_pick_2025"].values).astype(int)

    if "y_true" not in df.columns:
        df["model_ev_2025"] = np.nan
        df["market_ev_2025"] = np.nan
        return df

    y = pd.to_numeric(df["y_true"], errors="coerce")
    df["model_ev_2025"] = np.nan
    df["market_ev_2025"] = np.nan

    for idx, row in df.iterrows():
        yt = row["y_true"]
        if pd.isna(yt):
            continue

        if row["model_pick_2025"] == 1:  
            p_home = home_p.loc[idx]
            pay = fair_payout(p_home)
            if not np.isnan(pay):
                df.at[idx, "model_ev_2025"] = pay if yt == 1 else -1.0
        else:  
            p_away = away_p.loc[idx]
            pay = fair_payout(p_away)
            if not np.isnan(pay):
                df.at[idx, "model_ev_2025"] = pay if yt == 0 else -1.0

        if row["market_pick_2025"] == 1:
            p_home_mkt = home_p.loc[idx]
            pay_mkt = fair_payout(p_home_mkt)
            if not np.isnan(pay_mkt):
                df.at[idx, "market_ev_2025"] = pay_mkt if yt == 1 else -1.0
        else:
            p_away_mkt = away_p.loc[idx]
            pay_mkt = fair_payout(p_away_mkt)
            if not np.isnan(pay_mkt):
                df.at[idx, "market_ev_2025"] = pay_mkt if yt == 0 else -1.0

    return df

def _get_2025_slice_for_feature_set(feature_set: str) -> pd.DataFrame:
    df = g_2025[g_2025["feature_set"] == feature_set].copy()
    if df.empty:
        return df
    cw = _get_current_week()
    df["week"] = pd.to_numeric(df["week"], errors="coerce").astype("Int64")
    mask_future = df["week"] > cw
    df.loc[mask_future, "y_true"] = np.nan

    tau = _get_tau_global()
    df = _compute_picks_and_ev(df, tau)
    return df



def _compute_2025_year_summary(df: pd.DataFrame) -> pd.DataFrame:
    """
    Year overview (played weeks only: week < current_week):
      - n_games_total: total 2025 games in played weeks
      - n_games_labeled: games with y_true present
      - model_accuracy / market_accuracy / disagree_rate on labeled games only
    """
    cw = _get_current_week()
    df = df.copy()
    df["week"] = pd.to_numeric(df["week"], errors="coerce").astype("Int64")
    df = df[df["week"] < cw]  

    n_total = len(df)

    if "y_true" not in df.columns or n_total == 0:
        return pd.DataFrame([{
            "n_games_total": n_total,
            "n_games_labeled": 0,
            "model_accuracy": np.nan,
            "disagree_rate": np.nan,
            "market_accuracy": np.nan,
        }])

    y = pd.to_numeric(df["y_true"], errors="coerce")
    mask = y.notna()
    df_valid = df.loc[mask].copy()
    n_labeled = len(df_valid)

    if df_valid.empty:
        return pd.DataFrame([{
            "n_games_total": n_total,
            "n_games_labeled": 0,
            "model_accuracy": np.nan,
            "disagree_rate": np.nan,
            "market_accuracy": np.nan,
        }])

    model_correct = (
        df_valid["model_pick_2025"].astype(int).values ==
        y.loc[mask].astype(int).values
    )
    market_correct = (
        df_valid["market_pick_2025"].astype(int).values ==
        y.loc[mask].astype(int).values
    )

    model_acc = model_correct.mean()
    mkt_acc = market_correct.mean()
    disagree_rate = df_valid["disagree_flag_2025"].mean()

    return pd.DataFrame([{
        "n_games_total": n_total,
        "n_games_labeled": n_labeled,
        "model_accuracy": model_acc,
        "disagree_rate": disagree_rate,
        "market_accuracy": mkt_acc,
    }])


def _compute_weekly_perf_2025(df: pd.DataFrame) -> pd.DataFrame:
    """
    Weekly performance: model_accuracy, market_accuracy, disagree_rate per week.
    Uses only rows with y_true present, weeks < current_week.
    """
    if "y_true" not in df.columns:
        return pd.DataFrame(columns=["week", "model_accuracy", "market_accuracy", "disagree_rate", "n_games"])

    cw = _get_current_week()
    df = df.copy()
    df["week"] = pd.to_numeric(df["week"], errors="coerce").astype("Int64")
    df = df.dropna(subset=["week"])
    df = df[df["week"] < cw]  

    y = pd.to_numeric(df["y_true"], errors="coerce")
    df["label_notna"] = y.notna()
    df_valid = df[df["label_notna"]].copy()
    if df_valid.empty:
        return pd.DataFrame(columns=["week", "model_accuracy", "market_accuracy", "disagree_rate", "n_games"])

    def _agg_week(group):
        y_week = pd.to_numeric(group["y_true"], errors="coerce")
        model_correct = (group["model_pick_2025"].astype(int).values == y_week.astype(int).values)
        market_correct = (group["market_pick_2025"].astype(int).values == y_week.astype(int).values)
        return pd.Series({
            "n_games": len(group),
            "model_accuracy": model_correct.mean(),
            "market_accuracy": market_correct.mean(),
            "disagree_rate": group["disagree_flag_2025"].mean(),
        })

    res = df_valid.groupby("week").apply(_agg_week).reset_index()
    res = res.sort_values("week")
    return res


def draw_panel_6(season: int, week: int, feature_set: str):
    if season != 2025:
        print("[Panel 6] This panel is designed for 2025 only. Adjusting season to 2025.")
        season = 2025

    df_fs = _get_2025_slice_for_feature_set(feature_set)
    if df_fs.empty:
        print(f"[Panel 6] No 2025 data for feature_set={feature_set}.")
        return

    current_week = _get_current_week()
    is_prediction_week = (week == current_week)

    year_summary = _compute_2025_year_summary(df_fs)
    year_summary_display = year_summary.copy()
    for c in ["model_accuracy", "disagree_rate", "market_accuracy"]:
        if c in year_summary_display.columns:
            year_summary_display[c] = year_summary_display[c] * 100.0

    print(f"[Panel 6] 2025 Year Overview — feature_set={feature_set} (weeks < current_week={current_week})")
    display(year_summary_display.style.format({
        "model_accuracy": "{:.2f}",
        "disagree_rate": "{:.2f}",
        "market_accuracy": "{:.2f}",
    }))

    weekly = _compute_weekly_perf_2025(df_fs)
    if weekly.empty:
        print("\n[Panel 6] No completed weeks with labels yet for weekly performance.")
    else:
        x = weekly["week"].values
        fig, ax = plt.subplots(figsize=(10, 4))

        ax.plot(x, weekly["model_accuracy"].values, marker="o", label="model_accuracy")
        ax.plot(x, weekly["market_accuracy"].values, marker="s", linestyle="--", label="market_accuracy")
        ax.set_xlabel("Week")
        ax.set_ylabel("Accuracy")
        ax.set_title(f"2025 weekly performance — model vs market (feature_set={feature_set})")
        ax.set_ylim(0.0, 1.0)
        ax.grid(True, alpha=0.3)
        ax.legend(loc="best")

        plt.tight_layout()
        plt.show()

        print("[Panel 6] Weekly performance (completed weeks only):")
        display(weekly)

    df_week = df_fs[df_fs["week"] == week].copy()
    if df_week.empty:
        print(f"\n[Panel 6] No games found in 2025 for week={week}, feature_set={feature_set}.")
        return

    home = df_week["home_team"].astype(str).fillna("")
    away = df_week["away_team"].astype(str).fillna("")
    df_week["matchup"] = away + " @ " + home

    home_arr = home.values
    away_arr = away.values

    df_week["model_pick_team"] = np.where(df_week["model_pick_2025"] == 1, home_arr, away_arr)
    df_week["market_pick_team"] = np.where(df_week["market_pick_2025"] == 1, home_arr, away_arr)

    spread_col = None
    for cand in ["spread_line", "home_spread_line", "spread"]:
        if cand in df_week.columns:
            spread_col = cand
            break

    if spread_col is not None:
        spreads = pd.to_numeric(df_week[spread_col], errors="coerce").values
    else:
        spreads = np.full(len(df_week), np.nan)

    if not is_prediction_week:
        if "y_true" in df_week.columns:
            y_num = pd.to_numeric(df_week["y_true"], errors="coerce").values
            df_week["team_that_covered"] = np.where(
                np.isnan(y_num),
                np.nan,
                np.where(y_num == 1, home_arr, away_arr),
            )
        else:
            df_week["team_that_covered"] = np.nan

        cols_week = [
            "home_team",
            "away_team",
            "matchup",
            "pred_prob_home_cover",
            "home_spread_vigfree" if "home_spread_vigfree" in df_week.columns else "home_spread_prob",
            "model_pick_team",
            "market_pick_team",
            "disagree_flag_2025",
            "team_that_covered",
        ]
        cols_week = [c for c in cols_week if c in df_week.columns]

        print(f"\n[Panel 6] Week {week} overview — realized results (feature_set={feature_set}):")
        display(
            df_week[cols_week]
            .rename(columns={
                "pred_prob_home_cover": "model probability home covers",
                "home_spread_vigfree": "market probability home covers (vig-free)",
                "home_spread_prob": "market probability home covers (raw)",
                "model_pick_team": "model pick",
                "market_pick_team": "market pick",
                "disagree_flag_2025": "disagree_flag",
                "team_that_covered": "team that covered",
            })
            .reset_index(drop=True)
        )

    else:
        pred_texts = []
        for team, spr in zip(df_week["model_pick_team"].values, spreads):
            if np.isnan(spr):
                pred_texts.append(f"Predicts {team} will cover the spread")
            else:
                pred_texts.append(f"Predicts {team} will cover the spread of {spr}")

        df_week["prediction_text"] = pred_texts

        cols_week = [
            "home_team",
            "away_team",
            "matchup",
            "pred_prob_home_cover",
            "home_spread_vigfree" if "home_spread_vigfree" in df_week.columns else "home_spread_prob",
            "model_pick_team",
            "market_pick_team",
            "prediction_text",
        ]
        cols_week = [c for c in cols_week if c in df_week.columns]

        print(f"\n[Panel 6] Week {week} — PREDICTION MODE (current_week={current_week}, feature_set={feature_set}):")
        display(
            df_week[cols_week]
            .rename(columns={
                "pred_prob_home_cover": "model probability home covers",
                "home_spread_vigfree": "market probability home covers (vig-free)",
                "home_spread_prob": "market probability home covers (raw)",
                "model_pick_team": "model pick",
                "market_pick_team": "market pick",
                "prediction_text": "Prediction",
            })
            .reset_index(drop=True)
        )



def update_dashboard_panel6(season, week, feature_set):
    clear_output(wait=True)
    display(controls_p6)

    print("\n=== Panel 6 — 2025 Performance and Model Predictions ===")
    draw_panel_6(season, week, feature_set)

out_panel6 = widgets.interactive_output(
    update_dashboard_panel6,
    {
        "season": season_widget_p6,
        "week": week_widget_p6,
        "feature_set": feature_set_widget_p6,
    },
)

display(out_panel6)


[GAMES] Building games_df for feature_set=full_leak
[GAMES] Building games_df for feature_set=normalized
[GAMES] Building games_df for feature_set=corr_filtered
[GAMES] Building games_df for feature_set=pca
[STEP 4F FIXED] games_df_all shape: (23286, 335)
[STEP 4F FIXED] games_df_all seasons: [2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
[Panel 6] Auto-detected current 2025 week: 13


Output()