# Week 2 — Build Actuals ➜ Grade Model Performance (All‑in‑One, v2) 🚀

This version is **robust to missing/renamed `side` columns** in your props.  
It will derive a normalized `side` (Over/Under/Yes/No) from any of:
- `side`, `bet_side`, `ou_side`, `yes_no_side`, or by parsing `bet` text.


In [2]:
# --- Week 2: props vs actuals grading (join via player_display_name) ---
from pathlib import Path
import pandas as pd, numpy as np, re, sys



SEASON, WEEK = 2025, 2
BASE = Path.cwd()
props_file = BASE / f"props_with_model_week{WEEK}.csv"
stats_file = BASE / "weekly_player_stats_2025.parquet"

# 1) Load props
dfp = pd.read_csv(props_file)
print("[props]", dfp.shape)

# 2) Expand explicit sides
OU = {
    "rush_yds","rushing_yards",
    "recv_yds","reception_yds","receiving_yards",
    "pass_yds","passing_yards",
    "receptions","rush_attempts","completions","attempts",
}
YESNO = {"anytime_td"}

rows = []
for _, r in dfp.iterrows():
    m = str(r["market_std"]).lower()
    if m in OU and pd.notna(r.get("point")):
        for side in ("over","under"):
            x = r.copy(); x["side"] = side; rows.append(x)
    elif m in YESNO:
        for side in ("yes","no"):
            x = r.copy(); x["side"] = side; rows.append(x)
    # skip exotics for grading
dfp = pd.DataFrame(rows).reset_index(drop=True)
print("[props expanded]", dfp.shape)

# 3) Load stats (parquet)
dfs = pd.read_parquet(stats_file)
dfs = dfs[(dfs["season"]==SEASON) & (dfs["week"]==WEEK)]
print("[stats filtered]", dfs.shape)

# 4) Name normalizer (try common_markets; fallback locally)
scripts_dir = BASE / "scripts"
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

def _fallback_std_name(s):
    if s is None or (isinstance(s, float) and pd.isna(s)): return ""
    s = str(s).lower().strip()
    s = re.sub(r"[^a-z0-9 ]+", "", s)          # keep letters/digits/spaces
    s = " ".join(t for t in s.split() if t not in {"jr","sr","ii","iii","iv","v"})
    return s.replace(" ", "")                  # collapse spaces

std_name = None
try:
    import common_markets as cm
    for cand in ("std_player_name","std_name","normalize_name","name_std","norm_name"):
        if hasattr(cm, cand):
            std_name = getattr(cm, cand); print(f"[join] using common_markets.{cand}")
            break
except Exception as e:
    print("[join] common_markets not available:", e)
if std_name is None:
    std_name = _fallback_std_name
    print("[join] using local fallback normalizer")

# 5) Build canonical name_std on BOTH sides
# props: prefer existing name_std if present; otherwise from 'name' or 'player'
source_col = "name_std" if "name_std" in dfp.columns else ("name" if "name" in dfp.columns else "player")
dfp["name_std"] = dfp[source_col].apply(std_name)

# stats: IMPORTANT — use player_display_name (full name), not player_name (abbr)
dfs["name_std"] = dfs["player_display_name"].apply(std_name)

# 6) Merge on canonical name
dfm = dfp.merge(dfs, on="name_std", how="left", suffixes=("","_stat"))
matched = dfm["player_id"].notna().sum() if "player_id" in dfm.columns else dfm["team"].notna().sum()
print(f"[merge] merged rows: {len(dfm)}  matched: {matched}")

# 7) Compute actual_value per market
MAP = {
    # Passing
    "pass_yds":"passing_yards","passing_yards":"passing_yards",
    "pass_tds":"passing_tds","passing_tds":"passing_tds",
    "pass_ints":"passing_interceptions","pass_interceptions":"passing_interceptions",
    "completions":"completions","pass_cmp":"completions",
    "attempts":"attempts","pass_att":"attempts",
    # Rushing
    "rush_yds":"rushing_yards","rushing_yards":"rushing_yards",
    "rush_attempts":"carries","carries":"carries",
    "rushing_tds":"rushing_tds",
    # Receiving
    "recv_yds":"receiving_yards","reception_yds":"receiving_yards","receiving_yards":"receiving_yards",
    "receptions":"receptions","receiving_tds":"receiving_tds",
}
def actual_value(row):
    m = str(row["market_std"]).lower()
    if m in YESNO:
        return float((row.get("rushing_tds",0) or 0) + (row.get("receiving_tds",0) or 0))
    col = MAP.get(m)
    return float(row[col]) if col and col in row and pd.notna(row[col]) else np.nan

dfm["actual_value"] = dfm.apply(actual_value, axis=1)

# 8) Decide result
def decide(side, actual, point):
    if pd.isna(actual): return np.nan
    s = str(side).lower()
    if s=="yes": return 1.0 if actual>0 else 0.0
    if s=="no":  return 1.0 if actual==0 else 0.0
    if pd.isna(point): return np.nan
    return 1.0 if (s=="over" and actual>=point) or (s=="under" and actual<=point) else 0.0

dfm["result"] = dfm.apply(lambda r: decide(r["side"], r["actual_value"], r.get("point")), axis=1)

# 9) Save actuals
##Zactuals = dfm.loc[dfm["actual_value"].notna(), ["player_key","name","market_std","side","point","actual_value","result"]]
#actuals.to_csv(actuals_out, index=False)
#print(f"[write] actuals -> {actuals_out} rows={len(actuals)}")

# 10) Metrics (overall + per-market)
p_model = "model_prob"
eps = 1e-9
mask = dfm["result"].notna() & dfm[p_model].notna()
graded = dfm.loc[mask].copy()

if len(graded)==0:
    print("[grade] no graded rows — check name join coverage above")
else:
    graded["brier"] = (graded[p_model] - graded["result"])**2
    p = np.clip(graded[p_model].astype(float), eps, 1-eps)
    graded["logloss"] = -(graded["result"]*np.log(p) + (1-graded["result"])*np.log(1-p))

    # Overall
    print("\n[Week2 overall]")
    print("  rows    :", len(graded))
    print("  hit rate:", graded["result"].mean())
    print("  brier   :", graded["brier"].mean())
    print("  logloss :", graded["logloss"].mean())

    # Per-market summary
    def safemean(s): return pd.to_numeric(s, errors="coerce").dropna().mean()
    market_perf = (graded
        .groupby("market_std", dropna=False)
        .agg(n=("result","count"),
             hit=("result","mean"),
             brier=("brier", safemean),
             logloss=("logloss", safemean))
        .reset_index()
        .sort_values(["n","hit"], ascending=[False,False])
    )

    # Write grades + per-market
    grades_out = BASE / f"data/eval/grades_week{WEEK}.csv"
    market_out = BASE / f"data/eval/market_perf_week{WEEK}.csv"
    grades_out.parent.mkdir(parents=True, exist_ok=True)
    graded.to_csv(grades_out, index=False)
    market_perf.to_csv(market_out, index=False)
    print(f"[write] grades  -> {grades_out} (rows={len(graded)})")
    print(f"[write] markets -> {market_out} (rows={len(market_perf)})")

    # Quick glance
    display(market_perf.head(12))


[props] (2673, 24)
[props expanded] (2974, 25)
[stats filtered] (970, 114)
[join] common_markets not available: No module named 'common_markets'
[join] using local fallback normalizer
[merge] merged rows: 2974  matched: 156

[Week2 overall]
  rows    : 156
  hit rate: 0.5
  brier   : 0.32772987664122855
  logloss : 0.8793457747230159
[write] grades  -> /Users/pwitt/fourth-and-value/notebooks/data/eval/grades_week2.csv (rows=156)
[write] markets -> /Users/pwitt/fourth-and-value/notebooks/data/eval/market_perf_week2.csv (rows=1)


Unnamed: 0,market_std,n,hit,brier,logloss
0,anytime_td,156,0.5,0.32773,0.879346


In [None]:
SEASON, WEEK = 2025, 2
BASE = Path.cwd()
props_file = BASE / f"props_with_model_week{WEEK}.csv"
stats_file = BASE / "weekly_player_stats_2025.parquet"

# 1) Load props
dfp = pd.read_csv(props_file)
print("[props]", dfp.shape)

In [None]:
dfs.player_id

In [None]:
dfp.columns

In [None]:
dfp.player_key

In [None]:
import numpy as np
import re

# --- 0) Minimal helpers (use your scripts/common_markets if you prefer) ---
def _std_name(x: str) -> str:
    if pd.isna(x): return ""
    x = x.lower()
    x = re.sub(r"[^a-z0-9\s]", "", x)
    x = re.sub(r"\s+", " ", x).strip()
    return x

# Market aliases → a compact, opinionated set that matches nflverse columns below
ALIASES = {
    # yards
    "player_pass_yds": "pass_yds", "pass_yds": "pass_yds",
    "player_rush_yds": "rush_yds", "rush_yds": "rush_yds",
    "player_rec_yds": "rec_yds",   "rec_yds": "rec_yds", "reception_yds": "rec_yds",
    # counts
    "receptions": "receptions", "player_receptions": "receptions",
    "pass_attempts": "pass_attempts", "player_pass_attempts": "pass_attempts",
    "pass_completions": "pass_completions", "player_pass_completions": "pass_completions",
    "rush_attempts": "rush_attempts", "carries": "rush_attempts",
    # scoring / turnovers
    "pass_tds": "pass_tds", "player_pass_tds": "pass_tds",
    "pass_interceptions": "pass_ints", "interceptions": "pass_ints",
    "anytime_td": "anytime_td",
    # longest
    "rush_longest": "rush_long", "player_rush_longest": "rush_long",
    "reception_longest": "rec_long", "player_reception_longest": "rec_long",
}

def _std_market(x: str) -> str:
    if pd.isna(x): return ""
    y = x.strip().lower()
    return ALIASES.get(y, y)

def _first_existing(colnames, df):
    for c in colnames:
        if c in df.columns:
            return c
    return None

# --- 1) Normalize keys on both sides ---
dfp = dfp.copy()
dfs = dfs.copy()

dfp["name_std"] = dfp[_first_existing(["player","name","Player"], dfp)].map(_std_name)
dfp["market_std"] = dfp[_first_existing(["market","Market"], dfp)].map(_std_market)

dfs["name_std"] = dfs[_first_existing(["player","name","player_name"], dfs)].map(_std_name)

# --- 2) Choose line & side columns (robust to schema differences) ---
line_col = _first_existing(["line","point","line_disp","points","handicap","total"], dfp)
if line_col is None:
    dfp["line"] = np.nan
else:
    dfp["line"] = pd.to_numeric(dfp[line_col], errors="coerce")

# Side can be Over/Under or Yes/No; try to derive if missing
if "side" not in dfp.columns:
    bet_col = _first_existing(["bet","Bet","wager","selection"], dfp)
    if bet_col:
        dfp["side"] = dfp[bet_col].str.extract(r"^(Over|Under|Yes|No)", expand=False)
    else:
        dfp["side"] = np.nan

# --- 3) Map df markets -> dfs stat columns (DEFENSIVE) ---
def pick_col(df, *cands):
    for c in cands:
        if c and c in df.columns:
            return c
    return None

MARKET_TO_COL = {
    # yards
    "pass_yds":      pick_col(dfs, "passing_yards", "pass_yards", "pass_yds"),
    "rush_yds":      pick_col(dfs, "rushing_yards", "rush_yards", "rush_yds"),
    "rec_yds":       pick_col(dfs, "receiving_yards", "rec_yards", "rec_yds", "reception_yds"),
    # counts
    "receptions":    pick_col(dfs, "receptions", "recs"),
    "pass_attempts": pick_col(dfs, "passing_attempts", "pass_attempts", "attempts"),
    "pass_completions": pick_col(dfs, "passing_completions", "pass_completions", "completions"),
    "rush_attempts": pick_col(dfs, "rushing_attempts", "rush_attempts", "carries"),
    # scoring / turnovers
    "pass_tds":      pick_col(dfs, "passing_tds", "pass_tds"),
    "pass_ints":     pick_col(dfs, "interceptions", "passing_interceptions", "pass_ints", "ints"),
    # longest
    "rush_long":     pick_col(dfs, "rushing_long", "rush_long", "long_rush"),
    "rec_long":      pick_col(dfs, "receiving_long", "rec_long", "long_rec"),
    # anytime_td handled specially below
}

# TD columns for anytime TD
td_candidates = ["total_tds","rushing_tds","receiving_tds","kick_return_tds","punt_return_tds",
                 "defensive_tds","special_teams_tds"]
td_cols = [c for c in td_candidates if c in dfs.columns]

# Build the exact set of columns we will slice from dfs (only those that exist)
used_stat_cols = {"name_std"}
used_stat_cols.update([c for c in MARKET_TO_COL.values() if c is not None])
used_stat_cols.update(td_cols)

dfs_slice = dfs[list(used_stat_cols)].copy()

# Helpful diagnostics
missing_map = {m: col for m, col in MARKET_TO_COL.items() if col is None}
if missing_map:
    print("[warn] missing stat columns for these markets (no matching column in dfs):", missing_map)


# --- 4) Merge (player-level, same week) ---
merged = (
    dfp.merge(dfs_slice, on="name_std", how="left", suffixes=("","_s"))
)

print(f"[merge] matched player rows: {(~merged[dfs_slice.columns.difference(['name_std'])].isna()).any(axis=1).sum()} / {len(merged)}")

# --- 5) Compute actual_value per row (vectorized by market_std) ---
merged["actual_value"] = np.nan

for m, col in MARKET_TO_COL.items():
    if isinstance(col, str) and col in merged.columns:
        sel = merged["market_std"].eq(m)
        merged.loc[sel, "actual_value"] = pd.to_numeric(merged.loc[sel, col], errors="coerce")

# Anytime TD (binary)
if "anytime_td" in merged["market_std"].unique():
    # prefer total_tds if present; else sum rushing+receiving+returns
    if "total_tds" in merged.columns:
        td_any = (pd.to_numeric(merged["total_tds"], errors="coerce") > 0).astype(int)
    else:
        parts = [pd.to_numeric(merged.get(c, 0), errors="coerce").fillna(0) for c in td_cols]
        td_any = (np.sum(parts, axis=0) > 0).astype(int)
    merged.loc[merged["market_std"].eq("anytime_td"), "actual_value"] = td_any

# --- 6) Derive actual_side (handles O/U + Yes/No + Push) ---
OU_MARKETS = {"pass_yds","rush_yds","rec_yds","receptions",
              "pass_attempts","pass_completions","rush_attempts",
              "pass_tds","pass_ints","rush_long","rec_long"}

merged["actual_side"] = np.nan
is_ou = merged["market_std"].isin(OU_MARKETS) & merged["line"].notna() & merged["actual_value"].notna()
merged.loc[is_ou & (merged["actual_value"] > merged["line"]), "actual_side"] = "Over"
merged.loc[is_ou & (merged["actual_value"] < merged["line"]), "actual_side"] = "Under"
merged.loc[is_ou & (merged["actual_value"] == merged["line"]), "actual_side"] = "Push"  # keep pushes explicit

is_any = merged["market_std"].eq("anytime_td") & merged["actual_value"].notna()
merged.loc[is_any, "actual_side"] = np.where(merged.loc[is_any, "actual_value"].astype(int) > 0, "Yes", "No")

# --- 7) Quick coverage + sanity prints ---
def _c(n): return f"{n:,}"
print("[coverage]")
print("  rows total        :", _c(len(merged)))
print("  rows w/ actualval :", _c(merged["actual_value"].notna().sum()))
print("  rows w/ actualside:", _c(merged["actual_side"].notna().sum()))
print("  pushes (O/U)      :", _c((merged["actual_side"]=="Push").sum()))

# Unmapped markets you might want to add to ALIASES/MARKET_TO_COL
unmapped = (merged.loc[merged["actual_value"].isna(), "market_std"]
            .value_counts().head(15))
print("\n[top unmapped markets]")
print(unmapped)

# --- 8) Optional: compute hit flag (ignoring NaN & Push) ---
has_side = merged["side"].notna() & merged["actual_side"].notna()
not_push = merged["actual_side"] != "Push"
merged["hit"] = np.where(has_side & not_push & (merged["side"] == merged["actual_side"]), 1,
                  np.where(has_side & not_push, 0, np.nan))

print("\n[hit-rate quick check]")
mask = merged["hit"].notna()
if mask.any():
    print(f"  hit-rate: {merged.loc[mask,'hit'].mean():.3f} on {int(mask.sum())} graded")
else:
    print("  no graded rows yet (likely missing 'side' or mapping)")


In [1]:
import pandas as pd, numpy as np, re
from pathlib import Path

BASE = Path.cwd()
dfp = pd.read_csv(BASE/"props_with_model_week2.csv")   # projections
dfs = pd.read_parquet(BASE/"eekly_player_stats_2025.parquet")  # nflverse weekly; filter week=2 below
dfs = dfs.query("season==2025 and week==2").copy()

# --- normalizers (use your scripts/common_markets if available) ---
try:
    from scripts.common_markets import std_market as _std_market, std_name as _std_name
except Exception:
    def _std_name(x: str) -> str:
        if pd.isna(x): return ""
        x = x.lower()
        x = re.sub(r"[^a-z0-9\s]", "", x)
        return re.sub(r"\s+", " ", x).strip()

    ALIASES = {
        "player_pass_yds":"pass_yds","pass_yds":"pass_yds",
        "player_rush_yds":"rush_yds","rush_yds":"rush_yds",
        "player_rec_yds":"rec_yds","rec_yds":"rec_yds","reception_yds":"rec_yds",
        "receptions":"receptions","player_receptions":"receptions",
        "pass_attempts":"pass_attempts","player_pass_attempts":"pass_attempts",
        "pass_completions":"pass_completions","player_pass_completions":"pass_completions",
        "rush_attempts":"rush_attempts","carries":"rush_attempts",
        "pass_tds":"pass_tds","player_pass_tds":"pass_tds",
        "pass_interceptions":"pass_ints","interceptions":"pass_ints",
        "anytime_td":"anytime_td",
        "rush_longest":"rush_long","player_rush_longest":"rush_long",
        "reception_longest":"rec_long","player_reception_longest":"rec_long",
    }
    def _std_market(x: str) -> str:
        if pd.isna(x): return ""
        y = x.strip().lower()
        return ALIASES.get(y, y)

# --- keys ---
dfp = dfp.copy()
dfp["name_std"] = dfp[["player","name"]].ffill(axis=1).iloc[:,0].map(_std_name)
dfp["market_std"] = dfp[["market","Market"]].ffill(axis=1).iloc[:,0].map(_std_market)
# pick a line column & side column
line_col = next((c for c in ["line","point","line_disp","points","handicap","total"] if c in dfp.columns), None)
dfp["line"] = pd.to_numeric(dfp[line_col], errors="coerce") if line_col else np.nan
if "side" not in dfp.columns:
    guess = next((c for c in ["bet","Bet","selection"] if c in dfp.columns), None)
    dfp["side"] = dfp[guess].str.extract(r"^(Over|Under|Yes|No)", expand=False) if guess else np.nan

# dfs keys
dfs = dfs.copy()
name_col = next((c for c in ["player","player_name","name"] if c in dfs.columns), None)
dfs["name_std"] = dfs[name_col].map(_std_name)

# --- map markets to stat columns (defensive against schema diffs) ---
def pick(df, *cands):
    for c in cands:
        if c in df.columns: return c
    return None

M2C = {
    "pass_yds":      pick(dfs, "passing_yards","pass_yards"),
    "rush_yds":      pick(dfs, "rushing_yards","rush_yards"),
    "rec_yds":       pick(dfs, "receiving_yards","rec_yards"),
    "receptions":    pick(dfs, "receptions"),
    "pass_attempts": pick(dfs, "passing_attempts"),
    "pass_completions": pick(dfs, "passing_completions"),
    "rush_attempts": pick(dfs, "rush_attempts","rushing_attempts"),
    "pass_tds":      pick(dfs, "passing_tds"),
    "pass_ints":     pick(dfs, "interceptions","passing_interceptions"),
    "rush_long":     pick(dfs, "rushing_long"),
    "rec_long":      pick(dfs, "receiving_long"),
}

td_parts = [c for c in ["total_tds","rushing_tds","receiving_tds","kick_return_tds","punt_return_tds",
                        "defensive_tds","special_teams_tds"] if c in dfs.columns]

dfs_slice_cols = {"name_std"} | {c for c in M2C.values() if c} | set(td_parts)
dfs_slice = dfs[list(dfs_slice_cols)].copy()

# --- merge ---
merged = dfp.merge(dfs_slice, on="name_std", how="left", suffixes=("","_s"))
print(f"[merge] rows: {len(merged):,}")

# --- actual_value per market ---
merged["actual_value"] = np.nan
for m, col in M2C.items():
    if col:
        sel = merged["market_std"].eq(m)
        merged.loc[sel, "actual_value"] = pd.to_numeric(merged.loc[sel, col], errors="coerce")

# anytime TD (binary yes/no)
if "anytime_td" in merged["market_std"].unique():
    if "total_tds" in merged.columns:
        td_any = (pd.to_numeric(merged["total_tds"], errors="coerce") > 0).astype(int)
    else:
        parts = [pd.to_numeric(merged.get(c, 0), errors="coerce").fillna(0) for c in td_parts]
        td_any = (np.sum(parts, axis=0) > 0).astype(int)
    merged.loc[merged["market_std"].eq("anytime_td"), "actual_value"] = td_any

# --- actual_side (O/U & Yes/No), keep pushes explicit ---
OU = {"pass_yds","rush_yds","rec_yds","receptions","pass_attempts","pass_completions",
      "rush_attempts","pass_tds","pass_ints","rush_long","rec_long"}
merged["actual_side"] = np.nan
ou_mask = merged["market_std"].isin(OU) & merged["line"].notna() & merged["actual_value"].notna()
merged.loc[ou_mask & (merged["actual_value"] > merged["line"]), "actual_side"] = "Over"
merged.loc[ou_mask & (merged["actual_value"] < merged["line"]), "actual_side"] = "Under"
merged.loc[ou_mask & (merged["actual_value"] == merged["line"]), "actual_side"] = "Push"

any_mask = merged["market_std"].eq("anytime_td") & merged["actual_value"].notna()
merged.loc[any_mask, "actual_side"] = np.where(merged.loc[any_mask,"actual_value"].astype(int)>0, "Yes", "No")

# --- quick diagnostics ---
print("[coverage]")
print("  with actual_value :", int(merged["actual_value"].notna().sum()))
print("  with actual_side  :", int(merged["actual_side"].notna().sum()))
missing = (merged.loc[merged["actual_value"].isna(),"market_std"].value_counts().head(10))
if len(missing): print("[unmapped top]"); print(missing)

# --- grade (ignore pushes & rows without a declared side) ---
mask = merged["actual_side"].notna() & merged["side"].notna() & (merged["actual_side"]!="Push")
if mask.any():
    merged["hit"] = (merged.loc[mask,"actual_side"] == merged.loc[mask,"side"]).astype(int)
    hit = merged.loc[mask,"hit"].mean()
    print(f"[hit-rate] {hit:.3f} on {int(mask.sum())} graded rows")
else:
    print("[hit-rate] 0 graded rows (likely missing market mapping or side/line)")

# Optional: small by-market summary you can print or to_csv
by_mkt = (merged[mask]
          .groupby("market_std")["hit"]
          .agg(rows="count", hit_rate="mean")
          .sort_values(["hit_rate","rows"], ascending=[False, False]))
print("\n[by-market]")
print(by_mkt.head(10))


FileNotFoundError: [Errno 2] No such file or directory: '/Users/pwitt/fourth-and-value/notebooks/eekly_player_stats_2025.parquet'