# Pre‑Merge Diagnostics — Props vs Params (Weekly)

This notebook inspects the **source files before merging** so we can quickly diagnose coverage issues.

It will:
- Load **props feed** and **model params** (and optional **edges** for comparison).
- Show **columns, shapes, and top markets** in each.
- Normalize to **canonical keys** (using `common_markets.standardize_input` if available).
- Build **join keys** and report **unmatched** rows by market for two strategies:
  - **A)** Join on `(player_key, market_std)` (**recommended** when params are not line-specific),
  - **B)** Join on `(player_key, market_std, point_key)` (when params are line-specific).
- Surface **why** rows fail to model (missing side/point/price vs missing `mu/σ/λ`).

Run from the repo root so relative paths resolve.

## 0) Parameters

In [1]:
SEASON = 2025
WEEK   = 2

PATH_PROPS  = 'data/props/latest_all_props.csv'
PATH_PARAMS = f'data/props/params_week{WEEK}.csv'
PATH_EDGES  = f'data/props/props_with_model_week{WEEK}.csv'

print('Using:')
print('  props :', PATH_PROPS)
print('  params:', PATH_PARAMS)
print('  edges :', PATH_EDGES)


Using:
  props : data/props/latest_all_props.csv
  params: data/props/params_week2.csv
  edges : data/props/props_with_model_week2.csv


## 1) Imports & helpers

In [2]:
import math, re
from pathlib import Path
import numpy as np
import pandas as pd

try:
    from common_markets import standardize_input as _std_input
    HAVE_STD = True
except Exception as e:
    HAVE_STD = False
    def _std_input(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        if 'market_std' not in df.columns and 'market' in df.columns:
            df['market_std'] = df['market'].astype(str).str.lower().str.strip()
        elif 'market_std' not in df.columns:
            df['market_std'] = ''
        if 'name' not in df.columns and 'over_under' in df.columns:
            df['name'] = df['over_under']
        if 'point' not in df.columns and 'line' in df.columns:
            df['point'] = pd.to_numeric(df['line'], errors='coerce')
        if 'name_std' not in df.columns and 'player' in df.columns:
            s = df['player'].astype(str).str.lower().str.replace(r'[^a-z0-9\s]','',regex=True)
            df['name_std'] = s.str.replace(r'\s+',' ',regex=True).str.strip()
        if 'player_key' not in df.columns and 'name_std' in df.columns:
            df['player_key'] = df['name_std'].str.replace(' ','-')
        return df

def point_key(v):
    try:
        f = float(v)
        return str(int(f)) if math.isfinite(f) and float(f).is_integer() else str(f)
    except Exception:
        return '' if pd.isna(v) else str(v)

def american_to_prob(price) -> float:
    try:
        p = float(price)
    except Exception:
        return np.nan
    if p > 0:
        return 100.0 / (p + 100.0)
    elif p < 0:
        return (-p) / ((-p) + 100.0)
    return np.nan

def normalize_side_column(df: pd.DataFrame, col_hint: str = 'name') -> pd.Series:
    s = df.get(col_hint, pd.Series(index=df.index, dtype=object)).astype(str).str.strip().str.lower()
    for alt in ('selection', 'bet_selection', 'bet_side', 'over_under'):
        if alt in df.columns:
            empty = s.isna() | s.eq('') | s.isin(['nan','none','null'])
            s = s.mask(empty, df[alt].astype(str).str.strip().str.lower())
    s = s.mask(s.str.contains(r'\bover\b',  na=False), 'over')
    s = s.mask(s.str.contains(r'\bunder\b', na=False), 'under')
    s = s.mask(s.str.contains(r'\byes\b',   na=False), 'yes')
    s = s.mask(s.str.contains(r'\bno\b',    na=False), 'no')
    return s


## 2) Load source files (pre‑merge)

In [3]:
props  = pd.read_csv(PATH_PROPS, low_memory=False)
params = pd.read_csv(PATH_PARAMS, low_memory=False)
edges  = pd.read_csv(PATH_EDGES, low_memory=False) if Path(PATH_EDGES).exists() else None

print('props  :', props.shape, 'cols:', len(props.columns))
print('params :', params.shape, 'cols:', len(params.columns))
print('edges  :', None if edges is None else edges.shape)
props.head(3)


props  : (14739, 13) cols: 13
params : (1041, 14) cols: 14
edges  : (14739, 23)


Unnamed: 0,game_id,commence_time,home_team,away_team,game,bookmaker,bookmaker_title,market,market_std,player,name,price,point
0,1ee9ea2c8256bc6be5dd92e60f6c17de,2025-09-14T17:00:00Z,Detroit Lions,Chicago Bears,Chicago Bears @ Detroit Lions,draftkings,DraftKings,player_1st_td,player_1st_td,Jahmyr Gibbs,Yes,400,
1,1ee9ea2c8256bc6be5dd92e60f6c17de,2025-09-14T17:00:00Z,Detroit Lions,Chicago Bears,Chicago Bears @ Detroit Lions,draftkings,DraftKings,player_1st_td,player_1st_td,David Montgomery,Yes,550,
2,1ee9ea2c8256bc6be5dd92e60f6c17de,2025-09-14T17:00:00Z,Detroit Lions,Chicago Bears,Chicago Bears @ Detroit Lions,draftkings,DraftKings,player_1st_td,player_1st_td,Amon-Ra St. Brown,Yes,750,


In [4]:
params.head(3)


Unnamed: 0,season,week,player,player_key,name_std,market_std,market,dist,mu,sigma,lam,used_logs,name,point
0,2025,2,Jahmyr Gibbs,jahmyrgibbs,jahmyrgibbs,anytime_td,anytime_td,poisson,0.25,,0.25,0,,
1,2025,2,David Montgomery,davidmontgomery,davidmontgomery,anytime_td,anytime_td,poisson,0.25,,0.25,0,,
2,2025,2,Amon-Ra St. Brown,amonrastbrown,amonrastbrown,anytime_td,anytime_td,poisson,0.25,,0.25,0,,


## 3) Normalize copies (canonical keys)

In [5]:
p_props  = _std_input(props.copy())
p_params = _std_input(params.copy())

for df in (p_props, p_params):
    if 'point' not in df.columns: df['point'] = np.nan
    df['point_key'] = df['point'].map(point_key)
p_props['name']  = normalize_side_column(p_props, 'name')
if 'name' in p_params.columns:
    p_params['name'] = normalize_side_column(p_params, 'name')
print('normalized shapes:', p_props.shape, p_params.shape)
p_props[['player','market_std','name','point','point_key']].head(5)


normalized shapes: (14739, 16) (1041, 15)


Unnamed: 0,player,market_std,name,point,point_key
0,Jahmyr Gibbs,player_1st_td,yes,,
1,David Montgomery,player_1st_td,yes,,
2,Amon-Ra St. Brown,player_1st_td,yes,,
3,D'Andre Swift,player_1st_td,yes,,
4,Sam LaPorta,player_1st_td,yes,,


In [6]:
cols = ['player','market_std']
if 'mu' in p_params.columns: cols.append('mu')
if 'sigma' in p_params.columns: cols.append('sigma')
if 'lam' in p_params.columns: cols.append('lam')
p_params[[c for c in cols if c in p_params.columns]].head(5)


Unnamed: 0,player,market_std,mu,sigma,lam
0,Jahmyr Gibbs,anytime_td,0.25,,0.25
1,David Montgomery,anytime_td,0.25,,0.25
2,Amon-Ra St. Brown,anytime_td,0.25,,0.25
3,D'Andre Swift,anytime_td,0.25,,0.25
4,Sam LaPorta,anytime_td,0.25,,0.25


## 4) Market overview (counts)

In [18]:
print('Top markets in props:')
print(p_props['market_std'].value_counts().head(40).to_string())

print('\nTop markets in params:')
print(p_params['market_std'].value_counts().head(25).to_string())


Top markets in props:
market_std
player_anytime_td            2819
player_1st_td                2072
player_last_td               1889
player_reception_yds         1776
player_receptions            1692
player_reception_longest      946
player_rush_yds               888
player_rush_attempts          576
player_pass_tds               396
player_pass_yds               395
player_rush_longest           360
player_pass_attempts          318
player_pass_completions       318
player_pass_interceptions     294

Top markets in params:
market_std
anytime_td            462
recv_yds              156
receptions            145
rush_yds               71
rush_attempts          62
pass_attempts          29
pass_yds               29
pass_tds               29
pass_interceptions     29
pass_completions       29


## 5) Readiness checks (keys & required fields)

In [8]:
keys_props  = ['player_key','name_std','market_std','point','price']
keys_params = ['player_key','name_std','market_std','mu','sigma','lam']

def null_report(df, cols):
    d = {}
    for c in cols:
        d[c] = df[c].isna().mean() if c in df.columns else 1.0
    return pd.Series(d).sort_values(ascending=False)

print('props key null rates:')
print(null_report(p_props,  keys_props).round(3).to_string())

print('\nparams key null rates:')
print(null_report(p_params, keys_params).round(3).to_string())


props key null rates:
point         0.46
player_key    0.00
name_std      0.00
market_std    0.00
price         0.00

params key null rates:
lam           0.5
sigma         0.5
name_std      0.0
player_key    0.0
mu            0.0
market_std    0.0


## 6) Join diagnostics (pre‑merge)

In [9]:
def join_miss(left_df, right_df, on_cols):
    on_cols = [c for c in on_cols if c in left_df.columns and c in right_df.columns]
    if not on_cols:
        return None, None
    lk = left_df[on_cols].astype(str).agg('|'.join, axis=1)
    rk = set(right_df[on_cols].astype(str).agg('|'.join, axis=1))
    missing = ~lk.isin(rk)
    return missing, on_cols

miss_no_point, used_no_point = join_miss(p_props, p_params, ['player_key','market_std'])
miss_with_point, used_with_point = join_miss(p_props, p_params, ['player_key','market_std','point_key'])

print('Join A (no point): used cols ->', used_no_point,
      '| unmatched:', None if miss_no_point is None else f"{int(miss_no_point.sum())}/{len(p_props)} ({miss_no_point.mean():.1%})")
print('Join B (with point): used cols ->', used_with_point,
      '| unmatched:', None if miss_with_point is None else f"{int(miss_with_point.sum())}/{len(p_props)} ({miss_with_point.mean():.1%})")

import pandas as pd
if miss_no_point is not None:
    a = (p_props.assign(_miss=miss_no_point)
               .groupby('market_std', as_index=False)
               .agg(total=('market_std','size'),
                    unmatched=('_miss','sum'))
               .assign(unmatched_pct=lambda d:(d['unmatched']/d['total']).round(3))
               .sort_values(['unmatched_pct','total'], ascending=[False, False]))
    a.head(20)


Join A (no point): used cols -> ['player_key', 'market_std'] | unmatched: 14739/14739 (100.0%)
Join B (with point): used cols -> ['player_key', 'market_std', 'point_key'] | unmatched: 14739/14739 (100.0%)


In [10]:
if miss_with_point is not None:
    b = (p_props.assign(_miss=miss_with_point)
               .groupby('market_std', as_index=False)
               .agg(total=('market_std','size'),
                    unmatched=('_miss','sum'))
               .assign(unmatched_pct=lambda d:(d['unmatched']/d['total']).round(3))
               .sort_values(['unmatched_pct','total'], ascending=[False, False]))
    b.head(20)


## 7) Samples of unmatched (Join A, core markets)

In [11]:
core = {'recv_yds','receptions','rush_yds','rush_attempts','pass_yds','pass_attempts','pass_completions','pass_tds','interceptions','anytime_td'}
if miss_no_point is not None:
    samples = p_props.loc[miss_no_point & p_props['market_std'].isin(core),
                          ['player','name_std','player_key','market_std','name','point','price']].head(30)
    samples


## 8) (Optional) Compare to merged edges coverage

In [12]:
if Path(PATH_EDGES).exists():
    e = pd.read_csv(PATH_EDGES, low_memory=False)
    e['market_std'] = e['market_std'].astype(str).str.lower()
    modeled = e[['model_prob','mkt_prob']].notna().all(axis=1)
    cov = (e.assign(modeled=modeled)
             .groupby('market_std', as_index=False)
             .agg(total=('market_std','size'),
                  modeled=('modeled','sum'))
             .assign(modeled_pct=lambda d:(d['modeled']/d['total']).round(3))
             .sort_values(['modeled_pct','total'], ascending=[True, False]))
    cov.head(20)
else:
    print('No edges file present at', PATH_EDGES)


In [14]:
# Point the notebook to your repo's /scripts folder
import os, sys

# 1) If you know the absolute path, just use it:
# sys.path.insert(0, "/Users/pwitt/NFL-2025/scripts")

# 2) Or try to auto-find it from common locations:
candidates = [
    os.getcwd(),
    os.path.abspath(os.path.join(os.getcwd(), "NFL-2025")),
    os.path.abspath(os.path.join(os.getcwd(), "..")),
    os.path.abspath(os.path.join(os.getcwd(), "../NFL-2025")),
]
added = None
for base in candidates:
    p = os.path.join(base, "scripts", "common_markets.py")
    if os.path.exists(p):
        sys.path.insert(0, os.path.join(base, "scripts"))
        added = os.path.join(base, "scripts")
        break

print("Using scripts path:", added or "(not found)")


Using scripts path: /Users/pwitt/NFL-2025/scripts


In [15]:
# Re-normalize props with the updated std_market and check markets again
from common_markets import standardize_input
p_props2 = standardize_input(props.copy())
print("Top markets in props (after mapping):")
print(p_props2["market_std"].value_counts().head(15).to_string())

# Recompute join diagnostics WITHOUT point_key
lk = p_props2[["player_key","market_std"]].astype(str).agg("|".join, axis=1)
rk = set(p_params[["player_key","market_std"]].astype(str).agg("|".join, axis=1))
miss = ~lk.isin(rk)
print("\nUnmatched (no point):", int(miss.sum()), "/", len(p_props2), f"({miss.mean():.1%})")

# Per-market unmatched for core markets
core = {"recv_yds","receptions","rush_yds","rush_attempts","pass_yds","pass_attempts","pass_completions","pass_tds","interceptions","anytime_td"}
print("\nCore market unmatched rates (no point):")
print((p_props2.assign(_miss=miss)
        .loc[p_props2["market_std"].isin(core)]
        .groupby("market_std")["_miss"].mean()
        .sort_values(ascending=False)
        .round(3)
        .to_string()))


Top markets in props (after mapping):
market_std
anytime_td           2819
first_td             2072
last_td              1889
recv_yds             1776
receptions           1692
reception_longest     946
rush_yds              888
rush_attempts         576
pass_tds              396
pass_yds              395
rush_longest          360
pass_attempts         318
pass_completions      318
interceptions         294


KeyError: "['player_key'] not in index"

In [17]:
# Recompute join diagnostics WITHOUT point_key (auto-pick id col)

cols = [c for c in ["player","name_std","player_key","market_std","name","point","price"] if c in p_props2.columns]
p_props2.loc[miss & p_props2["market_std"].isin(core), cols].head(30)

id_col = None
for cand in ("player_key", "name_std"):
    if cand in p_props2.columns and cand in p_params.columns:
        id_col = cand
        break
if id_col is None:
    raise RuntimeError("Neither player_key nor name_std exist on BOTH props and params after normalization.")

print("Using ID column:", id_col)

lk = p_props2[[id_col, "market_std"]].astype(str).agg("|".join, axis=1)
rk = set(p_params[[id_col, "market_std"]].astype(str).agg("|".join, axis=1))
miss = ~lk.isin(rk)

print("Unmatched (no point):", int(miss.sum()), "/", len(p_props2), f"({miss.mean():.1%})")

core = {"recv_yds","receptions","rush_yds","rush_attempts","pass_yds","pass_attempts","pass_completions","pass_tds","interceptions","anytime_td"}
print("\nCore market unmatched rates:")
print((p_props2.assign(_miss=miss)
        .loc[p_props2["market_std"].isin(core)]
        .groupby("market_std")["_miss"].mean()
        .sort_values(ascending=False)
        .round(3)
        .to_string()))

# Sample rows from core markets that fail Join A
samples = p_props2.loc[miss & p_props2["market_std"].isin(core),
                       ["player","name_std","player_key","market_std","name","point","price"]].head(30)
samples


Using ID column: name_std
Unmatched (no point): 5561 / 14739 (37.7%)

Core market unmatched rates:
market_std
interceptions       1.0
anytime_td          0.0
pass_attempts       0.0
pass_completions    0.0
pass_tds            0.0
pass_yds            0.0
receptions          0.0
recv_yds            0.0
rush_attempts       0.0
rush_yds            0.0


KeyError: "['player_key'] not in index"

In [None]:
cols = [c for c in ["player","name_std","player_key","market_std","name","point","price"] if c in p_props2.columns]
p_props2.loc[miss & p_props2["market_std"].isin(core), cols].head(30)
