
# Fourth & Value — Minimal Join Prep Notebook

This notebook **only** loads your CSVs and prepares columns needed for joins.  
It keeps things **strict by default** and provides optional cells for name-based joins you can ignore.

**Steps:**  
1. Update the file paths below if needed.  
2. Run cells 1–6 to load & prep.  
3. Use the strict join cell if both frames have `player_key`.  
4. (Optional) Try name-based joins if you want to compare hit rates.  
5. Use the anti-join helper to inspect unmatched keys.


In [1]:

# --- File paths ---
PROPS_CSV  = "/Users/pwitt/fourth-and-value/data/props/latest_all_props.csv"
PARAMS_CSV = "/Users/pwitt/fourth-and-value/data/props/params_week4.csv"   # change week as needed

print(PROPS_CSV)
print(PARAMS_CSV)


/Users/pwitt/fourth-and-value/data/props/latest_all_props.csv
/Users/pwitt/fourth-and-value/data/props/params_week4.csv


In [2]:

import pandas as pd
import re

# --- Minimal canonical market mapper (small, surgical) ---
CANON_MAP = {
    "player_rushing_yards": "rush_yds",
    "player_reception_yds": "recv_yds",
    "player_receiving_yds": "recv_yds",
    "player_pass_yds": "pass_yds",
    "player_receptions": "receptions",
    "player_rush_attempts": "rush_attempts",
    "player_passing_attempts": "pass_attempts",
    "player_passing_completions": "pass_completions",
    "player_passing_tds": "pass_tds",
    "player_interceptions": "pass_interceptions",
    "player_anytime_td": "anytime_td",
    "player_1st_td": "first_td",
    "player_last_td": "last_td",
    "player_rush_longest": "rush_longest",
    "player_reception_longest": "reception_longest",
}
def canon_market(x: str) -> str:
    x = str(x or "").strip()
    return CANON_MAP.get(x, x)

def std_name(s: str) -> str:
    if s is None: return ""
    s = str(s).lower().strip()
    s = re.sub(r"[^\w\s]", "", s)     # drop punctuation
    s = re.sub(r"\s+", " ", s)        # collapse whitespace
    return s

def name_slug(s: str) -> str:
    s = std_name(s)
    if not s: return ""
    parts = s.split()
    if len(parts) == 1:
        return parts[0]
    return f"{parts[0][0]}{parts[-1]}"


In [3]:

# --- Load ---
props  = pd.read_csv(PROPS_CSV, low_memory=False)
params = pd.read_csv(PARAMS_CSV, low_memory=False)

print("[props] rows:", len(props), "cols:", len(props.columns))
print("[params] rows:", len(params), "cols:", len(params.columns))

# Peek
display(props.head(3))
display(params.head(3))


[props] rows: 17473 cols: 13
[params] rows: 1146 cols: 10


Unnamed: 0,game_id,commence_time,home_team,away_team,game,bookmaker,bookmaker_title,market,market_std,player,name,price,point
0,4dd631102a977fd398f7ea594ed208f2,2025-09-28T13:33:00Z,Pittsburgh Steelers,Minnesota Vikings,Minnesota Vikings @ Pittsburgh Steelers,draftkings,DraftKings,player_anytime_td,player_anytime_td,Jordan Mason,Yes,160,
1,4dd631102a977fd398f7ea594ed208f2,2025-09-28T13:33:00Z,Pittsburgh Steelers,Minnesota Vikings,Minnesota Vikings @ Pittsburgh Steelers,draftkings,DraftKings,player_anytime_td,player_anytime_td,Justin Jefferson,Yes,230,
2,4dd631102a977fd398f7ea594ed208f2,2025-09-28T13:33:00Z,Pittsburgh Steelers,Minnesota Vikings,Minnesota Vikings @ Pittsburgh Steelers,draftkings,DraftKings,player_anytime_td,player_anytime_td,Jordan Addison,Yes,450,


Unnamed: 0,player_display_name,name_std,market_std,n_games,mu,sigma,lam,season,week,built_at
0,Jordan Mason,jordan mason,anytime_td,3.0,,,0.666667,2025,4,2025-09-28T23:29:31Z
1,Justin Jefferson,justin jefferson,anytime_td,3.0,,,0.333333,2025,4,2025-09-28T23:29:31Z
2,Jordan Addison,jordan addison,anytime_td,,,,,2025,4,2025-09-28T23:29:31Z


In [4]:
params.rename(columns={'player_display_name':'player'},inplace=True)

In [None]:
CANON_MAP = {
    "player_rushing_yards": "rush_yds",
    "player_reception_yds": "recv_yds",
    "player_receiving_yds": "recv_yds",
    "player_pass_yds": "pass_yds",
    "player_receptions": "receptions",
    "player_rush_attempts": "rush_attempts",
    "player_passing_attempts": "pass_attempts",
    "player_passing_completions": "pass_completions",
    "player_passing_tds": "pass_tds",
    "player_interceptions": "pass_interceptions",
    "player_anytime_td": "anytime_td",
    "player_1st_td": "first_td",
    "player_last_td": "last_td",
    "player_rush_longest": "rush_longest",
    "player_reception_longest": "reception_longest",
}

In [7]:
props['market_std'] = props['market_std'].map(CANON_MAP)

In [11]:
t=params.merge(props,how="left",on=['player','market_std'])

In [12]:
t[t.market_std=='rush_attempts']['bookmaker'].unique()

array(['draftkings', 'bovada', 'fanduel', 'betonlineag', 'betrivers',
       'fanatics', 'betmgm'], dtype=object)

In [13]:
props.market_std.unique()

array(['anytime_td', nan, 'pass_yds', 'last_td', 'recv_yds', 'receptions',
       'rush_attempts', 'first_td', 'reception_longest', 'rush_longest'],
      dtype=object)

In [16]:
t[t.market_std=='rush_attempts'].to_csv('rush_attempts')

In [19]:
t[t.market_std=='rush_attempts']['Side']

KeyError: 'Side'

In [20]:
t.columns

Index(['player', 'name_std', 'market_std', 'n_games', 'mu', 'sigma', 'lam',
       'season', 'week', 'built_at', 'game_id', 'commence_time', 'home_team',
       'away_team', 'game', 'bookmaker', 'bookmaker_title', 'market', 'name',
       'price', 'point'],
      dtype='object')

In [None]:

# --- Normalize market_std on both frames ---
props["market_std"]  = props.get("market_std", props.get("market", "")).map(canon_market)
params["market_std"] = params.get("market_std", params.get("market", "")).map(canon_market)

# --- Prepare names ---
props_name_src  = props.get("name", props.get("player", ""))
params_name_src = params.get("name_std", params.get("player_display_name", ""))

props["name_std"]  = pd.Series(props_name_src, index=props.index).apply(std_name)
params["name_std"] = pd.Series(params_name_src, index=params.index).apply(std_name)

props["name_slug"]  = props["name_std"].apply(name_slug)
params["name_slug"] = params["name_std"].apply(name_slug)

# Keep light previews
print("[props] key cols present:", [c for c in ["player_key","name","player","name_std","name_slug","market_std"] if c in props.columns])
print("[params] key cols present:", [c for c in ["player_key","player_display_name","name_std","name_slug","market_std"] if c in params.columns])

display(props[["game_id","bookmaker_title","market","market_std","player","name","name_std","name_slug"]].head(5))
display(params[["player_display_name","name_std","name_slug","market_std","mu","sigma","lam"]].head(5))


In [None]:

# --- Key coverage & market overview ---
def nz(s): return int(s.notna().sum())

print("\n[player_key coverage]")
pk_props  = nz(props["player_key"])  if "player_key" in props.columns  else 0
pk_params = nz(params["player_key"]) if "player_key" in params.columns else 0
print(f"props player_key non-null:  {pk_props}/{len(props)}")
print(f"params player_key non-null: {pk_params}/{len(params)}")

print("\n[market counts]")
mc_props  = props["market_std"].value_counts(dropna=False).head(20)
mc_params = params["market_std"].value_counts(dropna=False).head(20)
display(mc_props.to_frame("props_rows"))
display(mc_params.to_frame("params_rows"))

# Save prepped copies (optional, for experimentation)
props_prep_path  = "data/qc/_props_prep.csv"
params_prep_path = "data/qc/_params_prep.csv"
import os
os.makedirs("data/qc", exist_ok=True)
props.to_csv(props_prep_path, index=False)
params.to_csv(params_prep_path, index=False)
print(f"[wrote] {props_prep_path}")
print(f"[wrote] {params_prep_path}")



## Strict Join (player_key + market_std)

Only run this if **both** `props` and `params` have a non-null `player_key` column.


In [None]:

if ("player_key" in props.columns) and ("player_key" in params.columns):
    left  = props.dropna(subset=["player_key","market_std"])
    right = params.dropna(subset=["player_key","market_std"])[["player_key","market_std","mu","sigma","lam","name_std","name_slug"]]
    merged_strict = left.merge(right, how="inner", on=["player_key","market_std"], suffixes=("", "_param"))
    hit = len(merged_strict)
    print(f"[strict] matched rows: {hit} / {len(props)} ({hit/len(props):.1%})")
    display(merged_strict.head(10))
else:
    print("[strict] skipped — player_key not present on both frames.")



## Optional: Name-based Joins (for diagnostics only)

If you want to compare coverage (not required if you are staying strict):  
- Name Std + Market  
- Name Slug + Market  


In [None]:

# Name Std + Market
left  = props.dropna(subset=["name_std","market_std"])
right = params.dropna(subset=["name_std","market_std"])[["name_std","market_std","mu","sigma","lam","player_key"]]
merged_name = left.merge(right, how="inner", on=["name_std","market_std"], suffixes=("", "_param"))
print(f"[name_std] matched rows: {len(merged_name)} / {len(props)} ({len(merged_name)/len(props):.1%})")
display(merged_name.head(10))

# Name Slug + Market
left  = props.dropna(subset=["name_slug","market_std"])
right = params.dropna(subset=["name_slug","market_std"])[["name_slug","market_std","mu","sigma","lam","player_key"]]
merged_slug = left.merge(right, how="inner", on=["name_slug","market_std"], suffixes=("", "_param"))
print(f"[name_slug] matched rows: {len(merged_slug)} / {len(props)} ({len(merged_slug)/len(props):.1%})")
display(merged_slug.head(10))



## Anti-Join Helper

Use this to inspect **unmatched** rows for a given key set.  
Change `KEYS` to one of:
- `["player_key","market_std"]` (strict)  
- `["name_std","market_std"]`  
- `["name_slug","market_std"]`


In [None]:

from typing import List

def anti_join_sample(left: pd.DataFrame, right: pd.DataFrame, keys: List[str], sample=20):
    lk = left.dropna(subset=keys)[keys].astype(str).drop_duplicates()
    rk = right.dropna(subset=keys)[keys].astype(str).drop_duplicates()
    aj = lk.merge(rk, how="left", on=keys, indicator=True)
    miss = aj[aj["_merge"]=="left_only"].drop(columns=["_merge"])
    print(f"[anti-join] unmatched key rows: {len(miss)}")
    display(miss.head(sample))

# Example: strict anti-join if player_key exists in both
keys = ["player_key","market_std"] if ("player_key" in props.columns and "player_key" in params.columns) else ["name_std","market_std"]
anti_join_sample(props, params, keys=keys, sample=25)



### Next steps
- If strict hit-rate is low, ensure `player_key` is present on **both** frames and derived the same way.
- If you want to **propagate** `player_key` into params (without fuzzy matching), merge a `(name_std, market_std) → player_key` map from props into params during params build.
- Keep joins modular: run strict, measure; then (optionally) compare name-based to diagnose coverage gaps.
