
# NFL — Build 2025 Weekly from PBP and Align to 2024 Weekly
**Purpose:**  
1) Load historical weekly data (e.g., 2024 file) and treat its columns/dtypes as the **target schema**.  
2) Load **2025 play-by-play (PBP)** and aggregate to weekly player-level stats that **align** with the target schema.  
3) Provide **checks** at each step (schema, dtypes, nulls, coverage) before concatenating and writing outputs.

> ⚠️ This notebook **does not** fetch from the internet. It expects files to already exist locally in your repo (e.g., `data/pbp/pbp_2025.csv`, `data/weekly/weekly_2024.csv`).  
> If your file names differ, just edit the **Config** cell below.


In [3]:
# --- Repo-root autodetect + Config ---
from pathlib import Path
import os, numpy as np, pandas as pd

# Find project root by walking up until we see markers
def find_repo_root(markers=("Makefile","data")):
    here = Path.cwd()
    for p in [here] + list(here.parents):
        if all((p / m).exists() for m in markers):
            return p
    return here  # fallback to CWD if markers not found

BASE = find_repo_root()
print(f"[repo root] {BASE}")

# Set paths RELATIVE to repo root
HIST_WEEKLY_PATH = BASE / "data/weekly_player_stats.csv"   # target schema
PBP_2025_PATH    = BASE / "data/play_by_play.csv"         # raw 2025 PBP export

# Outputs
OUT_2025_WEEKLY_FROM_PBP = BASE / "data/weekly/weekly_2025_from_pbp.csv"
OUT_MERGED               = BASE / "data/weekly/weekly_merged_thru_2025.csv"
QC_DIR                   = BASE / "data/qc"

# Make sure dirs exist
QC_DIR.mkdir(parents=True, exist_ok=True)
(OUT_2025_WEEKLY_FROM_PBP.parent).mkdir(parents=True, exist_ok=True)
(OUT_MERGED.parent).mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)


[repo root] /Users/pwitt/NFL-2025


In [4]:

def read_csv_safe(path, **kwargs):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    df = pd.read_csv(path, **kwargs)
    print(f"[read_csv_safe] {path}: shape={df.shape}")
    return df

def pick_first_col(df, candidates):
    """Return the first column name from `candidates` that exists in df (or None)."""
    for c in candidates:
        if c in df.columns:
            return c
    return None

def show_schema(df, name, head=3):
    print(f"\n[{name}] shape={df.shape}")
    print(f"[{name}] columns ({len(df.columns)}):")
    print(list(df.columns))
    print(f"\n[{name}] dtypes:")
    print(df.dtypes.head(40))
    if head:
        print(f"\n[{name}] head:")
        display(df.head(head))

def null_report(df, name, top=30):
    na = df.isna().mean().sort_values(ascending=False)
    print(f"\n[{name}] null ratios (top {top}):")
    display(na.head(top).to_frame("null_ratio"))

def dtype_alignment(source, target):
    """Return {col: target_dtype} for common columns with mismatched dtypes."""
    plan = {}
    for c in source.columns.intersection(target.columns):
        sdt = source[c].dtype
        tdt = target[c].dtype
        if sdt != tdt:
            plan[c] = tdt
    return plan


## 1) Load historical weekly (target schema)

In [5]:

hist = read_csv_safe(HIST_WEEKLY_PATH)
show_schema(hist, "hist (target schema)")

TARGET_COLS = list(hist.columns)
print("\n[TARGET_COLS] captured", len(TARGET_COLS), "columns")


[read_csv_safe] /Users/pwitt/NFL-2025/data/weekly_player_stats.csv: shape=(32991, 53)

[hist (target schema)] shape=(32991, 53)
[hist (target schema)] columns (53):
['player_id', 'player_name', 'player_display_name', 'position', 'position_group', 'headshot_url', 'team', 'season', 'week', 'season_type', 'opponent_team', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr', 'target_share', 'a

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,team,season,week,season_type,opponent_team,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr
0,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2018,1,REG,HOU,26,39,277.0,3,1.0,2.0,10.0,0,0,320.0,93.0,16.0,8.182894,0,0.865625,0.15032,1,2.0,0,0.0,0.0,1.0,1.837627,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,,0.0,21.28,21.28
1,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2018,2,REG,JAX,24,35,234.0,2,0.0,2.0,14.0,1,1,213.0,136.0,12.0,4.945607,0,1.098592,0.128772,3,10.0,0,0.0,0.0,2.0,2.328974,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,,0.0,16.36,16.36
2,00-0019596,T.Brady,Tom Brady,QB,QB,https://static.www.nfl.com/image/private/f_aut...,NE,2018,3,REG,DET,14,26,133.0,1,1.0,2.0,13.0,0,0,333.0,48.0,4.0,-10.869923,0,0.399399,-0.008199,1,2.0,0,0.0,0.0,1.0,-0.245843,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,,0,,,,,0.0,7.52,7.52



[TARGET_COLS] captured 53 columns


## 2) Load PBP 2025

In [6]:

pbp25 = read_csv_safe(PBP_2025_PATH)
show_schema(pbp25, "pbp25", head=2)

# Basic coverage check
season_col = pick_first_col(pbp25, ["season","Season"])
week_col   = pick_first_col(pbp25, ["week","Week"])

if season_col is None or week_col is None:
    raise SystemExit("❌ Expected 'season' and 'week' columns in PBP. Please adjust candidates in the code.")

print("\n[pbp25] seasons present:", sorted(pbp25[season_col].dropna().unique().tolist()))
print("[pbp25] weeks present:",   sorted(pbp25[week_col].dropna().unique().tolist()))


[read_csv_safe] /Users/pwitt/NFL-2025/data/play_by_play.csv: shape=(176, 397)

[pbp25] shape=(176, 397)
[pbp25] columns (397):
['play_id', 'game_id', 'old_game_id_x', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_point_result', 'two_point_conv_result', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'timeout_team', 'td_team', 'td_player_name', 'td_player_id', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'total_home_score', 'total_awa

Unnamed: 0,play_id,game_id,old_game_id_x,home_team,away_team,season_type,week,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,drive,sp,qtr,down,goal_to_go,time,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,shotgun,no_huddle,qb_dropback,qb_kneel,qb_spike,qb_scramble,pass_length,pass_location,air_yards,yards_after_catch,run_location,run_gap,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,home_timeouts_remaining,away_timeouts_remaining,timeout,timeout_team,td_team,td_player_name,td_player_id,posteam_timeouts_remaining,defteam_timeouts_remaining,total_home_score,total_away_score,posteam_score,defteam_score,score_differential,posteam_score_post,defteam_score_post,score_differential_post,no_score_prob,opp_fg_prob,opp_safety_prob,opp_td_prob,fg_prob,safety_prob,td_prob,extra_point_prob,two_point_conversion_prob,ep,epa,total_home_epa,total_away_epa,total_home_rush_epa,total_away_rush_epa,total_home_pass_epa,total_away_pass_epa,air_epa,yac_epa,comp_air_epa,comp_yac_epa,total_home_comp_air_epa,total_away_comp_air_epa,total_home_comp_yac_epa,total_away_comp_yac_epa,total_home_raw_air_epa,total_away_raw_air_epa,total_home_raw_yac_epa,total_away_raw_yac_epa,wp,def_wp,home_wp,away_wp,wpa,vegas_wpa,vegas_home_wpa,home_wp_post,...,play_clock,play_deleted,play_type_nfl,special_teams_play,st_play_type,end_clock_time,end_yard_line,fixed_drive,fixed_drive_result,drive_real_start_time,drive_play_count,drive_time_of_possession,drive_first_downs,drive_inside20,drive_ended_with_score,drive_quarter_start,drive_quarter_end,drive_yards_penalized,drive_start_transition,drive_end_transition,drive_game_clock_start,drive_game_clock_end,drive_start_yard_line,drive_end_yard_line,drive_play_id_started,drive_play_id_ended,away_score,home_score,location,result,total,spread_line,total_line,div_game,roof,surface,temp,wind,home_coach,away_coach,stadium_id,game_stadium,aborted_play,success,passer,passer_jersey_number,rusher,rusher_jersey_number,receiver,receiver_jersey_number,pass,rush,first_down,special,play,passer_id,rusher_id,receiver_id,name,jersey_number,id,fantasy_player_name,fantasy_player_id,fantasy,fantasy_id,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe,nflverse_game_id,old_game_id_y,possession_team,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense,ngs_air_yards,time_to_throw,was_pressure,route,defense_man_zone_type,defense_coverage_type,offense_names,defense_names,offense_positions,defense_positions,offense_numbers,defense_numbers
0,1.0,2024_22_KC_PHI,2025020900,PHI,KC,POST,22,,,,,,2025-02-09,900.0,1800.0,3600.0,Half1,0.0,,0.0,1.0,,0,15:00,KC 35,0.0,,GAME,,,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,3.0,3.0,,,,,,,,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.008251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546262,0.453738,0.546262,0.453738,0.0,0.0,0.0,,...,0,0.0,GAME_START,0.0,,,,1.0,Punt,,,,,,,,,,,,,,,,,,22,40,Neutral,18,62,-1.5,48.5,0,dome,sportturf,,,Nick Sirianni,Andy Reid,NOR00,Mercedes-Benz Superdome,0.0,0.0,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,40.0,2024_22_KC_PHI,2025020900,PHI,KC,POST,22,PHI,home,KC,KC,35.0,2025-02-09,900.0,1800.0,3600.0,Half1,0.0,1.0,0.0,1.0,,0,15:00,KC 35,0.0,10.0,7-H.Butker kicks 65 yards from KC 35 to end zo...,kickoff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,65.0,,,3.0,3.0,0.0,,,,,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003473,0.128879,0.00227,0.272088,0.208195,0.00324,0.381854,0.0,0.0,1.008251,0.274956,0.274956,-0.274956,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546262,0.453738,0.546262,0.453738,0.006226,0.006062,0.006062,0.552488,...,0,0.0,KICK_OFF,1.0,,2025-02-09T23:41:28.100Z,,1.0,Punt,2025-02-09T23:41:24.460Z,5.0,3:39,1.0,0.0,0.0,1.0,1.0,-10.0,KICKOFF,PUNT,15:00,11:21,PHI 30,PHI 40,40.0,217.0,22,40,Neutral,18,62,-1.5,48.5,0,dome,sportturf,,,Nick Sirianni,Andy Reid,NOR00,Mercedes-Benz Superdome,0.0,1.0,,,,,,,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,0.0,1.0,0.274956,,,,,,,,2024_22_KC_PHI,2025021000.0,PHI,,"2 CB, 1 FB, 1 FS, 2 ILB, 1 OLB, 2 RB, 1 TE, 1 WR",0.0,"3 CB, 2 FS, 2 ILB, 1 K, 1 OLB, 1 TE, 1 WR",0.0,00-0038043;00-0037198;00-0039324;00-0034377;00...,00-0034377;00-0039140;00-0035099;00-0037461;00...,00-0038043;00-0037198;00-0039324;00-0037193;00...,11.0,11.0,,,False,,,,Avonte Maddox;Kelee Ringo;Khari Blasingame;Tri...,Joshua Williams;Nazeeh Johnson;Christian Rolan...,CB;CB;FB;FS;ILB;ILB;OLB;RB;RB;TE;WR;NA;NA,CB;CB;CB;FS;FS;ILB;ILB;K;OLB;TE;WR;NA;NA,29;22;48;36;43;54;58;14;28;84;89;NA;NA,2;13;30;6;27;44;50;7;54;88;84;NA;NA



[pbp25] seasons present: [2024]
[pbp25] weeks present: [22]


In [6]:

pbp25 = read_csv_safe(PBP_2025_PATH)
show_schema(pbp25, "pbp25", head=2)

# Basic coverage check
season_col = pick_first_col(pbp25, ["season","Season"])
week_col   = pick_first_col(pbp25, ["week","Week"])

if season_col is None or week_col is None:
    raise SystemExit("❌ Expected 'season' and 'week' columns in PBP. Please adjust candidates in the code.")

print("\n[pbp25] seasons present:", sorted(pbp25[season_col].dropna().unique().tolist()))
print("[pbp25] weeks present:",   sorted(pbp25[week_col].dropna().unique().tolist()))


[read_csv_safe] /Users/pwitt/NFL-2025/data/play_by_play.csv: shape=(176, 397)

[pbp25] shape=(176, 397)
[pbp25] columns (397):
['play_id', 'game_id', 'old_game_id_x', 'home_team', 'away_team', 'season_type', 'week', 'posteam', 'posteam_type', 'defteam', 'side_of_field', 'yardline_100', 'game_date', 'quarter_seconds_remaining', 'half_seconds_remaining', 'game_seconds_remaining', 'game_half', 'quarter_end', 'drive', 'sp', 'qtr', 'down', 'goal_to_go', 'time', 'yrdln', 'ydstogo', 'ydsnet', 'desc', 'play_type', 'yards_gained', 'shotgun', 'no_huddle', 'qb_dropback', 'qb_kneel', 'qb_spike', 'qb_scramble', 'pass_length', 'pass_location', 'air_yards', 'yards_after_catch', 'run_location', 'run_gap', 'field_goal_result', 'kick_distance', 'extra_point_result', 'two_point_conv_result', 'home_timeouts_remaining', 'away_timeouts_remaining', 'timeout', 'timeout_team', 'td_team', 'td_player_name', 'td_player_id', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining', 'total_home_score', 'total_awa

Unnamed: 0,play_id,game_id,old_game_id_x,home_team,away_team,season_type,week,posteam,posteam_type,defteam,side_of_field,yardline_100,game_date,quarter_seconds_remaining,half_seconds_remaining,game_seconds_remaining,game_half,quarter_end,drive,sp,qtr,down,goal_to_go,time,yrdln,ydstogo,ydsnet,desc,play_type,yards_gained,shotgun,no_huddle,qb_dropback,qb_kneel,qb_spike,qb_scramble,pass_length,pass_location,air_yards,yards_after_catch,run_location,run_gap,field_goal_result,kick_distance,extra_point_result,two_point_conv_result,home_timeouts_remaining,away_timeouts_remaining,timeout,timeout_team,td_team,td_player_name,td_player_id,posteam_timeouts_remaining,defteam_timeouts_remaining,total_home_score,total_away_score,posteam_score,defteam_score,score_differential,posteam_score_post,defteam_score_post,score_differential_post,no_score_prob,opp_fg_prob,opp_safety_prob,opp_td_prob,fg_prob,safety_prob,td_prob,extra_point_prob,two_point_conversion_prob,ep,epa,total_home_epa,total_away_epa,total_home_rush_epa,total_away_rush_epa,total_home_pass_epa,total_away_pass_epa,air_epa,yac_epa,comp_air_epa,comp_yac_epa,total_home_comp_air_epa,total_away_comp_air_epa,total_home_comp_yac_epa,total_away_comp_yac_epa,total_home_raw_air_epa,total_away_raw_air_epa,total_home_raw_yac_epa,total_away_raw_yac_epa,wp,def_wp,home_wp,away_wp,wpa,vegas_wpa,vegas_home_wpa,home_wp_post,...,play_clock,play_deleted,play_type_nfl,special_teams_play,st_play_type,end_clock_time,end_yard_line,fixed_drive,fixed_drive_result,drive_real_start_time,drive_play_count,drive_time_of_possession,drive_first_downs,drive_inside20,drive_ended_with_score,drive_quarter_start,drive_quarter_end,drive_yards_penalized,drive_start_transition,drive_end_transition,drive_game_clock_start,drive_game_clock_end,drive_start_yard_line,drive_end_yard_line,drive_play_id_started,drive_play_id_ended,away_score,home_score,location,result,total,spread_line,total_line,div_game,roof,surface,temp,wind,home_coach,away_coach,stadium_id,game_stadium,aborted_play,success,passer,passer_jersey_number,rusher,rusher_jersey_number,receiver,receiver_jersey_number,pass,rush,first_down,special,play,passer_id,rusher_id,receiver_id,name,jersey_number,id,fantasy_player_name,fantasy_player_id,fantasy,fantasy_id,out_of_bounds,home_opening_kickoff,qb_epa,xyac_epa,xyac_mean_yardage,xyac_median_yardage,xyac_success,xyac_fd,xpass,pass_oe,nflverse_game_id,old_game_id_y,possession_team,offense_formation,offense_personnel,defenders_in_box,defense_personnel,number_of_pass_rushers,players_on_play,offense_players,defense_players,n_offense,n_defense,ngs_air_yards,time_to_throw,was_pressure,route,defense_man_zone_type,defense_coverage_type,offense_names,defense_names,offense_positions,defense_positions,offense_numbers,defense_numbers
0,1.0,2024_22_KC_PHI,2025020900,PHI,KC,POST,22,,,,,,2025-02-09,900.0,1800.0,3600.0,Half1,0.0,,0.0,1.0,,0,15:00,KC 35,0.0,,GAME,,,0.0,0.0,,0.0,0.0,0.0,,,,,,,,,,,3.0,3.0,,,,,,,,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.008251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546262,0.453738,0.546262,0.453738,0.0,0.0,0.0,,...,0,0.0,GAME_START,0.0,,,,1.0,Punt,,,,,,,,,,,,,,,,,,22,40,Neutral,18,62,-1.5,48.5,0,dome,sportturf,,,Nick Sirianni,Andy Reid,NOR00,Mercedes-Benz Superdome,0.0,0.0,,,,,,,0.0,0.0,,0.0,0.0,,,,,,,,,,,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,40.0,2024_22_KC_PHI,2025020900,PHI,KC,POST,22,PHI,home,KC,KC,35.0,2025-02-09,900.0,1800.0,3600.0,Half1,0.0,1.0,0.0,1.0,,0,15:00,KC 35,0.0,10.0,7-H.Butker kicks 65 yards from KC 35 to end zo...,kickoff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,65.0,,,3.0,3.0,0.0,,,,,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003473,0.128879,0.00227,0.272088,0.208195,0.00324,0.381854,0.0,0.0,1.008251,0.274956,0.274956,-0.274956,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.546262,0.453738,0.546262,0.453738,0.006226,0.006062,0.006062,0.552488,...,0,0.0,KICK_OFF,1.0,,2025-02-09T23:41:28.100Z,,1.0,Punt,2025-02-09T23:41:24.460Z,5.0,3:39,1.0,0.0,0.0,1.0,1.0,-10.0,KICKOFF,PUNT,15:00,11:21,PHI 30,PHI 40,40.0,217.0,22,40,Neutral,18,62,-1.5,48.5,0,dome,sportturf,,,Nick Sirianni,Andy Reid,NOR00,Mercedes-Benz Superdome,0.0,1.0,,,,,,,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,0.0,1.0,0.274956,,,,,,,,2024_22_KC_PHI,2025021000.0,PHI,,"2 CB, 1 FB, 1 FS, 2 ILB, 1 OLB, 2 RB, 1 TE, 1 WR",0.0,"3 CB, 2 FS, 2 ILB, 1 K, 1 OLB, 1 TE, 1 WR",0.0,00-0038043;00-0037198;00-0039324;00-0034377;00...,00-0034377;00-0039140;00-0035099;00-0037461;00...,00-0038043;00-0037198;00-0039324;00-0037193;00...,11.0,11.0,,,False,,,,Avonte Maddox;Kelee Ringo;Khari Blasingame;Tri...,Joshua Williams;Nazeeh Johnson;Christian Rolan...,CB;CB;FB;FS;ILB;ILB;OLB;RB;RB;TE;WR;NA;NA,CB;CB;CB;FS;FS;ILB;ILB;K;OLB;TE;WR;NA;NA,29;22;48;36;43;54;58;14;28;84;89;NA;NA,2;13;30;6;27;44;50;7;54;88;84;NA;NA



[pbp25] seasons present: [2024]
[pbp25] weeks present: [22]



## 3) Aggregate PBP into weekly player stats (passing / rushing / receiving)
This block constructs per-role aggregates and then coalesces them into a single per-player-week table.

> The column names in nflverse PBP can vary slightly across versions; the code below tries multiple common candidates and will raise if it can't find what's needed.


In [None]:

# --- Candidate column names for IDs/names/teams ---
PASSER_ID_COL   = pick_first_col(pbp25, ["passer_player_id", "passer_id", "passer_player_gsis_id"])
PASSER_NAME_COL = pick_first_col(pbp25, ["passer_player_name", "passer", "passer_name"])
RUSHER_ID_COL   = pick_first_col(pbp25, ["rusher_player_id", "rusher_id", "rusher_player_gsis_id"])
RUSHER_NAME_COL = pick_first_col(pbp25, ["rusher_player_name", "rusher", "rusher_name"])
RECV_ID_COL     = pick_first_col(pbp25, ["receiver_player_id", "receiver_id", "receiver_player_gsis_id"])
RECV_NAME_COL   = pick_first_col(pbp25, ["receiver_player_name", "receiver", "receiver_name"])

POSTEAM_COL     = pick_first_col(pbp25, ["posteam", "pos_team", "offense_team"])
DEFTEAM_COL     = pick_first_col(pbp25, ["defteam", "def_team", "defense_team"])

if POSTEAM_COL is None:
    raise SystemExit("❌ Could not find posteam column (tried 'posteam', 'pos_team', 'offense_team').")

# --- Boolean/event columns ---
PASS_ATT_COL   = pick_first_col(pbp25, ["pass_attempt", "pass"])
COMPLETE_COL   = pick_first_col(pbp25, ["complete_pass", "complete"])
INT_COL        = pick_first_col(pbp25, ["interception"])
SACK_COL       = pick_first_col(pbp25, ["sack"])

RUSH_ATT_COL   = pick_first_col(pbp25, ["rush_attempt"])
RECEPTION_COL  = pick_first_col(pbp25, ["reception"])

# Touchdowns (more robust to use specific suffix columns if available)
PASS_TD_COL    = pick_first_col(pbp25, ["pass_touchdown"])
RUSH_TD_COL    = pick_first_col(pbp25, ["rush_touchdown"])
REC_TD_COL     = pick_first_col(pbp25, ["receive_touchdown"])

# Yardage columns
PASS_YDS_COL   = pick_first_col(pbp25, ["passing_yards"])
RUSH_YDS_COL   = pick_first_col(pbp25, ["rushing_yards"])
REC_YDS_COL    = pick_first_col(pbp25, ["receiving_yards"])

# Some datasets compute targets as rows with pass_attempt==1 & receiver present
# We'll derive a 'target' flag if not present.
TARGET_COL     = pick_first_col(pbp25, ["target","pass_target"])
if TARGET_COL is None:
    pbp25["__is_target"] = (pbp25[PASS_ATT_COL]==1) & pbp25[RECV_ID_COL].notna()
    TARGET_COL = "__is_target"

# Fill missing indicator columns with 0 for safe summation
for col in [PASS_ATT_COL, COMPLETE_COL, INT_COL, SACK_COL, RUSH_ATT_COL, RECEPTION_COL, PASS_TD_COL, RUSH_TD_COL, REC_TD_COL, TARGET_COL]:
    if col and col in pbp25.columns:
        pbp25[col] = pbp25[col].fillna(0).astype(int)

# Yardage defaults
for col in [PASS_YDS_COL, RUSH_YDS_COL, REC_YDS_COL]:
    if col and col in pbp25.columns:
        pbp25[col] = pbp25[col].fillna(0).astype(float)

# --- Aggregate passing ---
pass_df = None
if PASSER_ID_COL is not None:
    cols = [season_col, week_col, POSTEAM_COL, PASSER_ID_COL, PASSER_NAME_COL]
    cols = [c for c in cols if c is not None]
    g = pbp25[pbp25[PASS_ATT_COL]==1].groupby(cols, dropna=False, as_index=False).agg({
        PASS_ATT_COL: "sum",
        COMPLETE_COL: "sum",
        INT_COL: "sum",
        SACK_COL: "sum",
        PASS_YDS_COL: "sum" if PASS_YDS_COL in pbp25.columns else "sum"
    })
    # Rename to generic stat names
    pass_df = g.rename(columns={
        season_col: "season",
        week_col: "week",
        POSTEAM_COL: "team",
        PASSER_ID_COL: "player_id",
        PASSER_NAME_COL: "player_name",
        PASS_ATT_COL: "pass_attempts",
        COMPLETE_COL: "completions",
        INT_COL: "interceptions",
        SACK_COL: "sacks",
        PASS_YDS_COL: "passing_yards"
    })
    pass_df["role"] = "passer"

# --- Aggregate rushing ---
rush_df = None
if RUSHER_ID_COL is not None:
    cols = [season_col, week_col, POSTEAM_COL, RUSHER_ID_COL, RUSHER_NAME_COL]
    cols = [c for c in cols if c is not None]
    gg = pbp25[pbp25[RUSH_ATT_COL]==1].groupby(cols, dropna=False, as_index=False).agg({
        RUSH_ATT_COL: "sum",
        RUSH_YDS_COL: "sum" if RUSH_YDS_COL in pbp25.columns else "sum",
        RUSH_TD_COL: "sum" if RUSH_TD_COL in pbp25.columns else "sum"
    })
    rush_df = gg.rename(columns={
        season_col: "season",
        week_col: "week",
        POSTEAM_COL: "team",
        RUSHER_ID_COL: "player_id",
        RUSHER_NAME_COL: "player_name",
        RUSH_ATT_COL: "rush_attempts",
        RUSH_YDS_COL: "rushing_yards",
        RUSH_TD_COL: "rush_tds"
    })
    rush_df["role"] = "rusher"

# --- Aggregate receiving ---
recv_df = None
if RECV_ID_COL is not None:
    cols = [season_col, week_col, POSTEAM_COL, RECV_ID_COL, RECV_NAME_COL]
    cols = [c for c in cols if c is not None]
    rr = pbp25[pbp25[RECEPTION_COL].notna() | pbp25[TARGET_COL].notna()].copy()
    # ensure ints
    rr[RECEPTION_COL] = rr[RECEPTION_COL].fillna(0).astype(int) if RECEPTION_COL else 0
    rr[TARGET_COL]    = rr[TARGET_COL].fillna(0).astype(int)
    agg = rr.groupby(cols, dropna=False, as_index=False).agg({
        RECEPTION_COL: "sum" if RECEPTION_COL in rr.columns else "sum",
        TARGET_COL: "sum",
        REC_YDS_COL: "sum" if REC_YDS_COL in rr.columns else "sum",
        REC_TD_COL: "sum" if REC_TD_COL in rr.columns else "sum",
    })
    recv_df = agg.rename(columns={
        season_col: "season",
        week_col: "week",
        POSTEAM_COL: "team",
        RECV_ID_COL: "player_id",
        RECV_NAME_COL: "player_name",
        RECEPTION_COL: "receptions",
        TARGET_COL: "targets",
        REC_YDS_COL: "receiving_yards",
        REC_TD_COL: "rec_tds"
    })
    recv_df["role"] = "receiver"

components = [d for d in [pass_df, rush_df, recv_df] if d is not None]
if not components:
    raise SystemExit("❌ No role aggregates were created. Check ID/name column candidates and PBP structure.")

for i, d in enumerate(components, 1):
    print(f"[component {i}] shape={d.shape}, columns={list(d.columns)}")

weekly25 = pd.concat(components, ignore_index=True)
# Sum across roles to one row per player-week
key_cols = ["season","week","team","player_id","player_name"]
stat_cols = [c for c in weekly25.columns if c not in key_cols + ["role"]]
weekly25 = weekly25.groupby(key_cols, dropna=False, as_index=False)[stat_cols].sum()

# Optional: compute a rough 'position' guess (very rough; safe placeholder)
# You can replace with a roster join later.
def guess_pos(row):
    if row.get("pass_attempts", 0) > 3:
        return "QB"
    if row.get("rush_attempts", 0) >= 5 and row.get("receptions", 0) < 3:
        return "RB"
    if row.get("receptions", 0) >= 3:
        return "WR/TE"
    return np.nan

weekly25["position_guess"] = weekly25.apply(guess_pos, axis=1)

print("\n[weekly25] player-week table from PBP:", weekly25.shape)
display(weekly25.head(10))



## 4) Align 2025 weekly to the historical schema
- Add any **missing columns** (fill with `NaN` or safe defaults).
- Reorder columns to match the **target** ordering.
- Report schema differences and dtype mismatches.


In [None]:

# Columns present now
cur_cols = set(weekly25.columns)
target_cols = set(TARGET_COLS)

missing_in_2025 = [c for c in TARGET_COLS if c not in cur_cols]
extra_in_2025   = [c for c in weekly25.columns if c not in TARGET_COLS]

print("[schema] Missing in 2025-from-PBP (will be added as NaN/default):", missing_in_2025[:40], "..." if len(missing_in_2025)>40 else "")
print("[schema] Extra columns in 2025 (not in hist):", extra_in_2025[:40], "..." if len(extra_in_2025)>40 else "")

# Create aligned df with all target columns
aligned25 = weekly25.copy()
for c in missing_in_2025:
    # Fill with NaN by default; you can add more nuanced defaults per column here
    aligned25[c] = np.nan

# Reorder to target schema
aligned25 = aligned25[TARGET_COLS]

# Attempt dtype alignment (best effort)
plan = dtype_alignment(aligned25, hist)
if plan:
    print("\n[dtypes] Attempting to cast mismatched dtypes:")
    for c, tdt in plan.items():
        try:
            aligned25[c] = aligned25[c].astype(tdt)
            print(f"  - cast {c} -> {tdt}")
        except Exception as e:
            print(f"  ! could not cast {c} -> {tdt}: {e}")

show_schema(aligned25, "aligned25 (2025 weekly aligned)", head=5)

# Write 2025-only aligned
aligned25.to_csv(OUT_2025_WEEKLY_FROM_PBP, index=False)
print(f"\n✅ Wrote 2025 weekly (aligned) -> {OUT_2025_WEEKLY_FROM_PBP}")


## 5) QC checks (nulls, sanity, coverage)

In [None]:

# Null report
null_report(aligned25, "aligned25", top=40)

# Sanity checks for key relationships
def safe_sum(col):
    return float(aligned25[col].fillna(0).sum()) if col in aligned25.columns else np.nan

checks = {
    "pass_attempts >= completions": (
        safe_sum("pass_attempts"), safe_sum("completions")
    ),
    "receiving_yards >= 0": safe_sum("receiving_yards"),
    "rushing_yards >= 0": safe_sum("rushing_yards"),
    "targets >= receptions": (safe_sum("targets"), safe_sum("receptions")),
}

print("\n[Sanity checks] (aggregated)")
for k, v in checks.items():
    if isinstance(v, tuple):
        a, b = v
        print(f" - {k}: {a} vs {b}  -> {'OK' if (math.isnan(a) or math.isnan(b) or a>=b) else 'CHECK'}")
    else:
        x = v
        print(f" - {k}: total={x}  -> {'OK' if (math.isnan(x) or x>=0) else 'CHECK'}")

# Coverage summary
cov = aligned25.groupby(["season","week"]).size().rename("rows").reset_index()
print("\n[Coverage: rows per season/week]")
display(cov.head(30))
cov.to_csv(os.path.join(QC_DIR, "coverage_2025_weekly_from_pbp.csv"), index=False)

# Save schema diff for record
schema_diff = pd.DataFrame({
    "missing_in_2025": pd.Series(missing_in_2025, dtype="object"),
    "extra_in_2025": pd.Series(extra_in_2025, dtype="object")
})
schema_diff.to_csv(os.path.join(QC_DIR, "schema_diff_2025_vs_hist.csv"), index=False)
print("\n✅ Wrote QC files in", QC_DIR)



## 6) Concatenate historical + 2025-aligned (review before promoting to prod)
We'll do a simple `concat` and write a merged file. You can diff this against your current combined weekly to verify.


In [None]:

# Ensure same columns/order as hist
assert list(aligned25.columns) == list(hist.columns), "Aligned 2025 columns do not match historical schema order."

merged = pd.concat([hist, aligned25], ignore_index=True)
print("[merged] shape:", merged.shape)

# Quick spot-check
display(merged.sort_values(["season","week"]).tail(10))

merged.to_csv(OUT_MERGED, index=False)
print(f"\n✅ Wrote merged file -> {OUT_MERGED}")



## 7) Next steps
- If the PBP schema evolves or if you need additional stats (e.g., air yards, fantasy scoring), extend the per-role aggregates.
- Optionally join with a **roster** table to set a trustworthy `position` and stable `player_id` across weeks.
- Once you are satisfied with the checks above, wire this notebook's logic into your prod pipeline (as a script) and guard it with asserts and logging.


usage: ipykernel_launcher.py [-h] [--season SEASON] [--out-csv OUT_CSV]
                             [--out-parquet OUT_PARQUET]
                             [--out-w1-csv OUT_W1_CSV]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/pwitt/Library/Jupyter/runtime/kernel-7a032024-2e20-4c87-b4c1-0eb0df3df29c.json


SystemExit: 2

In [9]:
# === Pull 2025 PBP, write extracts, and set PBP_2025_PATH ===
from pathlib import Path
import os, sys, pandas as pd

# Find repo root (works no matter where the notebook lives)
def find_root():
    here = Path.cwd()
    for p in [here] + list(here.parents):
        if (p/"Makefile").exists() and (p/"data").exists():
            return p
    return here

BASE = find_root()
SEASON = 2025

# Try to import nfl_data_py; if missing, show a clear message
try:
    import nfl_data_py as nfl
except Exception as e:
    raise SystemExit("❌ nfl_data_py not installed. In this env run:\n"
                     "    pip install nfl_data_py\n"
                     "…then re-run this cell.") from e

print(f"[pull_pbp] fetching play-by-play for {SEASON} …")
try:
    df = nfl.import_pbp_data([SEASON])
except Exception as e:
    raise SystemExit(f"❌ import_pbp_data([{SEASON}]) failed: {type(e).__name__}: {e}")

if df is None or df.empty:
    raise SystemExit(f"❌ No data returned for {SEASON}. Upstream may not have published it yet.")

# Basic sanity
for col in ("season","week"):
    if col not in df.columns:
        raise SystemExit(f"❌ Expected column '{col}' not found. Got: {list(df.columns)[:20]}")

weeks_present = sorted(df.loc[df["season"]==SEASON, "week"].dropna().unique().tolist())
print(f"[pull_pbp] seasons_present: {sorted(df['season'].dropna().unique())}")
print(f"[pull_pbp] weeks_in_{SEASON}: {weeks_present or '(none)'}")
print(df.head(3))

# Write outputs
out_dir = BASE / "data" / "pbp"
out_dir.mkdir(parents=True, exist_ok=True)
out_csv     = out_dir / f"pbp_{SEASON}.csv"
out_parquet = out_dir / f"pbp_{SEASON}.parquet"
out_w1_csv  = out_dir / f"pbp_{SEASON}_w1.csv"

df.to_csv(out_csv, index=False)
print(f"✅ wrote {out_csv}  rows={len(df):,}")
try:
    df.to_parquet(out_parquet, index=False)
    print(f"✅ wrote {out_parquet}")
except Exception as e:
    print(f"ℹ️ parquet write skipped ({type(e).__name__}: {e})")

w1 = df[(df["season"]==SEASON) & (df["week"]==1)]
if not w1.empty:
    w1.to_csv(out_w1_csv, index=False)
    print(f"✅ wrote {out_w1_csv}  rows={len(w1):,}")
else:
    print("ℹ️ No Week 1 rows yet in the fetched data.")

# Point the rest of your notebook at the freshly written file:
PBP_2025_PATH = out_csv
print(f"[config] PBP_2025_PATH set to: {PBP_2025_PATH}")


[pull_pbp] fetching play-by-play for 2025 …
2025 done.
Downcasting floats.
[pull_pbp] seasons_present: [np.int64(2025)]
[pull_pbp] weeks_in_2025: [1, 2]
   play_id         game_id old_game_id home_team away_team season_type  week posteam posteam_type defteam side_of_field  yardline_100   game_date  \
0      1.0  2025_01_ARI_NO  2025090705        NO       ARI         REG     1    None         None    None          None           NaN  2025-09-07   
1     40.0  2025_01_ARI_NO  2025090705        NO       ARI         REG     1     ARI         away      NO            NO          35.0  2025-09-07   
2     63.0  2025_01_ARI_NO  2025090705        NO       ARI         REG     1     ARI         away      NO           ARI          78.0  2025-09-07   

   quarter_seconds_remaining  half_seconds_remaining  game_seconds_remaining game_half  quarter_end  drive   sp  qtr  down  goal_to_go   time   yrdln  \
0                      900.0                  1800.0                  3600.0     Half1          0

In [10]:
import pandas as pd, os, io, gzip, requests

season = 2025
base = "https://github.com/nflverse/nflverse-data/releases/download/player_stats/"
candidates = [
    f"{base}stats_player_week_{season}.parquet",
    f"{base}stats_player_week_{season}.csv.gz",
]

last_err = None
df = None
for url in candidates:
    try:
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        if url.endswith(".parquet"):
            with open("data/weekly_player_stats_2025.parquet","wb") as f: f.write(r.content)
            df = pd.read_parquet("data/weekly_player_stats_2025.parquet")
        else:
            data = gzip.decompress(r.content)
            df = pd.read_csv(io.BytesIO(data), low_memory=False)
            os.makedirs("data", exist_ok=True)
            df.to_csv("data/weekly_player_stats_2025.csv", index=False)
        print("[ok]", url)
        break
    except Exception as e:
        last_err = e

if df is None:
    raise SystemExit(f"Couldn't fetch 2025 weekly player stats. Last error: {last_err}")

print(df.shape)
print(sorted(df['week'].unique()))
df.head()




SystemExit: Couldn't fetch 2025 weekly player stats. Last error: 404 Client Error: Not Found for url: https://github.com/nflverse/nflverse-data/releases/download/player_stats/stats_player_week_2025.csv.gz

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [11]:
# === Fetch nflverse weekly player stats for 2025 and save under data/ ===
from pathlib import Path
import io, os, sys, gzip
import pandas as pd

# Find repo root (Makefile + data/) so paths work no matter where this notebook is opened
def find_root():
    here = Path.cwd()
    for p in [here] + list(here.parents):
        if (p/"Makefile").exists() and (p/"data").exists():
            return p
    return here

BASE = find_root()
SEASON = 2025
RELEASE_BASE = "https://github.com/nflverse/nflverse-data/releases/download/player_stats/"
CANDIDATES = [
    f"{RELEASE_BASE}stats_player_week_{SEASON}.parquet",
    f"{RELEASE_BASE}stats_player_week_{SEASON}.csv.gz",
]

# Make sure requests is available
try:
    import requests
except Exception:
    raise SystemExit("❌ The 'requests' package is not installed in this environment. Run:\n"
                     "    pip install requests\nand re-run this cell.")

# Try parquet first, then csv.gz
df = None
last_err = None
fetched_url = None
for url in CANDIDATES:
    try:
        print(f"[fetch] GET {url}")
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        if url.endswith(".parquet"):
            out_parquet = BASE / "data" / f"weekly_player_stats_{SEASON}.parquet"
            out_parquet.parent.mkdir(parents=True, exist_ok=True)
            out_parquet.write_bytes(r.content)
            df = pd.read_parquet(out_parquet)
            fetched_url = url
            print(f"✅ saved {out_parquet}")
            break
        else:  # csv.gz
            buf = gzip.decompress(r.content)
            df = pd.read_csv(io.BytesIO(buf), low_memory=False)
            out_csv = BASE / "data" / f"weekly_player_stats_{SEASON}.csv"
            out_csv.parent.mkdir(parents=True, exist_ok=True)
            df.to_csv(out_csv, index=False)
            fetched_url = url
            print(f"✅ saved {out_csv}")
            break
    except Exception as e:
        last_err = e
        print(f"  ↳ failed: {type(e).__name__}: {e}")

if df is None:
    raise SystemExit(f"❌ Could not fetch weekly player stats for {SEASON}. Last error: {last_err}")

# Also write the alternate format for convenience
try:
    out_parquet = BASE / "data" / f"weekly_player_stats_{SEASON}.parquet"
    out_csv     = BASE / "data" / f"weekly_player_stats_{SEASON}.csv"
    out_parquet.parent.mkdir(parents=True, exist_ok=True)
    if not out_parquet.exists():
        df.to_parquet(out_parquet, index=False)
        print(f"🗂  also wrote {out_parquet}")
    if not out_csv.exists():
        df.to_csv(out_csv, index=False)
        print(f"🗂  also wrote {out_csv}")
except Exception as e:
    print(f"ℹ️ secondary write skipped: {type(e).__name__}: {e}")

# Quick summary so you can inspect
needed = [c for c in ("season","week","team","player_id","player_name") if c in df.columns]
print(f"\n[summary] source: {fetched_url}")
print(f"[summary] shape: {df.shape}")
if "season" in df.columns:
    print(f"[summary] seasons: {sorted(df['season'].dropna().unique().tolist())}")
if "week" in df.columns:
    weeks = sorted(df.loc[df.get("season", pd.Series(SEASON)) == SEASON, "week"].dropna().unique().tolist())
    print(f"[summary] weeks in {SEASON}: {weeks}")

display(df[needed].head(10) if needed else df.head(10))

# Optional: write a copy under data/weekly/ to match your other paths
weekly_dir = BASE / "data" / "weekly"
weekly_dir.mkdir(parents=True, exist_ok=True)
alias_csv = weekly_dir / f"weekly_player_stats_{SEASON}.csv"
df.to_csv(alias_csv, index=False)
print(f"\n✅ wrote alias for convenience: {alias_csv}")


[fetch] GET https://github.com/nflverse/nflverse-data/releases/download/player_stats/stats_player_week_2025.parquet
  ↳ failed: HTTPError: 404 Client Error: Not Found for url: https://github.com/nflverse/nflverse-data/releases/download/player_stats/stats_player_week_2025.parquet
[fetch] GET https://github.com/nflverse/nflverse-data/releases/download/player_stats/stats_player_week_2025.csv.gz
  ↳ failed: HTTPError: 404 Client Error: Not Found for url: https://github.com/nflverse/nflverse-data/releases/download/player_stats/stats_player_week_2025.csv.gz


SystemExit: ❌ Could not fetch weekly player stats for 2025. Last error: 404 Client Error: Not Found for url: https://github.com/nflverse/nflverse-data/releases/download/player_stats/stats_player_week_2025.csv.gz

In [12]:
# === Quick test: get 2025 WEEKLY player stats (loader → fallback from local PBP) ===
from pathlib import Path
import pandas as pd
import numpy as np

SEASON = 2025

# Find repo root (so paths work anywhere)
def find_root():
    here = Path.cwd()
    for p in [here] + list(here.parents):
        if (p/"Makefile").exists() and (p/"data").exists():
            return p
    return here

BASE = find_root()
DATA = BASE / "data"
(DATA / "weekly").mkdir(parents=True, exist_ok=True)

def log(msg): print(f"[weekly_test] {msg}")

# 1) Try official loader (nflreadpy)
def try_loader():
    try:
        import nflreadpy  # noqa: F401
    except Exception:
        # install once if needed (safe to re-run)
        import sys
        print("[weekly_test] installing nflreadpy…")
        %pip -q install -U nflreadpy pyarrow
    from nflreadpy import load_player_stats
    try:
        log("attempting nflreadpy.load_player_stats(seasons=2025, summary_level='week')")
        df = load_player_stats(seasons=SEASON, summary_level="week", file_type="parquet")
        if df is not None and len(df):
            return df
    except Exception as e:
        log(f"loader failed: {type(e).__name__}: {e}")
    return None

# 2) Fallback: build weeklies from locally available PBP
def try_fallback_from_pbp():
    # Priorities for local PBP files
    pbp_cands = [
        DATA / "pbp" / f"pbp_{SEASON}.parquet",
        DATA / "pbp" / f"pbp_{SEASON}.csv",
        DATA / f"play_by_play_{SEASON}.parquet",
        DATA / f"play_by_play_{SEASON}.csv",
        DATA / "play_by_play.parquet",
        DATA / "play_by_play.csv",
    ]
    src = next((p for p in pbp_cands if p.exists()), None)
    if not src:
        log("no local PBP candidate found for fallback.")
        return None

    log(f"fallback from local PBP: {src}")
    pbp = pd.read_parquet(src) if src.suffix==".parquet" else pd.read_csv(src, low_memory=False)
    if "season" not in pbp.columns or "week" not in pbp.columns:
        log("PBP missing season/week columns")
        return None
    pbp = pbp[pbp["season"]==SEASON].copy()
    if pbp.empty:
        log("local PBP has no 2025 rows")
        return None

    # helpers
    def pick(df, opts):
        for c in opts:
            if c in df.columns: return c
        return None
    def sumsafe(df, cols):
        agg={}
        for c in cols:
            if c and c in df.columns:
                if df[c].dtype.kind in "biu": df[c]=df[c].fillna(0).astype(int)
                if df[c].dtype.kind in "f":   df[c]=df[c].fillna(0.0).astype(float)
                agg[c]="sum"
        return agg

    season, week = "season", "week"
    team = pick(pbp, ["posteam","team","pos_team","offense_team"]) or "posteam"

    # IDs/names by role
    PASSER_ID   = pick(pbp, ["passer_player_id","passer_id","passer_player_gsis_id"])
    PASSER_NAME = pick(pbp, ["passer_player_name","passer","passer_name"])
    RUSHER_ID   = pick(pbp, ["rusher_player_id","rusher_id","rusher_player_gsis_id"])
    RUSHER_NAME = pick(pbp, ["rusher_player_name","rusher","rusher_name"])
    RECV_ID     = pick(pbp, ["receiver_player_id","receiver_id","receiver_player_gsis_id"])
    RECV_NAME   = pick(pbp, ["receiver_player_name","receiver","receiver_name"])

    # events / yards
    PASS_ATT = pick(pbp, ["pass_attempt","pass"]); COMP=pick(pbp, ["complete_pass","complete"])
    PASS_YDS = pick(pbp, ["passing_yards"]); INTS=pick(pbp, ["interception"]); SACKS=pick(pbp, ["sack"])
    RUSH_ATT = pick(pbp, ["rush_attempt"]); RUSH_YDS=pick(pbp, ["rushing_yards"]); RUSH_TD=pick(pbp, ["rush_touchdown"])
    RECEPT   = pick(pbp, ["reception"]); REC_YDS=pick(pbp, ["receiving_yards"]); REC_TD=pick(pbp, ["receive_touchdown"])
    TARGET   = pick(pbp, ["target","pass_target"])
    if TARGET is None and PASS_ATT and RECV_ID:
        pbp["__is_target"] = (pbp[PASS_ATT]==1) & pbp[RECV_ID].notna()
        TARGET="__is_target"

    parts=[]

    # Passing
    if PASSER_ID or PASSER_NAME:
        cols=[season,week,team]+[x for x in [PASSER_ID,PASSER_NAME] if x]
        g = pbp[pbp[PASS_ATT]==1].groupby(cols, dropna=False, as_index=False).agg(
            sumsafe(pbp,[PASS_ATT,COMP,INTS,SACKS,PASS_YDS])
        ).rename(columns={
            season:"season", week:"week", team:"team",
            PASSER_ID:"player_id", PASSER_NAME:"player_name",
            PASS_ATT:"pass_attempts", COMP:"completions",
            INTS:"interceptions", SACKS:"sacks", PASS_YDS:"passing_yards"
        })
        parts.append(g)

    # Rushing
    if RUSHER_ID or RUSHER_NAME:
        cols=[season,week,team]+[x for x in [RUSHER_ID,RUSHER_NAME] if x]
        g = pbp[pbp[RUSH_ATT]==1].groupby(cols, dropna=False, as_index=False).agg(
            sumsafe(pbp,[RUSH_ATT,RUSH_YDS,RUSH_TD])
        ).rename(columns={
            season:"season", week:"week", team:"team",
            RUSHER_ID:"player_id", RUSHER_NAME:"player_name",
            RUSH_ATT:"rush_attempts", RUSH_YDS:"rushing_yards", RUSH_TD:"rush_tds"
        })
        parts.append(g)

    # Receiving
    if RECV_ID or RECV_NAME:
        cols=[season,week,team]+[x for x in [RECV_ID,RECV_NAME] if x]
        m = pbp.copy()
        if RECEPT and RECEPT in m.columns: m[RECEPT]=m[RECEPT].fillna(0).astype(int)
        if TARGET and TARGET in m.columns: m[TARGET]=m[TARGET].fillna(0).astype(int)
        g = m.groupby(cols, dropna=False, as_index=False).agg(
            sumsafe(m,[RECEPT,TARGET,REC_YDS,REC_TD])
        ).rename(columns={
            season:"season", week:"week", team:"team",
            RECV_ID:"player_id", RECV_NAME:"player_name",
            (RECEPT or "receptions"):"receptions",
            (TARGET or "targets"):"targets",
            REC_YDS:"receiving_yards", REC_TD:"rec_tds"
        })
        if "receptions" not in g.columns: g["receptions"]=0
        if "targets" not in g.columns: g["targets"]=0
        parts.append(g)

    if not parts:
        log("fallback failed to build any aggregates from PBP.")
        return None

    from functools import reduce
    key=["season","week","team","player_name","player_id"]
    weekly = reduce(lambda a,b: pd.merge(a,b,on=key,how="outer"), parts)
    for col in ["pass_attempts","completions","passing_yards","interceptions","sacks",
                "rush_attempts","rushing_yards","rush_tds",
                "receptions","targets","receiving_yards","rec_tds"]:
        if col in weekly.columns: weekly[col]=weekly[col].fillna(0)

    return weekly.sort_values(key).reset_index(drop=True)

# Run test
df = try_loader()
source = "nflreadpy (player_stats/week)" if df is not None and len(df) else None

if df is None or len(df)==0:
    df = try_fallback_from_pbp()
    source = "fallback from local PBP → weekly aggregate" if df is not None and len(df) else None

if df is None or len(df)==0:
    raise SystemExit("❌ Could not obtain 2025 weekly player stats by either method.")

weeks = sorted(df.loc[df["season"]==SEASON, "week"].dropna().unique().tolist()) if "week" in df.columns else []
log(f"SUCCESS via {source}; rows={len(df):,}, cols={df.shape[1]}, weeks={weeks}")

# Write result (and an alias the pipeline can use)
out = DATA / "weekly" / f"weekly_player_stats_{SEASON}.csv"
df.to_csv(out, index=False)
log(f"wrote {out}")

display(df.head(12)[[c for c in ("season","week","team","player_id","player_name",
                                 "pass_attempts","passing_yards","rush_attempts","rushing_yards",
                                 "receptions","receiving_yards") if c in df.columns]])


[weekly_test] installing nflreadpy…
Note: you may need to restart the kernel to use updated packages.
[weekly_test] attempting nflreadpy.load_player_stats(seasons=2025, summary_level='week')
[weekly_test] loader failed: TypeError: load_player_stats() got an unexpected keyword argument 'file_type'
[weekly_test] fallback from local PBP: /Users/pwitt/NFL-2025/data/pbp/pbp_2025.parquet
[weekly_test] SUCCESS via fallback from local PBP → weekly aggregate; rows=369, cols=16, weeks=[1, 2]
[weekly_test] wrote /Users/pwitt/NFL-2025/data/weekly/weekly_player_stats_2025.csv


Unnamed: 0,season,week,team,player_id,player_name,pass_attempts,passing_yards,rush_attempts,rushing_yards,receptions,receiving_yards
0,2025,1,ARI,00-0039041,E.Higgins,0.0,0.0,0.0,0.0,0.0,3.0
1,2025,1,ARI,00-0035500,G.Dortch,0.0,0.0,0.0,0.0,0.0,-2.0
2,2025,1,ARI,00-0033553,J.Conner,0.0,0.0,12.0,39.0,0.0,5.0
3,2025,1,ARI,00-0035228,K.Murray,34.0,163.0,7.0,38.0,0.0,0.0
4,2025,1,ARI,00-0039849,M.Harrison,0.0,0.0,0.0,0.0,0.0,71.0
5,2025,1,ARI,00-0038559,Mi.Wilson,0.0,0.0,0.0,0.0,0.0,5.0
6,2025,1,ARI,00-0039921,T.Benson,0.0,0.0,8.0,69.0,0.0,6.0
7,2025,1,ARI,00-0037744,T.McBride,0.0,0.0,0.0,0.0,0.0,61.0
8,2025,1,ARI,00-0039737,T.Reiman,0.0,0.0,0.0,0.0,0.0,10.0
9,2025,1,ARI,00-0033891,Z.Jones,0.0,0.0,0.0,0.0,0.0,4.0


In [13]:
# === Append 2025 (from PBP aggregates) to prod weekly_player_stats.csv ===
from pathlib import Path
import pandas as pd
import numpy as np

# -------- CONFIG
SEASON = 2025
WEEKS  = [1]                 # change to list of weeks or None for all available weeks in PBP
BASE_CSV = "data/weekly_player_stats.csv"  # your prod baseline
OUT_CSV  = "data/weekly/weekly_for_params_thru_2025w1.csv"  # new file to feed params step

# Preferred local PBP files (first existing wins)
PBP_CANDIDATES = [
    "data/pbp/pbp_2025.parquet", "data/pbp/pbp_2025.csv",
    "data/play_by_play_2025.parquet", "data/play_by_play_2025.csv",
    "data/play_by_play.parquet", "data/play_by_play.csv",
]

# -------- helpers
def pick(df, opts):
    for c in opts:
        if c in df.columns: return c
    return None

def sumsafe(df, cols):
    agg={}
    for c in cols:
        if c and c in df.columns:
            if df[c].dtype.kind in "biu": df[c]=df[c].fillna(0).astype(int)
            if df[c].dtype.kind in "f":   df[c]=df[c].fillna(0.0).astype(float)
            agg[c]="sum"
    return agg

def load_base(path):
    base = pd.read_csv(path, low_memory=False)
    print(f"[base] {path}  shape={base.shape}")
    return base

def load_pbp():
    src = next((p for p in PBP_CANDIDATES if Path(p).exists()), None)
    if not src:
        raise FileNotFoundError("❌ No local PBP found. Expected one of:\n" + "\n".join(PBP_CANDIDATES))
    df = pd.read_parquet(src) if src.endswith(".parquet") else pd.read_csv(src, low_memory=False)
    print(f"[pbp] {src}  shape={df.shape}")
    if "season" not in df.columns or "week" not in df.columns:
        raise SystemExit("❌ PBP missing 'season'/'week' columns.")
    df = df[df["season"]==SEASON].copy()
    if df.empty:
        raise SystemExit(f"❌ PBP contains no rows for season {SEASON}.")
    if WEEKS:
        df = df[df["week"].isin(WEEKS)].copy()
        if df.empty:
            raise SystemExit(f"❌ PBP has no rows for {SEASON} weeks {WEEKS}.")
    return df

def aggregate_weekly_from_pbp(pbp):
    season, week = "season", "week"
    team = pick(pbp, ["posteam","team","pos_team","offense_team"]) or "posteam"

    # Player IDs/names by role
    PASSER_ID   = pick(pbp, ["passer_player_id","passer_id","passer_player_gsis_id"])
    PASSER_NAME = pick(pbp, ["passer_player_name","passer","passer_name"])
    RUSHER_ID   = pick(pbp, ["rusher_player_id","rusher_id","rusher_player_gsis_id"])
    RUSHER_NAME = pick(pbp, ["rusher_player_name","rusher","rusher_name"])
    RECV_ID     = pick(pbp, ["receiver_player_id","receiver_id","receiver_player_gsis_id"])
    RECV_NAME   = pick(pbp, ["receiver_player_name","receiver","receiver_name"])

    # Events/yards
    PA   = pick(pbp, ["pass_attempt","pass"])
    CP   = pick(pbp, ["complete_pass","complete"])
    INC  = pick(pbp, ["incomplete_pass"])
    INTS = pick(pbp, ["interception"])
    SACK = pick(pbp, ["sack"])
    PY   = pick(pbp, ["passing_yards"])
    PTD  = pick(pbp, ["pass_touchdown"])

    RA   = pick(pbp, ["rush_attempt"])
    RY   = pick(pbp, ["rushing_yards"])
    RTD  = pick(pbp, ["rush_touchdown"])

    REC  = pick(pbp, ["reception"])
    RECY = pick(pbp, ["receiving_yards"])
    RECTD= pick(pbp, ["receive_touchdown"])

    # better targets: complete OR incomplete with a named receiver, excluding spikes/throwaways
    SPIKE = pick(pbp, ["qb_spike","spike"])
    TA    = pick(pbp, ["throw_away","throwaway"])
    pbp["_is_target"] = 0
    if PA:
        tgt_mask = (pbp[PA]==1)
        if INC: tgt_mask &= (pbp[INC].fillna(0).astype(int)==1) | (pbp[CP].fillna(0).astype(int)==1 if CP else False)
        if RECV_ID: tgt_mask &= pbp[RECV_ID].notna()
        if SPIKE:   tgt_mask &= pbp[SPIKE].fillna(0).astype(int)==0
        if TA:      tgt_mask &= pbp[TA].fillna(0).astype(int)==0
        pbp.loc[tgt_mask, "_is_target"] = 1
    TGT = "_is_target"

    parts=[]

    # Passing (incl. pass_tds)
    if PASSER_ID or PASSER_NAME:
        cols=[season,week,team]+[x for x in [PASSER_ID,PASSER_NAME] if x]
        g = pbp[pbp[PA]==1].groupby(cols, dropna=False, as_index=False).agg(
            sumsafe(pbp,[PA,CP,INTS,SACK,PY,PTD])
        ).rename(columns={
            season:"season", week:"week", team:"team",
            PASSER_ID:"player_id", PASSER_NAME:"player_name",
            PA:"pass_attempts", CP:"completions",
            INTS:"interceptions", SACK:"sacks", PY:"passing_yards", PTD:"pass_tds"
        })
        parts.append(g)

    # Rushing
    if RUSHER_ID or RUSHER_NAME:
        cols=[season,week,team]+[x for x in [RUSHER_ID,RUSHER_NAME] if x]
        g = pbp[pbp[RA]==1].groupby(cols, dropna=False, as_index=False).agg(
            sumsafe(pbp,[RA,RY,RTD])
        ).rename(columns={
            season:"season", week:"week", team:"team",
            RUSHER_ID:"player_id", RUSHER_NAME:"player_name",
            RA:"rush_attempts", RY:"rushing_yards", RTD:"rush_tds"
        })
        parts.append(g)

    # Receiving (targets from the derived flag)
    if RECV_ID or RECV_NAME:
        cols=[season,week,team]+[x for x in [RECV_ID,RECV_NAME] if x]
        g = pbp.groupby(cols, dropna=False, as_index=False).agg(
            sumsafe(pbp,[REC,RECY,RECTD,TGT])
        ).rename(columns={
            season:"season", week:"week", team:"team",
            RECV_ID:"player_id", RECV_NAME:"player_name",
            REC:"receptions", RECY:"receiving_yards", RECTD:"rec_tds", TGT:"targets"
        })
        # ensure ints where appropriate
        for c in ["receptions","targets"]: 
            if c in g.columns: g[c]=g[c].fillna(0).astype(int)
        parts.append(g)

    if not parts:
        raise SystemExit("❌ Could not build any aggregates from PBP (check column names).")

    # Outer-merge the role tables, then fill missing stats with 0
    from functools import reduce
    key=["season","week","team","player_name","player_id"]
    weekly = reduce(lambda a,b: pd.merge(a,b,on=key,how="outer"), parts)

    stat_cols = [c for c in weekly.columns if c not in key]
    for c in stat_cols:
        if pd.api.types.is_numeric_dtype(weekly[c]): weekly[c] = weekly[c].fillna(0)

    # (Optional) simple position guess to help joins if your base has 'position'
    def guess_pos(r):
        pa = r.get("pass_attempts", 0) or 0
        ra = r.get("rush_attempts", 0) or 0
        rec= r.get("receptions", 0) or 0
        if pa >= 5: return "QB"
        if ra >= 8 and rec < 3: return "RB"
        if rec >= 3: return "WR/TE"
        return np.nan
    weekly["position_guess"] = weekly.apply(guess_pos, axis=1)

    weeks = sorted(weekly["week"].dropna().unique().tolist())
    print(f"[agg] rows={len(weekly):,}, weeks={weeks}")
    return weekly

# -------- run
base = load_base(BASE_CSV)
pbp  = load_pbp()
agg  = aggregate_weekly_from_pbp(pbp)

# Keep only requested weeks (if WEEKS is set)
if WEEKS:
    agg = agg[agg["week"].isin(WEEKS)].copy()

# --- align to prod schema
# 1) add any missing prod columns (NaN), then reorder to prod
missing_in_agg = [c for c in base.columns if c not in agg.columns]
for c in missing_in_agg:
    # fill NaN for unknowns (we only zero-fill the computed stats we actually created)
    agg[c] = np.nan

# If prod has 'position' but our guess exists and 'position' is missing, copy it
if "position" in base.columns and "position" not in agg.columns and "position_guess" in agg.columns:
    agg["position"] = agg["position_guess"]

# Ensure column order matches prod
agg = agg[base.columns]

# 2) remove any existing rows for these weeks/seasons before appending (avoid dupes)
key_cols = [c for c in ["season","week","team","player_id","player_name"] if c in base.columns]
mask_keep = ~( (base["season"]==SEASON) & (base["week"].isin(WEEKS if WEEKS else base["week"].unique())) )
base_clean = base[mask_keep].copy()

# 3) append and final sanity
merged = pd.concat([base_clean, agg], ignore_index=True)
if key_cols:
    merged = merged.drop_duplicates(subset=key_cols, keep="last")

# write
Path(OUT_CSV).parent.mkdir(parents=True, exist_ok=True)
merged.to_csv(OUT_CSV, index=False)
print(f"✅ wrote {OUT_CSV}  rows={len(merged):,}")

# quick checks
print("[checks] seasons:", sorted(merged["season"].unique())[:10])
print("[checks] weeks(2025):", sorted(merged.loc[merged["season"]==SEASON, "week"].unique()))
display(agg.sort_values(["team","player_name"]).head(12))


[base] data/weekly_player_stats.csv  shape=(32991, 53)
[pbp] data/pbp/pbp_2025.parquet  shape=(2917, 372)
[agg] rows=346, weeks=[1]
✅ wrote data/weekly/weekly_for_params_thru_2025w1.csv  rows=33,337
[checks] seasons: [np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023), np.int64(2024), np.int64(2025)]
[checks] weeks(2025): [np.int64(1)]


Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,team,season,week,season_type,opponent_team,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_yards,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,fantasy_points,fantasy_points_ppr
0,00-0039041,E.Higgins,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,2.0,3.0,,,,,,,,,,,,,,,
1,00-0035500,G.Dortch,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,1.0,-2.0,,,,,,,,,,,,,,,
2,00-0033553,J.Conner,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,39.0,,,,,,,,4.0,5.0,,,,,,,,,,,,,,,
3,00-0035228,K.Murray,,,,,ARI,2025,1,,,21.0,,163.0,,0.0,5.0,,,,,,,,,,,,38.0,,,,,,,,0.0,0.0,,,,,,,,,,,,,,,
4,00-0039849,M.Harrison,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,6.0,71.0,,,,,,,,,,,,,,,
5,00-0038559,Mi.Wilson,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,4.0,5.0,,,,,,,,,,,,,,,
6,00-0039921,T.Benson,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,69.0,,,,,,,,1.0,6.0,,,,,,,,,,,,,,,
7,00-0037744,T.McBride,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,9.0,61.0,,,,,,,,,,,,,,,
8,00-0039737,T.Reiman,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,1.0,10.0,,,,,,,,,,,,,,,
9,00-0033891,Z.Jones,,,,,ARI,2025,1,,,0.0,,0.0,,0.0,0.0,,,,,,,,,,,,0.0,,,,,,,,1.0,4.0,,,,,,,,,,,,,,,


In [14]:
merged.season.unique()

array([2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025])

In [21]:
merged[merged.season==2025].count()

player_id              313
player_name            313
player_display_name      0
position                 0
position_group           0
                      ... 
air_yards_share          0
wopr                     0
special_teams_tds        0
fantasy_points           0
fantasy_points_ppr       0
Length: 53, dtype: int64

In [22]:
merged[merged.season==2024].count()

player_id              20
player_name            20
player_display_name    20
position               20
position_group         20
                       ..
air_yards_share        15
wopr                   15
special_teams_tds      20
fantasy_points         20
fantasy_points_ppr     20
Length: 53, dtype: int64

In [23]:
import pandas as pd
import numpy as np

REQUIRED = [
    "season","week","team","player_id","player_name",
    "pass_attempts","completions","passing_yards","interceptions","sacks","pass_tds",
    "rush_attempts","rushing_yards","rush_tds",
    "targets","receptions","receiving_yards","rec_tds",
]

merged = pd.read_csv("data/weekly/weekly_for_params_thru_2025w1.csv", low_memory=False)

# Create any missing required columns
for c in REQUIRED:
    if c not in merged.columns:
        merged[c] = np.nan

# For 2025 rows specifically, fill required numeric stats with 0 (don’t touch 2024)
num_cols = [c for c in REQUIRED if c not in ["season","week","team","player_id","player_name"]]
m2025 = merged["season"] == 2025
for c in num_cols:
    merged.loc[m2025, c] = pd.to_numeric(merged.loc[m2025, c], errors="coerce").fillna(0)

# Ensure types are numeric where appropriate
for c in ["season","week"] + num_cols:
    if c in merged.columns:
        merged[c] = pd.to_numeric(merged[c], errors="ignore")

# Save back
merged.to_csv("data/weekly/weekly_for_params_thru_2025w1.csv", index=False)
print("✅ weekly_for_params_thru_2025w1.csv normalized for params")
print("2025 non-null counts on required cols:\n", merged.loc[m2025, REQUIRED].count())


  merged[c] = pd.to_numeric(merged[c], errors="ignore")


✅ weekly_for_params_thru_2025w1.csv normalized for params
2025 non-null counts on required cols:
 season             346
week               346
team               345
player_id          313
player_name        313
pass_attempts      346
completions        346
passing_yards      346
interceptions      346
sacks              346
pass_tds           346
rush_attempts      346
rushing_yards      346
rush_tds           346
targets            346
receptions         346
receiving_yards    346
rec_tds            346
dtype: int64


In [24]:
from pathlib import Path
import io, gzip, pandas as pd, requests

BASE = Path.cwd()
for p in [BASE] + list(BASE.parents):
    if (p/"Makefile").exists() and (p/"data").exists():
        BASE = p; break

urls = [
    "https://github.com/nflverse/nflverse-data/releases/download/stats_player/stats_player_week_2025.parquet",
    "https://github.com/nflverse/nflverse-data/releases/download/stats_player/stats_player_week_2025.csv.gz",
]

df = None; fetched = None
for u in urls:
    try:
        r = requests.get(u, timeout=60); r.raise_for_status()
        if u.endswith(".parquet"):
            out = BASE/"data/weekly_player_stats_2025.parquet"
            out.write_bytes(r.content)
            df = pd.read_parquet(out); fetched = u
            break
        else:
            buf = gzip.decompress(r.content)
            df = pd.read_csv(io.BytesIO(buf), low_memory=False); fetched = u
            out = BASE/"data/weekly_player_stats_2025.csv"; df.to_csv(out, index=False)
            break
    except Exception as e:
        print("failed:", u, type(e).__name__, e)

if df is None:
    raise SystemExit("Could not fetch weekly player stats for 2025 using the stats_player release.")

# also write a convenience alias under data/weekly/
alias = BASE/"data/weekly/weekly_player_stats_2025.csv"
df.to_csv(alias, index=False)
print("✅ fetched from:", fetched)
print("shape:", df.shape, "| weeks:", sorted(df["week"].dropna().unique().tolist())[:10])


✅ fetched from: https://github.com/nflverse/nflverse-data/releases/download/stats_player/stats_player_week_2025.parquet
shape: (1142, 114) | weeks: [1, 2]


In [25]:
df

Unnamed: 0,player_id,player_name,player_display_name,position,position_group,headshot_url,season,week,season_type,team,opponent_team,completions,attempts,passing_yards,passing_tds,passing_interceptions,sacks_suffered,sack_yards_lost,sack_fumbles,sack_fumbles_lost,passing_air_yards,passing_yards_after_catch,passing_first_downs,passing_epa,passing_cpoe,passing_2pt_conversions,pacr,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,rushing_2pt_conversions,receptions,targets,receiving_yards,receiving_tds,receiving_fumbles,receiving_fumbles_lost,receiving_air_yards,receiving_yards_after_catch,receiving_first_downs,receiving_epa,receiving_2pt_conversions,racr,target_share,air_yards_share,wopr,special_teams_tds,def_tackles_solo,def_tackles_with_assist,def_tackle_assists,def_tackles_for_loss,def_tackles_for_loss_yards,def_fumbles_forced,def_sacks,def_sack_yards,def_qb_hits,def_interceptions,def_interception_yards,def_pass_defended,def_tds,def_fumbles,def_safeties,misc_yards,fumble_recovery_own,fumble_recovery_yards_own,fumble_recovery_opp,fumble_recovery_yards_opp,fumble_recovery_tds,penalties,penalty_yards,punt_returns,punt_return_yards,kickoff_returns,kickoff_return_yards,fg_made,fg_att,fg_missed,fg_blocked,fg_long,fg_pct,fg_made_0_19,fg_made_20_29,fg_made_30_39,fg_made_40_49,fg_made_50_59,fg_made_60_,fg_missed_0_19,fg_missed_20_29,fg_missed_30_39,fg_missed_40_49,fg_missed_50_59,fg_missed_60_,fg_made_list,fg_missed_list,fg_blocked_list,fg_made_distance,fg_missed_distance,fg_blocked_distance,pat_made,pat_att,pat_missed,pat_blocked,pat_pct,gwfg_made,gwfg_att,gwfg_missed,gwfg_blocked,gwfg_distance,fantasy_points,fantasy_points_ppr
0,00-0023459,A.Rodgers,Aaron Rodgers,QB,QB,https://static.www.nfl.com/image/upload/f_auto...,2025,1,REG,PIT,NYJ,22,30,244,4,0,4,-26,0,0,139,173,14,10.204755,6.350381,0,1.755396,1,-1,0,0,0,0,-1.873902,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,25.66,25.66
1,00-0023853,M.Prater,Matt Prater,K,SPEC,https://static.www.nfl.com/image/upload/f_auto...,2025,1,REG,BUF,BAL,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,0,43.0,1.0,0,1,1,1,0,0,0,0,0,0,0,0,25;43;32,,,100,0,0,2,2,0,0,1.0,1,1,0,0,32,0.00,0.00
2,00-0025565,N.Folk,Nick Folk,K,SPEC,https://static.www.nfl.com/image/upload/f_auto...,2025,1,REG,NYJ,PIT,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,51.0,1.0,0,0,1,0,1,0,0,0,0,0,0,0,35;51,,,86,0,0,2,2,0,0,1.0,0,0,0,0,0,0.00,0.00
3,00-0026158,J.Flacco,Joe Flacco,QB,QB,https://static.www.nfl.com/image/upload/f_auto...,2025,1,REG,CLE,CIN,31,45,290,1,2,2,-12,0,0,273,173,13,-1.842128,2.374263,0,1.062271,2,6,0,0,0,1,0.134383,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,12.20,12.20
4,00-0026190,C.Campbell,Calais Campbell,DE,DL,https://static.www.nfl.com/image/upload/f_auto...,2025,1,REG,ARI,NO,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,3,0,2,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1137,00-0040242,J.Croskey-Merritt,Jacory Croskey-Merritt,RB,RB,https://static.www.nfl.com/image/upload/f_auto...,2025,2,REG,WAS,GB,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,4,17,0,0,0,1,0.492604,0,0,1,0,0,0,0,6,0,0,-0.821151,0,0.0,0.025000,0.021505,0.052554,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,1.70,1.70
1138,00-0040667,M.Golden,Matthew Golden,WR,WR,https://static.www.nfl.com/image/upload/f_auto...,2025,2,REG,GB,WAS,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,2,15,0,0,0,0,1.095628,0,0,2,0,0,0,0,81,0,0,-3.505815,0,0.0,0.074074,0.198529,0.250082,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,-2,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,1.50,1.50
1139,00-0040726,A.Belton,Anthony Belton,OT,OL,https://static.www.nfl.com/image/upload/f_auto...,2025,2,REG,GB,WAS,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,10,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,0.00,0.00
1140,00-0040731,T.Amos,Trey Amos,CB,DB,https://static.www.nfl.com/image/upload/f_auto...,2025,2,REG,WAS,GB,0,0,0,0,0,0,0,0,0,0,0,0,,,0,,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,0,,0,,0.000000,0.000000,0.000000,0,1,0,1,0,0,0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,0,0,0,0,0,0,,0,0,0,0,0,0.00,0.00


In [26]:
# DIAGNOSTIC: what's missing between props and params (Week 2)
import pandas as pd, numpy as np, re, os

SEASON=2025
WEEK=2
props  = pd.read_csv('data/props/latest_all_props.csv')
params = pd.read_csv(f'data/props/params_week{WEEK}.csv')

def name_std(s):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Ensure name_std exists
if 'name_std' not in props.columns:
    name_col = 'player' if 'player' in props.columns else ('name' if 'name' in props.columns else None)
    props['name_std'] = props[name_col].map(name_std) if name_col else ''
if 'name_std' not in params.columns:
    pname = 'player' if 'player' in params.columns else ('player_name' if 'player_name' in params.columns else None)
    params['name_std'] = params[pname].map(name_std) if pname else ''

print("rows props/params:", len(props), "/", len(params))

# Primary join keys present in both
pkeys = [k for k in ['player_key','market_std','point_key'] if k in props.columns and k in params.columns]
print("primary keys used:", pkeys or "(none)")

# Primary-key coverage (raw)
if pkeys:
    left_key  = props[pkeys].astype(str).agg('|'.join, axis=1)
    right_key = set(params[pkeys].astype(str).agg('|'.join, axis=1))
    missing_raw = ~left_key.isin(right_key)
else:
    missing_raw = pd.Series(False, index=props.index)

print("unmatched on primary keys:", int(missing_raw.sum()), "/", len(props), f"({missing_raw.mean():.1%})")

# Top missing markets on primary
if 'market_std' in props.columns:
    print("\nTop missing markets (primary key):")
    print(props.loc[missing_raw, 'market_std'].value_counts().head(20))

# Does normalizing point_key help?
def norm_point(x):
    s = str(x)
    # normalize numeric like '65.0' -> '65'
    if re.fullmatch(r"-?\d+\.0", s): s = s[:-2]
    return s

if 'point_key' in props.columns and 'point_key' in params.columns:
    props['point_key_norm']  = props['point_key'].map(norm_point)
    params['point_key_norm'] = params['point_key'].map(norm_point)
    pkeys2 = [k for k in ['player_key','market_std','point_key_norm'] if k in props.columns and k in params.columns]
    if pkeys2:
        lk2 = props[pkeys2].astype(str).agg('|'.join, axis=1)
        rk2 = set(params[pkeys2].astype(str).agg('|'.join, axis=1))
        missing_norm = ~lk2.isin(rk2)
        print("\nAfter point_key normalization:")
        print("unmatched:", int(missing_norm.sum()), "/", len(props), f"({missing_norm.mean():.1%})",
              "| improvement:", int(missing_raw.sum()) - int(missing_norm.sum()))
    else:
        missing_norm = missing_raw.copy()
else:
    missing_norm = missing_raw.copy()

# If we fallback to name_std instead of player_key, how much would match?
fk = [k for k in ['name_std','market_std','point_key'] if k in props.columns]
rk = [k for k in ['name_std','market_std','point_key'] if k in params.columns]
if set(fk)==set(rk) and fk:
    lk3 = props[fk].astype(str).agg('|'.join, axis=1)
    rk3 = set(params[rk].astype(str).agg('|'.join, axis=1))
    missing_name = ~lk3.isin(rk3)
    base = missing_norm if 'missing_norm' in locals() else missing_raw
    print("\nIf joining by name_std (fallback):")
    print("unmatched:", int(missing_name.sum()), "/", len(props), f"({missing_name.mean():.1%})",
          "| improvement vs previous:", int(base.sum()) - int(missing_name.sum()))
else:
    missing_name = None

# Sample of currently unmatched (post-normalization baseline)
m = missing_norm if 'missing_norm' in locals() else missing_raw
sample_cols = [c for c in ['player','name_std','player_key','market_std','side','point_key','book','line','price'] if c in props.columns]
print("\nSample unmatched rows:")
print(props.loc[m, sample_cols].head(20).to_string(index=False))

# Market vocabulary differences (helps spot unsupported markets)
p_markets = set(props['market_std'].unique()) if 'market_std' in props.columns else set()
r_markets = set(params['market_std'].unique()) if 'market_std' in params.columns else set()
only_in_props   = sorted(p_markets - r_markets)[:25]
only_in_params  = sorted(r_markets - p_markets)[:25]
print("\nMarkets only in props (first 25):", only_in_props)
print("Markets only in params (first 25):", only_in_params)

# Key null/dtype sanity
def nn(df, col): return float(df[col].isna().mean()) if col in df.columns else np.nan
for k in ['player_key','name_std','market_std','point_key']:
    print(f"null rate {k}: props={nn(props,k):.3f} params={nn(params,k):.3f}")


rows props/params: 14731 / 1041
primary keys used: ['market_std']
unmatched on primary keys: 14731 / 14731 (100.0%)

Top missing markets (primary key):
market_std
player_anytime_td            2812
player_1st_td                2068
player_last_td               1889
player_reception_yds         1770
player_receptions            1694
player_reception_longest      950
player_rush_yds               888
player_rush_attempts          576
player_pass_tds               398
player_pass_yds               396
player_rush_longest           360
player_pass_attempts          318
player_pass_completions       318
player_pass_interceptions     294
Name: count, dtype: int64

If joining by name_std (fallback):
unmatched: 14731 / 14731 (100.0%) | improvement vs previous: 0

Sample unmatched rows:
            player          name_std    market_std  price
      Jahmyr Gibbs      jahmyr gibbs player_1st_td    400
  David Montgomery  david montgomery player_1st_td    550
 Amon-Ra St. Brown   amonra st brown p

In [27]:
# Re-normalize props with the updated std_market and check markets again
from common_markets import standardize_input
p_props2 = standardize_input(props.copy())
print("Top markets in props (after mapping):")
print(p_props2["market_std"].value_counts().head(15).to_string())

# Recompute join diagnostics WITHOUT point_key
lk = p_props2[["player_key","market_std"]].astype(str).agg("|".join, axis=1)
rk = set(p_params[["player_key","market_std"]].astype(str).agg("|".join, axis=1))
miss = ~lk.isin(rk)
print("\nUnmatched (no point):", int(miss.sum()), "/", len(p_props2), f"({miss.mean():.1%})")

# Per-market unmatched for core markets
core = {"recv_yds","receptions","rush_yds","rush_attempts","pass_yds","pass_attempts","pass_completions","pass_tds","interceptions","anytime_td"}
print("\nCore market unmatched rates (no point):")
print((p_props2.assign(_miss=miss)
        .loc[p_props2["market_std"].isin(core)]
        .groupby("market_std")["_miss"].mean()
        .sort_values(ascending=False)
        .round(3)
        .to_string()))


ModuleNotFoundError: No module named 'common_markets'

In [28]:
# Point the notebook to your repo's /scripts folder
import os, sys

# 1) If you know the absolute path, just use it:
# sys.path.insert(0, "/Users/pwitt/NFL-2025/scripts")

# 2) Or try to auto-find it from common locations:
candidates = [
    os.getcwd(),
    os.path.abspath(os.path.join(os.getcwd(), "NFL-2025")),
    os.path.abspath(os.path.join(os.getcwd(), "..")),
    os.path.abspath(os.path.join(os.getcwd(), "../NFL-2025")),
]
added = None
for base in candidates:
    p = os.path.join(base, "scripts", "common_markets.py")
    if os.path.exists(p):
        sys.path.insert(0, os.path.join(base, "scripts"))
        added = os.path.join(base, "scripts")
        break

print("Using scripts path:", added or "(not found)")


Using scripts path: /Users/pwitt/NFL-2025/scripts


In [29]:
# Re-normalize props with the updated std_market and check markets again
from common_markets import standardize_input
p_props2 = standardize_input(props.copy())
print("Top markets in props (after mapping):")
print(p_props2["market_std"].value_counts().head(15).to_string())

# Recompute join diagnostics WITHOUT point_key
lk = p_props2[["player_key","market_std"]].astype(str).agg("|".join, axis=1)
rk = set(p_params[["player_key","market_std"]].astype(str).agg("|".join, axis=1))
miss = ~lk.isin(rk)
print("\nUnmatched (no point):", int(miss.sum()), "/", len(p_props2), f"({miss.mean():.1%})")

# Per-market unmatched for core markets
core = {"recv_yds","receptions","rush_yds","rush_attempts","pass_yds","pass_attempts","pass_completions","pass_tds","interceptions","anytime_td"}
print("\nCore market unmatched rates (no point):")
print((p_props2.assign(_miss=miss)
        .loc[p_props2["market_std"].isin(core)]
        .groupby("market_std")["_miss"].mean()
        .sort_values(ascending=False)
        .round(3)
        .to_string()))


Top markets in props (after mapping):
market_std
anytime_td           2812
first_td             2068
last_td              1889
recv_yds             1770
receptions           1694
reception_longest     950
rush_yds              888
rush_attempts         576
pass_tds              398
pass_yds              396
rush_longest          360
pass_attempts         318
pass_completions      318
interceptions         294


KeyError: "['player_key'] not in index"

In [30]:
# Recompute join diagnostics WITHOUT point_key (auto-pick id col)
id_col = None
for cand in ("player_key", "name_std"):
    if cand in p_props2.columns and cand in p_params.columns:
        id_col = cand
        break
if id_col is None:
    raise RuntimeError("Neither player_key nor name_std exist on BOTH props and params after normalization.")

print("Using ID column:", id_col)

lk = p_props2[[id_col, "market_std"]].astype(str).agg("|".join, axis=1)
rk = set(p_params[[id_col, "market_std"]].astype(str).agg("|".join, axis=1))
miss = ~lk.isin(rk)

print("Unmatched (no point):", int(miss.sum()), "/", len(p_props2), f"({miss.mean():.1%})")

core = {"recv_yds","receptions","rush_yds","rush_attempts","pass_yds","pass_attempts","pass_completions","pass_tds","interceptions","anytime_td"}
print("\nCore market unmatched rates:")
print((p_props2.assign(_miss=miss)
        .loc[p_props2["market_std"].isin(core)]
        .groupby("market_std")["_miss"].mean()
        .sort_values(ascending=False)
        .round(3)
        .to_string()))

# Sample rows from core markets that fail Join A
samples = p_props2.loc[miss & p_props2["market_std"].isin(core),
                       ["player","name_std","player_key","market_std","name","point","price"]].head(30)
samples


NameError: name 'p_params' is not defined