Import play-by-play data for 2024 season, regular season only

In [58]:
from nfl_data_py import import_pbp_data

YEARS = list(range(2016, 2025))  # 2016–2024 inclusive
pbp = import_pbp_data(YEARS, downcast=True)
pbp = pbp[pbp['season_type'] == 'REG'].copy()


2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
Downcasting floats.


See all available columns in the pbp data

In [59]:
import nfl_data_py as nfl

cols = nfl.see_pbp_cols()
for c in cols:
    print(c)

play_id
game_id
old_game_id
home_team
away_team
season_type
week
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
td_player_name
td_player_id
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa


Choose relevant columns for MAB later

In [60]:
id_cols = ['season', 'week', 'game_id', 'game_date', 'play_id']
team_cols = ['posteam', 'defteam', 'home_team', 'away_team', 'posteam_type']
game_st_cols = ['qtr', 'game_seconds_remaining', 'half_seconds_remaining', 'quarter_seconds_remaining', 'down', 'ydstogo', 'yardline_100', 'score_differential']
timeouts_clock_cols = ['home_timeouts_remaining', 'away_timeouts_remaining', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining']
condition_cols = ['roof', 'surface', 'temp', 'wind']
arm_indicators_cols = ['play_type', 'punt_attempt', 'field_goal_attempt', 'rush_attempt', 'pass_attempt']
reward_signal_cols = ['epa', 'wpa', 'success', 'yards_gained', 'first_down', 'touchdown']
fg_cols = ['field_goal_result', 'kick_distance']
punt_cols = ['punt_inside_twenty', 'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch', 'return_yards']
misc_filter_cols = ['penalty', 'aborted_play', 'play_deleted', 'goal_to_go', 'timeout', 'timeout_team']

Convert expected numeric columns to numeric dtype

In [61]:
import pandas as pd 
import numpy as np

pbp['week'] = pd.to_numeric(pbp['week'], errors='coerce')


Calculate offensive and defensive EPA per play over the last 4 weeks for each team

In [62]:
off_weekly = (
    pbp
      .dropna(subset=['posteam', 'epa'])
      .groupby(['season', 'posteam', 'week'], as_index=False)['epa'].mean()
      .sort_values(['season', 'posteam', 'week'])
)

off_weekly['off_epa_4w'] = (
    off_weekly
      .groupby(['season', 'posteam'])['epa']
      .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())


In [63]:
def_weekly = (
    pbp
      .dropna(subset=['defteam', 'epa'])
      .groupby(['season', 'defteam', 'week'], as_index=False)['epa'].mean()
      .sort_values(['season', 'defteam', 'week'])
)

def_weekly['def_epa_4w'] = (
    def_weekly
      .groupby(['season', 'defteam'])['epa']
      .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())


Merge (left join) 4-week rolling averages back to original df

In [64]:
pbp = pbp.merge(
    off_weekly[['season','posteam','week','off_epa_4w']],
    on=['season','posteam','week'], how='left'
).merge(
    def_weekly[['season','defteam','week','def_epa_4w']],
    on=['season','defteam','week'], how='left'
)

for c in ['off_epa_4w', 'def_epa_4w']:
    pbp[c] = pbp[c].fillna(0.0)

Calculate seconds per play

In [65]:
pbp['prev_gsr'] = pbp.groupby('game_id')['game_seconds_remaining'].shift(1)
pbp['play_elapsed_s'] = (pbp['prev_gsr'] - pbp['game_seconds_remaining']).clip(lower=0)
pbp['play_elapsed_s'] = pbp['play_elapsed_s'].fillna(0)

Lagged Variables

In [66]:
pbp['def_team_prev'] = pbp.groupby('game_id')['defteam'].shift(1)

Aattribute each elapsed interval to the previous defensive team

In [67]:
pbp['elapsed_prev_for_def'] = np.where(
    pbp['def_team_prev'] == pbp['defteam'],
    pbp['play_elapsed_s'],
    0.0
)

Cumulative Time on Field up to Snap

In [68]:
pbp['def_time_on_field_cum'] = (
    pbp
      .groupby(['game_id','defteam'])['elapsed_prev_for_def']
      .cumsum()
)

Ensure they are numeric

In [69]:
pbp['game_time_elapsed'] = pd.to_numeric(
    pbp.groupby('game_id')['play_elapsed_s'].cumsum(), errors='coerce'
).astype(float)

pbp['def_time_on_field_cum'] = pd.to_numeric(
    pbp['def_time_on_field_cum'], errors='coerce'
).astype(float)

Share of time on field

In [70]:
den = pbp['game_time_elapsed'].to_numpy(dtype=float)
num = pbp['def_time_on_field_cum'].to_numpy(dtype=float)

pbp['def_time_on_field_share'] = np.divide(
    num, den, out=np.zeros_like(num, dtype=float), where=den > 0
)

In [71]:
pbp['def_time_on_field_share'] = pbp['def_time_on_field_share'].fillna(0)

Number of plays in the drive

In [72]:
pbp['plays_in_drive_so_far'] = (
    pbp.groupby(['game_id','drive']).cumcount() + 1
)

Are you in the 4th quarter (teams typically more tired)

In [73]:
pbp['is_q4_or_later'] = (pbp['qtr'] >= 4).astype(int)

Add fatigue and drive columns

In [74]:
drive_context_cols = ['plays_in_drive_so_far']
fatigue_cols = ['play_elapsed_s','game_time_elapsed',
                'def_time_on_field_cum','def_time_on_field_share','is_q4_or_later']


Merge columns into final df

In [75]:
final_cols = (
    id_cols + team_cols + game_st_cols + timeouts_clock_cols +
    condition_cols + arm_indicators_cols + reward_signal_cols +
    fg_cols + punt_cols + misc_filter_cols +
    ['off_epa_4w', 'def_epa_4w'] +
    drive_context_cols + fatigue_cols
)

df = pbp[final_cols].copy()

Ensure numeric

In [76]:
num_cols = [
    'qtr','game_seconds_remaining','half_seconds_remaining','quarter_seconds_remaining',
    'down','ydstogo','yardline_100','score_differential','temp','wind',
    'epa','wpa','yards_gained','kick_distance','return_yards',
    'off_epa_4w','def_epa_4w', 'def_time_on_field_cum','def_time_on_field_share', 'plays_in_drive_so_far'
]

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

Check fatigue features (sanity check)

In [77]:
print("Fatigue feature NA rates:")
print(pbp[['def_time_on_field_cum','def_time_on_field_share','plays_in_drive_so_far']].isna().mean())

print("Preview fatigue cols:")
display(
    pbp[['season','week','game_id','qtr','game_seconds_remaining','posteam','defteam',
              'plays_in_drive_so_far','def_time_on_field_cum','def_time_on_field_share','is_q4_or_later']]
    .head(8)
)


Fatigue feature NA rates:
def_time_on_field_cum      0.053118
def_time_on_field_share    0.000000
plays_in_drive_so_far      0.011397
dtype: float64
Preview fatigue cols:


Unnamed: 0,season,week,game_id,qtr,game_seconds_remaining,posteam,defteam,plays_in_drive_so_far,def_time_on_field_cum,def_time_on_field_share,is_q4_or_later
0,2016,1,2016_01_BUF_BAL,1.0,3600.0,,,,,0.0,0
1,2016,1,2016_01_BUF_BAL,1.0,3600.0,BAL,BUF,1.0,0.0,0.0,0
2,2016,1,2016_01_BUF_BAL,1.0,3597.0,BAL,BUF,2.0,3.0,1.0,0
3,2016,1,2016_01_BUF_BAL,1.0,3572.0,BAL,BUF,3.0,28.0,1.0,0
4,2016,1,2016_01_BUF_BAL,1.0,3541.0,BAL,BUF,4.0,59.0,1.0,0
5,2016,1,2016_01_BUF_BAL,1.0,3515.0,BAL,BUF,5.0,85.0,1.0,0
6,2016,1,2016_01_BUF_BAL,1.0,3474.0,BAL,BUF,6.0,126.0,1.0,0
7,2016,1,2016_01_BUF_BAL,1.0,3425.0,BAL,BUF,7.0,175.0,1.0,0


Sample Table

In [78]:
print(f"Data shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")

print("\nMissingness (key columns):")
missing_summary = (
    df[['down','ydstogo','yardline_100','play_type','epa','wpa']]
      .isna()
      .mean()
      .mul(100)
      .round(1)
      .rename("pct_missing")
      .to_frame()
)
print(missing_summary)

print("\nSample rows:")
display(df.head(3))


Data shape: 416,321 rows × 58 columns

Missingness (key columns):
              pct_missing
down                 15.8
ydstogo               0.0
yardline_100          7.0
play_type             2.9
epa                   1.1
wpa                   1.5

Sample rows:


Unnamed: 0,season,week,game_id,game_date,play_id,posteam,defteam,home_team,away_team,posteam_type,...,timeout,timeout_team,off_epa_4w,def_epa_4w,plays_in_drive_so_far,play_elapsed_s,game_time_elapsed,def_time_on_field_cum,def_time_on_field_share,is_q4_or_later
0,2016,1,2016_01_BUF_BAL,2016-09-11,1.0,,,BAL,BUF,,...,,,0.0,0.0,,0.0,0.0,,0.0,0
1,2016,1,2016_01_BUF_BAL,2016-09-11,36.0,BAL,BUF,BAL,BUF,home,...,0.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
2,2016,1,2016_01_BUF_BAL,2016-09-11,58.0,BAL,BUF,BAL,BUF,home,...,0.0,,0.0,0.0,2.0,3.0,3.0,3.0,1.0,0


Safely get column or default value

In [79]:
import numpy as np
import pandas as pd

def col_safe(frame, name, default=0):
    return frame[name] if name in frame.columns else pd.Series(default, index=frame.index)

Slice for only valid fourth-down plays

In [80]:
is_fourth = df['down'] == 4
valid_type = df['play_type'].isin(['run','pass','punt','field_goal'])

Enforce no penalties, no aborted plays, no kneels/spikes, and non-missing key fields

In [81]:
mask = (
    is_fourth &
    valid_type &
    col_safe(df, 'penalty').fillna(0).eq(0) &
    col_safe(df, 'aborted_play').fillna(0).eq(0) &
    col_safe(df, 'qb_kneel').fillna(0).eq(0) &
    col_safe(df, 'qb_spike').fillna(0).eq(0) &
    df['yardline_100'].notna() &
    df['ydstogo'].notna()
)

decisions_df = df.loc[mask].copy()

Get play type (decision)

In [82]:
decisions_df['action'] = np.select(
    [decisions_df['play_type'].isin(['run','pass']),
     decisions_df['play_type'] == 'punt',
     decisions_df['play_type'] == 'field_goal'],
    ['go','punt','fg']
)

Summary Stats

In [83]:
print("Number of 4th down decision plays:", len(decisions_df))
print("\nAction breakdown:")
print(decisions_df['action'].value_counts())

print("\nMissingness (% of rows) on key fields:")
print((decisions_df[['down','ydstogo','yardline_100','play_type','epa','wpa']]
        .isna().mean()*100).round(1))

print("\nPreview:")
display(
    decisions_df[['season','week','posteam','defteam','qtr','game_seconds_remaining',
                  'yardline_100','ydstogo','score_differential','action','epa','wpa']].head(10)
)

print("\nDistribution of ydstogo:")
print(decisions_df['ydstogo'].describe().round(2))

print("\nYardline bins × action:")
yard_bins = pd.cut(decisions_df['yardline_100'], bins=[0,20,40,60,80,100], include_lowest=True)
print(pd.crosstab(yard_bins, decisions_df['action']))

Number of 4th down decision plays: 31849

Action breakdown:
punt    17783
fg       8388
go       5678
Name: action, dtype: int64

Missingness (% of rows) on key fields:
down            0.0
ydstogo         0.0
yardline_100    0.0
play_type       0.0
epa             0.0
wpa             0.0
dtype: float64

Preview:


Unnamed: 0,season,week,posteam,defteam,qtr,game_seconds_remaining,yardline_100,ydstogo,score_differential,action,epa,wpa
8,2016,1,BAL,BUF,1.0,3412.0,71.0,6.0,0.0,punt,-0.405048,0.021992
12,2016,1,BUF,BAL,1.0,3282.0,76.0,18.0,0.0,punt,0.224181,0.014649
24,2016,1,BUF,BAL,1.0,3030.0,48.0,11.0,0.0,punt,0.447958,-0.007276
34,2016,1,BAL,BUF,1.0,2741.0,32.0,15.0,0.0,fg,1.797426,0.06181
43,2016,1,BUF,BAL,2.0,2587.0,64.0,10.0,-3.0,punt,0.480402,-0.004405
67,2016,1,BUF,BAL,2.0,1986.0,1.0,1.0,-10.0,go,2.859358,0.112149
93,2016,1,BUF,BAL,3.0,1423.0,31.0,15.0,-3.0,fg,-3.855208,-0.085883
97,2016,1,BAL,BUF,3.0,1363.0,53.0,2.0,3.0,punt,-1.136957,-0.033029
103,2016,1,BUF,BAL,3.0,1158.0,55.0,4.0,-3.0,punt,0.560674,-0.019397
111,2016,1,BAL,BUF,3.0,946.0,42.0,1.0,3.0,go,-3.14418,-0.099268



Distribution of ydstogo:
count    31849.00
mean         7.73
std          5.71
min          1.00
25%          3.00
50%          7.00
75%         10.00
max         46.00
Name: ydstogo, dtype: float64

Yardline bins × action:
action            fg    go  punt
yardline_100                    
(-0.001, 20.0]  4241  1417     0
(20.0, 40.0]    4098  1759   538
(40.0, 60.0]      49  1600  6035
(60.0, 80.0]       0   789  8644
(80.0, 100.0]      0   113  2566


Bin fieldgoals based on distance

In [84]:
fg = df.loc[df['field_goal_attempt'] == 1, 
            ['season','week','posteam','kick_distance','field_goal_result']].copy()

In [85]:
fg['fg_made'] = (fg['field_goal_result'] == 'made').astype(int)
fg['dist_bin'] = pd.cut(fg['kick_distance'], bins=[0,39,49,70],
                        labels=['short','mid','long'], include_lowest=True)

In [86]:
fg_week = (fg.groupby(['season','posteam','week','dist_bin'])['fg_made']
             .mean()
             .reset_index()
             .sort_values(['season','posteam','week']))

In [87]:
fg_week['fg_pct'] = (fg_week
    .groupby(['season','posteam','dist_bin'])['fg_made']
    .apply(lambda s: s.shift().rolling(16, min_periods=3).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(16, min_periods=3).mean())


In [88]:
fg_wide = (fg_week
           .pivot_table(index=['season','posteam','week'],
                        columns='dist_bin', values='fg_pct')
           .add_prefix('fg_pct_')
           .reset_index())

In [89]:
decisions_df = decisions_df.merge(fg_wide, on=['season','posteam','week'], how='left')

print("FG% columns added:",
      [c for c in decisions_df.columns if c.startswith('fg_pct_')])

FG% columns added: ['fg_pct_short', 'fg_pct_mid', 'fg_pct_long']


Rolling 4-week average for net punt average

In [90]:
punts = df.loc[df["punt_attempt"] == 1, ["game_id","play_id","season","week","posteam","defteam","yardline_100"]].copy()

In [91]:
punts = punts.sort_values(["game_id","play_id"])

In [92]:
punts["next_posteam"] = punts.groupby("game_id")["posteam"].shift(-1)
punts["next_yardline_100"] = punts.groupby("game_id")["yardline_100"].shift(-1)

In [93]:
mask_recv = (punts["next_posteam"] == punts["defteam"]) & (punts["next_yardline_100"].notna())
punts = punts.loc[mask_recv].copy()

In [94]:
S = punts["yardline_100"].astype(float)
N = punts["next_yardline_100"].astype(float)
punts["punt_net"] = S + N - 100.0

In [95]:
punts["punt_net"] = punts["punt_net"].clip(lower=0, upper=80)

In [96]:
p_week = (punts.groupby(["season","posteam","week"], as_index=False)["punt_net"]
                 .mean()
                 .sort_values(["season","posteam","week"]))

In [97]:
p_week["punt_net_4w"] = (p_week.groupby(["season","posteam"])["punt_net"]
                                .apply(lambda s: s.shift().rolling(4, min_periods=1).mean()))

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(4, min_periods=1).mean()))


In [98]:
decisions_df = decisions_df.merge(
    p_week[["season","posteam","week","punt_net_4w"]],
    on=["season","posteam","week"], how="left"
)

In [99]:
print("Punt net 4w added. Missing rate:", decisions_df["punt_net_4w"].isna().mean().round(3))

Punt net 4w added. Missing rate: 0.106


Fill missing FG% and punt net with overall means

In [100]:
for c in ['fg_pct_short','fg_pct_mid','fg_pct_long','punt_net_4w']:
    if c in decisions_df:
        decisions_df[c] = decisions_df.groupby('posteam')[c].transform(
            lambda s: s.fillna(s.mean())
        )

In [101]:
cols_peek = ['posteam','week','yardline_100','ydstogo','action',
             'off_epa_4w','def_epa_4w','fg_pct_short','fg_pct_mid','fg_pct_long','punt_net_4w']
display(decisions_df[cols_peek].head(8))

Unnamed: 0,posteam,week,yardline_100,ydstogo,action,off_epa_4w,def_epa_4w,fg_pct_short,fg_pct_mid,fg_pct_long,punt_net_4w
0,BAL,1,71.0,6.0,punt,0.0,0.0,0.997562,0.870584,0.704809,31.565077
1,BUF,1,76.0,18.0,punt,0.0,0.0,0.95038,0.73888,0.652809,30.212951
2,BUF,1,48.0,11.0,punt,0.0,0.0,0.95038,0.73888,0.652809,30.212951
3,BAL,1,32.0,15.0,fg,0.0,0.0,0.997562,0.870584,0.704809,31.565077
4,BUF,1,64.0,10.0,punt,0.0,0.0,0.95038,0.73888,0.652809,30.212951
5,BUF,1,1.0,1.0,go,0.0,0.0,0.95038,0.73888,0.652809,30.212951
6,BUF,1,31.0,15.0,fg,0.0,0.0,0.95038,0.73888,0.652809,30.212951
7,BAL,1,53.0,2.0,punt,0.0,0.0,0.997562,0.870584,0.704809,31.565077


In [102]:
import os

OUTDIR = "data"
os.makedirs(OUTDIR, exist_ok=True)

df.to_csv(os.path.join(OUTDIR, "pbp_clean_2016_2024.csv"), index=False)
decisions_df.to_csv(os.path.join(OUTDIR, "decisions_2016_2024.csv"), index=False)

print("Saved to:", OUTDIR)

Saved to: data
