Import play-by-play data for 2024 season, regular season only

In [None]:
from nfl_data_py import import_pbp_data

pbp_2024 = import_pbp_data([2024] , downcast=True)
pbp_2024 = pbp_2024[pbp_2024['season_type'] == 'REG'].copy()


2024 done.
Downcasting floats.


See all available columns in the pbp data

In [None]:
import nfl_data_py as nfl

cols = nfl.see_pbp_cols()
for c in cols:
    print(c)

play_id
game_id
old_game_id
home_team
away_team
season_type
week
posteam
posteam_type
defteam
side_of_field
yardline_100
game_date
quarter_seconds_remaining
half_seconds_remaining
game_seconds_remaining
game_half
quarter_end
drive
sp
qtr
down
goal_to_go
time
yrdln
ydstogo
ydsnet
desc
play_type
yards_gained
shotgun
no_huddle
qb_dropback
qb_kneel
qb_spike
qb_scramble
pass_length
pass_location
air_yards
yards_after_catch
run_location
run_gap
field_goal_result
kick_distance
extra_point_result
two_point_conv_result
home_timeouts_remaining
away_timeouts_remaining
timeout
timeout_team
td_team
td_player_name
td_player_id
posteam_timeouts_remaining
defteam_timeouts_remaining
total_home_score
total_away_score
posteam_score
defteam_score
score_differential
posteam_score_post
defteam_score_post
score_differential_post
no_score_prob
opp_fg_prob
opp_safety_prob
opp_td_prob
fg_prob
safety_prob
td_prob
extra_point_prob
two_point_conversion_prob
ep
epa
total_home_epa
total_away_epa
total_home_rush_epa


Choose relevant columns for MAB later

In [None]:
id_cols = ['season', 'week', 'game_id', 'game_date', 'play_id']
team_cols = ['posteam', 'defteam', 'home_team', 'away_team', 'posteam_type']
game_st_cols = ['qtr', 'game_seconds_remaining', 'half_seconds_remaining', 'quarter_seconds_remaining', 'down', 'ydstogo', 'yardline_100', 'score_differential']
timeouts_clock_cols = ['home_timeouts_remaining', 'away_timeouts_remaining', 'posteam_timeouts_remaining', 'defteam_timeouts_remaining']
condition_cols = ['roof', 'surface', 'temp', 'wind']
arm_indicators_cols = ['play_type', 'punt_attempt', 'field_goal_attempt', 'rush_attempt', 'pass_attempt']
reward_signal_cols = ['epa', 'wpa', 'success', 'yards_gained', 'first_down', 'touchdown']
fg_cols = ['field_goal_result', 'kick_distance']
punt_cols = ['punt_inside_twenty', 'punt_out_of_bounds', 'punt_downed', 'punt_fair_catch', 'return_yards']
misc_filter_cols = ['penalty', 'aborted_play', 'play_deleted', 'goal_to_go', 'timeout', 'timeout_team']

Convert expected numeric columns to numeric dtype

In [None]:
import pandas as pd 

pbp_2024['week'] = pd.to_numeric(pbp_2024['week'], errors='coerce')


Calculate offensive and defensive EPA per play over the last 4 weeks for each team

In [None]:
off_weekly = (
    pbp_2024
      .dropna(subset=['posteam', 'epa'])
      .groupby(['season', 'posteam', 'week'], as_index=False)['epa'].mean()
      .sort_values(['season', 'posteam', 'week'])
)

off_weekly['off_epa_4w'] = (
    off_weekly
      .groupby(['season', 'posteam'])['epa']
      .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())


In [131]:
def_weekly = (
    pbp_2024
      .dropna(subset=['defteam', 'epa'])
      .groupby(['season', 'defteam', 'week'], as_index=False)['epa'].mean()
      .sort_values(['season', 'defteam', 'week'])
)

def_weekly['def_epa_4w'] = (
    def_weekly
      .groupby(['season', 'defteam'])['epa']
      .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())


Merge (left join) 4-week rolling averages back to original df

In [None]:
pbp_2024 = pbp_2024.merge(
    off_weekly[['season','posteam','week','off_epa_4w']],
    on=['season','posteam','week'], how='left'
).merge(
    def_weekly[['season','defteam','week','def_epa_4w']],
    on=['season','defteam','week'], how='left'
)

for c in ['off_epa_4w', 'def_epa_4w']:
    pbp_2024[c] = pbp_2024[c].fillna(0.0)

Calculate seconds per play

In [None]:
pbp_2024['prev_gsr'] = pbp_2024.groupby('game_id')['game_seconds_remaining'].shift(1)
pbp_2024['play_elapsed_s'] = (pbp_2024['prev_gsr'] - pbp_2024['game_seconds_remaining']).clip(lower=0)
pbp_2024['play_elapsed_s'] = pbp_2024['play_elapsed_s'].fillna(0)

Lagged Variables

In [None]:
pbp_2024['def_team_prev'] = pbp_2024.groupby('game_id')['defteam'].shift(1)
pbp_2024['play_elapsed_s_prev'] = pbp_2024['play_elapsed_s']

Defensive Cumulative Time on Field

In [None]:
def_cum = (
    pbp_2024
      .assign(def_flag=lambda d: 1)
      .loc[:, ['game_id','def_team_prev','play_elapsed_s_prev']]
      .dropna(subset=['def_team_prev'])
      .rename(columns={'def_team_prev':'defteam'})
)

In [136]:
def_cum['def_time_on_field_cum'] = (
    def_cum
      .groupby(['game_id','defteam'])['play_elapsed_s_prev']
      .cumsum()
)

Merge defensive cumulative time on field back to original df

In [None]:
pbp_2024 = pbp_2024.merge(
    def_cum[['game_id','defteam','def_time_on_field_cum']].copy(),
    on=['game_id','defteam'],
    how='left'
)

In [138]:
pbp_2024['def_time_on_field_cum'] = pbp_2024['def_time_on_field_cum'].fillna(0)

Ensure they are numeric

In [139]:
pbp_2024['game_time_elapsed'] = pd.to_numeric(
    pbp_2024.groupby('game_id')['play_elapsed_s'].cumsum(), errors='coerce'
).astype(float)

pbp_2024['def_time_on_field_cum'] = pd.to_numeric(
    pbp_2024['def_time_on_field_cum'], errors='coerce'
).astype(float)

Share of time on field

In [None]:
den = pbp_2024['game_time_elapsed'].to_numpy(dtype=float)
num = pbp_2024['def_time_on_field_cum'].to_numpy(dtype=float)

pbp_2024['def_time_on_field_share'] = np.divide(
    num, den, out=np.zeros_like(num, dtype=float), where=den > 0
)

In [141]:
pbp_2024['def_time_on_field_share'] = pbp_2024['def_time_on_field_share'].fillna(0)

Number of play in the drive

In [142]:
pbp_2024['plays_in_drive_so_far'] = (
    pbp_2024.groupby(['game_id','drive']).cumcount() + 1
)

Are you in the 4th quarter (teams typically more tired)

In [143]:
pbp_2024['is_q4_or_later'] = (pbp_2024['qtr'] >= 4).astype(int)

Add fatigue and drive columns

In [144]:
drive_context_cols = ['plays_in_drive_so_far']
fatigue_cols = ['play_elapsed_s','game_time_elapsed',
                'def_time_on_field_cum','def_time_on_field_share','is_q4_or_later']


Merge columns into final df

In [145]:
final_cols = (
    id_cols + team_cols + game_st_cols + timeouts_clock_cols +
    condition_cols + arm_indicators_cols + reward_signal_cols +
    fg_cols + punt_cols + misc_filter_cols +
    ['off_epa_4w', 'def_epa_4w'] +
    drive_context_cols + fatigue_cols
)

df = pbp_2024[final_cols].copy()

Ensure numeric

In [146]:
num_cols = [
    'qtr','game_seconds_remaining','half_seconds_remaining','quarter_seconds_remaining',
    'down','ydstogo','yardline_100','score_differential','temp','wind',
    'epa','wpa','yards_gained','kick_distance','return_yards',
    'off_epa_4w','def_epa_4w', 'def_time_on_field_cum','def_time_on_field_share', 'plays_in_drive_so_far'
]

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

Check fatigue features (sanity check)

In [147]:
print("Fatigue feature NA rates:")
print(pbp_2024[['def_time_on_field_cum','def_time_on_field_share','plays_in_drive_so_far']].isna().mean())

print("Preview fatigue cols:")
display(
    pbp_2024[['season','week','game_id','qtr','game_seconds_remaining','posteam','defteam',
              'plays_in_drive_so_far','def_time_on_field_cum','def_time_on_field_share','is_q4_or_later']]
    .head(8)
)


Fatigue feature NA rates:
def_time_on_field_cum      0.000000
def_time_on_field_share    0.000000
plays_in_drive_so_far      0.003044
dtype: float64
Preview fatigue cols:


Unnamed: 0,season,week,game_id,qtr,game_seconds_remaining,posteam,defteam,plays_in_drive_so_far,def_time_on_field_cum,def_time_on_field_share,is_q4_or_later
0,2024,1,2024_01_ARI_BUF,1.0,3600.0,,,,0.0,0.0,0
1,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,1.0,0.0,0.0,0
2,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,2.0,33.0,0.0,0
3,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,3.0,77.0,0.0,0
4,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,4.0,118.0,0.0,0
5,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,5.0,154.0,0.0,0
6,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,6.0,189.0,0.0,0
7,2024,1,2024_01_ARI_BUF,1.0,3600.0,ARI,BUF,7.0,232.0,0.0,0


Sample Table

In [167]:
print(f"Data shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")

print("\nMissingness (key columns):")
missing_summary = (
    df[['down','ydstogo','yardline_100','play_type','epa','wpa']]
      .isna()
      .mean()
      .mul(100)
      .round(1)
      .rename("pct_missing")
      .to_frame()
)
print(missing_summary)

print("\nSample rows:")
display(df.head(3))


Data shape: 3,698,469 rows × 58 columns

Missingness (key columns):
              pct_missing
down                 11.3
ydstogo               0.0
yardline_100          1.9
play_type             1.9
epa                   0.6
wpa                   1.5

Sample rows:


Unnamed: 0,season,week,game_id,game_date,play_id,posteam,defteam,home_team,away_team,posteam_type,...,timeout,timeout_team,off_epa_4w,def_epa_4w,plays_in_drive_so_far,play_elapsed_s,game_time_elapsed,def_time_on_field_cum,def_time_on_field_share,is_q4_or_later
0,2024,1,2024_01_ARI_BUF,2024-09-08,1.0,,,BUF,ARI,,...,,,0.0,0.0,,0.0,0.0,0.0,0.0,0
1,2024,1,2024_01_ARI_BUF,2024-09-08,40.0,ARI,BUF,BUF,ARI,away,...,0.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0
2,2024,1,2024_01_ARI_BUF,2024-09-08,40.0,ARI,BUF,BUF,ARI,away,...,0.0,,0.0,0.0,2.0,0.0,0.0,33.0,0.0,0


Safely get column or default value

In [None]:
import numpy as np
import pandas as pd

def col_safe(frame, name, default=0):
    return frame[name] if name in frame.columns else pd.Series(default, index=frame.index)

Slice for only valid fourth-down plays

In [150]:
is_fourth = df['down'] == 4
valid_type = df['play_type'].isin(['run','pass','punt','field_goal'])

Enforce no penalties, no aborted plays, no kneels/spikes, and non-missing key fields

In [None]:
mask = (
    is_fourth &
    valid_type &
    col_safe(df, 'penalty').fillna(0).eq(0) &
    col_safe(df, 'aborted_play').fillna(0).eq(0) &
    col_safe(df, 'qb_kneel').fillna(0).eq(0) &
    col_safe(df, 'qb_spike').fillna(0).eq(0) &
    df['yardline_100'].notna() &
    df['ydstogo'].notna()
)

decisions_df = df.loc[mask].copy()

Get play type (decision)

In [152]:
decisions_df['action'] = np.select(
    [decisions_df['play_type'].isin(['run','pass']),
     decisions_df['play_type'] == 'punt',
     decisions_df['play_type'] == 'field_goal'],
    ['go','punt','fg']
)

Summary Stats

In [None]:
print("Number of 4th down decision plays:", len(decisions_df))
print("\nAction breakdown:")
print(decisions_df['action'].value_counts())

print("\nMissingness (% of rows) on key fields:")
print((decisions_df[['down','ydstogo','yardline_100','play_type','epa','wpa']]
        .isna().mean()*100).round(1))

print("\nPreview:")
display(
    decisions_df[['season','week','posteam','defteam','qtr','game_seconds_remaining',
                  'yardline_100','ydstogo','score_differential','action','epa','wpa']].head(10)
)

print("\nDistribution of ydstogo:")
print(decisions_df['ydstogo'].describe().round(2))

print("\nYardline bins × action:")
yard_bins = pd.cut(decisions_df['yardline_100'], bins=[0,20,40,60,80,100], include_lowest=True)
print(pd.crosstab(yard_bins, decisions_df['action']))

Number of 4th down decision plays: 299296

Action breakdown:
punt    152128
fg       84495
go       62673
Name: action, dtype: int64

Missingness (% of rows) on key fields:
down            0.0
ydstogo         0.0
yardline_100    0.0
play_type       0.0
epa             0.0
wpa             0.0
dtype: float64

Preview:


Unnamed: 0,season,week,posteam,defteam,qtr,game_seconds_remaining,yardline_100,ydstogo,score_differential,action,epa,wpa
2301,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2302,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2303,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2304,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2305,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2306,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2307,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2308,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2309,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172
2310,2024,1,ARI,BUF,2.0,2700.0,11.0,4.0,7.0,fg,0.135574,-0.002172



Distribution of ydstogo:
count    299296.00
mean          7.73
std           5.79
min           1.00
25%           3.00
50%           7.00
75%          10.00
max          40.00
Name: ydstogo, dtype: float64

Yardline bins × action:
action             fg     go   punt
yardline_100                       
(-0.001, 20.0]  39668  15221      0
(20.0, 40.0]    43947  18859   2469
(40.0, 60.0]      880  18554  51522
(60.0, 80.0]        0   9345  78179
(80.0, 100.0]       0    694  19958


Bin fieldgoals based on distance

In [154]:
fg = df.loc[df['field_goal_attempt'] == 1, 
            ['season','week','posteam','kick_distance','field_goal_result']].copy()

In [155]:
fg['fg_made'] = (fg['field_goal_result'] == 'made').astype(int)
fg['dist_bin'] = pd.cut(fg['kick_distance'], bins=[0,39,49,70],
                        labels=['short','mid','long'], include_lowest=True)

In [156]:
fg_week = (fg.groupby(['season','posteam','week','dist_bin'])['fg_made']
             .mean()
             .reset_index()
             .sort_values(['season','posteam','week']))

In [157]:
fg_week['fg_pct'] = (fg_week
    .groupby(['season','posteam','dist_bin'])['fg_made']
    .apply(lambda s: s.shift().rolling(16, min_periods=3).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(16, min_periods=3).mean())


In [158]:
fg_wide = (fg_week
           .pivot_table(index=['season','posteam','week'],
                        columns='dist_bin', values='fg_pct')
           .add_prefix('fg_pct_')
           .reset_index())

In [159]:
decisions_df = decisions_df.merge(fg_wide, on=['season','posteam','week'], how='left')

print("FG% columns added:",
      [c for c in decisions_df.columns if c.startswith('fg_pct_')])

FG% columns added: ['fg_pct_short', 'fg_pct_mid', 'fg_pct_long']


Rolling 4-week average for net punt average

In [160]:
p = df.loc[df['punt_attempt'] == 1,
           ['season','week','posteam','yards_gained','return_yards']].copy()

In [161]:
p['return_yards'] = p['return_yards'].fillna(0)
p['net_yards'] = p['yards_gained'] - p['return_yards']

In [162]:
p_week = (p.groupby(['season','posteam','week'])['net_yards']
            .mean()
            .reset_index()
            .sort_values(['season','posteam','week']))

In [163]:
p_week['punt_net_4w'] = (p_week
    .groupby(['season','posteam'])['net_yards']
    .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())
)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  .apply(lambda s: s.shift().rolling(4, min_periods=1).mean())


In [164]:
decisions_df = decisions_df.merge(
    p_week[['season','posteam','week','punt_net_4w']],
    on=['season','posteam','week'], how='left'
)

print("Added punt_net_4w. Missing rate:",
      decisions_df['punt_net_4w'].isna().mean().round(3))

Added punt_net_4w. Missing rate: 0.076


Fill missing FG% and punt net with overall means

In [None]:
for c in ['fg_pct_short','fg_pct_mid','fg_pct_long','punt_net_4w']:
    if c in decisions_df:
        decisions_df[c] = decisions_df[c].fillna(decisions_df[c].mean())

In [None]:
cols_peek = ['posteam','week','yardline_100','ydstogo','action',
             'off_epa_4w','def_epa_4w','fg_pct_short','fg_pct_mid','fg_pct_long','punt_net_4w']
display(decisions_df[cols_peek].head(8))

Unnamed: 0,posteam,week,yardline_100,ydstogo,action,off_epa_4w,def_epa_4w,fg_pct_short,fg_pct_mid,fg_pct_long,punt_net_4w
0,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
1,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
2,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
3,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
4,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
5,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
6,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
7,ARI,1,11.0,4.0,fg,0.0,0.0,0.958226,0.759344,0.674986,-4.264438
