# Historical QB pass rate analysis

We want to understand how QB tendencies affect pass rate; this hopefully helps smooth out noise from backup starts.

In [1]:
import pandas as pd
import os
import nfl_data_py as nfl

In [3]:
root_dir = os.getcwd()
df_games = pd.read_csv(os.path.join(root_dir,'data/games.csv'))
df_players = pd.read_csv(os.path.join(root_dir,'data/players.csv'))
df_play = pd.read_csv(os.path.join(root_dir,'data/plays.csv'))
df_player_play = pd.read_csv(os.path.join(root_dir,'data/player_play.csv'))
pfr_szn = nfl.import_seasonal_pfr('pass',[2021])
pfr_wk = nfl.import_weekly_pfr('pass',[2021])
isr = nfl.import_seasonal_rosters(years=[2021],columns=['player_id','pfr_id','gsis_it_id']).drop_duplicates()
snap_df = nfl.import_snap_counts([2021])


### Aggregate 2021 PFR QB data

Bringing in snap data from 2021, we get play-action and overall pass rates for each QB that year:

In [4]:
# get '21 nflverse snap data, merge into pfr pass info
snap_szn = snap_df.groupby('pfr_player_id')['offense_snaps'].sum().reset_index()
pfr_sub = pfr_szn[['pfr_id','pa_pass_att','pa_pass_yards','pass_attempts']].merge(snap_szn,how='left',left_on='pfr_id',right_on='pfr_player_id').drop(columns=['pfr_player_id'])

# calculate PA, overall pass rates for '21
pfr_sub['qb_pa_rate_pass'] = pfr_sub['pa_pass_att']/pfr_sub['pass_attempts']
pfr_sub['qb_pa_rate_ovr'] = pfr_sub['pa_pass_att']/pfr_sub['offense_snaps']
pfr_sub['qb_pass_rate'] = pfr_sub['pass_attempts']/pfr_sub['offense_snaps']
pfr_sub.head(1)

Unnamed: 0,pfr_id,pa_pass_att,pa_pass_yards,pass_attempts,offense_snaps,qb_pa_rate_pass,qb_pa_rate_ovr,qb_pass_rate
0,BradTo00,113.0,912.0,719.0,1284.0,0.157163,0.088006,0.559969


We then get ID's to match our '22 data (as long as they have a meaningful sample size)

In [13]:
# pa_pass att = arbitrary column, same amt of na's for all features
pfr_id_rect = isr.merge(pfr_sub,how='left').dropna(subset='pa_pass_att').drop(columns=['player_id','pfr_id'])
pfr_id_rect = pfr_id_rect[pfr_id_rect['pass_attempts'] >= 20]
pfr_id_rect.head(1)

Unnamed: 0,gsis_it_id,pa_pass_att,pa_pass_yards,pass_attempts,offense_snaps,qb_pa_rate_pass,qb_pa_rate_ovr,qb_pass_rate
0,25511,113.0,912.0,719.0,1284.0,0.157163,0.088006,0.559969


### Aggregate 2022 data

We now bring in '22 data, getting QB data—aggregated to the game level—for that year

In [14]:
# bring in position, PA and pass flags
df_pos = df_player_play[['gameId','playId','nflId']].merge(df_players[['nflId','position']],how='left')
df_comp = df_pos.merge(df_play[['gameId','playId','playAction','isDropback']])

# group QB passing data to game level, add explicit week info
qb_grp = df_comp[df_comp.position=='QB'].groupby(['gameId','nflId']).agg(pa_ct_game=('playAction','sum'),snap_ct=('playAction','count'),pass_ct=('isDropback','sum')).reset_index()
qb_trunc = qb_grp.merge(df_games[['gameId','week']].drop_duplicates(),how='left').sort_values(by=['nflId','week'])

We then get '22 pass ratios:

In [15]:
qb_trunc['qb_pass_rate'] = qb_trunc['pass_ct']/qb_trunc['snap_ct']
qb_trunc['qb_pa_rate_ovr'] = qb_trunc['pa_ct_game']/qb_trunc['snap_ct']
qb_trunc['qb_pa_rate_pass'] = qb_trunc['pa_ct_game']/qb_trunc['pass_ct']
qb_trunc['week'] = qb_trunc['week'].astype(int)

In [16]:
qb_trunc.head(1)

Unnamed: 0,gameId,nflId,pa_ct_game,snap_ct,pass_ct,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
33,2022091113,25511,8,58,26,1,0.448276,0.137931,0.307692


### Bring data together, get mean

Using '21 rates as week 1 data, we bring it together with '22 data.

In [17]:
# adjust ID typing
pfr_id_rect.rename(columns={'gsis_it_id':'nflId'},inplace=True)
pfr_id_rect['nflId'] = pfr_id_rect['nflId'].astype(str)
qb_trunc['nflId'] = qb_trunc['nflId'].astype(str)

# set '21 data as week 1, uptick other weeks
pfr_id_rect['week']=1
qb_trunc['week']+=1

# stack '21 (our proxy week 1) and '22 (week 2-9) data
reduce_cols = ['nflId','week','qb_pass_rate','qb_pa_rate_ovr','qb_pa_rate_pass']
qb_w_21 = pd.concat([pfr_id_rect[reduce_cols],qb_trunc[reduce_cols]],axis=0)

We now get the running average for each of our pass rates:

In [18]:
# get cumulative sum for each feature
mean_cols  = ['qb_pass_rate','qb_pa_rate_ovr','qb_pa_rate_pass']
qb_w_21.sort_values(by=['nflId','week'],inplace=True)
qb_full = pd.concat([qb_w_21[['nflId','week']],qb_w_21.groupby(['nflId'])[mean_cols].cumsum()],axis=1)
qb_full = pd.concat([qb_full,qb_full.groupby(['nflId']).agg(qb_week=('week','cumcount'))],axis=1)
qb_full['qb_week']+=1 # num. start of qb's season; default start ind 0 so bump by 1

# get average by dividing by week num.
for col in mean_cols: 
    qb_full[col] = qb_full[col].values/qb_full['qb_week'].values

qb_full.drop(columns=['qb_week'],inplace=True)

### Get pass rate EWM

Note we don't need to shift here b/c we already did implicitly via week uptick earlier

In [19]:
ewm_temp = qb_w_21.copy()
for col in mean_cols:
    ewm_temp[col+'_ewm'] = ewm_temp.groupby(['nflId'])[col].transform(lambda x: x.ewm(alpha=.1).mean())
    ewm_temp.drop(columns=[col],inplace=True)

Basically, we see a slight up-weighting of recent weeks for the EWM, which is its intent:

In [20]:
qb_w_21[qb_w_21['nflId'] == '54712']

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
167,54712,6,0.561404,0.087719,0.15625
197,54712,7,0.714286,0.238095,0.333333


In [21]:
ewm_temp[ewm_temp.nflId =='54712']

Unnamed: 0,nflId,week,qb_pass_rate_ewm,qb_pa_rate_ovr_ewm,qb_pa_rate_pass_ewm
167,54712,6,0.561404,0.087719,0.15625
197,54712,7,0.641868,0.166865,0.249452


In [22]:
qb_full[qb_full.nflId == '54712']

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
167,54712,6,0.561404,0.087719,0.15625
197,54712,7,0.637845,0.162907,0.244792


In [23]:
qb_full = qb_full.merge(ewm_temp,how='left',on=['nflId','week'])

# Fill missing data

For whatever reason, some QB's did not play in '21, so we have to figure out how to fill their data.

Since we're shifting a week, we must use the last prior record we have for a QB, or use default values

First we make it s.t. each QB-week pair has a value, using a cartesian product:

In [24]:
ci = pd.merge(qb_trunc['nflId'].drop_duplicates(), pd.Series(list(range(1,10))).rename('week'), how='cross',copy=False).sort_values(by=['nflId','week'])
qb_aw = ci.merge(qb_full,how='left')
qb_aw.tail(9)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass,qb_pass_rate_ewm,qb_pa_rate_ovr_ewm,qb_pa_rate_pass_ewm
549,54727,1,,,,,,
550,54727,2,,,,,,
551,54727,3,,,,,,
552,54727,4,,,,,,
553,54727,5,,,,,,
554,54727,6,0.0,0.0,,0.0,0.0,
555,54727,7,,,,,,
556,54727,8,0.5,0.0,0.0,0.526316,0.0,0.0
557,54727,9,,,,,,


Next, we forward-fill within groups for QB's, to the 'using last known value' point:

In [25]:
qb_ffill_pre = pd.concat([qb_aw['nflId'],qb_aw.groupby('nflId').ffill()],axis=1)

In [26]:
qb_ffill_pre.tail(9)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass,qb_pass_rate_ewm,qb_pa_rate_ovr_ewm,qb_pa_rate_pass_ewm
549,54727,1,,,,,,
550,54727,2,,,,,,
551,54727,3,,,,,,
552,54727,4,,,,,,
553,54727,5,,,,,,
554,54727,6,0.0,0.0,,0.0,0.0,
555,54727,7,0.0,0.0,,0.0,0.0,
556,54727,8,0.5,0.0,0.0,0.526316,0.0,0.0
557,54727,9,0.5,0.0,0.0,0.526316,0.0,0.0


As we can see, zero-pass-rates cause issues for % of passes which are PA (div 0), so we fix those

In [27]:
qb_ffill_pre.isna().sum()

nflId                   0
week                    0
qb_pass_rate           65
qb_pa_rate_ovr         65
qb_pa_rate_pass        87
qb_pass_rate_ewm       65
qb_pa_rate_ovr_ewm     65
qb_pa_rate_pass_ewm    87
dtype: int64

In [28]:
qb_ffill_pre.loc[qb_ffill_pre['qb_pass_rate'] == 0,'qb_pa_rate_ovr'] = 0
qb_ffill_pre.loc[qb_ffill_pre['qb_pass_rate'] == 0,'qb_pa_rate_pass'] = 0

Everything else left, fill with base values:

In [29]:
qb_ffill_pre['qb_pass_rate'] = qb_ffill_pre['qb_pass_rate'].fillna(.54)
qb_ffill_pre['qb_pa_rate_ovr'] = qb_ffill_pre['qb_pa_rate_ovr'].fillna(.13)
qb_ffill_pre['qb_pa_rate_pass'] = qb_ffill_pre['qb_pa_rate_pass'].fillna(.24)
qb_ffill_pre['qb_pass_rate_ewm'] = qb_ffill_pre['qb_pass_rate_ewm'].fillna(.54)
qb_ffill_pre['qb_pa_rate_ovr_ewm'] = qb_ffill_pre['qb_pa_rate_ovr_ewm'].fillna(.13)
qb_ffill_pre['qb_pa_rate_pass_ewm'] = qb_ffill_pre['qb_pa_rate_pass_ewm'].fillna(.24)

### Reintegrate data

We first get info for the qb's to actually play on each down:

In [30]:
qb_wk = df_comp[df_comp.position=='QB'].merge(df_games[['gameId','week']].drop_duplicates(),how='left')[['gameId','playId','nflId','week']]
qb_wk.head(5)

Unnamed: 0,gameId,playId,nflId,week
0,2022090800,56,46076,1
1,2022090800,80,46076,1
2,2022090800,101,46076,1
3,2022090800,122,46076,1
4,2022090800,167,46076,1


Then, we integrate our cleaned metric dataframe:

In [31]:
qb_ffill_pre['nflId'] = qb_ffill_pre['nflId'].astype(int)
rate_df = qb_wk.merge(qb_ffill_pre,how='left')
rate_df.head(1)

Unnamed: 0,gameId,playId,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass,qb_pass_rate_ewm,qb_pa_rate_ovr_ewm,qb_pa_rate_pass_ewm
0,2022090800,56,46076,1,0.505477,0.13928,0.275542,0.505477,0.13928,0.275542


### Dependent variable analysis

Now we finally reach our goal: seeing the relation between these features & team pass rates

In [33]:
train_data=pd.read_csv(os.path.join(root_dir, "data/train_data.csv"))
train_data = train_data.merge(rate_df.drop(columns=['nflId','week']),how='left')

Perhaps due to data sparsity (some qb spot starts?), we gain relatively little going from mean QB pass rates to EWM pass rates

In [36]:
train_data[list(rate_df.columns[-6:])+['pass']].corr()

Unnamed: 0,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass,qb_pass_rate_ewm,qb_pa_rate_ovr_ewm,qb_pa_rate_pass_ewm,pass
qb_pass_rate,1.0,0.044594,-0.306843,0.997696,0.055136,-0.335151,0.058176
qb_pa_rate_ovr,0.044594,1.0,0.892818,0.04465,0.998105,0.887017,-0.018358
qb_pa_rate_pass,-0.306843,0.892818,1.0,-0.311549,0.883682,0.993344,-0.059373
qb_pass_rate_ewm,0.997696,0.04465,-0.311549,1.0,0.055552,-0.34044,0.060595
qb_pa_rate_ovr_ewm,0.055136,0.998105,0.883682,0.055552,1.0,0.880633,-0.017015
qb_pa_rate_pass_ewm,-0.335151,0.887017,0.993344,-0.34044,0.880633,1.0,-0.057033
pass,0.058176,-0.018358,-0.059373,0.060595,-0.017015,-0.057033,1.0


In [38]:
train_data[['gameId','playId','qb_pa_rate_pass','qb_pass_rate_ewm']].to_csv(os.path.join(root_dir,'data/qb_historical.csv'))