# historical qb stats

pa rate, etc

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import chain
import nfl_data_py as nfl
from utility_db_25 import mark_columns, TrainML, build_catboost, optuna_call, test_ML, build_transformer, get_momentum_cols, create_momentum_index, get_final_features, get_motion_cols, motion_complexity_score

In [2]:
root_dir = os.getcwd()
df_games = pd.read_csv(os.path.join(root_dir,'data/games.csv'))
df_players = pd.read_csv(os.path.join(root_dir,'data/players.csv'))
df_play = pd.read_csv(os.path.join(root_dir,'data/plays.csv'))
df_player_play = pd.read_csv(os.path.join(root_dir,'data/player_play.csv'))
pfr_szn = nfl.import_seasonal_pfr('pass',[2021])
pfr_wk = nfl.import_weekly_pfr('pass',[2021])
isr = nfl.import_seasonal_rosters(years=[2021],columns=['player_id','pfr_id','gsis_it_id'])
snap_df = nfl.import_snap_counts([2021])


### Aggregate 2021 PFR QB data

Bringing in snap data from 2021, we get play-action and overall pass rates for each QB that year

In [3]:
# get snap data, merge into pfr pass info
snap_szn = snap_df.groupby('pfr_player_id')['offense_snaps'].sum().reset_index()
pfr_sub = pfr_szn[['pfr_id','pa_pass_att','pa_pass_yards','pass_attempts']].merge(snap_szn,how='left',left_on='pfr_id',right_on='pfr_player_id').drop(columns=['pfr_player_id'])

# calculate PA, overall pass rates
pfr_sub['qb_pa_rate_pass'] = pfr_sub['pa_pass_att']/pfr_sub['pass_attempts']
pfr_sub['qb_pa_rate_ovr'] = pfr_sub['pa_pass_att']/pfr_sub['offense_snaps']
pfr_sub['qb_pass_rate'] = pfr_sub['pass_attempts']/pfr_sub['offense_snaps']
pfr_sub.head(1)

Unnamed: 0,pfr_id,pa_pass_att,pa_pass_yards,pass_attempts,offense_snaps,qb_pa_rate_pass,qb_pa_rate_ovr,qb_pass_rate
0,BradTo00,113.0,912.0,719.0,1284.0,0.157163,0.088006,0.559969


We then merge to get ID's which match with our '22 data, with meaningful samples

In [4]:
pfr_id_rect = isr.merge(pfr_sub,how='left').dropna(subset='pa_pass_att').drop(columns=['player_id','pfr_id'])
pfr_id_rect = pfr_id_rect[pfr_id_rect['pass_attempts'] >= 20]
pfr_id_rect.head(1)

Unnamed: 0,gsis_it_id,pa_pass_att,pa_pass_yards,pass_attempts,offense_snaps,qb_pa_rate_pass,qb_pa_rate_ovr,qb_pass_rate
0,25511,113.0,912.0,719.0,1284.0,0.157163,0.088006,0.559969


### Aggregate 2022 data

We now bring in '22 data, getting game-QB data for that year

In [5]:
df_pos = df_player_play[['gameId','playId','nflId']].merge(df_players[['nflId','position']],how='left')
df_comp = df_pos.merge(df_play[['gameId','playId','playAction','isDropback']])
qb_grp = df_comp[df_comp.position=='QB'].groupby(['gameId','nflId']).agg(pa_ct_game=('playAction','sum'),snap_ct=('playAction','count'),pass_ct=('isDropback','sum')).reset_index()
qb_trunc = qb_grp.merge(df_games[['gameId','week']].drop_duplicates(),how='left').sort_values(by=['nflId','week'])

In [6]:
qb_grp.head(1)

Unnamed: 0,gameId,nflId,pa_ct_game,snap_ct,pass_ct
0,2022090800,34452,7,64,48


In [7]:
#qb_trunc.columns = ['gameId','nflId'] + ['qb_'+x for x in qb_trunc.columns[2:-1]]+['week']

### Bring data together, get mean

Using '21 rates as week 1 data, we bring it together with '22 data:

In [8]:
qb_trunc

Unnamed: 0,gameId,nflId,pa_ct_game,snap_ct,pass_ct,week
33,2022091113,25511,8,58,26,1
50,2022091804,25511,7,63,35,2
104,2022092512,25511,7,56,42,3
143,2022100213,25511,7,57,51,4
168,2022100908,25511,5,66,47,5
...,...,...,...,...,...,...
250,2022102400,54602,2,31,23,7
167,2022100907,54712,5,57,32,5
197,2022101604,54712,5,21,15,6
175,2022100910,54727,0,1,0,5


In [9]:
qb_trunc['qb_pass_rate'] = qb_trunc['pass_ct']/qb_trunc['snap_ct']
qb_trunc['qb_pa_rate_ovr'] = qb_trunc['pa_ct_game']/qb_trunc['snap_ct']
qb_trunc['qb_pa_rate_pass'] = qb_trunc['pa_ct_game']/qb_trunc['pass_ct']
qb_trunc['week'] = qb_trunc['week'].astype(int)

In [10]:
pfr_id_rect.rename(columns={'gsis_it_id':'nflId'},inplace=True)
pfr_id_rect['nflId'] = pfr_id_rect['nflId'].astype(str)
qb_trunc['nflId'] = qb_trunc['nflId'].astype(str)
pfr_id_rect['week']=1
reduce_cols = ['nflId','week','qb_pass_rate','qb_pa_rate_ovr','qb_pa_rate_pass']
qb_trunc['week']+=1
qb_w_21 = pd.concat([pfr_id_rect[reduce_cols],qb_trunc[reduce_cols]],axis=0)

In [11]:
mean_cols  = ['qb_pass_rate','qb_pa_rate_ovr','qb_pa_rate_pass']

In [12]:
qb_w_21.sort_values(by=['nflId','week'],inplace=True)

In [13]:
qb_full = pd.concat([qb_w_21[['nflId','week']],qb_w_21.groupby(['nflId'])[mean_cols].cumsum()],axis=1)

In [14]:
qb_w_21.head(9)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
0,25511,1,0.559969,0.088006,0.157163
33,25511,2,0.448276,0.137931,0.307692
50,25511,3,0.555556,0.111111,0.2
104,25511,4,0.75,0.125,0.166667
143,25511,5,0.894737,0.122807,0.137255
168,25511,6,0.712121,0.075758,0.106383
203,25511,7,0.615385,0.092308,0.15
222,25511,8,0.754098,0.196721,0.26087
251,25511,9,0.770492,0.065574,0.085106


In [15]:
for col in mean_cols: 
    qb_full[col] = qb_full[col].values/qb_full['week'].values

In [16]:
qb_full.head(11)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
0,25511,1,0.559969,0.088006,0.157163
33,25511,2,0.504122,0.112969,0.232428
50,25511,3,0.521267,0.112349,0.221618
104,25511,4,0.57845,0.115512,0.20788
143,25511,5,0.641707,0.116971,0.193755
168,25511,6,0.653443,0.110102,0.179193
203,25511,7,0.648006,0.10756,0.175023
222,25511,8,0.661268,0.118705,0.185754
251,25511,9,0.673404,0.112802,0.174571
307,25511,10,0.680388,0.108278,0.166204


In [17]:
qb_full

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
0,25511,1,0.559969,0.088006,0.157163
33,25511,2,0.504122,0.112969,0.232428
50,25511,3,0.521267,0.112349,0.221618
104,25511,4,0.578450,0.115512,0.207880
143,25511,5,0.641707,0.116971,0.193755
...,...,...,...,...,...
250,54602,8,0.261723,0.079493,0.164947
167,54712,6,0.093567,0.014620,0.026042
197,54712,7,0.182241,0.046545,0.069940
175,54727,6,0.000000,0.000000,


# todo: ffill after ci

now integrate cartesian, forward fill, blah blah seems sane

In [18]:
ci = pd.merge(qb_trunc['nflId'].drop_duplicates(), pd.Series(list(range(1,10))).rename('week'), how='cross',copy=False).sort_values(by=['nflId','week'])

In [19]:
qb_aw = ci.merge(qb_full,how='left')

In [20]:
qb_aw.tail(12)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
546,54712,7,0.182241,0.046545,0.06994
547,54712,8,,,
548,54712,9,,,
549,54727,1,,,
550,54727,2,,,
551,54727,3,,,
552,54727,4,,,
553,54727,5,,,
554,54727,6,0.0,0.0,
555,54727,7,,,


In [21]:
qb_ffill_pre = pd.concat([qb_aw['nflId'],qb_aw.groupby('nflId').ffill()],axis=1)

In [22]:
qb_ffill_pre.tail(5)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
553,54727,5,,,
554,54727,6,0.0,0.0,
555,54727,7,0.0,0.0,
556,54727,8,0.125,0.0,0.0
557,54727,9,0.125,0.0,0.0


As we can see, zero-pass-rates cause issues for % of passes which are PA (div 0), so we fix those

In [23]:
qb_ffill_pre.isna().sum()

nflId               0
week                0
qb_pass_rate       65
qb_pa_rate_ovr     65
qb_pa_rate_pass    87
dtype: int64

In [24]:
qb_ffill_pre.loc[qb_ffill_pre['qb_pass_rate'] == 0,'qb_pa_rate_ovr'] = 0
qb_ffill_pre.loc[qb_ffill_pre['qb_pass_rate'] == 0,'qb_pa_rate_pass'] = 0

In [25]:
qb_ffill_pre.tail(5)

Unnamed: 0,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
553,54727,5,,,
554,54727,6,0.0,0.0,0.0
555,54727,7,0.0,0.0,0.0
556,54727,8,0.125,0.0,0.0
557,54727,9,0.125,0.0,0.0


In [26]:
qb_ffill_pre.isna().sum()

nflId               0
week                0
qb_pass_rate       65
qb_pa_rate_ovr     65
qb_pa_rate_pass    65
dtype: int64

In [27]:
qb_aw.mean()

nflId                   inf
week               5.000000
qb_pass_rate       0.482693
qb_pa_rate_ovr     0.127952
qb_pa_rate_pass    0.238619
dtype: float64

In [28]:
qb_aw.median()

nflId              44869.000000
week                   5.000000
qb_pass_rate           0.530260
qb_pa_rate_ovr         0.127907
qb_pa_rate_pass        0.239538
dtype: float64

In [29]:
qb_ffill_pre.mean()

nflId                   inf
week               5.000000
qb_pass_rate       0.447831
qb_pa_rate_ovr     0.113008
qb_pa_rate_pass    0.215029
dtype: float64

Everything else left, fill with base values:

In [30]:
qb_ffill_pre['qb_pass_rate'] = qb_ffill_pre['qb_pass_rate'].fillna(.54)
qb_ffill_pre['qb_pa_rate_ovr'] = qb_ffill_pre['qb_pa_rate_ovr'].fillna(.13)
qb_ffill_pre['qb_pa_rate_pass'] = qb_ffill_pre['qb_pa_rate_pass'].fillna(.24)

In [31]:
qb_ffill_pre.isna().sum()

nflId              0
week               0
qb_pass_rate       0
qb_pa_rate_ovr     0
qb_pa_rate_pass    0
dtype: int64

TODO: reintegrate data

In [32]:
qb_wk =  df_comp[df_comp.position=='QB'].merge(df_games[['gameId','week']].drop_duplicates(),how='left')[['gameId','playId','nflId','week']]

In [33]:
qb_wk

Unnamed: 0,gameId,playId,nflId,week
0,2022090800,56,46076,1
1,2022090800,80,46076,1
2,2022090800,101,46076,1
3,2022090800,122,46076,1
4,2022090800,167,46076,1
...,...,...,...,...
16210,2022110700,3658,46101,9
16211,2022110700,3686,46101,9
16212,2022110700,3707,46101,9
16213,2022110700,3740,46101,9


In [34]:
qb_ffill_pre.dtypes

nflId               object
week                 int64
qb_pass_rate       float64
qb_pa_rate_ovr     float64
qb_pa_rate_pass    float64
dtype: object

In [35]:
qb_ffill_pre['nflId'] = qb_ffill_pre['nflId'].astype(int)

In [36]:
rate_df = qb_wk.merge(qb_ffill_pre,how='left')

In [37]:
rate_df

Unnamed: 0,gameId,playId,nflId,week,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass
0,2022090800,56,46076,1,0.505477,0.139280,0.275542
1,2022090800,80,46076,1,0.505477,0.139280,0.275542
2,2022090800,101,46076,1,0.505477,0.139280,0.275542
3,2022090800,122,46076,1,0.505477,0.139280,0.275542
4,2022090800,167,46076,1,0.505477,0.139280,0.275542
...,...,...,...,...,...,...,...
16210,2022110700,3658,46101,9,0.563199,0.231477,0.409221
16211,2022110700,3686,46101,9,0.563199,0.231477,0.409221
16212,2022110700,3707,46101,9,0.563199,0.231477,0.409221
16213,2022110700,3740,46101,9,0.563199,0.231477,0.409221


In [43]:
rate_df.columns[-3:]

Index(['qb_pass_rate', 'qb_pa_rate_ovr', 'qb_pa_rate_pass'], dtype='object')

explore

In [39]:
train_data=pd.read_csv(os.path.join(root_dir, "data/train_data.csv"))

In [40]:
train_data.head(3)

Unnamed: 0,gameId,playId,quarter,down,yardsToGo,score_differential,time_remaining,playNullifiedByPenalty,preSnapHomeTeamWinProbability,pass_rush_ratio,...,ybc_att,yac_att,qb_plays,play_id,old_game_id_x,xpass_situational,weight_all_sum,db_ct,mean_pairwise_dist,mean_offset_db_orientation_abs
0,2022090800,56,1,1,10,0,60.0,N,0.413347,2.72,...,3.086347,1.974908,2196.0,56,2022090800,0.515357,2639,5,10.823594,39.849639
1,2022090800,80,1,2,4,0,59.483333,N,0.413316,2.72,...,3.086347,1.974908,2196.0,80,2022090800,0.483545,2639,5,12.818117,27.345678
2,2022090800,101,1,1,10,0,58.9,N,0.399819,2.72,...,3.086347,1.974908,2196.0,101,2022090800,0.46302,2648,5,8.904863,26.583269


In [41]:
train_data = train_data.merge(rate_df.drop(columns=['nflId','week']),how='left')

In [46]:
train_data[list(rate_df.columns[-3:])+['pass']].corr()

Unnamed: 0,qb_pass_rate,qb_pa_rate_ovr,qb_pa_rate_pass,pass
qb_pass_rate,1.0,0.490073,0.293829,0.059814
qb_pa_rate_ovr,0.490073,1.0,0.940806,-0.005185
qb_pa_rate_pass,0.293829,0.940806,1.0,-0.036629
pass,0.059814,-0.005185,-0.036629,1.0
