# QBR Processing

We use ESPN's QBR and PFR data to see if QB efficiency/rushing tendencies tell us about team run rates

In [1]:
import nfl_data_py as nfl
import pandas as pd

Use last three years' data just b/c we don't have PA/RPO tracking prior

In [2]:
id_df = nfl.import_ids()
qb_years = [2019,2020,2021]
qb_v1 = nfl.import_qbr(qb_years).reset_index(drop=True)
qb_pre = qb_v1[qb_v1['season_type'] == 'Regular'].copy()

## Add pfr data
We add pfr rushing data, to further account for mobile QB's effect on the run game

In [3]:
# rush filtered to QB
pfr_rp = nfl.import_seasonal_pfr('rush',qb_years)
pfr_rush = pfr_rp[pfr_rp['pos'] == 'QB'][['pfr_id','season','pos','ybc_att','yac_att']]
pfr_rush.head(1)

Unnamed: 0,pfr_id,season,pos,ybc_att,yac_att
363,JackLa00,2019,QB,4.9,1.9


ID type housekeeping:

In [4]:
# all NA's are non-QB's
id_df.dropna(subset=['espn_id'],inplace=True)

# make sure typing correct
qb_pre['player_id'] = qb_pre['player_id'].astype(int).astype(str)
id_df['espn_id'] = id_df['espn_id'].astype(int).astype(str)
id_df['pfr_id'] = id_df['pfr_id'].astype(str)

#### Merge in pfr ID's to QBR, subset to relevant columns

In [5]:
qb_id = qb_pre.merge(id_df[['espn_id','pfr_id']].drop_duplicates(),how='left',left_on='player_id',right_on='espn_id')
qb_id = qb_id[qb_id['name_display'] != 'John Franklin'][['player_id','pfr_id','season','qbr_total','qb_plays','pass','run','name_display']]
qb_id.head(1).T

Unnamed: 0,0
player_id,3916387
pfr_id,JackLa00
season,2019
qbr_total,83.0
qb_plays,613
pass,55.0
run,39.1
name_display,Lamar Jackson


### Fix 'nan' string weirdness in qb_id

Have some qb's who inexplicably have their pfr_id set to 'nan' string, so we fix it

In [6]:
nan_flag = (qb_id['pfr_id'] == 'nan')
pfr_rush.rename(columns={'yds':'yds_rushed'},inplace=True)

Fix players who have 'nan' to their real ID's

In [7]:
qb_id.loc[nan_flag & (qb_id['name_display'] == 'Matt Moore'),'pfr_id'] = 'MoorMa01'
qb_id.loc[nan_flag & (qb_id['name_display'] == 'Chase Daniel'),'pfr_id'] = 'DaniCh00'
qb_id.loc[nan_flag & (qb_id['name_display'] == 'Brian Hoyer'),'pfr_id'] = 'HoyeBr00'
qb_id.loc[nan_flag & (qb_id['name_display'] == 'Tim Boyle'),'pfr_id'] = 'BoylTi00'
qb_id.loc[nan_flag & (qb_id['name_display'] == 'John Wolford'),'pfr_id'] = 'WolfJo00'
qb_id.loc[nan_flag & (qb_id['name_display'] == 'Andrew Beck'),'pfr_id'] = 'BeckAn01'
qb_id.loc[nan_flag & (qb_id['name_display'] == 'Joseph Charlton'),'pfr_id'] = 'CharJo01'

In [8]:
pfr_rush.head(1)

Unnamed: 0,pfr_id,season,pos,ybc_att,yac_att
363,JackLa00,2019,QB,4.9,1.9


### Add in pfr rushing data

We merge and drop na's, since all na's are non-qb's:

In [9]:
# rename generically named QBR (maybe EPA?) pass/run columns
qb_id.rename(columns={'pass':'pass_val','run':'run_val'},inplace=True)

# merge in pfr data
qb_pfr = qb_id.merge(pfr_rush,how='left',on=['pfr_id','season']).dropna()

# sort ascending s.t. exponential mean works
qb_pfr = qb_pfr.sort_values(by=['pfr_id','season'],ascending=True).drop(columns=['pos','player_id','name_display'])
qb_pfr.head(3)

Unnamed: 0,pfr_id,season,qbr_total,qb_plays,pass_val,run_val,ybc_att,yac_att
23,AlleJo02,2019,49.4,639,21.1,10.4,2.4,2.2
60,AlleJo02,2020,76.6,729,112.1,13.0,2.8,1.3
125,AlleJo02,2021,66.3,828,80.9,29.5,3.9,2.4


#### Calc exponential weighed mean

We want to give recent seasons more weight, then do the same for recent games when we get there for '22

qb_plays is there to mark volume, can be a useful flag perhaps if a backup is in

In [10]:
mean_cols = [x for x in qb_pfr.columns if x not in ['pfr_id','qb_plays','season']]

d_pre = {lambda x: x.ewm(alpha=.1).mean().iloc[-1]:mean_cols, 'sum':['qb_plays']}

qb_21 = qb_pfr.groupby(['pfr_id']).agg({k: v for v,l in d_pre.items() for k in l}).reset_index()

Compare df before and after means:

In [11]:
qb_21.head(3)

Unnamed: 0,pfr_id,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays
0,AlleJo02,64.669373,73.387823,18.311439,3.086347,1.974908,2196
1,AlleKy00,54.036842,8.721053,1.752632,2.184211,1.378947,696
2,BeatC.00,92.3,0.7,0.0,2.0,0.0,4


# Get weekly 2022 data

We repeat the process on a weekly level for 2022 now, starting with QBR:

In [12]:
qb_v1_22 = nfl.import_qbr([2022],frequency='weekly').reset_index(drop=True)
qbr_22_pre = qb_v1_22[qb_v1_22['season_type'] == 'Regular'].copy()
qbr_22_pre['player_id'] = qbr_22_pre['player_id'].astype(int).astype(str)
qbr_22_pre.head(1)

Unnamed: 0,season,season_type,game_id,game_week,week_text,team_abb,player_id,name_short,rank,qbr_total,...,name_last,name_display,headshot_href,team,opp_id,opp_abb,opp_team,opp_name,week_num,qualified
0,2022,Regular,401437653,1,Week 1,KC,3139477,P. Mahomes,1.0,94.9,...,Mahomes,Patrick Mahomes,https://a.espncdn.com/i/headshots/nfl/players/...,Chiefs,22,ARI,Arizona Cardinals,Cardinals,1,True


#### Get weekly '22 rush, pass PFR data

In [13]:
# rush filtered to QB
rp_22 = nfl.import_weekly_pfr('rush',[2022])
pfr_rush_22 = rp_22[['pfr_player_id','week','rushing_yards_before_contact_avg','rushing_yards_after_contact_avg']].copy()
pfr_rush_22.rename(columns={'rushing_yards_before_contact_avg':'ybc_att','rushing_yards_after_contact_avg':'yac_att','pfr_player_id':'pfr_id'},inplace=True)
pfr_rush_22.head(2)

Unnamed: 0,pfr_id,week,ybc_att,yac_att
0,HendDa00,1,2.3,1.3
1,AkerCa00,1,0.0,0.0


#### Merge in '22 pfr Id's to qbr df, subset columns

In [14]:
qb_id_22 = qbr_22_pre.merge(id_df[['espn_id','pfr_id']].drop_duplicates(),how='left',left_on='player_id',right_on='espn_id')
qb_id_22 = qb_id_22[qb_id_22['name_display'] != 'John Franklin'][['player_id','pfr_id','game_week','qbr_total','qb_plays','pass','run','name_display']]
qb_id_22.head(3)

Unnamed: 0,player_id,pfr_id,game_week,qbr_total,qb_plays,pass,run,name_display
0,3139477,MahoPa00,1,94.9,44,11.1,-1.4,Patrick Mahomes
1,3918298,AlleJo02,1,87.4,45,6.6,3.9,Josh Allen
2,4038941,HerbJu00,1,79.9,38,10.1,-1.4,Justin Herbert


### Merge in rush (weekly, '22)

Again we deal with 'nan' weirdness first:

In [15]:
nan_flag_22 = (qb_id_22['pfr_id'] == 'nan')

In [16]:
qb_id_22.loc[nan_flag_22 & (qb_id_22['name_display'] == 'Brett Rypien'),'pfr_id'] = 'RypiBr00'
qb_id_22.loc[nan_flag_22 & (qb_id_22['name_display'] == 'John Wolford'),'pfr_id'] = 'WolfJo00'
qb_id_22.loc[nan_flag_22 & (qb_id_22['name_display'] == 'Bryce Perkins'),'pfr_id'] = 'PerkBr02'
qb_id_22.loc[nan_flag_22 & (qb_id_22['name_display'] == 'Chris Streveler'),'pfr_id'] = 'StreCh00'

We dropna here since all na's are immobile qb's:

In [17]:
# rename generically named QBR (maybe EPA?) pass/run columns
qb_id_22.rename(columns={'pass':'pass_val','run':'run_val'},inplace=True)

# merge in pfr data
qb_pfr_22 = qb_id_22.merge(pfr_rush_22,how='left',left_on=['pfr_id','game_week'],right_on=['pfr_id','week']).dropna()

# sort ascending s.t. exponential mean works
qb_pfr_22.sort_values(by=['pfr_id','week'],ascending=True,inplace=True)
qb_pfr_22.drop(columns=['player_id','name_display','game_week'],inplace=True)
qb_pfr_22.head(2)

Unnamed: 0,pfr_id,qbr_total,qb_plays,pass_val,run_val,week,ybc_att,yac_att
1,AlleJo02,87.4,45,6.6,3.9,1.0,3.6,2.0
34,AlleJo02,89.0,45,8.5,1.4,2.0,3.0,7.0


### Combine '21, '22 data, get rolling mean

We set week 0 as the start s.t. past data gets carried through

In [18]:
qb_21['week'] = 0
qb_pfr_full = pd.concat([qb_21[qb_pfr_22.columns],qb_pfr_22],axis=0)
qb_pfr_full.sort_values(['pfr_id','week'],inplace=True)
qb_pfr_full['week'] = qb_pfr_full['week'].astype(int)
qb_pfr_full.head(3)

Unnamed: 0,pfr_id,qbr_total,qb_plays,pass_val,run_val,week,ybc_att,yac_att
0,AlleJo02,64.669373,2196,73.387823,18.311439,0,3.086347,1.974908
1,AlleJo02,87.4,45,6.6,3.9,1,3.6,2.0
34,AlleJo02,89.0,45,8.5,1.4,2,3.0,7.0


We perform weighted means now that we have 2021 info to carry throughout our data:

In [19]:
# define columns for mean
mean_cols_full = [x for x in qb_pfr_full.columns if x not in ['pfr_id','qb_plays','week']]

# get mean, cum sum, integrate
ewm_full = qb_pfr_full.groupby(['pfr_id'])[mean_cols_full].transform(lambda x: x.ewm(alpha=.1).mean())
cs_full = qb_pfr_full.groupby(['pfr_id'])['qb_plays'].cumsum()
full_agged = pd.concat([qb_pfr_full[['pfr_id','week']],ewm_full,cs_full],axis=1)
full_agged.head(3)

Unnamed: 0,pfr_id,week,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays
0,AlleJo02,0,64.669373,73.387823,18.311439,3.086347,1.974908,2196
1,AlleJo02,1,76.632861,38.236337,10.726471,3.356691,1.988114,2241
34,AlleJo02,2,81.196381,27.263519,7.284969,3.22507,3.837519,2286


### Use cartesian product to make sure each qb has data for all weeks, then forward fill:

In [20]:
def cartesian_product(d):
    index = pd.MultiIndex.from_product(d.values(), names=d.keys())
    return pd.DataFrame(index=index).reset_index()

In [21]:
# merge s.t. each qb has data for all weeks
ci = cartesian_product({'pfr_id':list(qb_pfr_22.pfr_id.unique()),'week':list(range(0,10))})
ci_merged = ci.merge(full_agged,how='left',on=['pfr_id','week'])
ci.head(2)

Unnamed: 0,pfr_id,week
0,AlleJo02,0
1,AlleJo02,1


We uptick week, finally, since week 0 (i.e., 2019-21 means) is used for week 1

In [22]:
# forward fill within qb group, t
df_op = pd.concat([ci['pfr_id'],ci_merged.groupby('pfr_id').ffill().reset_index(drop=True)],axis=1)
df_op['week']+=1
df_op.head(9)

Unnamed: 0,pfr_id,week,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays
0,AlleJo02,1,64.669373,73.387823,18.311439,3.086347,1.974908,2196.0
1,AlleJo02,2,76.632861,38.236337,10.726471,3.356691,1.988114,2241.0
2,AlleJo02,3,81.196381,27.263519,7.284969,3.22507,3.837519,2286.0
3,AlleJo02,4,73.986035,21.196779,6.446071,3.624876,3.099653,2362.0
4,AlleJo02,5,75.03948,16.679971,6.06853,3.301348,3.343932,2409.0
5,AlleJo02,6,77.890886,14.20857,5.072171,4.1334,2.886374,2447.0
6,AlleJo02,7,76.090842,12.769328,4.176609,3.705302,2.486458,2500.0
7,AlleJo02,8,76.090842,12.769328,4.176609,3.705302,2.486458,2500.0
8,AlleJo02,9,77.49711,10.966214,4.110483,4.336468,2.190345,2535.0


### Fix missing ID's

Some rookies missing ID's, so we use a dict to loc and fill them:

In [23]:
isr = nfl.import_seasonal_rosters([2022])

Define rookie name-pfr id mappings

In [24]:
ids_missing = ['AlleKy00', 'BlouDa00', 'HoweSa00', 'HuntTy01', 'PerkBr02',
       'PickKe00', 'PurdBr00', 'RiddDe00', 'RushCo00', 'RypiBr00',
       'StreCh00', 'ThomSk00', 'WillMa12', 'WolfJo00', 'ZappBa00']

names_missing = ['Kyle Allen', 'David Blough', 'Sam Howell', 'Tyler Huntley', 'Bryce Perkins', 
                 'Kenny Pickett','Brock Purdy', 'Desmond Ridder', 'Cooper Rush', 'Brett Rypien',
                 'Chris Streveler', 'Skylar Thompson', 'Malik Willis', 'John Wolford', 'Bailey Zappe']

qbd = dict(zip(names_missing, ids_missing))

In [25]:
# loop over dict and loc-replace 'none' id's
for k in qbd:
    isr.loc[isr['player_name'] == k, 'pfr_id'] = qbd[k]

In [26]:
main_ids = isr[['pfr_id','gsis_it_id']].drop_duplicates()

In [27]:
qbs_out = df_op.merge(main_ids,how='left',on='pfr_id')
qbs_out.head(3)

Unnamed: 0,pfr_id,week,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays,gsis_it_id
0,AlleJo02,1,64.669373,73.387823,18.311439,3.086347,1.974908,2196.0,46076
1,AlleJo02,2,76.632861,38.236337,10.726471,3.356691,1.988114,2241.0,46076
2,AlleJo02,3,81.196381,27.263519,7.284969,3.22507,3.837519,2286.0,46076


# Fill rookie, other missing stats here

Rookies and exceptions of retreads like Trevor Siemian (no data in '19-'21) filled w/median and set career snaps to 0

In [28]:
# fill rook stats w/median
fnd = dict(qb_21[qbs_out.columns[:-1]].iloc[:,2:-1].median())
qbs_out[qbs_out.ybc_att.isna()].pfr_id.unique()

array(['BlouDa00', 'DobbJo00', 'EhliSa00', 'HoweSa00', 'HuntTy01',
       'LancTr00', 'PerkBr02', 'PeteNa00', 'PickKe00', 'PurdBr00',
       'RiddDe00', 'RushCo00', 'RypiBr00', 'SiemTr00', 'SmitGe00',
       'StidJa00', 'StreCh00', 'ThomSk00', 'WalkPh00', 'WebbDa00',
       'WhitMi01', 'WillMa12', 'ZappBa00'], dtype=object)

In [29]:
# fill career snaps w/o
qbs_out.fillna(fnd,inplace=True)
qbs_out['qb_plays'] = qbs_out['qb_plays'].fillna(0)

In [30]:
qbs_out['gsis_it_id'] = qbs_out['gsis_it_id'].astype(int).astype(str)
qbs_out.head(2)

Unnamed: 0,pfr_id,week,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays,gsis_it_id
0,AlleJo02,1,64.669373,73.387823,18.311439,3.086347,1.974908,2196.0,46076
1,AlleJo02,2,76.632861,38.236337,10.726471,3.356691,1.988114,2241.0,46076


# Integrate to play level

In [31]:
df_player_play = pd.read_csv('data/player_play.csv')[['gameId','playId','nflId']]
df_games = pd.read_csv('data/games.csv')[['gameId','week']]
df_weeks = df_player_play.merge(df_games,how='left',on='gameId').drop_duplicates()

In [32]:
df_weeks.head(2)

Unnamed: 0,gameId,playId,nflId,week
0,2022090800,56,35472,1
1,2022090800,56,42392,1


Fix some ID typing issues:

In [33]:
df_weeks['nflId'] = df_weeks['nflId'].astype(int).astype(str)
df_weeks['week'] = df_weeks['week'].astype(int).astype(str)
qbs_out['week'] = qbs_out['week'].astype(int).astype(str)

In [34]:
df_op= df_weeks.merge(qbs_out,how='inner',left_on=['nflId','week'],right_on=['gsis_it_id','week'])

NA's are non-qb's, so we inner merge; we also drop nflId, since we're already at game-play level

In [35]:
df_op[['gameId','playId']].duplicated().sum()

2

Malik Willis is used by Titans as a decoy for a couple of plays, so we excise them:

In [36]:
df_op[df_op[['gameId','playId']].duplicated(keep=False)]

Unnamed: 0,gameId,playId,nflId,week,pfr_id,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays,gsis_it_id
11759,2022102305,430,38538,7,TannRy00,44.739061,8.081103,2.190302,1.523148,0.568836,1814.0,38538
11790,2022102305,2879,38538,7,TannRy00,44.739061,8.081103,2.190302,1.523148,0.568836,1814.0,38538
11862,2022102305,430,54551,7,WillMa12,57.5,32.661255,3.157565,2.756089,0.7,0.0,54551
11864,2022102305,2879,54551,7,WillMa12,57.5,32.661255,3.157565,2.756089,0.7,0.0,54551


In [37]:
df_out = df_op[~((df_op['gameId'] == 2022102305) & (df_op['pfr_id']=='WillMa12'))].drop(columns=['pfr_id','week','nflId','gsis_it_id'])

In [38]:
df_out.head(3)

Unnamed: 0,gameId,playId,qbr_total,pass_val,run_val,ybc_att,yac_att,qb_plays
0,2022090800,56,64.669373,73.387823,18.311439,3.086347,1.974908,2196.0
1,2022090800,80,64.669373,73.387823,18.311439,3.086347,1.974908,2196.0
2,2022090800,101,64.669373,73.387823,18.311439,3.086347,1.974908,2196.0


In [39]:
import os
root_dir = os.getcwd()

In [41]:
df_out.to_csv(os.path.join(root_dir,'data/qbr_data.csv'),index=False)