In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', None)

In [2]:
pbp = pd.concat([pd.read_csv(f'pbp_data/play_by_play_{season}.csv.gz', low_memory=False) for season in range(2006,2022)])

In [3]:
pbp = pbp.loc[(pbp['pass']==1)].copy()

In [4]:
# To include designed qb runs (which I decided not to )
# rusher_pass_atts = pbp.groupby(['season','passer_id'], as_index=False)\
#     .agg({'pass_attempt':'sum'})\
#     .rename(columns={'passer_id':'rusher_id', 'pass_attempt':'rusher_passes'})
# pbp = pbp.loc[(pbp.play_type.isin(['pass','run','no_play']))&(pbp.qb_spike!=1)&(pbp.qb_kneel!=1)].copy()\
#     .merge(rusher_pass_atts,on=['season','rusher_id'], how='left')
# pbp = pbp.loc[(pbp['rusher_id'].isnull())|(pbp['rusher_passes']>10)]

In [5]:
pbp['scramble_epa'] = np.where((pbp['qb_scramble']==1), pbp['qb_epa'], 0)
pbp['sack_epa'] = np.where((pbp['sack']==1), pbp['qb_epa'], 0)
pbp['incompletion_epa'] = np.where((pbp['incomplete_pass']==1), pbp['qb_epa'], 0)
pbp['int_epa'] = np.where((pbp['interception']==1), pbp['qb_epa'], 0)
pbp['completionx_epa'] = np.where(pbp['complete_pass']==1, pbp['air_epa'].fillna(0)+pbp['xyac_epa'].fillna(0), 0)
pbp['completion_epa'] = np.where(pbp['complete_pass']==1, pbp['epa'], 0)
pbp['yacoe_epa'] = np.where((pbp['complete_pass']==1) & (pbp['fumble_lost']!=1), pbp['yac_epa'].fillna(0)-pbp['xyac_epa'].fillna(0), 0)
pbp['penalty_epa'] = np.where((pbp['scramble_epa']==0)&(pbp['sack_epa']==0)&(pbp['incompletion_epa']==0)&(pbp['int_epa']==0)&(pbp['completion_epa']==0)&(pbp['yacoe_epa']==0)&(pbp['penalty']==1), pbp['qb_epa'], 0)
pbp['completed_air_epa'] = np.where(pbp['complete_pass']==1, pbp['air_epa'].fillna(0), 0)
pbp['completed_xyac_epa'] = np.where(pbp['complete_pass']==1, pbp['xyac_epa'].fillna(0), 0) 
pbp['completed_yac_epa'] = np.where(pbp['complete_pass']==1, pbp['yac_epa'].fillna(0), 0)

pbp['aggression'] = pbp['air_epa'] + pbp['xyac_epa'].fillna(0)

components = ['scramble','sack','incompletion','int','completionx','completion','completed_air','completed_xyac','completed_yac','yacoe','penalty']

passer_ids = pbp.groupby(['season','posteam','name'], as_index=False).agg({'passer_id':pd.Series.mode})

games = pbp.groupby(['season','posteam','name','game_id'])\
    .agg({**{'week':'mean','qb_epa':'mean','play_id':'count','season_type':'first','aggression':'mean'}, **{f'{component}_epa':'sum' for component in components}})\
    .query('play_id>10').reset_index().merge(passer_ids, on=['season','posteam','name']).rename(columns={'qb_epa':'epa'})
games['week'] = games['week'].astype(int)
games['games_played'] = games.groupby(['passer_id']).game_id.transform('count')

In [6]:
for component in components:
    games[f'{component}_value'] = games[f'{component}_epa']/games['play_id']
    games[f'{component}_vaa'] = games[f'{component}_value']-games.loc[games.games_played>=16][f'{component}_value'].mean()
    games[f'weighted_{component}_value'] = games[f'{component}_value']*games['play_id']
    games[f'weighted_{component}_vaa'] = games[f'{component}_vaa']*games['play_id']
games['weighted_epa'] = games['epa']*games['play_id']

In [7]:
games.to_csv('component_vaa_games_no_rushes.csv')