In [16]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 300)

In [89]:
data = pd.DataFrame()
for year in range(1999, 2020):
    i_df = pd.read_csv(f'pbp_data/play_by_play_{year}.csv.gz', compression='gzip', low_memory=False)
    i_df['year'] = year
    data = pd.concat([data, i_df])
data.reset_index(drop=True, inplace=True)

In [19]:
sorted(list(data.columns))

['Unnamed: 0',
 'air_epa',
 'air_wpa',
 'air_yards',
 'assist_tackle',
 'assist_tackle_1_player_id',
 'assist_tackle_1_player_name',
 'assist_tackle_1_team',
 'assist_tackle_2_player_id',
 'assist_tackle_2_player_name',
 'assist_tackle_2_team',
 'assist_tackle_3_player_id',
 'assist_tackle_3_player_name',
 'assist_tackle_3_team',
 'assist_tackle_4_player_id',
 'assist_tackle_4_player_name',
 'assist_tackle_4_team',
 'away_coach',
 'away_score',
 'away_team',
 'away_timeouts_remaining',
 'away_wp',
 'away_wp_post',
 'blocked_player_id',
 'blocked_player_name',
 'comp_air_epa',
 'comp_air_wpa',
 'comp_yac_epa',
 'comp_yac_wpa',
 'complete_pass',
 'cp',
 'cpoe',
 'def_wp',
 'defensive_extra_point_attempt',
 'defensive_extra_point_conv',
 'defensive_two_point_attempt',
 'defensive_two_point_conv',
 'defteam',
 'defteam_score',
 'defteam_score_post',
 'defteam_timeouts_remaining',
 'desc',
 'div_game',
 'down',
 'drive',
 'drive_end_transition',
 'drive_end_yard_line',
 'drive_ended_with_sc

In [97]:
def base_pass_rate(year, team):
    team_season = data.loc[(data.year==year) & (data.posteam==team) & (data.wp >= .4) & (data.wp <= .6) & ((data.down == 1) | (data.down == 2)) & (data.half_seconds_remaining > 120) & ((data.play_type=='run') | (data.play_type=='pass'))]
    passes = team_season.loc[team_season['pass']==1]
    return len(passes)/len(team_season)

def get_top_passer(year, team):
    team_season = data.loc[(data.year==year) & (data.posteam==team)]
    passers = team_season.groupby('passer', as_index=False).agg({'pass_attempt':'count'}).sort_values('pass_attempt', ascending=False)
    return passers.passer.iloc[0]

def get_coach(year, team):
    team_season = data.loc[(data.year==year) & (data.posteam==team)]
    coaches = team_season.groupby(['home_coach'], as_index=False).agg({'pass_attempt':'count'}).sort_values('pass_attempt', ascending=False)
    try:
        return coaches.home_coach.iloc[0]
    except IndexError:
        return '--'
    
def prev_yr_pass_rate(year, team):
    try:
        return team_seasons.loc[(team_seasons.year==(year-1)) & (team_seasons.posteam==team)].iloc[0].pass_rate
    except IndexError:
        return np.nan

def prev_yr_qb(year, team):
    try:
        return team_seasons.loc[(team_seasons.year==(year-1)) & (team_seasons.posteam==team)].iloc[0].qb
    except IndexError:
        return '--'
    
def prev_yr_coach(year, team):
    try:
        return team_seasons.loc[(team_seasons.year==(year-1)) & (team_seasons.posteam==team)].iloc[0].coach
    except IndexError:
        return '--'

In [98]:
team_seasons = data.groupby(['year', 'posteam'], as_index=False).agg({'epa':'mean'})

In [99]:
team_seasons['pass_rate'] = team_seasons.apply(lambda row: base_pass_rate(row['year'], row['posteam']), axis=1)

In [100]:
team_seasons['qb'] = team_seasons.apply(lambda row: get_top_passer(row['year'], row['posteam']), axis=1)

In [103]:
team_seasons['coach'] = team_seasons.apply(lambda row: get_coach(row['year'], row['posteam']), axis=1)

In [104]:
team_seasons['prev_yr_pass_rate'] = team_seasons.apply(lambda row: prev_yr_pass_rate(row['year'], row['posteam']), axis=1)
team_seasons['prev_yr_qb'] = team_seasons.apply(lambda row: prev_yr_qb(row['year'], row['posteam']), axis=1)
team_seasons['prev_yr_coach'] = team_seasons.apply(lambda row: prev_yr_coach(row['year'], row['posteam']), axis=1)

In [119]:
team_seasons['qb_change'] = team_seasons.apply(lambda row: 1 if row['prev_yr_qb']!=row['qb'] else 0, axis=1)
team_seasons['coach_change'] = team_seasons.apply(lambda row: 1 if row['prev_yr_coach']!=row['coach'] else 0, axis=1)

In [105]:
team_seasons['rate_diff'] = team_seasons['pass_rate'] - team_seasons['prev_yr_pass_rate']
team_seasons['rate_change'] = abs(team_seasons['rate_diff'])

In [157]:
column_order = ['year','posteam','epa','prev_yr_qb','qb','prev_yr_coach','coach', 'qb_change', 'coach_change', 'prev_yr_pass_rate','pass_rate','rate_diff','rate_change']
team_seasons = team_seasons.reindex(columns=column_order)
team_seasons.sort_values('rate_change', ascending=False)[:20][['year', 'posteam', 'prev_yr_qb', 'qb', 'prev_yr_coach', 'coach', 'prev_yr_pass_rate', 'pass_rate']]

Unnamed: 0,year,posteam,prev_yr_qb,qb,prev_yr_coach,coach,prev_yr_pass_rate,pass_rate
460,2013,KC,M.Cassel,A.Smith,Romeo Crennel,Andy Reid,0.320856,0.616114
422,2012,DEN,T.Tebow,P.Manning,John Fox,John Fox,0.327014,0.6
143,2003,LV,R.Gannon,R.Gannon,Bill Callahan,Bill Callahan,0.711111,0.459184
231,2006,DET,J.Harrington,J.Kitna,Steve Mariucci,Rod Marinelli,0.448718,0.651934
390,2011,DEN,K.Orton,T.Tebow,Josh McDaniels,John Fox,0.528662,0.327014
631,2018,PIT,B.Roethlisberger,B.Roethlisberger,Mike Tomlin,Mike Tomlin,0.5,0.689474
584,2017,GB,A.Rodgers,B.Hundley,Mike McCarthy,Mike McCarthy,0.65,0.461538
632,2018,SEA,R.Wilson,R.Wilson,Pete Carroll,Pete Carroll,0.544601,0.357895
394,2011,IND,P.Manning,C.Painter,Jim Caldwell,Jim Caldwell,0.584795,0.401099
480,2014,BUF,E.Manuel,K.Orton,Doug Marrone,Doug Marrone,0.358621,0.538889


In [158]:
team_seasons.sort_values('rate_change', ascending=False)[:20].qb_change.sum()

15

In [159]:
team_seasons.sort_values('rate_change', ascending=False)[:20].coach_change.sum()

5