In [1]:
import pandas as pd

In [2]:
pbp = pd.DataFrame()
for year in range(1999, 2020):
    i_df = pd.read_csv(f'pbp_data/play_by_play_{year}.csv.gz', compression='gzip', low_memory=False)
    pbp = pd.concat([pbp, i_df])

In [3]:
# columns: the data for which you want to find correlations and YoY stability
# groupby: the (single) variable by which you'd like to group by
# data: dataframe for which correlations/YoY stability are being found.
#       must contain all columns specified in first arg, as well as specified 
#       groupby col and 'season' col 
def year_to_year_corr(columns, groupby, data):
    cols = [groupby, 'season'] + columns 
    data = data[cols]
    ldata = data.groupby(by=groupby).shift(-1)
    data.columns = [groupby, 'prev_season'] + [f'prev_{col}' for col in columns]
    new_data = pd.concat((data, ldata), axis=1).dropna(subset=['season']).drop(columns=['prev_season','season'])
    tot_corr = new_data.corr(method='pearson')
    num_corr_cols = len(columns)
    corr = tot_corr.iloc[num_corr_cols:,num_corr_cols:]
    pred = tot_corr.iloc[0:num_corr_cols, num_corr_cols:]
    return corr,pred

In [16]:
epa = pbp.loc[(pbp.down == 1) & (pbp.season_type=='REG') & (pbp.play==1)].groupby(['season', 'posteam']).agg({'epa':'mean', 'play_id':'count'}).rename(columns={'epa':'first_down_epa', 'play_id':'first_down_count'})
second_down = pbp.loc[(pbp.down==2) & (pbp.season_type=='REG') & (pbp.play==1)].groupby(['season', 'posteam']).agg({'epa':'mean', 'play_id':'count'})
epa['second_down_epa'] = second_down['epa']
epa['second_down_count'] = second_down['play_id']
third_down = pbp.loc[(pbp.down==3) & (pbp.season_type=='REG') & (pbp.play==1)].groupby(['season', 'posteam']).agg({'epa':'mean', 'play_id':'count'})
epa['third_down_epa'] = third_down['epa']
epa['third_down_count'] = third_down['play_id']
fourth_down = pbp.loc[(pbp.down==4) & (pbp.season_type=='REG') & (pbp.play==1)].groupby(['season', 'posteam']).agg({'epa':'mean', 'play_id':'count'})
epa['fourth_down_epa'] = fourth_down['epa']
epa['fourth_down_count'] = fourth_down['play_id']
epa['overall_epa'] = pbp.loc[(pbp.season_type=='REG') & (pbp.play==1)].groupby(['season', 'posteam']).agg({'epa':'mean'})
epa['overall_count'] = pbp.loc[(pbp.season_type=='REG') & (pbp.play==1)].groupby(['season', 'posteam']).agg({'play_id':'count'})
epa = epa.reset_index()

In [19]:
print(f'first down: {epa.first_down_count.mean()}')
print(f'second down: {epa.second_down_count.mean()}')
print(f'third down: {epa.third_down_count.mean()}')
print(f'fourth down: {epa.fourth_down_count.mean()}')
print(f'overall: {epa.overall_count.mean()}')

first down: 466.49775784753365
second down: 354.0732436472347
third down: 230.42600896860986
fourth down: 21.315396113602393
overall: 1075.3961136023916


In [18]:
corr, pred = year_to_year_corr(['first_down_epa', 'second_down_epa', 'third_down_epa', 'fourth_down_epa', 'overall_epa'], 'posteam', epa)
pred**2

Unnamed: 0,first_down_epa,second_down_epa,third_down_epa,fourth_down_epa,overall_epa
prev_first_down_epa,0.117478,0.086439,0.095032,0.006967,0.147397
prev_second_down_epa,0.077844,0.097381,0.102041,0.003631,0.134442
prev_third_down_epa,0.096377,0.114267,0.130856,0.003073,0.165028
prev_fourth_down_epa,0.008187,0.009038,0.018936,0.002688,0.017541
prev_overall_epa,0.142512,0.147629,0.165304,0.006618,0.222142
