In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
pbp = pd.DataFrame()
for year in range(2006, 2021):
    i_df = pd.read_csv(f'pbp_data/play_by_play_{year}.csv.gz', compression='gzip', low_memory=False)
    pbp = pd.concat([pbp, i_df])

In [3]:
plays = pbp.loc[(pd.notna(pbp.cp)) & (pd.notna(pbp.xyac_mean_yardage)) & (pd.notna(pbp.air_yards)) & (pbp.season_type=='REG')].copy()

In [4]:
plays['exp_air_yards'] = plays['air_yards']*plays['cp'] 
plays['exp_yac'] = plays['xyac_mean_yardage']*plays['cp']

In [5]:
plays['airYOE'] = np.where(plays['complete_pass']==1, plays['air_yards']-plays['exp_air_yards'], -1*plays['exp_air_yards'])
plays['yacOE'] = np.where(plays['complete_pass']==1, plays['yards_after_catch']-plays['exp_yac'], -1*plays['exp_yac'])
plays['recYOE'] = plays['airYOE']+plays['yacOE']

In [6]:
plays_2020 = plays.loc[plays.season==2020]

In [7]:
receivers = plays_2020.groupby(['posteam','receiver_id', 'receiver'], as_index=False)\
                      .agg({'airYOE':'sum', 'yacOE':'sum', 'recYOE':'sum', 'game_id':pd.Series.nunique})\
                      .drop(columns=['receiver_id'])
receivers['recYOE/G'] = receivers['recYOE']/receivers['game_id']
top_receivers = receivers.sort_values('recYOE', ascending=False)[:10]
top_receivers

Unnamed: 0,posteam,receiver,airYOE,yacOE,recYOE,game_id,recYOE/G
334,MIN,J.Jefferson,214.222996,98.705387,312.928383,16,19.558024
244,KC,T.Kelce,123.380133,182.090419,305.470552,15,20.364703
47,BUF,S.Diggs,233.685796,-8.218736,225.46706,16,14.091691
176,GB,D.Adams,62.296507,143.321778,205.618286,14,14.68702
449,SEA,D.Metcalf,163.676717,38.500307,202.177025,16,12.636064
198,HOU,W.Fuller,105.963819,81.676749,187.640568,10,18.764057
504,TEN,A.Brown,23.057764,164.215498,187.273262,14,13.376662
499,TEN,C.Davis,119.739853,63.083104,182.822958,14,13.058783
74,CAR,D.Moore,117.367158,56.421937,173.789094,15,11.58594
15,ATL,J.Jones,140.118125,33.033732,173.151858,9,19.239095


In [8]:
era_receivers = plays.groupby(['season', 'posteam','receiver_id', 'receiver'], as_index=False)\
                      .agg({'airYOE':'sum', 'yacOE':'sum', 'recYOE':'sum', 'game_id':pd.Series.nunique})\
                      .drop(columns=['receiver_id'])
era_receivers['recYOE/G'] = era_receivers['recYOE']/era_receivers['game_id']
era_receivers.sort_values('recYOE', ascending=False)[:10]

Unnamed: 0,season,posteam,receiver,airYOE,yacOE,recYOE,game_id,recYOE/G
2450,2011,GB,J.Nelson,308.055561,181.335867,489.391428,16,30.586964
3401,2013,DEN,D.Thomas,125.127149,298.096203,423.223352,16,26.45146
6513,2019,LAC,A.Ekeler,35.759144,379.22809,414.987233,16,25.936702
6186,2018,SF,G.Kittle,-1.361534,406.697683,405.336149,16,25.333509
2641,2011,NYG,V.Cruz,165.931694,238.322122,404.253816,16,25.265863
1478,2009,DAL,M.Austin,91.436119,306.812293,398.248412,16,24.890526
2605,2011,NE,W.Welker,178.77923,206.320059,385.099289,16,24.068706
2915,2012,DEN,D.Thomas,195.933069,177.220663,373.153732,16,23.322108
4221,2014,WAS,D.Jackson,217.389611,148.106855,365.496466,15,24.366431
2203,2010,PIT,M.Wallace,241.816537,122.707541,364.524078,16,22.782755


In [9]:
# columns: the data for which you want to find correlations and YoY stability
# groupby: the (single) variable by which you'd like to group by
# data: dataframe for which correlations/YoY stability are being found.
#       must contain all columns specified in first arg, as well as specified 
#       groupby col and 'season' col 
def year_to_year_corr(columns, groupby, data):
    cols = [groupby, 'season'] + columns 
    data = data[cols]
    ldata = data.groupby(by=groupby).shift(-1)
    data.columns = [groupby, 'prev_season'] + [f'prev_{col}' for col in columns]
    new_data = pd.concat((data, ldata), axis=1).dropna(subset=['season']).drop(columns=['prev_season','season'])
    tot_corr = new_data.corr(method='pearson')
    num_corr_cols = len(columns)
    corr = tot_corr.iloc[num_corr_cols:,num_corr_cols:]
    pred = tot_corr.iloc[0:num_corr_cols, num_corr_cols:]
    return corr,pred

In [10]:
all_receivers = plays.groupby(['season', 'receiver_id'], as_index=False).agg({'airYOE':'sum', 'yacOE':'sum', 'recYOE':'sum', 'play_id':'count'})
corr, pred = year_to_year_corr(['airYOE', 'yacOE', 'recYOE'], 'receiver_id', all_receivers.loc[all_receivers.play_id>=43])

In [11]:
pred

Unnamed: 0,airYOE,yacOE,recYOE
prev_airYOE,0.198463,0.007557,0.127332
prev_yacOE,0.03337,0.327055,0.231795
prev_recYOE,0.146074,0.216331,0.22983
