In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import time
import glob

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

import helpers

In [2]:
# folder = 'pfr_data'

# for year in range(1972,1973):
#     url = f'https://www.pro-football-reference.com/years/{year}/index.htm'
#     page = requests.get(url)
#     tables = pd.read_html(page.text)
#     if len(tables)>=2:
#         afc = tables[0]
#         afc.to_csv(f'{folder}/standings/afc_standings_{year}.csv')
#         nfc = tables[1]
#         nfc.to_csv(f'{folder}/standings//nfc_standings_{year}.csv')
#         if len(tables) > 2:
#             for i in range(2,len(tables)):
#                 tables[i].to_csv(f'{folder}/table_{i}_{year}.csv')
#     soup = BeautifulSoup(page.text, 'html.parser')
#     comments = soup.find_all(string=lambda text:isinstance(text, Comment))
#     keywords = ['Team Offense Table', 'Passing Offense Table', 'Rushing Offense Table']
#     file_prefixes = ['offense', 'pass_offense', 'rush_offense']
#     for comment in comments:
#         for i in range(3):
#             if keywords[i] in comment:
#                 table = BeautifulSoup(comment).find('table')
#                 df = pd.read_html(str(table))[0]
#                 df.to_csv(f'{folder}/{file_prefixes[i]}/{file_prefixes[i]}_{year}.csv')
#     time.sleep(3)

In [3]:
offense = pd.concat([pd.read_csv(f'pfr_data/offense/offense_{season}.csv').assign(season=season).drop(columns=['Unnamed: 27_level_0'], errors='ignore') for season in range(1935,2022)])
off_cols = ['del_1','rank','team','games','points','yards','plays','ypp','turnovers','fumbles_lost','first_downs','pass_completions','pass_atts','pass_yds','pass_tds','pass_ints','pass_nya','pass_1D','rush_atts','rush_yds','rush_tds','rush_ypa','rush_1D','penalties','penalty_yds','penalty_1D','drive_score_pct','drive_to_pct','season']
offense = offense.set_axis(off_cols, axis=1)\
    .drop(columns=['del_1'])\
    .query('(rank != \'Rk\') and (rank == rank)') # filter out subheader and league averages
for col in offense.columns[2:]:
    offense[col] = offense[col].astype('float')
offense['season'] = offense['season'].astype(int)
offense['rank'] = offense['rank'].astype(float).astype(int)

offense['to%'] = offense['turnovers']/offense['plays']
offense['first_down_rate'] = offense['first_downs']/offense['plays']
offense['ypp'] = offense['yards']/offense['plays']
offense['ypg'] = offense['yards']/offense['games']
offense['ppg'] = offense['points']/offense['games']
offense['ppy'] = offense['points']/offense['yards']

pass_offense = pd.concat([pd.read_csv(f'pfr_data/pass_offense/pass_offense_{season}.csv').assign(season=season).rename(columns={'Tm':'team'}).query('Rk == Rk') for season in range(1935,2022)])
offense = offense.merge(pass_offense[['season','team','ANY/A']], on=['season','team'])
del(pass_offense)

offense = helpers.season_era_adjust(offense, 'first_down_rate')
offense = helpers.season_era_adjust(offense, 'ypp')
offense = helpers.season_era_adjust(offense, 'pass_nya')
offense = helpers.season_era_adjust(offense, 'ANY/A')
offense = helpers.season_era_adjust(offense, 'ppg')
offense = helpers.season_era_adjust(offense, 'ppy')
drop_columns = [
    'fumbles_lost',
    'turnovers',
    'first_downs',
    'points',
    'yards',
    'pass_atts',
    'pass_yds',
    'pass_tds',
    'pass_ints',
    'pass_1D',
    'rush_atts',
    'rush_yds',
    'rush_tds',
    'rush_ypa',
    'rush_1D',
    'penalties',
    'pass_completions',
    'penalty_yds',
    'penalty_1D',
    'drive_score_pct',
    'drive_to_pct',
]
offense = offense.drop(columns=drop_columns)
type_conversions = {
    'rank':'uint8',
    'games':'uint8',
    'plays':'uint16',
    'ypg':'uint16',
    'season':'uint16'
}
for col in type_conversions.keys():
    try:
        offense[col] = offense[col].astype(type_conversions[col])
    except pd.errors.IntCastingNaNError:
        print(f'{col} has NaN value(s) for: ' + str([f'{row.season} {row.team}' for row in offense.loc[offense[col].isna()][['season','team']].itertuples()]))
        offense[col] = offense[col].fillna(0).astype(type_conversions[col])
for col in offense.columns:
    if str(offense[col].dtype)=='float64':
        offense[col] = offense[col].astype('float16')

plays has NaN value(s) for: ['1943 Phi/Pit Eagles/Steelers']
ypg has NaN value(s) for: ['1943 Phi/Pit Eagles/Steelers']


In [4]:
offense.sort_values('ANY/A_stdevs_above_mean', ascending=False)[['season','team','ANY/A_stdevs_above_mean']][:10]

Unnamed: 0,season,team,ANY/A_stdevs_above_mean
794,1984,Miami Dolphins,3.628906
1379,2004,Indianapolis Colts,3.347656
1603,2011,Green Bay Packers,3.009766
934,1989,San Francisco 49ers,2.943359
1475,2007,New England Patriots,2.835938
1074,1994,San Francisco 49ers,2.792969
1763,2016,Atlanta Falcons,2.78125
738,1982,San Diego Chargers,2.689453
160,1951,Los Angeles Rams,2.681641
795,1984,San Francisco 49ers,2.619141


In [5]:
standings_pre_merger = pd.concat([pd.read_csv(f'pfr_data/standings/nfl_standings_{season}.csv').assign(season=season) for season in range(1935,1970)])
standings_post_merger = pd.concat([pd.concat([pd.read_csv(f'pfr_data/standings/afc_standings_{season}.csv').assign(season=season), pd.read_csv(f'pfr_data/standings/nfc_standings_{season}.csv').assign(season=season)]) for season in range(1970,2022)])
standings = pd.concat([standings_pre_merger, standings_post_merger])\
    .drop(columns=['Unnamed: 0'])
standings['Tm'] = standings['Tm'].map(lambda x: x.rstrip('*+'))
div_names = ['East','West','American','National','Capitol','Century','Coastal','Central','AFC East','AFC Central','AFC West','NFC East','NFC Central','NFC West','AFC North','AFC South','NFC North','NFC South']
standings = standings.loc[~standings.Tm.isin(div_names)].rename(columns={'Tm':'team'})
for col in standings.columns[1:]:
    standings[col] = standings[col].astype('float16')
standings['season'] = standings['season'].astype('uint16')
drop_columns = [
    'W',
    'L',
    'T',
    'PF',
    'PA',
    'SoS'
]
standings = standings.drop(columns=drop_columns)
standings = helpers.season_era_adjust(standings, 'MoV')
standings.columns

merge = offense.merge(standings, on=['season','team'], how='left')
franchise_df = pd.DataFrame()
for filename in glob.iglob(f'pfr_data/franchise_encyclopedias/*.csv'):
    temp_franchise_df = pd.read_csv(filename)[['season','team','passer_name','passer_url','coach_name','coach_url','franchise_code']]
    temp_franchise_df['season'] = temp_franchise_df['season'].astype('uint16')
    franchise_df = pd.concat([franchise_df,temp_franchise_df])
merge = merge.merge(franchise_df, on=['season','team'], how='left')

games = pd.DataFrame()
for filename in glob.iglob(f'pfr_data/qb_games/reg_season/*.csv'):
    temp_games = pd.read_csv(filename)
    temp_games = pd.concat([temp_games.iloc[:,:23], temp_games[['qb','qb_tag']]], axis=1)
    game_cols = ['del_1','rank','season','Date','G#','Week','Age','Tm','Home','Opp','Result','GS','Cmp','Pass_Att','Cmp%','Pass_Yds','Pass_TD','Int','Passer_Rating','Sk','Sack_Yds', 'Pass_Y/A', 'Pass_AY/A', 'qb', 'qb_tag']#, 'Rush_Att', 'Rush_Yds', 'Rush_Y/A', 'Rush_TD', 'Rec_Tgt', 'Rec', 'Rec_Yds', 'Rec_Y/A', 'Rec_TD', 'Catch%', 'Y/Tgt', 'Scoring_TD', 'Scoring_Pts', 'Fmb', 'FL', 'FF', 'FR', 'Fmb_Yds', 'Fmb_TD', 'Off_snaps', 'Off_snaps_pct', 'Def_snaps', 'Def_snaps_pct', 'ST_snaps', 'ST_snaps_pct', 'Status', 'QB', 'QB_Tag']
    try:
        temp_games = temp_games.set_axis(game_cols, axis=1)\
        .drop(columns=['del_1'])\
        .query('(rank != \'Rk\') and (rank == rank)') # filter out subheader and league averages
    except ValueError:
        display(temp_games.head(2))
        print(game_cols)
        raise KeyboardInterrupt
    games = pd.concat([games, temp_games])
games['season'] = games['season'].astype(float).astype(int)
games['Home'] = np.where(games['Home']=='@', 0, 1)
games['Game_result'] = np.where(games['Result'].str[0]=='W', 1, np.where(games['Result'].str[0]=='L', 0, 0.5))
games['ppg'] = games['Result'].map(lambda x:x.split()[1].split('-')[0]).astype('int')
games['ppg_against'] = games['Result'].map(lambda x:x.split()[1].split('-')[1]).astype('int')

# status_col = games['Status'].copy()

nan_status_values = ['Injured Reserve', 'Did Not Play', 'Inactive', 'Suspended', 'COVID-19 List', 'Exempt List', 'Non-Football Injury']

status_col = np.where(games['Cmp'].isin(nan_status_values), games['Cmp'], 'Active')
for col in nan_status_values:
    games.replace(col, np.nan, inplace=True)
games['Status'] = status_col

int_cols = ['rank','G#','Week','Home','Game_result','ppg','ppg_against']
float_cols = ['Cmp','Pass_Att','Cmp%','Pass_Yds','Pass_TD','Int','Passer_Rating','Sk','Sack_Yds','Pass_Y/A','Pass_AY/A']#,'Rush_Att','Rush_Yds','Rush_Y/A','Rush_TD','Off_snaps','Off_snaps_pct']
type_conversions = {
    **{val:'uint8' for val in int_cols},
    **{val:'float16' for val in float_cols}
}
for col in type_conversions.keys():
    try:
        games[col] = games[col].astype(float).astype(type_conversions[col])
    except pd.errors.IntCastingNaNError:
        print(f'{col} has NaN value(s) for: ' + str([f'{row.season} {row.team}' for row in games.loc[games[col].isna()][['QB','season','Tm','Opp']].itertuples()]))
        games[col] = games[col].fillna(0).astype(float).astype(type_conversions[col])
for col in games.columns:
    if str(games[col].dtype)=='float64':
        games[col] = games[col].astype('float16')
games['ANY/A'] = (games['Pass_Yds'] - games['Sack_Yds'].fillna(0) + 20*games['Pass_TD'] - 45*games['Int'])/(games['Pass_Att']+games['Sk'].fillna(0))
# games['TANY/A'] = (games['Pass_Yds'] + games['Rush_Yds'] - games['Sack_Yds'] + 20*(games['Pass_TD']+games['Rush_TD']) - 45*games['Int'])/(games['Pass_Att']+games['Sk']+games['Rush_Att'])
games = helpers.games_era_adjust(helpers.games_era_adjust(games, merge, 'ANY/A'), merge, 'ppg')
games['performance_index'] = .8*games['ANY/A_stdevs_above_mean'] + .2*games['ppg_stdevs_above_mean']
games['dropbacks'] = (games['Pass_Att']+games['Sk'].fillna(0)).astype('Int64')

offense = pd.concat([pd.read_csv(f'pfr_data/offense/offense_{season}.csv').assign(season=season).drop(columns=['Unnamed: 27_level_0'], errors='ignore') for season in range(1935,2022)])
off_cols = ['del_1','rank','team','games','points','yards','plays','ypp','turnovers','fumbles_lost','first_downs','pass_completions','pass_atts','pass_yds','pass_tds','pass_ints','pass_nya','pass_1D','rush_atts','rush_yds','rush_tds','rush_ypa','rush_1D','penalties','penalty_yds','penalty_1D','drive_score_pct','drive_to_pct','season']
offense = offense.set_axis(off_cols, axis=1)\
    .drop(columns=['del_1'])\
    .query('(rank != \'Rk\') and (rank == rank)') # filter out subheader and league averages
for col in offense.columns[2:]:
    offense[col] = offense[col].astype('float')
offense['season'] = offense['season'].astype(int)
offense['rank'] = offense['rank'].astype(float).astype(int)

offense['to%'] = offense['turnovers']/offense['plays']
offense['first_down_rate'] = offense['first_downs']/offense['plays']
offense['ypp'] = offense['yards']/offense['plays']
offense['ypg'] = offense['yards']/offense['games']
offense['ppg'] = offense['points']/offense['games']
offense['ppy'] = offense['points']/offense['yards']

pass_offense = pd.concat([pd.read_csv(f'pfr_data/pass_offense/pass_offense_{season}.csv').assign(season=season).rename(columns={'Tm':'team'}).query('Rk == Rk') for season in range(1935,2022)])
offense = offense.merge(pass_offense[['season','team','ANY/A']], on=['season','team'])
del(pass_offense)

In [6]:
seasons = pd.DataFrame(helpers.weighted_avg(games, 'performance_index', 'dropbacks', ['season','qb']))
seasons['ANY/A_stdevs_above_mean'] = helpers.weighted_avg(games, 'ANY/A_stdevs_above_mean', 'dropbacks', ['season','qb'])
seasons['ppg_stdevs_above_mean'] = helpers.weighted_avg(games, 'ppg_stdevs_above_mean', 'dropbacks', ['season','qb'])
seasons['dropbacks'] = games.groupby(['season','qb']).agg({'dropbacks':'sum'})
seasons['games_played'] = games.loc[games['Status']=='Active'].groupby(['season','qb']).agg({'rank':'count'}).astype('Int64')
seasons['dropbacks'] = seasons['dropbacks'].fillna(0).astype('Int64')
seasons['games_played'] = seasons['games_played'].fillna(0).astype('Int64')
seasons.reset_index(inplace=True)

seasons\
    .query('dropbacks >= 300')\
    .sort_values('performance_index', ascending=False)[:13][['season','qb','performance_index']].rename(columns={'performance_index':'stdevs_above_era_mean'})

Unnamed: 0,season,qb,stdevs_above_era_mean
1096,1984,Dan Marino,3.431146
2093,2004,Peyton Manning,3.38039
1792,1998,Randall Cunningham,3.051606
2255,2007,Tom Brady,3.01002
1600,1994,Steve Young,2.969509
2401,2011,Aaron Rodgers,2.901149
282,1961,George Blanda,2.839175
2534,2013,Peyton Manning,2.812998
2681,2016,Matt Ryan,2.793773
1892,2000,Kurt Warner,2.719479


In [7]:
season_dropback_min = 300
stretch_duration = 4
stretches = seasons.copy()
for year in range(1,stretch_duration):
    cols = ['season','performance_index','ANY/A_stdevs_above_mean','ppg_stdevs_above_mean','dropbacks','games_played']
    future = seasons.groupby('qb').shift(-1*year)
    future.columns = ['next_'*year+col for col in cols]
    stretches = pd.concat((stretches,future), axis=1)
last_season_col = 'next_'*(stretch_duration-1)+'season'
stretches['stretch'] = np.where((stretches[last_season_col]-stretches['season'])==(stretch_duration-1), 
                        stretches['season'].astype('str') + '-' + (stretches['season']+stretch_duration-1).fillna(0).astype('int').astype('str'), '*')
for field in ['dropbacks','games_played']:
    stretches[f'tot_{field}'] = stretches[[col for col in stretches.columns if field in col]].sum(axis=1)
for field in ['performance_index','ANY/A_stdevs_above_mean','ppg_stdevs_above_mean']:
    for year in range(stretch_duration):
        level = "next_"*year
        stretches[f'weighted_{level}{field}'] = stretches[f'{level}{field}']*stretches[f'{level}dropbacks']
    stretches[field] = stretches[[col for col in stretches.columns if 'weighted' in col]].sum(axis=1)/stretches['tot_dropbacks']
    stretches.drop(columns=[col for col in stretches.columns if 'weighted' in col], inplace=True)

stretches['first_season'] = stretches.groupby('qb')['season'].transform('min')

stretches = stretches.dropna()\
    .query('(stretch != \'*\')&'+'&'.join([f'({"next_"*i}dropbacks>={season_dropback_min})' for i in range(stretch_duration)]))\
    .drop(columns=[col for col in stretches.columns if 'next' in col]+['dropbacks','games_played'])\
    .rename(columns={'tot_dropbacks':'dropbacks','tot_games_played':'games_played','season':'start_season'})\
    .reindex(['stretch','qb','performance_index','ANY/A_stdevs_above_mean','ppg_stdevs_above_mean','dropbacks','games_played','start_season','first_season'], axis=1)
for col in ['dropbacks','games_played']:
    stretches[col] = stretches[col].astype(int)    

stretches\
    .sort_values('performance_index', ascending=False)\
    .drop_duplicates(subset=['qb'])[:10]\
    .reset_index(drop=True)\
    .style.background_gradient(
        subset=['performance_index'], 
        cmap=cm.get_cmap('PRGn'), 
        vmin=stretches['performance_index'].min(),
        vmax=stretches['performance_index'].max())

Unnamed: 0,stretch,qb,performance_index,ANY/A_stdevs_above_mean,ppg_stdevs_above_mean,dropbacks,games_played,start_season,first_season
0,1992-1995,Steve Young,2.251361,2.214611,2.402387,1888,59,1992,1985
1,2003-2006,Peyton Manning,2.214968,2.313808,1.821117,2135,64,2003,1998
2,1983-1986,Dan Marino,2.143091,2.248414,1.723722,2108,59,1983,1983
3,2011-2014,Aaron Rodgers,2.046617,2.149286,1.638141,2000,56,2011,2005
4,1980-1983,Dan Fouts,2.005242,2.021391,1.942651,1945,51,1980,1973
5,2009-2012,Tom Brady,1.683009,1.647584,1.827581,2405,64,2009,2000
6,1967-1970,Daryle Lamonica,1.676651,1.667241,1.716143,1638,55,1967,1963
7,1984-1987,Joe Montana,1.665391,1.754977,1.308304,1722,52,1984,1979
8,2018-2021,Patrick Mahomes,1.660837,1.686915,1.558718,2403,62,2018,2017
9,2017-2020,Drew Brees,1.652448,1.637411,1.714307,1855,54,2017,2001


In [8]:
# best n year starts to a career ever
stretches.loc[stretches.start_season==stretches.first_season]\
    .sort_values('performance_index', ascending=False)[:10]\
    .reset_index(drop=True)\
    .style.background_gradient(
        subset=['performance_index'],
        cmap=cm.get_cmap('PRGn'),
        vmin=stretches['performance_index'].min(),
        vmax=stretches['performance_index'].max())

Unnamed: 0,stretch,qb,performance_index,ANY/A_stdevs_above_mean,ppg_stdevs_above_mean,dropbacks,games_played,start_season,first_season
0,1983-1986,Dan Marino,2.143091,2.248414,1.723722,2108,59,1983,1983
1,1965-1968,Joe Namath,1.035032,1.152115,0.567597,1682,55,1965,1965
2,1999-2002,Jeff Garcia,1.027376,1.122034,0.6506,2050,61,1999,1999
3,2012-2015,Russell Wilson,0.934934,1.035274,0.534128,1899,64,2012,2012
4,1998-2001,Peyton Manning,0.852278,0.882557,0.732433,2311,64,1998,1998
5,2016-2019,Dak Prescott,0.58258,0.68346,0.179704,2207,64,2016,2016
6,2008-2011,Matt Ryan,0.527222,0.566841,0.369426,2107,62,2008,2008
7,1986-1989,Jim Kelly,0.459191,0.606881,-0.12994,1872,57,1986,1986
8,2012-2015,Andrew Luck,0.198377,0.132732,0.461597,2221,55,2012,2012
9,2008-2011,Joe Flacco,0.160935,0.132898,0.273128,2097,64,2008,2008


Validation using nflfastr era seasons to compare against EPA/p

In [9]:
# nflfastr_games = pd.DataFrame()
# for filename in glob.iglob(f'pfr_data/nflfastr_qb_games/reg_season/*.csv'):
#     temp_games = pd.read_csv(filename)
#     temp_games = pd.concat([temp_games.iloc[:,:23], temp_games[['qb','qb_tag']]], axis=1)
#     game_cols = ['del_1','rank','season','Date','G#','Week','Age','Tm','Home','Opp','Result','GS','Cmp','Pass_Att','Cmp%','Pass_Yds','Pass_TD','Int','Passer_Rating','Sk','Sack_Yds', 'Pass_Y/A', 'Pass_AY/A', 'qb', 'qb_tag']#, 'Rush_Att', 'Rush_Yds', 'Rush_Y/A', 'Rush_TD', 'Rec_Tgt', 'Rec', 'Rec_Yds', 'Rec_Y/A', 'Rec_TD', 'Catch%', 'Y/Tgt', 'Scoring_TD', 'Scoring_Pts', 'Fmb', 'FL', 'FF', 'FR', 'Fmb_Yds', 'Fmb_TD', 'Off_snaps', 'Off_snaps_pct', 'Def_snaps', 'Def_snaps_pct', 'ST_snaps', 'ST_snaps_pct', 'Status', 'QB', 'QB_Tag']
#     try:
#         temp_games = temp_games.set_axis(game_cols, axis=1)\
#         .drop(columns=['del_1'])\
#         .query('(rank != \'Rk\') and (rank == rank)') # filter out subheader and league averages
#     except ValueError:
#         display(temp_games.head(2))
#         print(game_cols)
#         raise KeyboardInterrupt
#     nflfastr_games = pd.concat([nflfastr_games, temp_games])
# nflfastr_games['season'] = nflfastr_games['season'].astype(float).astype(int)
# nflfastr_games['Home'] = np.where(nflfastr_games['Home']=='@', 0, 1)
# nflfastr_games['Game_result'] = np.where(nflfastr_games['Result'].str[0]=='W', 1, np.where(nflfastr_games['Result'].str[0]=='L', 0, 0.5))
# nflfastr_games['ppg'] = nflfastr_games['Result'].map(lambda x:x.split()[1].split('-')[0]).astype('int')
# nflfastr_games['ppg_against'] = nflfastr_games['Result'].map(lambda x:x.split()[1].split('-')[1]).astype('int')

# # status_col = nflfastr_games['Status'].copy()

# nan_status_values = ['Injured Reserve', 'Did Not Play', 'Inactive', 'Suspended', 'COVID-19 List', 'Exempt List', 'Non-Football Injury']

# status_col = np.where(nflfastr_games['Cmp'].isin(nan_status_values), nflfastr_games['Cmp'], 'Active')
# for col in nan_status_values:
#     nflfastr_games.replace(col, np.nan, inplace=True)
# nflfastr_games['Status'] = status_col

# int_cols = ['rank','G#','Week','Home','Game_result','ppg','ppg_against']
# float_cols = ['Cmp','Pass_Att','Cmp%','Pass_Yds','Pass_TD','Int','Passer_Rating','Sk','Sack_Yds','Pass_Y/A','Pass_AY/A']#,'Rush_Att','Rush_Yds','Rush_Y/A','Rush_TD','Off_snaps','Off_snaps_pct']
# type_conversions = {
#     **{val:'uint8' for val in int_cols},
#     **{val:'float16' for val in float_cols}
# }
# for col in type_conversions.keys():
#     try:
#         nflfastr_games[col] = nflfastr_games[col].astype(float).astype(type_conversions[col])
#     except pd.errors.IntCastingNaNError:
#         print(f'{col} has NaN value(s) for: ' + str([f'{row.season} {row.team}' for row in nflfastr_games.loc[nflfastr_games[col].isna()][['QB','season','Tm','Opp']].itertuples()]))
#         nflfastr_games[col] = nflfastr_games[col].fillna(0).astype(float).astype(type_conversions[col])
# for col in nflfastr_games.columns:
#     if str(nflfastr_games[col].dtype)=='float64':
#         nflfastr_games[col] = nflfastr_games[col].astype('float16')
# nflfastr_games['ANY/A'] = (nflfastr_games['Pass_Yds'] - nflfastr_games['Sack_Yds'] + 20*nflfastr_games['Pass_TD'] - 45*nflfastr_games['Int'])/(nflfastr_games['Pass_Att']+nflfastr_games['Sk'])
# # nflfastr_games['TANY/A'] = (nflfastr_games['Pass_Yds'] + nflfastr_games['Rush_Yds'] - nflfastr_games['Sack_Yds'] + 20*(nflfastr_games['Pass_TD']+nflfastr_games['Rush_TD']) - 45*nflfastr_games['Int'])/(nflfastr_games['Pass_Att']+nflfastr_games['Sk']+nflfastr_games['Rush_Att'])
# nflfastr_games = helpers.games_era_adjust(helpers.games_era_adjust(nflfastr_games, merge, 'ANY/A'), merge, 'ppg')
# for anya_weight in [.5,.55,.6,.65,.75,.8,.85,.9,.95]:
#     nflfastr_games[f'{anya_weight}_performance_index'] = anya_weight*nflfastr_games['ANY/A_stdevs_above_mean'] + (1-anya_weight)*nflfastr_games['ppg_stdevs_above_mean']
# nflfastr_games['dropbacks'] = nflfastr_games['Pass_Att']+nflfastr_games['Sk'].fillna(0)

# roster = pd.concat([pd.read_csv(f'roster/roster_{season}.csv') for season in range(1999,2022)])
# roster.loc[roster.position=='QB'].columns

# nflfastr_seasons = nflfastr_games.groupby(['season','qb','qb_tag']).agg({'dropbacks':'sum'})
# for col in nflfastr_games.columns:
#     if 'performance_index' in col:
#         nflfastr_seasons[col] = helpers.weighted_avg(nflfastr_games, col, 'dropbacks', ['season','qb','qb_tag'])
# nflfastr_seasons['ANY/A_stdevs_above_mean'] = helpers.weighted_avg(nflfastr_games, 'ANY/A_stdevs_above_mean', 'dropbacks', ['season','qb','qb_tag'])
# nflfastr_seasons['ppg_stdevs_above_mean'] = helpers.weighted_avg(nflfastr_games, 'ppg_stdevs_above_mean', 'dropbacks', ['season','qb','qb_tag'])
# nflfastr_seasons['games_played'] = nflfastr_games.loc[nflfastr_games['Status']=='Active'].groupby(['season','qb','qb_tag']).agg({'rank':'count'})
# nflfastr_seasons['dropbacks'] = nflfastr_seasons['dropbacks'].fillna(0).astype(int)
# nflfastr_seasons['games_played'] = nflfastr_seasons['games_played'].fillna(0).astype(int)
# nflfastr_seasons = nflfastr_seasons.reset_index().query('season>=1999')

In [10]:
# pbp = pd.concat([pd.read_csv(f'pbp_data/play_by_play_{season}.csv.gz') for season in range(1999,2022)])

In [11]:
# pbp_seasons = pbp.groupby(['season','passer_id'], as_index=False).agg({'play_id':'count','epa':'mean'})
# nflfastr_seasons_merged = nflfastr_seasons\
#     .merge(roster[['season','pfr_id','gsis_id']].rename(columns={'pfr_id':'qb_tag','gsis_id':'passer_id'}), how='left', on=['season','qb_tag'])\
#     .merge(pbp_seasons, on=['season', 'passer_id'], indicator=True)

In [12]:
# _, corr, pred = helpers.year_to_year_corr(['performance_index','ANY/A_stdevs_above_mean','ppg_stdevs_above_mean','epa'],'qb',nflfastr_seasons_merged.loc[(nflfastr_seasons_merged.dropbacks>=300)])
# (corr**2).sort_values('epa', ascending=False).style.background_gradient()

In [13]:
# (pred**2).sort_values('epa', ascending=False).style.background_gradient()