In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
from datetime import date
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 100, "display.max_columns", 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

def p(s):
    print(s)
    
pd.DataFrame.len = pd.Index.len = lambda x: print(len(x))

In [2]:
# Read raw data - from local file if available, else from internet and write to local file
if os.path.exists('data/data.csv.gz'):
    dfFull = pd.read_csv('data/data.csv.gz', compression='gzip',index_col=0)
elif os.path.exists('../data/data.csv.gz'):
    dfFull = pd.read_csv('../data/data.csv.gz', compression='gzip',index_col=0)
else:
    YEARS = np.arange(1999,date.today().year)
    dfFull = pd.DataFrame()
    for i in YEARS:
        sys.stdout.write(f'{i} \r'); sys.stdout.flush()
        iData = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                             'play_by_play_' + str(i) + '.csv.gz?raw=True',
                             compression='gzip', low_memory=False)
        dfFull = dfFull.append(iData, sort=True)

    dfFull.reset_index(drop=True, inplace=True)
    dfFull.to_csv('../data/data.csv.gz', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


# Flex

In [50]:
# Lookup table with all features of the raw data
featureInfo = pd.read_csv('features.csv').iloc[:,0:4]
# print(featureInfo[featureInfo.Use==1])

idCols = featureInfo[(featureInfo.Use==1) & (featureInfo.Type == 'ID')].Field.tolist() + ['receiver_player_name']
valCols = featureInfo[(featureInfo.Use==1) & (featureInfo.Type != 'ID')].Field.tolist() + ['season', 'game_id'] 
notUsed = [col for col in dfFull.columns if col not in idCols + valCols]
df = dfFull[['season'] + [x for x in featureInfo[(featureInfo.Use==1)].Field] + ['receiver_player_name','rusher_player_name','passer_player_name']]
                                                                                 
def lookup(s):
    return featureInfo[featureInfo.Field.str.contains(s)]

In [51]:
# Filter to relevant plays
df = df.dropna(subset=['receiver_player_name','rusher_player_name'], how='all')#[df.season>=2011]

df['player_name'] = df.receiver_player_name.fillna(df.rusher_player_name)
df['rec_yards'] = df.pass_attempt * df.yards_gained
df['rush_yards'] = df.rush_attempt * df.yards_gained

## Season total stats

In [5]:
# Sum all metrics per player per season
seasonTotals = df.groupby(['season','player_name']).sum().drop(['play_id','qb_scramble','sack','safety'],axis=1)

seasonTotals.columns
# seasonTotals.describe().transpose()[['min','max','mean','std']] #df info

Index(['complete_pass', 'fumble', 'incomplete_pass', 'interception',
       'no_huddle', 'pass_attempt', 'pass_touchdown', 'qb_dropback', 'qb_hit',
       'rush_attempt', 'rush_touchdown', 'shotgun', 'tackled_for_loss',
       'touchdown', 'air_yards', 'yards_after_catch', 'yards_gained',
       'rec_yards', 'rush_yards'],
      dtype='object')

In [6]:
# Calculate fantasy points
if not 'fanPts' in seasonTotals.columns:
    seasonTotals.insert(0,'fanPts',0)

ppr = 0
seasonTotals.fanPts = seasonTotals.yards_gained/10 + seasonTotals.touchdown*6 - 2*seasonTotals.fumble + seasonTotals.complete_pass*ppr

In [7]:
# Append data from N previous seasons as new columns 
seasonHist = seasonTotals.copy()
currentYear = 2019
lookback = 3
for i in range(1,lookback+1):
    temp = seasonTotals.copy().reset_index()
    temp = temp[temp.season.isin(range(1999,currentYear-i+1))]
    temp['season'] = temp.season+i
    seasonHist = seasonHist.join(temp.set_index(['season','player_name']),how='outer',rsuffix=f'-{i}')

# Remove no longer active players
# seasonHist = seasonHist.dropna(subset=['fanPts'])
seasonHist.to_csv('seasonHist.csv')

## Game average stats

In [8]:
# Sum all metrics per player per season
gameTotals = df.groupby(['season','player_name','game_id']).sum().drop(['play_id','qb_scramble','sack','safety'],axis=1)
gameAvgs = gameTotals.groupby(['season','player_name']).mean()

gameAvgs.columns
# gameAvgs.describe().transpose()[['min','max','mean','std']] #df info
# gameAvgs

Index(['complete_pass', 'fumble', 'incomplete_pass', 'interception',
       'no_huddle', 'pass_attempt', 'pass_touchdown', 'qb_dropback', 'qb_hit',
       'rush_attempt', 'rush_touchdown', 'shotgun', 'tackled_for_loss',
       'touchdown', 'air_yards', 'yards_after_catch', 'yards_gained',
       'rec_yards', 'rush_yards'],
      dtype='object')

In [9]:
# Append data from N previous seasons as new columns 
gameAvgHist = gameAvgs.copy()
currentYear = 2019
lookback = 3
for i in range(1,lookback+1):
    temp = gameAvgs.copy().reset_index()
    temp = temp[temp.season.isin(range(1999,currentYear-i+1))]
    temp['season'] = temp.season+i
    gameAvgHist = gameAvgHist.join(temp.set_index(['season','player_name']),how='outer',rsuffix=f'-{i}')

gameAvgHist.to_csv('gameAvgHist.csv')

In [13]:
# Join game avgs to season total df
allStats = seasonHist.join(gameAvgHist, lsuffix='_seasonTotal', rsuffix='_avgPerGame')
allStats.dropna(subset=['fanPts']).to_csv('allStats.csv')