In [354]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
import re
from datetime import date
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 100, "display.max_columns", 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

def p(s):
    print(s)
    
pd.DataFrame.len = pd.Index.len = lambda x: print(len(x))

In [247]:
# Read raw data - from local file if available, else from internet and write to local file
if os.path.exists('data/data.csv.gz'):
    dfFull = pd.read_csv('data/data.csv.gz', compression='gzip', index_col=0, low_memory=False)
elif os.path.exists('../data/data.csv.gz'):
    dfFull = pd.read_csv('../data/data.csv.gz', compression='gzip', index_col=0, low_memory=False)
else:
    YEARS = np.arange(1999,date.today().year)
    dfFull = pd.DataFrame()
    for i in YEARS:
        sys.stdout.write(f'{i} \r'); sys.stdout.flush()
        iData = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                             'play_by_play_' + str(i) + '.csv.gz?raw=True',
                             compression='gzip', low_memory=False)
        dfFull = dfFull.append(iData, sort=True)

    dfFull.reset_index(drop=True, inplace=True)
    dfFull.to_csv('../data/data.csv.gz', compression='gzip', low_memory=False)

In [438]:
# Lookup table with all features of the raw data
featureInfo = pd.read_csv('features.csv').iloc[:,0:-9]
# print(featureInfo[featureInfo.Use==1])

idCols = featureInfo[(featureInfo.Use==1) & (featureInfo.Type == 'ID')].Field.tolist() + ['receiver_player_name']
valCols = featureInfo[(featureInfo.Use==1) & (featureInfo.Type != 'ID')].Field.tolist() + ['season', 'game_id'] 
notUsed = [col for col in dfFull.columns if col not in idCols + valCols]
df = dfFull[['season'] + [x for x in featureInfo[(featureInfo.Use==1)].Field]]
                                                                                 
def lookup(s):
    return featureInfo[featureInfo.Field.str.contains(s)]

In [249]:
# Update legacy player IDs when applicable
legacyIds = pd.read_csv('data/legacy_id_map.csv')

keys = [x[0] for x in legacyIds.drop(['full_name'],axis=1).values]
vals = [x[1] for x in legacyIds.drop(['full_name'],axis=1).values]
legacyIdDict = {keys[i]: vals[i] for i in range(len(keys))}

df.loc[:,'passer_player_id'] = [legacyIdDict[x] if x in legacyIdDict.keys() else x for x in df['passer_player_id']]
df.loc[:,'rusher_player_id'] = [legacyIdDict[x] if x in legacyIdDict.keys() else x for x in df['rusher_player_id']]
df.loc[:,'receiver_player_id'] = [legacyIdDict[x] if x in legacyIdDict.keys() else x for x in df['receiver_player_id']]

# Calculate useful stats
df.loc[:,'pass_yards'] = df.pass_attempt * df.yards_gained
df.loc[:,'rush_yards'] = df.rush_attempt * df.yards_gained
df.loc[:,'rec_yards'] = df.pass_attempt * df.yards_gained

In [316]:
passStats = ['season'] + list(featureInfo[featureInfo['Pass']==1].Field)
rushStats = ['season'] + list(featureInfo[featureInfo['Rush']==1].Field)
recStats = ['season'] + list(featureInfo[featureInfo['Rec']==1].Field)

dfPass = df.dropna(subset=['passer_player_name'])[passStats].rename(columns={'passer_player_name':'player_name','passer_player_name':'player_name'})
dfRush = df.dropna(subset=['rusher_player_name'])[rushStats].rename(columns={'rusher_player_name':'player_name','rusher_player_name':'player_name'})
dfRec = df.dropna(subset=['receiver_player_name'])[recStats].rename(columns={'receiver_player_name':'player_name','receiver_player_name':'player_name'})

plays = pd.concat([dfRush,dfRec,dfPass]).fillna(0)

## Season total stats

In [317]:
# Sum all metrics per player per season
seasonTotals = plays.groupby(['season','player_name']).sum().drop(['play_id'],axis=1)

seasonTotals.columns
# seasonTotals.describe().transpose()[['min','max','mean','std']] #df info

Index(['rush_attempt', 'rush_touchdown', 'tackled_for_loss', 'yards_gained',
       'fumble', 'touchdown', 'complete_pass', 'pass_attempt',
       'pass_touchdown', 'air_yards', 'yards_after_catch', 'qb_dropback',
       'qb_hit', 'qb_scramble', 'sack'],
      dtype='object')

In [319]:
# Calculate fantasy points
if not 'fanPts' in seasonTotals.columns:
    seasonTotals.insert(0,'fanPts',0)

ppr = 0
seasonTotals.fanPts = seasonTotals.yards_gained/10 + seasonTotals.touchdown*6 - 2*seasonTotals.fumble + seasonTotals.complete_pass*ppr

In [320]:
# Append data from N previous seasons as new columns 
seasonHist = seasonTotals.copy()
currentYear = 2019
lookback = 3
for i in range(1,lookback+1):
    temp = seasonTotals.copy().reset_index()
    temp = temp[temp.season.isin(range(1999,currentYear-i+1))]
    temp['season'] = temp.season+i
    seasonHist = seasonHist.join(temp.set_index(['season','player_name']),how='outer',rsuffix=f'-{i}')

# Remove no longer active players
# seasonHist = seasonHist.dropna(subset=['fanPts'])
seasonHist.to_csv('seasonHist.csv')

## Game average stats

In [321]:
# Sum all metrics per player per season
gameTotals = plays.groupby(['season','player_name','game_id']).sum().drop(['play_id'],axis=1)
gameAvgs = gameTotals.groupby(['season','player_name']).mean()

gameAvgs.columns
# gameAvgs.describe().transpose()[['min','max','mean','std']] #df info
# gameAvgs

Index(['rush_attempt', 'rush_touchdown', 'tackled_for_loss', 'yards_gained',
       'fumble', 'touchdown', 'complete_pass', 'pass_attempt',
       'pass_touchdown', 'air_yards', 'yards_after_catch', 'qb_dropback',
       'qb_hit', 'qb_scramble', 'sack'],
      dtype='object')

In [322]:
# Append data from N previous seasons as new columns 
gameAvgHist = gameAvgs.copy()
currentYear = 2019
lookback = 3
for i in range(1,lookback+1):
    temp = gameAvgs.copy().reset_index()
    temp = temp[temp.season.isin(range(1999,currentYear-i+1))]
    temp['season'] = temp.season+i
    gameAvgHist = gameAvgHist.join(temp.set_index(['season','player_name']),how='outer',rsuffix=f'-{i}')

gameAvgHist.to_csv('gameAvgHist.csv')

In [323]:
# Join game avgs to season total df
allStats = seasonHist.join(gameAvgHist, lsuffix='_seasonTotal', rsuffix='_avgPerGame').reset_index()
allStats.dropna(subset=['fanPts']).to_csv('allStats.csv')

# Position tags

In [428]:
# Read in position data
if os.path.exists('rosterNamed.csv'):
    roster = pd.read_csv('data/rosterNamed.csv')
else:
    if os.path.exists('data/roster.csv.gz'):
        roster = pd.read_csv('data/roster.csv.gz', compression='gzip', index_col=0)
    else:
        roster = pd.read_csv('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/roster-data/roster.csv', low_memory=False)
        roster.to_csv('data/roster.csv.gz', compression='gzip')

    roster = roster[roster['teamPlayers.positionGroup'].apply(lambda x: any([y in x for y in ['QB','RB','WR','TE']]))].reset_index(drop=True)
    
    if not 'player_name' in roster.columns:
        roster.insert(0,'player_name','')

    i = 0
    print(len(allStats['player_name'].unique()))
    for name in allStats['player_name'].unique():
        i+=1
        sys.stdout.write(f'{i} \r'); sys.stdout.flush()
        if '.' in name:
            first = name.split('.')[0]
            last = name.split('.')[1]
            roster.loc[roster['teamPlayers.firstName'].str.startswith(first) & roster['teamPlayers.lastName'].str.match(last),'player_name'] = first+'.'+last
        elif ' ' in name:
            first = name.split(' ')[0]
            last = name.split(' ')[1]
            roster.loc[roster['teamPlayers.firstName'].str.startswith(first) & roster['teamPlayers.lastName'].str.match(last),'player_name'] = first+' '+last
        else:
            first = re.findall('[A-Z][^A-Z]*', name)[0]
            last = re.findall('[A-Z][^A-Z]*', name)[-1]
            roster.loc[roster['teamPlayers.firstName'].str.startswith(first) & roster['teamPlayers.lastName'].str.match(last),'player_name'] = first+last

        roster.to_csv('data/rosterNamed.csv',index=False)

In [434]:
# Lookups
posMap = {roster['player_name'][i]: roster['teamPlayers.positionGroup'][i] for i in range(len(roster))}

if 'pos' not in allStats.columns:
    allStats.insert(2,'pos','')
    
allStats.loc[:,'pos'] = [posMap[x] if x in posMap.keys() else np.nan for x in allStats.player_name]
    
labeled = allStats.dropna(subset=['pos'])

labeled.to_csv('data/labeled.csv',index=False)

### Split into separate files for position

In [443]:
featureInfo = featureInfo.loc[featureInfo['Use']==1,:]

positions = ['QB','RB','WR','TE']

cols = {}
cols['QB'] = [x for x in labeled.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Pass == 1].Field.tolist()])]
cols['RB'] = [x for x in labeled.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Rush == 1].Field.tolist()])]
cols['WR'] = [x for x in labeled.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Rec == 1].Field.tolist()])]
cols['TE'] = [x for x in labeled.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Rec == 1].Field.tolist()])]

allData = {}
for pos in positions:
    allData[pos] = labeled.loc[labeled['pos']==pos,cols[pos]+list(labeled.columns[labeled.columns.str.contains('fanPts')])]
    allData[pos].to_csv(f'{pos}.csv')