In [268]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
import re
from datetime import date
from sklearn.model_selection import train_test_split

pd.set_option("display.max_rows", 100, "display.max_columns", 100)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

def p(s):
    print(s)
    
pd.DataFrame.len = pd.Index.len = lambda x: print(len(x))

In [269]:
# Read raw data - from local file if available, else from internet and write to local file
if os.path.exists('data/data.csv.gz'):
    dfFull = pd.read_csv('data/data.csv.gz', compression='gzip', index_col=0, low_memory=False)
elif os.path.exists('../data/data.csv.gz'):
    dfFull = pd.read_csv('../data/data.csv.gz', compression='gzip', index_col=0, low_memory=False)
else:
    YEARS = np.arange(1999,date.today().year)
    dfFull = pd.DataFrame()
    for i in YEARS:
        sys.stdout.write(f'{i} \r'); sys.stdout.flush()
        iData = pd.read_csv('https://github.com/guga31bb/nflfastR-data/blob/master/data/' \
                             'play_by_play_' + str(i) + '.csv.gz?raw=True',
                             compression='gzip', low_memory=False)
        dfFull = dfFull.append(iData, sort=True)

    dfFull.reset_index(drop=True, inplace=True)
    dfFull.to_csv('../data/data.csv.gz', compression='gzip', low_memory=False)

In [270]:
# Lookup table with all features of the raw data
featureInfo = pd.read_csv('features.csv').iloc[:,0:-9]
featureInfo = featureInfo[featureInfo.Use==1]

df = dfFull[['season'] + [x for x in featureInfo.Field]]
    
def lookup(s):
    return featureInfo[featureInfo.Field.str.contains(s)]

In [271]:
# # Update legacy player IDs when applicable
# legacyIds = pd.read_csv('data/legacy_id_map.csv')

# keys = [x[0] for x in legacyIds.drop(['full_name'],axis=1).values]
# vals = [x[1] for x in legacyIds.drop(['full_name'],axis=1).values]
# legacyIdDict = {keys[i]: vals[i] for i in range(len(keys))}

# df.loc[:,'passer_player_id'] = [legacyIdDict[x] if x in legacyIdDict.keys() else x for x in df['passer_player_id']]
# df.loc[:,'rusher_player_id'] = [legacyIdDict[x] if x in legacyIdDict.keys() else x for x in df['rusher_player_id']]
# df.loc[:,'receiver_player_id'] = [legacyIdDict[x] if x in legacyIdDict.keys() else x for x in df['receiver_player_id']]

In [272]:
passStats = ['season'] + list(featureInfo[featureInfo['Pass']==1].Field)
rushStats = ['season'] + list(featureInfo[featureInfo['Rush']==1].Field)
recStats = ['season'] + list(featureInfo[featureInfo['Rec']==1].Field)

dfPass = df.dropna(subset=['passer_player_name'])[passStats].rename(columns={'passer_player_name':'player_name','passer_player_id':'player_id'})
dfRush = df.dropna(subset=['rusher_player_name'])[rushStats].rename(columns={'rusher_player_name':'player_name','rusher_player_id':'player_id'})
dfRec = df.dropna(subset=['receiver_player_name'])[recStats].rename(columns={'receiver_player_name':'player_name','receiver_player_id':'player_id'})

plays = pd.concat([dfPass,dfRec,dfRush]).fillna(0)

In [273]:
# Calculate useful stats
plays.loc[:,'pass_yards'] = plays.pass_attempt * plays.yards_gained
plays.loc[:,'rush_yards'] = plays.rush_attempt * plays.yards_gained
plays.loc[:,'rec_yards'] = plays.pass_attempt * plays.yards_gained

# Position tags

In [274]:
# Read in position data
if os.path.exists('rosterNamed.csv'):
    roster = pd.read_csv('data/rosterNamed.csv')
else:
    if os.path.exists('data/roster.csv.gz'):
        roster = pd.read_csv('data/roster.csv.gz', compression='gzip', index_col=0)
    else:
        roster = pd.read_csv('https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/roster-data/roster.csv', low_memory=False)
        roster.to_csv('data/roster.csv.gz', compression='gzip')

    roster = roster[roster['teamPlayers.positionGroup'].apply(lambda x: any([y in x for y in ['QB','RB','WR','TE']]))].reset_index(drop=True)
    
    if not 'player_name' in roster.columns:
        roster.insert(0,'player_name','')

    i = 0
    print(len(allStats['player_name'].unique()))
    for name in allStats['player_name'].unique():
        i+=1
        sys.stdout.write(f'{i} \r'); sys.stdout.flush()
        if '.' in name:
            first = name.split('.')[0]
            last = name.split('.')[1]
            roster.loc[roster['teamPlayers.firstName'].str.startswith(first) & roster['teamPlayers.lastName'].str.match(last),'player_name'] = first+'.'+last
        elif ' ' in name:
            first = name.split(' ')[0]
            last = name.split(' ')[1]
            roster.loc[roster['teamPlayers.firstName'].str.startswith(first) & roster['teamPlayers.lastName'].str.match(last),'player_name'] = first+' '+last
        else:
            first = re.findall('[A-Z][^A-Z]*', name)[0]
            last = re.findall('[A-Z][^A-Z]*', name)[-1]
            roster.loc[roster['teamPlayers.firstName'].str.startswith(first) & roster['teamPlayers.lastName'].str.match(last),'player_name'] = first+last

        roster.to_csv('data/rosterNamed.csv',index=False)

In [275]:
# Lookups
posMap = {roster['player_name'][i]: roster['teamPlayers.positionGroup'][i] for i in range(len(roster))}

labeled = plays.copy()

if 'pos' not in labeled.columns:
    labeled.insert(1,'pos','')
    
    
labeled.loc[:,'pos'] = [posMap[x] if x in posMap.keys() else np.nan for x in plays.player_name]
    
labeled = labeled.dropna(subset=['pos'])

## Opponent data

In [353]:
allowed = labeled.groupby(['season','defteam','pos']).sum().drop('play_id', axis=1).reset_index()

# Calculate fantasy points
if not 'fanPts' in allowed.columns:
    allowed.insert(3,'fanPts',0)

ppr = 0
allowed.fanPts = allowed.yards_gained/10 \
                        + 6 * (allowed.pass_touchdown + allowed.rush_touchdown) \
                        - 2 * allowed.fumble \
                        + ppr * allowed.complete_pass

allowed.loc[allowed['pos']=='QB','fanPts'] = allowed.yards_gained/25 \
                        + 4 * allowed.pass_touchdown \
                        + 6 * allowed.rush_touchdown \
                        - 2 * allowed.fumble \
                        - 2 * allowed.interception

matchups = labeled.groupby(['season','posteam','defteam']).sum()

allowed = matchups.reset_index().iloc[:,:3].merge(allowed).groupby(['season','posteam','pos']).sum()
allowed = allowed.loc[:,['fanPts']]#,'complete_pass','pass_touchdown','air_yards','yards_gained','fumble','interception','touchdown','yards_after_catch','rush_touchdown','pass_yards','rush_yards','rec_yards']]

## Season total stats

In [354]:
# Sum all metrics per player per season
seasonTotals = labeled.groupby(['season','player_name','posteam','pos']).sum().drop(['play_id'],axis=1).reset_index()

# Calculate fantasy points
if not 'fanPts' in seasonTotals.columns:
    seasonTotals.insert(3,'fanPts',0)

ppr = 0
seasonTotals.fanPts = seasonTotals.yards_gained/10 \
                        + 6 * (seasonTotals.pass_touchdown + seasonTotals.rush_touchdown) \
                        - 2 * seasonTotals.fumble \
                        + ppr * seasonTotals.complete_pass

seasonTotals.loc[seasonTotals['pos']=='QB','fanPts'] = seasonTotals.yards_gained/25 \
                        + 4 * seasonTotals.pass_touchdown \
                        + 6 * seasonTotals.rush_touchdown \
                        - 2 * seasonTotals.fumble \
                        - 2 * seasonTotals.interception

seasonTotals = seasonTotals.merge(allowed.reset_index(), on=['season','posteam','pos'], suffixes=['','_oppAllowTotal'])

seasonTotals.columns
# seasonTotals.describe().transpose()[['min','max','mean','std']] #df info

Index(['season', 'player_name', 'posteam', 'fanPts', 'pos', 'complete_pass',
       'pass_attempt', 'pass_touchdown', 'qb_dropback', 'qb_hit',
       'qb_scramble', 'sack', 'tackled_for_loss', 'air_yards', 'yards_gained',
       'fumble', 'interception', 'touchdown', 'yards_after_catch',
       'rush_attempt', 'rush_touchdown', 'pass_yards', 'rush_yards',
       'rec_yards', 'fanPts_oppAllowTotal'],
      dtype='object')

In [355]:
# Append data from N previous seasons as new columns 
seasonHist = seasonTotals.copy().set_index(['season','player_name','posteam','pos'])
currentYear = 2019
lookback = 3
for i in range(1,lookback+1):
    temp = seasonTotals.copy()
    temp = temp[temp.season.isin(range(1999,currentYear-i+1))]
    temp['season'] = temp.season+i
    seasonHist = seasonHist.join(temp.set_index(['season','player_name','posteam','pos']),how='outer',rsuffix=f'-{i}')

seasonHist.to_csv('expanded/seasonHist.csv', index=False)

## Game average stats

In [365]:
# Sum all metrics per player per season
gameTotals = labeled.groupby(['season','player_name','posteam','game_id','pos']).sum().drop(['play_id'],axis=1)
gameAvgs = gameTotals.groupby(['season','player_name','posteam','pos']).mean()

gameAvgs.columns
# gameAvgs.describe().transpose()[['min','max','mean','std']] #df info
# gameAvgs

Index(['complete_pass', 'pass_attempt', 'pass_touchdown', 'qb_dropback',
       'qb_hit', 'qb_scramble', 'sack', 'tackled_for_loss', 'air_yards',
       'yards_gained', 'fumble', 'interception', 'touchdown',
       'yards_after_catch', 'rush_attempt', 'rush_touchdown', 'pass_yards',
       'rush_yards', 'rec_yards'],
      dtype='object')

In [366]:
# Append data from N previous seasons as new columns 
gameAvgHist = gameAvgs.copy()
currentYear = 2019
lookback = 3
for i in range(1,lookback+1):
    temp = gameAvgs.copy().reset_index()
    temp = temp[temp.season.isin(range(1999,currentYear-i+1))]
    temp['season'] = temp.season+i
    gameAvgHist = gameAvgHist.join(temp.set_index(['season','player_name','posteam','pos']),how='outer',rsuffix=f'-{i}')

gameAvgHist.to_csv('expanded/gameAvgHist.csv', index=False)

In [367]:
# Join game avgs to season total df
allStats = seasonHist.join(gameAvgHist, lsuffix='_seasonTotal', rsuffix='_avgPerGame').reset_index()
allStats = allStats.dropna(subset=['fanPts'])
allStats.to_csv('expanded/allStats.csv', index=False)
# allStats

## Split into separate files for position

In [368]:
featureInfo = pd.read_csv('features.csv').iloc[:,0:-9]
featureInfo = featureInfo.loc[featureInfo['Use']==1,:]

positions = ['QB','RB','WR','TE']

cols = {}
cols['QB'] = [x for x in allStats.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Pass == 1].Field.tolist()])]
cols['RB'] = [x for x in allStats.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Rush == 1].Field.tolist()])]
cols['WR'] = [x for x in allStats.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Rec == 1].Field.tolist()])]
cols['TE'] = [x for x in allStats.columns if any([x.startswith(y) for y in featureInfo[featureInfo.Rec == 1].Field.tolist()])]

posData = {}
for pos in positions:
    posData[pos] = allStats.loc[allStats['pos']==pos,cols[pos]+list(allStats.columns[allStats.columns.str.contains('fanPts')])]
    posData[pos] = allStats.loc[allStats['pos']==pos,:]
    posData[pos].to_csv(f'expanded/{pos}.csv', index=False)