To Do:
* rush percentage as feature
* ewma feature instead of multiple fpts features
* L1 regularization
* Decision Tree regresssion
* sklearn time series split validation on training data, or implement this ourselves (Brian G.). Holdout data on a few future weeks. Report both the cross validation and holdout performance scores.
* look at masters theses that do this
* tune number of games in moving averages and tune number of trees in random forest

In [45]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [93]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv')
all_games.drop([440917], inplace=True) # delete weird Lagerrete Blount double game
all_games.loc[all_games['team'] == 'SDG', 'team'] = 'LAC'
all_games.loc[all_games['team'] == 'STL', 'team'] = 'LAR'
all_games = all_games.set_index('player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')
gamesDef = pd.read_csv('defData12-17.csv')
gamesDef.loc[gamesDef['Tm']=='SDG', 'Tm'] = 'LAC'
gamesDef.loc[gamesDef['Tm']=='STL', 'Tm'] = 'LAR'
gamesDef.set_index('Tm')
gamesDef = gamesDef.drop(columns=["Rk","Time","LTime"])

In [94]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [95]:
# subset RB position by choosing only RBs with at least 3 fantasy points in previous game
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 5.0]
}

In [96]:
def get_players_thatweek(all_games, all_players, position, year, game_number, subset_position, specific_players=None):
    if specific_players != None:
        ids = specific_players
    else:
        ids = all_players[all_players.position == position].index
        
    # return pandas df with player_id as index and player name and log draft position columns
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == game_number) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, ['name', 'draft_position']]
    # players that weren't drafted give them position of last pick of draft
    for_df.fillna(255, inplace=True)
    for_df['log_draft_position'] = np.log(for_df.draft_position)
    return pd.DataFrame(data=for_df)

In [97]:
# train to game number 
a = get_players_thatweek(all_games, all_players,'RB',2016,1,subset_position)
a

Unnamed: 0_level_0,name,draft_position,log_draft_position
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17858,Bilal Powell,126.0,4.836282
10586,Carlos Hyde,57.0,4.043051
5943,Shaun Draughn,255.0,5.541264
11664,Matt Jones,95.0,4.553877
23267,Spencer Ware,194.0,5.267858
8152,Frank Gore,65.0,4.174387
16086,DeMarco Murray,71.0,4.26268
18654,Theo Riddick,199.0,5.293305
12749,Eddie Lacy,61.0,4.110874
21133,James Starks,193.0,5.26269


In [98]:
def get_team_stats(games, year, game_number):
    # returns: dataframe with team passing and rushing stats
    
    # start by getting team passing attempts
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number <= game_number)]
    passing_stats = pd.concat((prev_years, current_year))
    passing_stats = pd.concat((prev_years, current_year))
    passing_stats = passing_stats[['team', 'date', 'year', 'game_number', 'passing_attempts', 'passing_rating']] 
    passing_stats.sort_values(by=['year', 'game_number'], axis=0,ascending=True, inplace=True)
    passing_stats = passing_stats.groupby(['team', 'year', 'game_number'])
    # sum of pass attempts by anyone by team in year during game
    team_pass_attempts = passing_stats.passing_attempts.sum()
    # take the passing rating from the player with most pass_attempts
    pass_rating = passing_stats.apply(
        lambda x: x.nlargest(1,'passing_attempts')).droplevel(3)['passing_rating']
    ewma_pass_rating = pass_rating.groupby('team').apply(
        lambda x: x.ewm(span=16).mean()).rename('ewma_team_passing_rating')
    # Note: team pass attempts and pass_rating and ewma_pass_rating are hierarchical pd series

    # get total team rushing attempts
    games = games[(games.rushing_attempts > 0)]
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number <= game_number)]
    rushing_stats = pd.concat((prev_years, current_year))
    rushing_stats.sort_values(by=['year', 'game_number'], axis=0, ascending=True, inplace=True)
    rushing_stats = rushing_stats.groupby(['team', 'year', 'game_number'])
    rush_attempts = rushing_stats.rushing_attempts.sum()

    # combine all stats into one dataframe
    team_stats = pd.DataFrame(team_pass_attempts)
    team_stats = team_stats.join(pass_rating)
    team_stats = team_stats.join(ewma_pass_rating)
    team_stats = team_stats.join(rush_attempts)
    ewma_rush_attempts = team_stats.groupby('team').apply(
        lambda x: x['rushing_attempts'].ewm(span=16).mean()).droplevel(0).rename('ewma_team_rush_attempts')
    team_stats = team_stats.join(ewma_rush_attempts)
    team_stats['rush_percentage'] = team_stats.rushing_attempts/(
        team_stats.rushing_attempts+team_stats.passing_attempts)
    ewma_rush_perc = team_stats.groupby('team').apply(
        lambda x: x['rush_percentage'].ewm(span=16).mean()).droplevel(0).rename('ewma_team_rush_percentage')
    team_stats = team_stats.join(ewma_rush_perc)
    team_stats = team_stats.sort_values(by=['year', 'game_number'],axis=0,ascending=False).groupby('team').nth([0])
    return team_stats

In [99]:
def get_def_data(gamesDef, year, gameNumber, gamesBack = 5):
    # return def stats for all teams using last 5 weeks as Pandas Dataframe
    prevSeasonDef = gamesDef[gamesDef['Year']==(year-1)]
    currentSeasonDef = gamesDef[(gamesDef['Year']==year) & (gamesDef['Game']<=gameNumber)]
    df = pd.concat((prevSeasonDef, currentSeasonDef))
    df.sort_values(by=['Year', 'Game'], axis=0, ascending=False, inplace=True)
    grouped = df.groupby('Tm', sort=False)
    def_stats = grouped.nth(list(range(0,gamesBack))).groupby('Tm', sort=False).mean()[['DY/P', 'TO']]
    return def_stats

In [100]:
# example usage
df1 = get_def_data(gamesDef, 2016, 1, gamesBack=15)
df1

Unnamed: 0_level_0,DY/P,TO
Tm,Unnamed: 1_level_1,Unnamed: 2_level_1
OAK,5.350027,2.1
NOR,6.745927,1.7
IND,5.722467,2.444444
PIT,5.64632,2.153846
NYJ,5.12812,1.916667
DET,5.643,2.111111
WAS,6.248087,1.909091
ATL,5.58424,2.545455
LAC,6.014253,1.636364
DAL,5.727227,2.333333


In [101]:
def get_features_response(players, all_games, year, game_number, points_dict, defWeeksBack=5):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value
    
    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number <= game_number)]
    #rest_year = games[(games.year == year) &
    #                 (games.game_number>=game_number)]
    next_game = games[(games.year == year) &
                     (games.game_number == game_number+1)]['fpts'].rename('next_fpts')
    df = pd.concat((prev_years, current_year))
    # only keep columns needed for indiviual statistics
    df = df[['team', 'date', 'year', 'game_number', 'fpts', 'rushing_attempts', 'receiving_targets']]
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=True, inplace=True)
    
    ## Individual Statistics
    # group dataframe by index
    df['num_games'] = df.groupby(df.index).cumcount().rename('num_games') + 1
    df['log_num_games'] = np.log(df.num_games)
    df['ewma_rushing_attempts'] = df.groupby(df.index).apply(
        lambda x: x['rushing_attempts'].ewm(span=16).mean()).droplevel(0).rename('ewma_rushing_attempts')
    df['ewma_receiving_targets'] = df.groupby(df.index).apply(
        lambda x: x['receiving_targets'].ewm(span=16).mean()).droplevel(0).rename('ewma_receiving_targets')
    df['ewma_fpts'] = df.groupby(df.index).apply(
        lambda x: x['fpts'].ewm(span=16).mean()).droplevel(0).rename('ewma_fpts')
    df = df[['team', 'date', 'year', 'game_number', 'fpts', 'ewma_fpts', 'ewma_rushing_attempts', 'ewma_receiving_targets']]
    df = df.sort_values(by=['year','game_number'],axis=0,ascending=False).groupby('player_id').nth([0])

    ## Team Statistics
    # get rush_percentage, rushing_attempts, and passer rating of teams
    team_stats = get_team_stats(games = all_games, year=year, game_number=game_number)
    # get defenseive stats
    def_stats = get_def_data(gamesDef, year, game_number, defWeeksBack)
    # join defensive stats to team stats by team
    team_stats = team_stats.join(def_stats)
    # join team stats to df by team
    df = df.reset_index().join(team_stats, how='left', on=['team']).set_index('player_id')#.drop_duplicates()
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=True, inplace=True)
    # join df to players with name and draft info
    df = players.join(df)
    
    # Response variable is the fantasy points of the next game, NA should be zero because they
    # didn't play or didn't score
    df['next_fpts'] = next_game
    df['next_fpts'] = df['next_fpts'].fillna(0)
    #df['ros_ppg'] = rest_year.groupby('player_id').fpts.mean().rename('ros_ppg')
    #ros_games = rest_year.groupby('player_id').fpts.count().rename('ros_games', inplace=True)
    return df

In [102]:
# example usage
get_features_response(a, all_games, 2016, 1, half_ppr)

Unnamed: 0_level_0,name,draft_position,log_draft_position,team,date,year,game_number,fpts,ewma_fpts,ewma_rushing_attempts,...,passing_attempts,passing_rating,ewma_team_passing_rating,rushing_attempts,ewma_team_rush_attempts,rush_percentage,ewma_team_rush_percentage,DY/P,TO,next_fpts
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17858,Bilal Powell,126.0,4.836282,NYJ,2016-09-11,2016,1,5.8,8.912268,5.131337,...,19,77.0,86.385564,30,27.739056,0.612245,0.559673,5.31214,1.5,0.8
10586,Carlos Hyde,57.0,4.043051,SFO,2016-09-12,2016,1,22.3,9.707592,13.544751,...,22,84.2,85.188259,42,26.397092,0.65625,0.547583,4.84292,1.75,6.7
5943,Shaun Draughn,255.0,5.541264,SFO,2016-09-12,2016,1,10.6,6.088254,6.886776,...,22,84.2,85.188259,42,26.397092,0.65625,0.547583,4.84292,1.75,2.1
11664,Matt Jones,95.0,4.553877,WAS,2016-09-12,2016,1,3.8,7.504538,10.89595,...,30,72.7,106.715521,12,25.430036,0.285714,0.500759,6.60628,1.333333,13.0
23267,Spencer Ware,194.0,5.267858,KAN,2016-09-11,2016,1,29.4,10.262829,7.653109,...,34,97.8,95.640579,19,26.819619,0.358491,0.570549,4.89348,1.5,11.5
8152,Frank Gore,65.0,4.174387,IND,2016-09-11,2016,1,9.8,11.173305,16.3988,...,31,119.5,83.541845,19,24.331832,0.38,0.51227,5.3658,1.666667,13.8
16086,DeMarco Murray,71.0,4.26268,TEN,2016-09-11,2016,1,22.2,12.584521,13.195112,...,25,86.5,86.987722,22,22.094555,0.468085,0.501644,5.19956,2.4,18.0
18654,Theo Riddick,199.0,5.293305,DET,2016-09-11,2016,1,25.3,10.030749,3.317164,...,31,128.6,106.822328,24,23.567241,0.436364,0.472121,6.11116,1.0,8.5
12749,Eddie Lacy,61.0,4.110874,GNB,2016-09-11,2016,1,8.3,10.238753,13.600464,...,20,95.1,89.472135,25,27.440341,0.555556,0.555273,5.42282,2.666667,5.0
21133,James Starks,193.0,5.26269,GNB,2016-09-11,2016,1,1.8,7.821561,7.676184,...,20,95.1,89.472135,25,27.440341,0.555556,0.555273,5.42282,2.666667,2.4


In [17]:
## Pretend its 2016, nfl week 5

# Get all features and responses for Running Backs from START_YEAR to YEAR before GAME_NUMBER
POSITION = 'RB'
START_YEAR = 2016
YEAR = 2016
GAME_NUMBER = 5
# features 
FEATURES = ['log_draft_position','log_num_games', 'ewma_fpts', 
            'ewma_team_rush_attempts', 'ewma_team_rush_percentage', 
            'ewma_team_passing_rating', 'ewma_rushing_attempts', 'ewma_receiving_targets',
            'DY/P']
# response
RESPONSE = ['next_fpts']

# append features and response each week to these lists
feature_list = []
response_list = []
lr = None
r_sqr_list = []
mean_abs_err_list = []
mean_test_fpts_list = []

# train mlr if no training yet
if lr is None:

    for train_year in range(START_YEAR, YEAR+1):   

        # if current year dont go past nfl week
        if train_year == YEAR:
            week_limit = GAME_NUMBER-1
        else: # if previous year don't go past regular season (nfl week 16)
            week_limit = 16

        for train_week in range(1,week_limit+1):
            players = get_players_thatweek(all_games, all_players, POSITION, train_year, train_week, subset_position)
            train = get_features_response(players, all_games, train_year, train_week, points_dict=half_ppr, defweeksBack=5)
            print('Got train data for year ' +str(train_year) + ' game number ' + str(train_week))
            feature = train[FEATURES]
            response = train[RESPONSE]
            feature_list.append(feature)
            response_list.append(response)

    ## After all feature and response training lists have been created
    #  Concat lists into train dataframes
    train_x = pd.concat(feature_list)
    train_y = pd.concat(response_list)

    # fill NaN in train_x with zeros
    train_x = train_x.fillna(0)
    print('train data: ')
    print(train_x.head())
    print(train_x.tail())
    print('train response: ')
    print(train_y.head())
    print(train_y.tail())

    # Train a linear regression model to predict rest of season ppg for RBs 
    lr = LinearRegression()
    lr = lr.fit(train_x, train_y)
    print('Model trained. Here are the results:')
    print('Intercept:')
    print(lr.intercept_)
    for feat, coef in zip(FEATURES, lr.coef_):
        print(feat + ' ' + str(coef))


Got train data for year 2016 game number 1
Got train data for year 2016 game number 2
Got train data for year 2016 game number 3
Got train data for year 2016 game number 4
train data: 
           log_draft_position  log_num_games  ewma_fpts  \
player_id                                                 
17858                4.836282       4.077537   8.912268   
10586                4.043051       3.091042   9.707592   
5943                 5.541264       3.737670   6.088254   
11664                4.553877       2.639057   7.504538   
23267                5.267858       2.639057  10.262829   

           ewma_team_rush_attempts  ewma_team_rush_percentage  \
player_id                                                       
17858                    27.437597                   0.552663   
10586                    24.316705                   0.533094   
5943                     24.316705                   0.533094   
11664                    27.220708                   0.529432   
23267      

In [None]:
# test linear regression
for test_week in range(GAME_NUMBER,8):
    # Get test_x and test_y
    players = get_players_thatweek(all_games, all_players, POSITION, YEAR, test_week, subset_position)
    test = get_features_response(players, all_games, YEAR, test_week, points_dict=half_ppr)
    test_x = test[FEATURES]
    test_y = test[RESPONSE]

    # fill NaN in test_x with zeros
    test_x = test_x.fillna(0)

    # Score gives R^2 of prediction of test_x wrt test_y
    # Note: R^2 is correlated to how many easy predictions
    #  (players with low fantasy points are easy to predict).
    #  R^2 decreases when predicting on fewer but better running backs.
    print('Year: ' +str(YEAR) + '  Game Number: '+ str(test_week))
    r_sqr = lr.score(test_x, test_y)
    r_sqr_list.append(r_sqr)
    mean_abs_err = round(mean_absolute_error(test_y, lr.predict(test_x)),2)
    mean_abs_err_list.append(mean_abs_err)
    mean_test_fpts = np.mean(test_y)
    mean_test_fpts_list.append(mean_test_fpts)
    
    print('R^2: ' + str(round(r_sqr,2)))
    print('Mean Abs Error: ' + str(mean_abs_err))

print('Mean R^2: ' + str(round(np.mean(r_sqr_list), 2)))
print('Overall mean abs err: ' + str(round(np.mean(mean_abs_err_list), 2)))
print('Overall mean test fpts:' + str(round(np.mean(mean_test_fpts_list),2)))

In [None]:
print('number of running back games: ' + str(len(train_x)))