**Models**

**Running Back**

In [152]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARMA
pd.set_option('display.max_columns', 50)

In [18]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv')
all_games.drop([440917], inplace=True) # delete weird Lagerrete Blount double game
all_games.loc[all_games['team'] == 'SDG', 'team'] = 'LAC'
all_games.loc[all_games['team'] == 'STL', 'team'] = 'LAR'
all_games.loc[all_games['opponent'] == 'SDG', 'opponent'] = 'LAC'
all_games.loc[all_games['opponent'] == 'STL', 'opponent'] = 'LAR'
all_games = all_games.set_index('player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')

In [155]:
data = pd.read_csv('rbs_2014_2017.csv', index_col='player_id')
data.head()

Unnamed: 0_level_0,year,game_number,name,team,date,log_draft_position,log_num_games,ewma_fpts,ewma_team_rush_attempts,ewma_team_rush_percentage,ewma_team_passing_rating,ewma_rushing_attempts,ewma_receiving_targets,DY/P,TO,next_fpts,fpts
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10,2015,1,Ameer Abdullah,DET,2015-09-13,3.988984,0.0,17.4,23.809087,0.517256,84.592646,7.0,4.0,5.4125,1.4,2.3,17.4
10,2015,2,Ameer Abdullah,DET,2015-09-20,3.988984,0.693147,9.378125,22.890371,0.495618,84.005276,6.46875,2.40625,4.47098,1.75,11.2,2.3
10,2015,3,Ameer Abdullah,DET,2015-09-27,3.988984,1.098612,10.062809,22.43268,0.482016,82.887008,7.044213,3.381014,4.53456,1.666667,5.4,11.2
10,2015,4,Ameer Abdullah,DET,2015-10-05,3.988984,1.386294,8.670033,21.911188,0.475728,82.94736,8.8232,3.864604,5.4733,2.0,3.0,5.4
10,2015,5,Ameer Abdullah,DET,2015-10-11,3.988984,1.609438,7.236029,21.451048,0.455653,81.059436,8.109187,3.393028,5.46972,1.75,8.4,3.0


In [160]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Training on the 2017 season
MSEarma = []
MAEarma = []
MSElr = []
MAElr = []
MSErf = []
MAErf = []
MSEbase = []
MAEbase = []

WEEK = 1
POSITION = 'RB'
YEAR = 2017

features = ['log_draft_position','log_num_games','ewma_fpts','ewma_team_rush_attempts','ewma_team_rush_percentage','ewma_team_passing_rating','ewma_rushing_attempts','ewma_receiving_targets','DY/P','TO']
resp = ['next_fpts']
X_train = data.loc[data['year'] < 2017][features]
X_traincurr = data.loc[(data['year'] == 2017) & (data['game_number'] < WEEK)][features]
if len(X_traincurr)>0:
    X_train = X_train.append(X_traincurr)
y_train = data.loc[data['year'] < 2017][resp]
y_traincurr = data.loc[(data['year'] == 2017) & (data['game_number'] < WEEK)][resp]
if len(y_traincurr)>0:
    y_train = y_train.append(y_traincurr) 
X_test = data.loc[(data['year'] == 2017) & (data['game_number']==WEEK)][features]
y_test = data.loc[(data['year'] == 2017) & (data['game_number']==WEEK)][resp]

# Fitting models

# naive approach
preds = data.loc[(data['year'] == 2017) & (data['game_number']==WEEK)]['fpts']
MAEbase.append(mean_absolute_error(y_test, preds))
MSEbase.append(mean_squared_error(y_test, preds))

# ARMA
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, WEEK, subset_position)
X, resp = get_features_response_ARMA(players, all_games, YEAR, WEEK, half_ppr)

preds = []
for i in resp.index:
    if type(X.loc[i]['fpts']) != np.float64:
        past_fpts = X.loc[i]['fpts'].values
        model = ARMA(past_fpts, (0,1))
        model_fit = model.fit(disp=0)
        preds.append(model_fit.forecast()[0][0])
    else:
        resp.drop([i],inplace=True)
MAEarma.append(mean_absolute_error(resp, preds))
MSEarma.append(mean_squared_error(resp, preds))

# linear regression
XtrainS = StandardScaler().fit_transform(X_train)
XtestS = StandardScaler().fit_transform(X_test)
lr = LinearRegression()
lr.fit(XtrainS, y_train)
preds = lr.predict(XtestS)
MAElr.append(mean_absolute_error(y_test, preds))
MSElr.append(mean_squared_error(y_test, preds))

# random forest
rf = RandomForestRegressor(n_estimators = 350)
rf.fit(XtrainS, y_train)
preds = rf.predict(XtestS)
MAErf.append(mean_absolute_error(y_test, preds))
MSErf.append(mean_squared_error(y_test, preds))



In [161]:
print(MAErf, MSErf)
print(MAElr, MSElr)
print(MAEbase, MSEbase)
print(MAEarma, MSEarma)

[5.499047619047618] [44.872744505344976]
[5.3036770344002155] [43.78875939387488]
[5.980952380952382] [59.059047619047625]
[6.416494908112935] [72.29194970092794]


Helpful Functions

In [17]:
def get_players_thatweek(all_games, all_players, position, year, game_number, subset_position, specific_players=None):
    if specific_players != None:
        ids = specific_players
    else:
        ids = all_players[all_players.position == position].index
        
    # return pandas df with player_id as index and player name and log draft position columns
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == game_number) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, ['name', 'draft_position']]
    # players that weren't drafted give them position of last pick of draft
    for_df.fillna(255, inplace=True)
    for_df['log_draft_position'] = np.log(for_df.draft_position)
    return pd.DataFrame(data=for_df)

In [20]:
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [21]:
# subset RB position by choosing only RBs with at least 3 fantasy points in previous game
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 5.0],
    'WR': [['rushing_attempts', 'receiving_targets'], 5.0]
}

In [43]:
def get_features_response_ARMA(players, all_games, year, gameNumber, points_dict):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value

    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number <= gameNumber)]
    next_game = games[(games.year == year) &
                     (games.game_number == gameNumber+1)]
    
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=False, inplace=True)
    # group dataframe by index
    df = df.groupby(df.index)
    
    past_16 = df.nth(list(range(0,16))).fpts
    weeks = df.nth(list(range(0,16))).game_number
    years = df.nth(list(range(0,16))).year
    
    # get response variable, the next game fantasy points 
    resp = next_game['fpts']

    for_return = pd.concat([past_16, years, weeks], axis=1)
    for_return.sort_values(by=['player_id', 'year', 'game_number'], axis=0, ascending=False, inplace=True)
    resp = resp.sort_index(ascending=False)
    return for_return, resp