In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)

In [2]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv')
all_games.drop([440917], inplace=True)
all_games = all_games.set_index('player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')
gamesDef = pd.read_csv('defData12-17.csv',index_col='Tm')
gamesDef = gamesDef.drop(columns=["Rk","Time","LTime"])

In [3]:
# weird situation: player id 1890 (LeGarrette Blount) was on the Steelers in the 11th game of 2014, was 
# released from the Steelers and played for the Patriots in their 11th game of 2014. Since he didn't touch the ball
# with the Steelers, deleted that row b/c it causes errors later on
# all_games[(all_games['year'] == 2014) & (all_games['player_id'] == 1890)]

In [4]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [5]:
# subset RB position by choosing only RBs with at least 5 rushing attempts
# or receiving targets in a given nfl week.
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 5]
}

In [6]:
def get_players_thatweek(all_games, all_players, position, year, gamenumber, subset_position):
    # return pandas df with player_id as index and column 'name' as player name
    ids = all_players[all_players.position == position].index
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == gamenumber) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, 'name']
    return pd.DataFrame(data=for_df)

In [7]:
# example usage
a = get_players_thatweek(all_games, all_players,'RB',2014,11,subset_position)
a.head()

Unnamed: 0_level_0,name
player_id,Unnamed: 1_level_1
10839,Steven Jackson
10586,Carlos Hyde
1457,Joique Bell
23980,Andre Williams
17587,Bernard Pierce


In [8]:
def get_def_data(gamesDef, opps, gameNumber, year):
    prevSeasonDef = gamesDef[gamesDef['Year']==(year-1)]
    currentSeasonDef = gamesDef[(gamesDef['Year']==year) & (gamesDef['Game']<=(gameNumber-1))]
    df = pd.concat((prevSeasonDef, currentSeasonDef))
    df.sort_values(by=['Year', 'Game'], axis=0, ascending=False, inplace=True)
    df = df.loc[opps]
    grouped = df.groupby('Tm', sort=False)
    last5_DYP = grouped.nth(list(range(0,5))).groupby('Tm', sort=False).mean()['DY/P']
    last5_TO = grouped.nth(list(range(0,5))).groupby('Tm', sort=False).mean()['TO']
    DYP = []
    TO = []
    for i in opps:
        DYP.append(last5_DYP[i])
        TO.append(last5_TO[i])
    return DYP, TO

In [9]:
def get_features_response(players, all_games, year, gameNumber, points_dict, def_data):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value

    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < gameNumber)]
    next_game = games[(games.year == year) &
                     (games.game_number == gameNumber)]
    opps = next_game['opponent']
    #get oppponents' defense stats
    last5_DYP, last5_TO = get_def_data(def_data, opps, gameNumber, year)
    
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=False, inplace=True)
    # group dataframe by index
    df = df.groupby(df.index)
    
    num_prev = df.fpts.agg('count').rename('num_prev')
    last = df.nth([0]).groupby('player_id').agg('mean').fpts.rename('last')
    next_3 = df.nth(list(range(1,4))).groupby('player_id').agg('mean').fpts.rename('next_3')
    next_15 = df.nth(list(range(4,19))).groupby('player_id').agg('mean').fpts.rename('next_15')
    to_debut = df.nth(list(range(19,300))).groupby('player_id').agg('mean').fpts.rename('to_debut')
    
    # get response variable, the next game fantasy points 
    resp = next_game['fpts']

    for_return = players.join([num_prev,last, next_3, next_15, to_debut], how='left')
    for_return.insert(6, "last5 DY/P", last5_DYP)
    for_return.insert(7, "last5 TO", last5_TO)
    for_return.insert(8, "resp", resp.values)
    return for_return

In [10]:
# example usage
players = get_players_thatweek(all_games, all_players,'RB',2014,11, subset_position)
get_features_response(players, all_games, 2014, 11, half_ppr, gamesDef)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0_level_0,name,num_prev,last,next_3,next_15,to_debut,last5 DY/P,last5 TO,resp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10839,Steven Jackson,153,5.7,10.5,10.373333,14.615672,4.67384,2.0,12.1
10586,Carlos Hyde,10,3.5,5.2,4.833333,,5.55198,2.2,7.6
1457,Joique Bell,49,13.0,9.966667,11.48,6.853333,5.68434,1.5,5.1
23980,Andre Williams,10,1.1,9.0,6.566667,,5.39738,1.6,11.2
17587,Bernard Pierce,39,3.1,5.833333,4.113333,4.86,5.78462,1.8,3.9
8152,Frank Gore,142,12.4,7.7,10.186667,14.371545,5.55198,2.2,3.6
13751,Marshawn Lynch,114,13.0,25.5,15.686667,13.013684,4.84612,1.2,9.7
19486,Bishop Sankey,10,11.0,7.5,5.5,,5.36146,3.25,6.1
2556,Bryce Brown,35,10.6,7.2,3.246667,5.78125,5.60266,4.0,2.0
16086,DeMarco Murray,47,16.1,18.266667,19.793333,12.253571,6.49304,2.2,15.3


In [285]:
POSITION = 'RB'
START_YEAR = 2013
YEAR = 2016
NFL_WEEK = 9
# features 
FEATURES = ['num_prev', 'last', 'next_3', 'next_15', 'to_debut','last5 DY/P','last5 TO']
# response
RESPONSE = ['resp']

# append features and response each week to these lists
feature_list = []
response_list = []

for year in range(START_YEAR, YEAR+1):   
    
    # if current year dont go past nfl week
    if year == YEAR:
        week_limit = NFL_WEEK-1
    else: # if previous year don't go past regular season (nfl week 16)
        week_limit = 16

    for week in range(1,week_limit+1):
        #print("week: ",week," year: ",year)
        players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
        train = get_features_response(players, all_games, year, week, half_ppr, gamesDef)
        feature = train[FEATURES]
        response = train[RESPONSE]
        feature_list.append(feature)
        response_list.append(response)

# At the end concatenate feature and response lists 
#  into train_x and train_y dataframes
train_x = pd.concat(feature_list)
train_y = pd.concat(response_list)

# fill NaN in train_x with zeros
train_x = train_x.fillna(0)

# Get test_x and test_y
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, NFL_WEEK, subset_position)
test = get_features_response(players, all_games, YEAR, NFL_WEEK, half_ppr, gamesDef)
test_x = test[FEATURES]
test_y = test[RESPONSE]

# fill NaN in test_x with zeros
test_x = test_x.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [286]:
# Naive model: predict last week's performance
preds = test_x['last']
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

sse = 0
for i in range(len(preds)):
    sse += np.abs(preds.values[i] - test_y.values[i][0])
print("MAE: ", sse/len(preds))

MSE:  94.25779661016949
MAE:  7.483050847457628


In [287]:
# Train a linear regression model to predict rest of season ppg for RBs
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr = lr.fit(train_x, train_y)
print('Intercept:', lr.intercept_)
print(FEATURES)
print(lr.coef_)
print('R^2: ', lr.score(test_x, test_y))

Intercept: [1.26403061]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut', 'last5 DY/P', 'last5 TO']
[[-0.01142514  0.17205766  0.22793364  0.15257873  0.06268435  0.7397996
   0.04957784]]
R^2:  0.23105169958034621


In [288]:
#Using statsmodels
import statsmodels.api as sm 
X_train = sm.add_constant(train_x)
# Fit OLS model 
model = sm.OLS(train_y, X_train).fit() 
# Print model summary 
model.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,90.3
Date:,"Thu, 05 Dec 2019",Prob (F-statistic):,7.81e-121
Time:,14:33:02,Log-Likelihood:,-10994.0
No. Observations:,3328,AIC:,22000.0
Df Residuals:,3320,BIC:,22050.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.2640,1.170,1.081,0.280,-1.029,3.558
num_prev,-0.0114,0.004,-2.635,0.008,-0.020,-0.003
last,0.1721,0.017,10.128,0.000,0.139,0.205
next_3,0.2279,0.025,9.150,0.000,0.179,0.277
next_15,0.1526,0.033,4.568,0.000,0.087,0.218
to_debut,0.0627,0.034,1.846,0.065,-0.004,0.129
last5 DY/P,0.7398,0.194,3.805,0.000,0.359,1.121
last5 TO,0.0496,0.192,0.258,0.796,-0.327,0.426

0,1,2,3
Omnibus:,584.524,Durbin-Watson:,2.033
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1065.72
Skew:,1.098,Prob(JB):,3.8199999999999996e-232
Kurtosis:,4.692,Cond. No.,653.0


In [289]:
X_test = sm.add_constant(test_x)
preds = model.predict(X_test) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

sse = 0
for i in range(len(preds)):
    sse += np.abs(preds.values[i] - test_y.values[i][0])
print("MAE: ", sse/len(preds))

MSE:  57.190588381861666
MAE:  5.71963260310083


In [290]:
# worse than before!
# trying it without TO
X_train2 = sm.add_constant(train_x.drop(columns=['last5 TO']))
model2 = sm.OLS(train_y, X_train2).fit() 
model2.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,105.4
Date:,"Thu, 05 Dec 2019",Prob (F-statistic):,7.540000000000001e-122
Time:,14:33:07,Log-Likelihood:,-10994.0
No. Observations:,3328,AIC:,22000.0
Df Residuals:,3321,BIC:,22050.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.3774,1.084,1.271,0.204,-0.748,3.503
num_prev,-0.0115,0.004,-2.645,0.008,-0.020,-0.003
last,0.1721,0.017,10.129,0.000,0.139,0.205
next_3,0.2280,0.025,9.152,0.000,0.179,0.277
next_15,0.1525,0.033,4.567,0.000,0.087,0.218
to_debut,0.0630,0.034,1.856,0.064,-0.004,0.130
last5 DY/P,0.7370,0.194,3.797,0.000,0.356,1.118

0,1,2,3
Omnibus:,584.508,Durbin-Watson:,2.033
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1065.658
Skew:,1.098,Prob(JB):,3.94e-232
Kurtosis:,4.692,Cond. No.,606.0


In [291]:
X_test2 = sm.add_constant(test_x.drop(columns=['last5 TO']))
preds2 = model2.predict(X_test2) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds2.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds2))

MSE:  56.99818482659592


In [292]:
#normalizing features
from sklearn.preprocessing import StandardScaler
X_train3 = sm.add_constant(StandardScaler().fit_transform(train_x.drop(columns=['last5 TO'])))
model3 = sm.OLS(train_y, X_train3).fit() 
model3.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,105.4
Date:,"Thu, 05 Dec 2019",Prob (F-statistic):,7.540000000000001e-122
Time:,14:33:14,Log-Likelihood:,-10994.0
No. Observations:,3328,AIC:,22000.0
Df Residuals:,3321,BIC:,22050.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,10.0346,0.114,87.833,0.000,9.811,10.259
x1,-0.4349,0.164,-2.645,0.008,-0.757,-0.113
x2,1.3145,0.130,10.129,0.000,1.060,1.569
x3,1.3521,0.148,9.152,0.000,1.062,1.642
x4,0.7953,0.174,4.567,0.000,0.454,1.137
x5,0.3572,0.192,1.856,0.064,-0.020,0.735
x6,0.4339,0.114,3.797,0.000,0.210,0.658

0,1,2,3
Omnibus:,584.508,Durbin-Watson:,2.033
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1065.658
Skew:,1.098,Prob(JB):,3.94e-232
Kurtosis:,4.692,Cond. No.,3.5


In [293]:
X_test3 = sm.add_constant(StandardScaler().fit_transform(test_x.drop(columns=['last5 TO'])))
preds3 = model3.predict(X_test3) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds3[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds3))

MSE:  56.97082445560976


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [294]:
# Without defense data?
X_train4 = sm.add_constant(train_x.drop(columns=['last5 TO','last5 DY/P']))
model4 = sm.OLS(train_y, X_train4).fit() 
model4.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.156
Model:,OLS,Adj. R-squared:,0.155
Method:,Least Squares,F-statistic:,123.1
Date:,"Thu, 05 Dec 2019",Prob (F-statistic):,8.19e-120
Time:,14:33:20,Log-Likelihood:,-11001.0
No. Observations:,3328,AIC:,22010.0
Df Residuals:,3322,BIC:,22050.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.3859,0.247,21.847,0.000,4.903,5.869
num_prev,-0.0116,0.004,-2.676,0.007,-0.020,-0.003
last,0.1710,0.017,10.050,0.000,0.138,0.204
next_3,0.2265,0.025,9.077,0.000,0.178,0.275
next_15,0.1552,0.033,4.639,0.000,0.090,0.221
to_debut,0.0632,0.034,1.859,0.063,-0.003,0.130

0,1,2,3
Omnibus:,585.224,Durbin-Watson:,2.031
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1065.221
Skew:,1.1,Prob(JB):,4.8999999999999996e-232
Kurtosis:,4.685,Cond. No.,135.0


In [295]:
X_test4 = sm.add_constant(test_x.drop(columns=['last5 TO','last5 DY/P']))
preds4 = model4.predict(X_test4) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds4.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds4))

MSE:  56.949822280692146


Using sklearn

In [296]:
# including last5 DY/P and scaled features
lr = LinearRegression()
lr.fit(X_train3, train_y)
print('Intercept:', lr.intercept_)
print(FEATURES)
print(lr.coef_)
print('R^2: ', lr.score(X_test3, test_y))
preds = lr.predict(X_test3)
print("MSE: ",sum((preds - test_y.values)**2)/len(preds))

Intercept: [10.03464543]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut', 'last5 DY/P', 'last5 TO']
[[ 0.         -0.4349254   1.31446508  1.35209679  0.79533407  0.35719891
   0.43394503]]
R^2:  0.2340065056483754
MSE:  [56.97082446]


In [297]:
# without defense data
lr = LinearRegression()
lr.fit(X_train4, train_y)
print('Intercept:', lr.intercept_)
print(FEATURES)
print(lr.coef_)
print('R^2: ', lr.score(X_test4, test_y))
preds = lr.predict(X_test4)
print("MSE: ",sum((preds - test_y.values)**2)/len(preds))

Intercept: [5.38590127]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut', 'last5 DY/P', 'last5 TO']
[[ 0.         -0.01161661  0.17103333  0.22651036  0.15518261  0.06321914]]
R^2:  0.23428888754310584
MSE:  [56.94982228]


Lasso

In [298]:
from sklearn.linear_model import Lasso
# including last5 DY/P and scaled features
las_model = Lasso(alpha = 0.01)
las_model.fit(X_train3, train_y)
print('R^2: ', las_model.score(X_test3, test_y))
preds = las_model.predict(X_test3)
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

R^2:  0.23335201417611418
MSE:  57.01950230868406


In [299]:
#without defense data
las_model = Lasso(alpha = 0.01)
las_model.fit(X_train4, train_y)
print('R^2: ', las_model.score(X_test4, test_y))
preds = las_model.predict(X_test4)
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

R^2:  0.23417123659952363
MSE:  56.958572578577574


Random forest?

In [302]:
# including last5 DY/P and scaled features
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=250, random_state = 35)
rf_model.fit(X_train3, train_y)
preds = rf_model.predict(X_test3)
print('R^2: ', rf_model.score(X_test3, test_y))
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

  after removing the cwd from sys.path.


R^2:  0.17492560555366454
MSE:  61.36497089783804


In [202]:
#without defense data
rf_model = RandomForestRegressor(n_estimators=350, random_state = 35)
rf_model.fit(X_train4, train_y)
preds = rf_model.predict(X_test4)
print('R^2: ', rf_model.score(X_test4, test_y))
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

  This is separate from the ipykernel package so we can avoid doing imports until


R^2:  0.1709542431631037
MSE:  61.6603412779661


Boosted Tree Regressor

In [203]:
# including last5 DY/P and scaled features
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators=350, random_state = 35)
gb_model.fit(X_train3, train_y)
preds = gb_model.predict(X_test3)
print('R^2: ', gb_model.score(X_test3, test_y))
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

  y = column_or_1d(y, warn=True)


R^2:  0.17840849009491544
MSE:  61.10593109493906


In [204]:
#without defense data
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators=350, random_state = 35)
gb_model.fit(X_train4, train_y)
preds = gb_model.predict(X_test4)
print('R^2: ', gb_model.score(X_test4, test_y))
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))
se = 0
for i in range(len(preds)):
    se += np.abs(preds[i] - test_y.values[i][0])
print("MAE: ", se/len(preds))

  y = column_or_1d(y, warn=True)


R^2:  0.16140505359557034
MSE:  62.37056297900781
MAE:  5.554143295392428


**An ARMA for comparison**

In [205]:
def get_features_response_ARMA(players, all_games, year, gameNumber, points_dict):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value

    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < gameNumber)]
    next_game = games[(games.year == year) &
                     (games.game_number == gameNumber)]
    
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=False, inplace=True)
    # group dataframe by index
    df = df.groupby(df.index)
    
    past_16 = df.nth(list(range(0,16))).fpts
    weeks = df.nth(list(range(0,16))).game_number
    years = df.nth(list(range(0,16))).year
    
    # get response variable, the next game fantasy points 
    resp = next_game['fpts']

    for_return = pd.concat([past_16, years, weeks], axis=1)
    for_return.sort_values(by=['player_id', 'year', 'game_number'], axis=0, ascending=False, inplace=True)
    resp = resp.sort_index(ascending=False)
    return for_return, resp

In [218]:
POSITION = 'RB'
YEAR = 2016
NFL_WEEK = 9
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, NFL_WEEK, subset_position)
X, resp = get_features_response_ARMA(players, all_games, YEAR, NFL_WEEK, half_ppr)

from statsmodels.tsa.arima_model import ARMA
preds = []
for i in range(len(resp)):
    id = resp.index[i]
    past_fpts = X.loc[id]['fpts'].values
    #print(past_fpts)
    model = ARMA(past_fpts, (0,1))
    model_fit = model.fit(disp=0)
    #residuals = pd.DataFrame(model_fit.resid)
    #residuals.plot()
    preds.append(model_fit.forecast()[0][0])
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - resp.values[i])**2
print("MSE: ", sse/len(preds))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


MSE:  67.02332980829279


**Parameter tuning** for Lasso and Random Forest on training data up to 2016 week 16

should scale features?

In [235]:
POSITION = 'RB'
START_YEAR = 2013
YEAR = 2017
NFL_WEEK = 1
# features 
FEATURES = ['num_prev', 'last', 'next_3', 'next_15', 'to_debut','last5 DY/P','last5 TO']
# response
RESPONSE = ['resp']

# append features and response each week to these lists
feature_list = []
response_list = []

for year in range(START_YEAR, YEAR+1):   
    
    # if current year dont go past nfl week
    if year == YEAR:
        week_limit = NFL_WEEK-1
    else: # if previous year don't go past regular season (nfl week 16)
        week_limit = 16

    for week in range(1,week_limit+1):
        #print("week: ",week," year: ",year)
        players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
        train = get_features_response(players, all_games, year, week, half_ppr, gamesDef)
        feature = train[FEATURES]
        response = train[RESPONSE]
        feature_list.append(feature)
        response_list.append(response)

# At the end concatenate feature and response lists 
#  into train_x and train_y dataframes
train_x = pd.concat(feature_list)
train_y = pd.concat(response_list)

# fill NaN in train_x with zeros
train_x = train_x.fillna(0)

# Get test_x and test_y
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, NFL_WEEK, subset_position)
test = get_features_response(players, all_games, YEAR, NFL_WEEK, half_ppr, gamesDef)
test_x = test[FEATURES]
test_y = test[RESPONSE]

# fill NaN in test_x with zeros
test_x = test_x.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [236]:
train_x = train_x.drop(columns=['last5 TO'])

In [243]:
trainX2016 = train_x
trainX2016.head()

Unnamed: 0_level_0,num_prev,last,next_3,next_15,to_debut,last5 DY/P
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
17858,16.0,7.8,8.166667,5.1,0.0,5.32722
22024,25.0,0.6,6.166667,5.853333,8.333333,5.56478
10839,131.0,13.2,15.533333,10.793333,15.285714,6.36146
10524,27.0,2.8,5.9,5.92,4.6375,5.09174
1457,24.0,3.5,9.866667,6.72,0.0,5.54738


In [244]:
trainy2016 = train_y
trainy2016.head()

Unnamed: 0_level_0,resp
player_id,Unnamed: 1_level_1
17858,8.4
22024,8.4
10839,14.7
10524,2.4
1457,23.7


In [283]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
kf = KFold(n_splits=5, shuffle=True, random_state=3)
# re index
trainX2016.reset_index(drop=True, inplace=True)
trainy2016.reset_index(drop=True, inplace=True)

range_est = [100,150,200,250,300,350,400]
mses = [0,0,0,0,0,0,0]

for train_index, test_index in kf.split(X):
    X_train, X_test = trainX2016.iloc[train_index], trainX2016.iloc[test_index]
    y_train, y_test = trainy2016.iloc[train_index], trainy2016.iloc[test_index]
    for j in range(len(range_est)):
        rf_model = RandomForestRegressor(n_estimators=range_est[j], random_state = 35)
        rf_model.fit(X_train, y_train)
        preds = rf_model.predict(X_test)
        #MSE
        sse = 0
        for i in range(len(preds)):
            sse += (preds[i] - y_test.values[i][0])**2
        mses[j] += sse/len(preds)
print(mses)  
# 350 is best

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()


  app.launch_new_instance()
  app.launch_new_instance()


[228.79173906483405, 228.43973373654032, 227.76831977353163, 227.65821038611156, 227.49382115338813, 227.34567807929471, 227.62469912363898]


In [281]:
kf = KFold(n_splits=5, shuffle=True, random_state=3)
# re index
trainX2016.reset_index(drop=True, inplace=True)
trainy2016.reset_index(drop=True, inplace=True)

range_a = [0.01,0.05,0.1,0.2,0.3,0.4,0.5]
mses = [0,0,0,0,0,0,0]

for train_index, test_index in kf.split(X):
    X_train, X_test = trainX2016.iloc[train_index], trainX2016.iloc[test_index]
    y_train, y_test = trainy2016.iloc[train_index], trainy2016.iloc[test_index]
    for j in range(len(range_a)):
        las_model = Lasso(alpha = range_a[j])
        las_model.fit(X_train, y_train)
        preds = las_model.predict(X_test)
        #MSE
        sse = 0
        for i in range(len(preds)):
            sse += (preds[i] - y_test.values[i][0])**2
        mses[j] += sse/len(preds)
print(mses)  
#alpha = 0.01 is best

[214.80744238540308, 214.83332130481583, 214.9371464030825, 215.3838614366919, 216.1489241206329, 217.14473443758254, 217.7563682815356]
