In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)

In [2]:
all_games = pd.read_csv('nfl-football-player-stats/games_1995.csv')
all_games.drop([440917], inplace=True)
all_games = all_games.set_index('player_id')
all_players = pd.read_csv('nfl-football-player-stats/players_1995.csv',index_col='player_id')
gamesDef = pd.read_csv('defData12-17.csv',index_col='Tm')
gamesDef = gamesDef.drop(columns=["Rk","Time","LTime"])

In [3]:
# weird situation: player id 1890 (LeGarrette Blount) was on the Steelers in the 11th game of 2014, was 
# released from the Steelers and played for the Patriots in their 11th game of 2014. Since he didn't touch the ball
# with the Steelers, deleted that row b/c it causes errors later on
# all_games[(all_games['year'] == 2014) & (all_games['player_id'] == 1890)]

In [4]:
# dictionary that has fantasy value for each nfl stat
half_ppr = {
    'rushing_yards': 0.1,
    'rushing_touchdowns': 6,
    'receiving_receptions': 0.5,
    'receiving_yards': 0.1,
    'receiving_touchdowns': 6
}

In [5]:
# subset RB position by choosing only RBs with at least 5 rushing attempts
# or receiving targets in a given nfl week.
subset_position = {
    'RB': [['rushing_attempts', 'receiving_targets'], 5]
}

In [6]:
def get_players_thatweek(all_games, all_players, position, year, gamenumber, subset_position):
    # return pandas df with player_id as index and column 'name' as player name
    ids = all_players[all_players.position == position].index
    games = all_games.loc[ids]
    stats = subset_position[position][0]
    sum_threshold = subset_position[position][1]
    # only take RBs with 5 rush attempts or receiving targets (>= sum_threshold)
    worth_predicting = games[(games.year == year) & 
                             (games.game_number == gamenumber) &
                             (np.sum(games[stats], axis=1) >= sum_threshold)]
    ids = worth_predicting.index
    for_df = all_players.loc[ids, 'name']
    return pd.DataFrame(data=for_df)

In [7]:
# example usage
a = get_players_thatweek(all_games, all_players,'RB',2014,11,subset_position)
a.head()

Unnamed: 0_level_0,name
player_id,Unnamed: 1_level_1
10839,Steven Jackson
10586,Carlos Hyde
1457,Joique Bell
23980,Andre Williams
17587,Bernard Pierce


In [8]:
def get_def_data(gamesDef, opps, gameNumber, year):
    prevSeasonDef = gamesDef[gamesDef['Year']==(year-1)]
    currentSeasonDef = gamesDef[(gamesDef['Year']==year) & (gamesDef['Game']<=(gameNumber-1))]
    df = pd.concat((prevSeasonDef, currentSeasonDef))
    df.sort_values(by=['Year', 'Game'], axis=0, ascending=False, inplace=True)
    df = df.loc[opps]
    grouped = df.groupby('Tm', sort=False)
    last5_DYP = grouped.nth(list(range(0,5))).groupby('Tm', sort=False).mean()['DY/P']
    last5_TO = grouped.nth(list(range(0,5))).groupby('Tm', sort=False).mean()['TO']
    DYP = []
    TO = []
    for i in opps:
        DYP.append(last5_DYP[i])
        TO.append(last5_TO[i])
    return DYP, TO

In [9]:
def get_features_response(players, all_games, year, gameNumber, points_dict, def_data):
    games = all_games.loc[players.index]
    
    # compute fpts for each row
    games['fpts'] = games['game_number']*0
    for stat, value in zip(points_dict.keys(), points_dict.values()):
        games['fpts'] = games['fpts'] + games[stat]*value

    prev_years = games[(games.year < year) &
                       (games.game_number <= 16)]
    current_year = games[(games.year == year) &
                         (games.game_number < gameNumber)]
    next_game = games[(games.year == year) &
                     (games.game_number == gameNumber)]
    opps = next_game['opponent']
    #get oppponents' defense stats
    last5_DYP, last5_TO = get_def_data(def_data, opps, gameNumber, year)
    
    df = pd.concat((prev_years, current_year))
    df.sort_values(by=['player_id', 'year', 'game_number'], axis=0,
                   ascending=False, inplace=True)
    # group dataframe by index
    df = df.groupby(df.index)
    
    num_prev = df.fpts.agg('count').rename('num_prev')
    last = df.nth([0]).groupby('player_id').agg('mean').fpts.rename('last')
    next_3 = df.nth(list(range(1,4))).groupby('player_id').agg('mean').fpts.rename('next_3')
    next_15 = df.nth(list(range(4,19))).groupby('player_id').agg('mean').fpts.rename('next_15')
    to_debut = df.nth(list(range(19,300))).groupby('player_id').agg('mean').fpts.rename('to_debut')
    
    # get response variable, the next game fantasy points 
    resp = next_game['fpts']

    for_return = players.join([num_prev,last, next_3, next_15, to_debut], how='left')
    for_return.insert(6, "last5 DY/P", last5_DYP)
    for_return.insert(7, "last5 TO", last5_TO)
    for_return.insert(8, "resp", resp.values)
    return for_return

In [34]:
# example usage
players = get_players_thatweek(all_games, all_players,'RB',2014,11, subset_position)
get_features_response(players, all_games, 2014, 11, half_ppr, gamesDef)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0_level_0,name,num_prev,last,next_3,next_15,to_debut,last5 DY/P,last5 TO,resp
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10839,Steven Jackson,153,5.7,10.5,10.373333,14.615672,4.67384,2.0,12.1
10586,Carlos Hyde,10,3.5,5.2,4.833333,,5.55198,2.2,7.6
1457,Joique Bell,49,13.0,9.966667,11.48,6.853333,5.68434,1.5,5.1
23980,Andre Williams,10,1.1,9.0,6.566667,,5.39738,1.6,11.2
17587,Bernard Pierce,39,3.1,5.833333,4.113333,4.86,5.78462,1.8,3.9
8152,Frank Gore,142,12.4,7.7,10.186667,14.371545,5.55198,2.2,3.6
13751,Marshawn Lynch,114,13.0,25.5,15.686667,13.013684,4.84612,1.2,9.7
19486,Bishop Sankey,10,11.0,7.5,5.5,,5.36146,3.25,6.1
2556,Bryce Brown,35,10.6,7.2,3.246667,5.78125,5.60266,4.0,2.0
16086,DeMarco Murray,47,16.1,18.266667,19.793333,12.253571,6.49304,2.2,15.3


In [35]:
POSITION = 'RB'
START_YEAR = 2013
YEAR = 2016
NFL_WEEK = 5
# features 
FEATURES = ['num_prev', 'last', 'next_3', 'next_15', 'to_debut','last5 DY/P','last5 TO']
# response
RESPONSE = ['resp']

# append features and response each week to these lists
feature_list = []
response_list = []

for year in range(START_YEAR, YEAR+1):   
    
    # if current year dont go past nfl week
    if year == YEAR:
        week_limit = NFL_WEEK-1
    else: # if previous year don't go past regular season (nfl week 16)
        week_limit = 16

    for week in range(1,week_limit+1):
        #print("week: ",week," year: ",year)
        players = get_players_thatweek(all_games, all_players, POSITION, year, week, subset_position)
        train = get_features_response(players, all_games, year, week, half_ppr, gamesDef)
        feature = train[FEATURES]
        response = train[RESPONSE]
        feature_list.append(feature)
        response_list.append(response)

# At the end concatenate feature and response lists 
#  into train_x and train_y dataframes
train_x = pd.concat(feature_list)
train_y = pd.concat(response_list)

# fill NaN in train_x with zeros
train_x = train_x.fillna(0)

# Get test_x and test_y
players = get_players_thatweek(all_games, all_players, POSITION, YEAR, NFL_WEEK, subset_position)
test = get_features_response(players, all_games, YEAR, NFL_WEEK, half_ppr, gamesDef)
test_x = test[FEATURES]
test_y = test[RESPONSE]

# fill NaN in test_x with zeros
test_x = test_x.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


In [36]:
# Train a linear regression model to predict rest of season ppg for RBs
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr = lr.fit(train_x, train_y)
print('Intercept:', lr.intercept_)
print(FEATURES)
print(lr.coef_)
print('R^2: ', lr.score(test_x, test_y))

Intercept: [1.34042061]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut', 'last5 DY/P', 'last5 TO']
[[-0.01080409  0.1740959   0.21513372  0.1582381   0.06468097  0.73462786
   0.00882845]]
R^2:  0.18890860570007872


In [37]:
#Using statsmodels
import statsmodels.api as sm 
X_train = sm.add_constant(train_x)
# Fit OLS model 
model = sm.OLS(train_y, X_train).fit() 
# Print model summary 
model.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,84.17
Date:,"Wed, 27 Nov 2019",Prob (F-statistic):,2.22e-112
Time:,00:28:22,Log-Likelihood:,-10237.0
No. Observations:,3106,AIC:,20490.0
Df Residuals:,3098,BIC:,20540.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.3404,1.195,1.122,0.262,-1.003,3.683
num_prev,-0.0108,0.004,-2.415,0.016,-0.020,-0.002
last,0.1741,0.018,9.911,0.000,0.140,0.209
next_3,0.2151,0.026,8.381,0.000,0.165,0.265
next_15,0.1582,0.034,4.592,0.000,0.091,0.226
to_debut,0.0647,0.035,1.852,0.064,-0.004,0.133
last5 DY/P,0.7346,0.200,3.672,0.000,0.342,1.127
last5 TO,0.0088,0.201,0.044,0.965,-0.385,0.403

0,1,2,3
Omnibus:,569.378,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1075.388
Skew:,1.122,Prob(JB):,3.0399999999999996e-234
Kurtosis:,4.809,Cond. No.,652.0


In [38]:
X_test = sm.add_constant(test_x)
preds = model.predict(X_test) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

MSE:  49.502233400558026


In [39]:
# worse than before!
# trying it without TO
X_train2 = sm.add_constant(train_x.drop(columns=['last5 TO']))
model2 = sm.OLS(train_y, X_train2).fit() 
model2.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,98.22
Date:,"Wed, 27 Nov 2019",Prob (F-statistic):,2.15e-113
Time:,00:28:23,Log-Likelihood:,-10237.0
No. Observations:,3106,AIC:,20490.0
Df Residuals:,3099,BIC:,20530.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.3594,1.114,1.221,0.222,-0.824,3.543
num_prev,-0.0108,0.004,-2.418,0.016,-0.020,-0.002
last,0.1741,0.018,9.913,0.000,0.140,0.209
next_3,0.2151,0.026,8.383,0.000,0.165,0.265
next_15,0.1582,0.034,4.592,0.000,0.091,0.226
to_debut,0.0647,0.035,1.854,0.064,-0.004,0.133
last5 DY/P,0.7344,0.200,3.673,0.000,0.342,1.126

0,1,2,3
Omnibus:,569.379,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1075.411
Skew:,1.122,Prob(JB):,3e-234
Kurtosis:,4.809,Cond. No.,608.0


In [40]:
X_test2 = sm.add_constant(test_x.drop(columns=['last5 TO']))
preds2 = model2.predict(X_test2) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds2.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds2))

MSE:  49.513578585021804


In [41]:
#normalizing features
from sklearn.preprocessing import StandardScaler
X_train3 = sm.add_constant(StandardScaler().fit_transform(train_x.drop(columns=['last5 TO'])))
model3 = sm.OLS(train_y, X_train3).fit() 
model3.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.16
Model:,OLS,Adj. R-squared:,0.158
Method:,Least Squares,F-statistic:,98.22
Date:,"Wed, 27 Nov 2019",Prob (F-statistic):,2.15e-113
Time:,00:28:23,Log-Likelihood:,-10237.0
No. Observations:,3106,AIC:,20490.0
Df Residuals:,3099,BIC:,20530.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.9730,0.117,84.979,0.000,9.743,10.203
x1,-0.4097,0.169,-2.418,0.016,-0.742,-0.077
x2,1.3228,0.133,9.913,0.000,1.061,1.584
x3,1.2730,0.152,8.383,0.000,0.975,1.571
x4,0.8257,0.180,4.592,0.000,0.473,1.178
x5,0.3687,0.199,1.854,0.064,-0.021,0.759
x6,0.4312,0.117,3.673,0.000,0.201,0.661

0,1,2,3
Omnibus:,569.379,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1075.411
Skew:,1.122,Prob(JB):,3e-234
Kurtosis:,4.809,Cond. No.,3.53


In [42]:
X_test3 = sm.add_constant(StandardScaler().fit_transform(test_x.drop(columns=['last5 TO'])))
preds3 = model3.predict(X_test3) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds3[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds3))

MSE:  49.29470020205491


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [46]:
# Without defense data?
X_train4 = sm.add_constant(train_x.drop(columns=['last5 TO','last5 DY/P']))
model4 = sm.OLS(train_y, X_train4).fit() 
model4.summary()

0,1,2,3
Dep. Variable:,resp,R-squared:,0.156
Model:,OLS,Adj. R-squared:,0.155
Method:,Least Squares,F-statistic:,114.7
Date:,"Wed, 27 Nov 2019",Prob (F-statistic):,1.52e-111
Time:,00:29:56,Log-Likelihood:,-10244.0
No. Observations:,3106,AIC:,20500.0
Df Residuals:,3100,BIC:,20540.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.3431,0.253,21.123,0.000,4.847,5.839
num_prev,-0.0109,0.004,-2.437,0.015,-0.020,-0.002
last,0.1732,0.018,9.844,0.000,0.139,0.208
next_3,0.2137,0.026,8.309,0.000,0.163,0.264
next_15,0.1608,0.035,4.658,0.000,0.093,0.228
to_debut,0.0648,0.035,1.853,0.064,-0.004,0.133

0,1,2,3
Omnibus:,569.293,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1072.689
Skew:,1.123,Prob(JB):,1.17e-233
Kurtosis:,4.8,Cond. No.,136.0


In [47]:
X_test4 = sm.add_constant(test_x.drop(columns=['last5 TO','last5 DY/P']))
preds4 = model4.predict(X_test4) 
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds4.values[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds4))

MSE:  48.79041319742892


Using sklearn

In [73]:
# including last5 DY/P and scaled features
lr = LinearRegression()
lr.fit(X_train3, train_y)
print('Intercept:', lr.intercept_)
print(FEATURES)
print(lr.coef_)
print('R^2: ', lr.score(X_test3, test_y))
preds = lr.predict(X_test3)
print("MSE: ",sum((preds - test_y.values)**2)/len(preds))

Intercept: [9.97301996]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut', 'last5 DY/P', 'last5 TO']
[[ 0.         -0.40965582  1.32279173  1.27300699  0.82570868  0.36871887
   0.43116449]]
R^2:  0.19230902583820453
MSE:  [49.2947002]


In [72]:
# without defense data
lr = LinearRegression()
lr.fit(X_train4, train_y)
print('Intercept:', lr.intercept_)
print(FEATURES)
print(lr.coef_)
print('R^2: ', lr.score(X_test4, test_y))
preds = lr.predict(X_test4)
print("MSE: ",sum((preds - test_y.values)**2)/len(preds))

Intercept: [5.34310396]
['num_prev', 'last', 'next_3', 'next_15', 'to_debut', 'last5 DY/P', 'last5 TO']
[[ 0.         -0.01091709  0.17321945  0.21365332  0.16079936  0.06480533]]
R^2:  0.20057174090400287
MSE:  [48.7904132]


Lasso

In [71]:
from sklearn.linear_model import Lasso
# including last5 DY/P and scaled features
las_model = Lasso()
las_model.fit(X_train3, train_y)
print('R^2: ', las_model.score(X_test3, test_y))
preds = las_model.predict(X_test3)
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

R^2:  0.15103945151350318
MSE:  51.81344977198023


In [70]:
#without defense data
las_model = Lasso()
las_model.fit(X_train4, train_y)
print('R^2: ', las_model.score(X_test4, test_y))
preds = las_model.predict(X_test4)
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

R^2:  0.19216288202291143
MSE:  49.30361959795226


Random forest?

In [84]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=400)
rf_model.fit(X_train3, train_y)
preds = rf_model.predict(X_test3)
print('R^2: ', rf_model.score(X_test3, test_y))
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

  This is separate from the ipykernel package so we can avoid doing imports until


R^2:  0.19071478629844218
MSE:  49.39199924671046


In [89]:
rf_model = RandomForestRegressor(n_estimators=400)
rf_model.fit(X_train4, train_y)
preds = rf_model.predict(X_test4)
print('R^2: ', rf_model.score(X_test4, test_y))
#MSE
sse = 0
for i in range(len(preds)):
    sse += (preds[i] - test_y.values[i][0])**2
print("MSE: ", sse/len(preds))

  


R^2:  0.15700236412482527
MSE:  51.44952346983079
