# XGBoost: First Model and Feature Importance

In this notebook, we'll build the first of two baseline models: an XGBoost model for each player trained on 2010 and 2011, with 2012 as the test set.

We'll then construct Shapley values to get a more intuitive sense of the most important features. Because of the high dimensionality of the data, we'll want our model to be very conservative.

Note that I'm running the xgboost package on SageMaker because I have issues installing it locally. But this section should be done locally by most users.

Use kernel: conda_python3

In [None]:
!conda install -y -c conda-forge xgboost

Solving environment: - 

In [None]:
!conda install -y -c conda-forge shap

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

plt.style.use('ggplot')

## Tom Brady

In [None]:
brady = pd.read_csv('../data/data_final/final/features_raw/brady.csv')

In [None]:
split_date = dt.date(2012,5,1)
brady['date'] = pd.to_datetime(brady['date'])

train, test = brady.loc[brady['date']<split_date, brady.columns!='date'], brady.loc[brady['date']>split_date, brady.columns!='date']
X_train, y_train = train.iloc[:,1:], train.iloc[:,0]
X_test, y_test = test.iloc[:,1:], test.iloc[:,0]
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 3, alpha = 10, n_estimators = 30)

In [None]:
xg_reg.fit(X_train,y_train)

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
print("RMSE: {:.2f}\n MAE: {:.2f}".format(rmse, mae))

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
fig, ax = plt.subplots()
plot1 = ax.plot(y_test.values, alpha = 0.8, label='actual')
plot1 = ax.plot(preds, alpha = 0.8, label='predicted')
ax.legend()
plt.xlabel('Week (2012)')
plt.ylabel('Fantasy Points')
plt.title('Tom Brady Fantasy Points 2012, Predicted vs Actual')

In [None]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.25,'learning_rate': 0.4,
                'max_depth': 4, 'alpha': 10}

cv_results_rmse = xgb.cv(dtrain=dtrain, params=params, nfold=5,
                    num_boost_round=100,early_stopping_rounds=20, metrics="rmse", as_pandas=True, seed=123)
cv_results_mae = xgb.cv(dtrain=dtrain, params=params, nfold=5,
                    num_boost_round=100,early_stopping_rounds=20, metrics="mae", as_pandas=True, seed=123)

In [None]:
print(cv_results_rmse.tail(1))
print(cv_results_mae.tail(1))

In [None]:
xg_reg2 = xgb.train(params=params, dtrain=dtrain, num_boost_round=100)

In [None]:
preds2 = xg_reg2.predict(dtest)
rmse = np.sqrt(mean_squared_error(y_test, preds2))
mae = mean_absolute_error(y_test, preds2)
print("RMSE: {:.2f}\n MAE: {:.2f}".format(rmse, mae))

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
fig, ax = plt.subplots()
plot1 = ax.plot(y_test.values, alpha = 0.8, label='actual')
plot1 = ax.plot(preds2, alpha = 0.8, label='predicted')
ax.legend()
plt.xlabel('Week (2012)')
plt.ylabel('Fantasy Points')
plt.title('Tom Brady Fantasy Points 2012, Predicted vs Actual (Model 2)')

In [None]:
shap.summary_plot(
    shap.TreeExplainer(xg_reg).shap_values(X_test), 
    features = X_test,
    feature_names = X_train.columns, 
    max_display=15
)

The similarity of shap values for the defensive stats makes me think PCA would be really good here. It suggests that there may be a good amount of overlap between those features.

In [None]:
explainer = shap.TreeExplainer(xg_reg)
shap_values = explainer.shap_values(X_test)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[7], X_test.iloc[7])

Finally, try iteratively training new models after each game to see how it improves prediciton accuracy. Use xg_reg as a baseline.

In [None]:
X_train_iter = X_train
y_train_iter = y_train
preds = []

for game_idx in np.arange(0, len(X_test)):
    xg_reg.fit(X_train_iter, y_train_iter)
    preds.append(xg_reg.predict(X_test)[game_idx])
    X_train_iter = X_train_iter.append(pd.Series(X_test.iloc[game_idx]))
    y_train_iter = y_train_iter.append(pd.Series(y_test.iloc[game_idx]))


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
print("RMSE: {:.2f}\n MAE: {:.2f}".format(rmse, mae))

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
fig, ax = plt.subplots()
plot1 = ax.plot(y_test.values, alpha = 0.8, label='actual')
plot1 = ax.plot(preds, alpha = 0.8, label='predicted')
ax.legend()
plt.xlabel('Week (2012)')
plt.ylabel('Fantasy Points')
plt.title('Tom Brady Fantasy Points 2012, Predicted vs Actual (Model 3 - Iterative Training)')

# McCoy

In [None]:
mccoy = pd.read_csv('../data/data_final/final/features_raw/mccoy.csv')
mccoy = mccoy.loc[mccoy['player_played']==1].copy() # we can drop the 'played' flag because we don't want ot predict these.

In [None]:
split_date = dt.date(2012,5,1)
mccoy['date'] = pd.to_datetime(mccoy['date'])

train, test = mccoy.loc[mccoy['date']<split_date, mccoy.columns!='date'], mccoy.loc[mccoy['date']>split_date, mccoy.columns!='date']
X_train, y_train = train.iloc[:,1:], train.iloc[:,0]
X_test, y_test = test.iloc[:,1:], test.iloc[:,0]
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 3, alpha = 10, n_estimators = 30)

In [None]:
xg_reg.fit(X_train,y_train)

In [None]:
preds = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
print("RMSE: {:.2f}\n MAE: {:.2f}".format(rmse, mae))

In [None]:
preds

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
fig, ax = plt.subplots()
plot1 = ax.plot(y_test.values, alpha = 0.8, label='actual')
plot1 = ax.plot(preds, alpha = 0.8, label='predicted')
ax.legend()
plt.xlabel('Week (2012)')
plt.ylabel('Fantasy Points')
plt.title('LeSean McCoy Fantasy Points 2012, Predicted vs Actual')

In [None]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.25,'learning_rate': 0.6,
                'max_depth': 2, 'alpha': 50}

cv_results_rmse = xgb.cv(dtrain=dtrain, params=params, nfold=5,
                    num_boost_round=100,early_stopping_rounds=20, metrics="rmse", as_pandas=True, seed=123)
cv_results_mae = xgb.cv(dtrain=dtrain, params=params, nfold=5,
                    num_boost_round=100,early_stopping_rounds=20, metrics="mae", as_pandas=True, seed=123)

In [None]:
print(cv_results_rmse.tail(1))
print(cv_results_mae.tail(1))

In [None]:
xg_reg2 = xgb.train(params=params, dtrain=dtrain, num_boost_round=100)

In [None]:
preds2 = xg_reg2.predict(dtest)
rmse = np.sqrt(mean_squared_error(y_test, preds2))
mae = mean_absolute_error(y_test, preds2)
print("RMSE: {:.2f}\n MAE: {:.2f}".format(rmse, mae))

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
fig, ax = plt.subplots()
plot1 = ax.plot(y_test.values, alpha = 0.8, label='actual')
plot1 = ax.plot(preds2, alpha = 0.8, label='predicted')
ax.legend()
plt.xlabel('Week (2012)')
plt.ylabel('Fantasy Points')
plt.title('LeSean McCoy Fantasy Points 2012, Predicted vs Actual (Model 2)')

In [None]:
shap.summary_plot(
    shap.TreeExplainer(xg_reg2).shap_values(X_test), 
    features = X_test,
    feature_names = X_train.columns, 
    max_display=15
)

In [None]:
explainer = shap.TreeExplainer(xg_reg2)
shap_values = explainer.shap_values(X_test)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[3], X_test.iloc[3])

In [None]:
X_train_iter = X_train
y_train_iter = y_train
preds = []

for game_idx in np.arange(0, len(X_test)):
    xg_reg.fit(X_train_iter, y_train_iter)
    preds.append(xg_reg.predict(X_test)[game_idx])
    X_train_iter = X_train_iter.append(pd.Series(X_test.iloc[game_idx]))
    y_train_iter = y_train_iter.append(pd.Series(y_test.iloc[game_idx]))


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
print("RMSE: {:.2f}\n MAE: {:.2f}".format(rmse, mae))

In [None]:
plt.rcParams['figure.figsize'] = (10, 5)
fig, ax = plt.subplots()
plot1 = ax.plot(y_test.values, alpha = 0.8, label='actual')
plot1 = ax.plot(preds, alpha = 0.8, label='predicted')
ax.legend()
plt.xlabel('Week (2012)')
plt.ylabel('Fantasy Points')
plt.title('LeSean McCoy Fantasy Points 2012, Predicted vs Actual (Model 3 - Iterative Training)')
plt.show()