Import Libraries & Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (auc, classification_report, roc_auc_score, accuracy_score,
                             f1_score, log_loss, roc_curve, confusion_matrix,
                             precision_score, recall_score, plot_confusion_matrix,
                             make_scorer)

import xgboost as xgb
import statsmodels.formula.api as smf

Test Train Split

XGBoost Training

In [None]:
#Assign Model
model_t = XGBClassifier(n_estimators= 70, max_depth=4, eta = .05, subsample = .9, colsample_bytree = .8)

# #Utilize Cross Validation To Further Enhance Model
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=1)
scores = cross_val_score(model_t, X_train, y_train, cv=cv, scoring = 'neg_log_loss', n_jobs = -1, error_score = 'raise')

scores = abs(scores)
print('Mean Log Loss: %.3f (%.3f)' % (scores.mean(), scores.std()) )

In [None]:
#Build The Model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", 
                              random_state = 42, 
                              eta = .04, 
                              max_depth = 6,
                              min_child_weight = 3,
                              n_estimators = 100,
                              gamma = .6,
                              reg_lambda = .2,
                              subsample = 1,
                              colsample_bytree = .99)

#Fit The Model
xgb_model.fit(X, Y, early_stopping_rounds = 5, eval_metric = 'logloss', eval_set = [(x, y)])

In [None]:
#Update The Model Based On Cross Validation Tests and Fit and Balance Overfitting With Performance
model_t = XGBClassifier(n_estimators= 40, max_depth = 4, eta = .05, subsample = .8, colsample_bytree = .9)
print("                 Test Set                        Train Set")
model_t.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)]) #Check For Overfitting

Hyperparamter Tuning

In [None]:
#Hyperparameter Tuning
param_grid = {'eta': [.0035],
              'objective':['binary:logistic'],
              'max_depth': [6],
              'min_child_weight': [1],
              'n_estimators': [8],
              'gamma': [.44],
              'reg_lambda' : [.55],
              'subsample': [1],
              'colsample_bytree': [.5]}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
optimal_params = GridSearchCV(xgb_model, param_grid, n_jobs = 4, scoring = LogLoss, verbose = 0, cv = 3)

optimal_params.fit(X, Y)

Model Metrics

In [None]:
#Accuracy Score
accuracy_score(test['team1_win'], test['prediction'])
precision_score(test['team1_win'], test['prediction'])
recall_score(test['team1_win'], test['prediction'])
f1_score(test['team1_win'], test['prediction'])
log_loss(test['team1_win'].values, test['prob'].values, labels=[0,1])

Feature Importance

In [None]:
feat_importances = pd.Series(xgb_model.feature_importances_, index= X.columns)
feat_importances.nlargest(54).plot(kind='barh')
sns.set(rc = {'figure.figsize':(14,14)})
plt.title("Feature Importance of XGBoost Model", size = 14)