# Bayesian optimization

The point of this notebook is to use the [Bayesian optimization package](https://github.com/fmfn/BayesianOptimization) to do an intelligent hyperparameter search for XGB. In this notebook we'll run hyperparameter tuning on the XGB model. I'm following both the documentation on the package github page, along with [this Kaggle tutorial](https://www.kaggle.com/tilii7/bayesian-optimization-of-xgboost-parameters).

In [2]:
import xgboost as xgb
from bayes_opt import BayesianOptimization

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, classification_report

## Data loading

In [3]:
df = pd.read_csv('../data/start_to_finish.csv')  #Pct Diff Columns Only (Gives Highest Accuracy)
#df = pd.read_csv('../data/Final Data/diff-mlb-games.csv')    #Diff columns only
#df = pd.read_csv('../data/Final Data/full-diff-mlb-games.csv')    #All columns

cols = ['team_ops_pct_diff', 'obp_diff', 'team_obp_pct_diff',
       'home_Rank_offset1year', 'away_WHIP_offset1year', 'team_ERA_pct_diff',
       'home_win_diff_bayes', 'home_RD', 'team_bayes_pct_diff',
       'away_win_diff_bayes', 'team_RA_pct_diff', 'team_slg_pct_diff',
       'team_W-L_pct_diff', 'away_ERA_offset1year', 'away_win_pct',
       'away_pitcher_IP_avg_162games', 'team_Rank_pct_diff',
       'home_pitcher_IP_avg_162games', 'away_RD',
       'home_pitcher_WHIP_avg_162games', 'team_WHIP_pct_diff',
       'home_bayes_win', 'home_win_diff', 'pitcher_IP_pct_diff',
       'away_Rank_offset1year', 'home_total_R', 'home_pythag_expect',
       'home_obp', 'team_R_pct_diff', 'home_avg',
       'away_pitcher_WPA_avg_162games', 'home_ops', 'away_pitcher_season_game',
       'avg_diff', 'team_avg_pct_diff', 'home_W-L-pct_offset1year',
       'home_ERA_offset1year', 'away_R_offset1year', 'home_win_pct',
       'away_elo']

train_df = df[df['Y'] <= 2015][cols + ['home_win']] 
test_df = df[df['Y'] > 2015][cols + ['home_win']]

X_train = train_df.drop('home_win', axis=1)
y_train = train_df.home_win

X_test = test_df.drop('home_win', axis=1)
y_test = test_df.home_win

## Loss function

In [4]:
log_file = open('AUC-5fold-XGB.log', 'a')
AUCbest = -1.0
ITERbest = 0

In [5]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

In [6]:
def xgb_cv(max_depth, min_child_weight, eta, subsample, colsample_bytree, gamma):
    global AUCbest
    global ITERbest
    
    params = {'max_depth': int(max_depth),
              'min_child_weight': min_child_weight,
              'eta': eta,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'gamma': gamma,
              'seed': 0,
              'nthread': 4,
              'objective': 'binary:logistic',
              'eval_metric': 'auc'}
    
    folds = 5
    cv_score = 0
    
    print("\n Search parameters (%d-fold validation):\n %s" % (folds, params), file=log_file)
    log_file.flush()

    xgbc = xgb.cv(
                    params,
                    dtrain,
                    num_boost_round = 20000,
                    stratified = True,
                    nfold = folds,
                    early_stopping_rounds = 100,
                    metrics = 'auc',
                    show_stdv = True
               )
    
    val_score = xgbc['test-auc-mean'].iloc[-1]
    train_score = xgbc['train-auc-mean'].iloc[-1]
    print('Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' % ( len(xgbc), train_score, val_score, (train_score - val_score), (train_score*2-1),
(val_score*2-1)))
    if val_score > AUCbest:
        AUCbest = val_score
        ITERbest = len(xgbc)

    return (val_score*2) - 1

## Hyperparameter tuning

In [7]:
params = {'max_depth': (3, 20),
          'min_child_weight': (0.001, 10),
          'eta': (0.001, 1.0),
          'subsample': (0.6, 1.0),
          'colsample_bytree': (0.6, 1.0),
          'gamma': (0.001, 10)}

In [8]:
XGB_BO = BayesianOptimization(xgb_cv, params)

In [9]:
XGB_BO.maximize(init_points=2, n_iter=3)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
Stopped after 1 iterations with train-auc = 0.686580 val-auc = 0.634804 ( diff = 0.051777 ) train-gini = 0.373160 val-gini = 0.269607
| [0m 1       [0m | [0m 0.2696  [0m | [0m 0.8728  [0m | [0m 0.8267  [0m | [0m 8.736   [0m | [0m 9.238   [0m | [0m 3.199   [0m | [0m 0.6695  [0m |
Stopped after 3 iterations with train-auc = 0.696259 val-auc = 0.654165 ( diff = 0.042094 ) train-gini = 0.392518 val-gini = 0.308330
| [95m 2       [0m | [95m 0.3083  [0m | [95m 0.9681  [0m | [95m 0.6908  [0m | [95m 6.27    [0m | [95m 6.52    [0m | [95m 7.212   [0m | [95m 0.8479  [0m |
Stopped after 7 iterations with train-auc = 0.899019 val-auc = 0.620986 ( diff = 0.278033 ) train-gini = 0.798039 val-gini = 0.241972
| [0m 3       [0m | [0m 0.242   [0m | [0m 0.616   [0m | [0m 0.4

In [10]:
best_params = {'max_depth': int(XGB_BO.max['params']['max_depth']),
              'min_child_weight': XGB_BO.max['params']['min_child_weight'],
              'eta': XGB_BO.max['params']['eta'],
              'subsample': XGB_BO.max['params']['subsample'],
              'colsample_bytree': XGB_BO.max['params']['colsample_bytree'],
              'gamma': XGB_BO.max['params']['gamma'],
              'seed': 0,
              'nthread': 4,
              'objective': 'binary:logistic',
              'eval_metric': 'auc'}

xgb_best = xgb.train(best_params, dtrain, 10)

In [11]:
best_params

{'max_depth': 3,
 'min_child_weight': 6.292249181210905,
 'eta': 1.0,
 'subsample': 1.0,
 'colsample_bytree': 1.0,
 'gamma': 6.4258711159875235,
 'seed': 0,
 'nthread': 4,
 'objective': 'binary:logistic',
 'eval_metric': 'auc'}

In [12]:
test_preds_proba = xgb_best.predict(dtest)

In [13]:
test_preds = np.round(test_preds_proba, 0)

In [14]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

         0.0       0.61      0.55      0.58      4656
         1.0       0.63      0.69      0.66      5282

    accuracy                           0.62      9938
   macro avg       0.62      0.62      0.62      9938
weighted avg       0.62      0.62      0.62      9938



In [15]:
accuracy_score(y_test, test_preds)

0.622358623465486