In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bri-data-hackathon-pa/sample_submission.csv
/kaggle/input/bri-data-hackathon-pa/data_description.csv
/kaggle/input/bri-data-hackathon-pa/train.csv
/kaggle/input/bri-data-hackathon-pa/test.csv


Hello everyone!

**This notebook presents a straightforward code to tune hyperparameter of LGBM, CAT, and XGB with Bayesian Optimization. It is like GridSearchCV and RandomizedSearchCV.**

GridSearchCV searches for all combinations of parameters, and it could take a very long time. Not very efficient. RandomizedSearchCV searches the combination randomly. Somehow the algorithm can skip the optimal parameter, especially if the search grid is enormous. Bayesian Optimization is a smarter method to tune the hyperparameter. I won't discuss the theory behind it in this notebook as it is straightforward.

If you have any questions regarding the code, please comment below. I will update the notebook accordingly.

**Please do upvote the notebook if this notebook helps you as it will be a benchmark for me to do more work in the future. Thank you :)**

**Note: I do not do the feature engineering here, so the result may sub-optimal**

In [2]:
# Read train and test set
train = pd.read_csv("/kaggle/input/bri-data-hackathon-pa/train.csv")
test = pd.read_csv("/kaggle/input/bri-data-hackathon-pa/test.csv")

In [3]:
# Split train set into dependent variables and independent variable
y = train['Best Performance']
X = train.drop('Best Performance', axis=1)

In [4]:
# Convert to dummy variables
X = pd.get_dummies(X)
test = pd.get_dummies(test)

In [5]:
# Extract the common features between train and test set and use it to filter the train and test set
common = list(set(X.columns).intersection(set(test.columns)))
X = X[common]
test = test[common]

### XGBoost - Cross Validation Score

In [7]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
# fit model to training data
model = xgb.XGBRFClassifier(n_estimators=1000, random_state=1245)
# cross validation score
score = cross_val_score(model, X, y, cv=5, scoring="roc_auc", n_jobs=-1)
print("XGB ROC-AUC Mean Score: ", np.mean(score))

XGB ROC-AUC Mean Score:  0.5709700832656107


### Tuning with Bayesian Optimization

Now we will use Bayesian Optimization to tune the hyperparameter. Our goal is to maximize AUC.

You can also adjust what parameter you want to tune and the range of hyperparameter.

In [8]:
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Convert to special data format
# https://xgboost.readthedocs.io/en/latest/python/python_intro.html
dtrain = xgb.DMatrix(X, y, feature_names=X.columns.values)

def hyp_xgb(max_depth, subsample, colsample_bytree,min_child_weight, gamma, learning_rate):
    params = {
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'nthread':-1
     }
    
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['learning_rate'] = learning_rate
    scores = xgb.cv(params, dtrain, num_boost_round=500,verbose_eval=False, 
                    early_stopping_rounds=10, nfold=5)
    return scores['test-auc-mean'].iloc[-1]

In [10]:
pds ={
  'min_child_weight':(3, 20),
  'gamma':(0, 10),
  'subsample':(0.5, 1),
  'colsample_bytree':(0.1, 1),
  'max_depth': (2, 15),
  'learning_rate': (0.01, 0.5)
}

In [11]:
optimizer = BayesianOptimization(hyp_xgb, pds, random_state=1)
optimizer.maximize(init_points=4, n_iter=25)

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5665  [0m | [0m 0.4753  [0m | [0m 7.203   [0m | [0m 0.01006 [0m | [0m 5.93    [0m | [0m 5.495   [0m | [0m 0.5462  [0m |
| [95m 2       [0m | [95m 0.5762  [0m | [95m 0.2676  [0m | [95m 3.456   [0m | [95m 0.2044  [0m | [95m 9.005   [0m | [95m 10.13   [0m | [95m 0.8426  [0m |
| [0m 3       [0m | [0m 0.5689  [0m | [0m 0.284   [0m | [0m 8.781   [0m | [0m 0.02342 [0m | [0m 10.72   [0m | [0m 10.09   [0m | [0m 0.7793  [0m |
| [0m 4       [0m | [0m 0.5639  [0m | [0m 0.2263  [0m | [0m 1.981   [0m | [0m 0.4024  [0m | [0m 14.59   [0m | [0m 8.328   [0m | [0m 0.8462  [0m |
| [0m 5       [0m | [0m 0.566   [0m | [0m 0.2908  [0m | [0m 3.604   [0m | [0m 0.4254  [0m | [0m 8.613   [0m | [0m 10.07   [0m | [0m 0

In [12]:
optimizer.max['params']

{'colsample_bytree': 0.8485245380480577,
 'gamma': 0.07362378227392719,
 'learning_rate': 0.02919903835311307,
 'max_depth': 12.501698562782558,
 'min_child_weight': 4.536601691335176,
 'subsample': 0.8782155860634835}

### Instantiate with new hyperparameters

In [15]:
# Copied from above
# Some params need to be an integer
params = {
    'colsample_bytree': 0.8485245380480577,
    'gamma': 0.07362378227392719,
    'learning_rate': 0.02919903835311307,
    'max_depth': 13,
    'min_child_weight': 5,
    'subsample': 0.8782155860634835,
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'n_jobs':-1
}

In [16]:
xgbr =  xgb.XGBClassifier(**params, random_state=12345, nthread=-1)
xgbr.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8485245380480577,
              eval_metric='auc', gamma=0.07362378227392719, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02919903835311307, max_delta_step=0, max_depth=13,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, nthread=-1, num_parallel_tree=1,
              random_state=12345, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8782155860634835, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [17]:
# Predict the probability using predict_proba
y_pred = xgbr.predict_proba(test)[:,1]

In [18]:
submission = pd.read_csv("../input/bri-data-hackathon-pa/sample_submission.csv")
submission.head()

Unnamed: 0,index,Best Performance
0,0,0.131028
1,1,0.379354
2,2,0.031798
3,3,0.28522
4,4,0.848732


In [19]:
submission['Best Performance'] = y_pred
submission.to_csv("submission.csv", index=False)