In [84]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [85]:
wine_df = pd.read_csv('../data/wine/winequality-red.csv', sep=";")
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [86]:
wine_df['ID'] = np.arange(1, len(wine_df) + 1)
wine_df.quality = wine_df.quality.apply(lambda x: 1 if x >=6 else 0)
wine_df.quality.value_counts()

quality
1    855
0    744
Name: count, dtype: int64

In [87]:
y = wine_df.quality.copy()
features = [x for x in wine_df.columns if x not in ['quality','ID']]
X = wine_df[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

In [88]:
base_model = XGBClassifier(objective = 'binary:logistic',
                               eval_metric='auc',
                               booster='gbtree',                               
                               nthread=-1,
                               n_estimators=500,
                               learning_rate=0.1,
                               gamma = 0,
                               max_depth=5,
                               min_child_weight=1,
                               max_delta_step=1,
                               subsample=0.8,
                               colsample_bytree=0.8,
                               seed=24
                               )
print(base_model.get_params()['n_estimators'])

500


In [89]:
def model_cv(bst,X,y,nfold = 5, early_stopping_rounds = 30):
    params = bst.get_xgb_params()
    full_params = bst.get_params()
    extra_params = {k : full_params[k] for k in full_params if k not in params}
    # print(params)
    # print('-------------------')
    # print(bst.get_params())
    print(extra_params)
    dtrain = xgb.DMatrix(X,label=y)
    cv_result = xgb.cv(params,
                       dtrain,
                       num_boost_round= bst.get_params()['n_estimators'],
                       nfold=nfold,
                       early_stopping_rounds=early_stopping_rounds,)
    print('best round of iteration/n_estimator:', cv_result.shape[0])
    print('base_score:',cv_result.iloc[len(cv_result)-1,:])
    return cv_result

In [90]:
def model_fit(bst,X_train,y_train,X_test,y_test,cv_result):
    bst.set_params(n_estimators=cv_result.shape[0])
    bst.fit(X_train,y_train)
    train_pred_prob = bst.predict_proba(X_train)
    train_score = metrics.roc_auc_score(y_train,train_pred_prob[:,1])
    print(f'train score: {train_score}')
    test_pred_prob = bst.predict_proba(X_test)
    test_score = metrics.roc_auc_score(y_test,test_pred_prob[:,1])
    print(f'test score: {test_score}')

In [91]:
cv_results = model_cv(base_model,X_train,y_train)
model_fit(base_model,X_train,y_train,X_test,y_test,cv_results)

{'callbacks': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'feature_types': None, 'importance_type': None, 'missing': nan, 'n_estimators': 500}
best round of iteration/n_estimator: 157
base_score: train-auc-mean    0.999919
train-auc-std     0.000042
test-auc-mean     0.880227
test-auc-std      0.036848
Name: 156, dtype: float64
train score: 0.9994967102032799
test score: 0.828768472906404


In [92]:
print(cv_results)

     train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
0          0.834198       0.007410       0.752015      0.031718
1          0.871699       0.005346       0.795939      0.030709
2          0.879701       0.006901       0.801498      0.035865
3          0.890815       0.005964       0.810467      0.037818
4          0.894213       0.004330       0.817049      0.040078
..              ...            ...            ...           ...
152        0.999886       0.000048       0.880019      0.036558
153        0.999898       0.000049       0.879925      0.036469
154        0.999904       0.000053       0.879992      0.036603
155        0.999911       0.000052       0.879843      0.036558
156        0.999919       0.000042       0.880227      0.036848

[157 rows x 4 columns]


In [93]:
param1 = {
    'max_depth': range(3,10,2),
    'min_child_weight': range(1,6,2),
}
# custom_scorer = metrics.make_scorer(metrics.roc_auc_score, needs_proba=True)
bst2 = XGBClassifier(objective = 'binary:logistic',
                               eval_metric='auc',
                               booster='gbtree',                               
                               nthread=-1,
                               n_estimators=138,
                               learning_rate=0.1,
                               gamma = 0,
                               max_depth=5,
                               min_child_weight=1,
                               max_delta_step=1,
                               subsample=0.8,
                               colsample_bytree=0.8,
                               seed=24
                               )

grid_search1 = GridSearchCV(bst2,param1,cv=5,scoring='roc_auc')
grid_search1.fit(X_train,y_train)

  _data = np.array(data, dtype=dtype, copy=copy,


In [94]:
print(f'Best params: {grid_search1.best_params_} and best aoc score: {grid_search1.best_score_}')

Best params: {'max_depth': 9, 'min_child_weight': 1} and best aoc score: 0.8880023479980228


In [95]:
param2 = {
    'max_depth': [7,8,9],
    'min_child_weight': [1,2],
}
grid_search2 = GridSearchCV(bst2,param2,cv=5,scoring='roc_auc')
grid_search2.fit(X_train,y_train)
print(f'Best params: {grid_search2.best_params_} and best aoc score: {grid_search2.best_score_}')

Best params: {'max_depth': 9, 'min_child_weight': 1} and best aoc score: 0.8880023479980228


In [107]:
param3 = {
    'gamma': [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}
bst3 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=138,
                     learning_rate=0.1,
                     max_depth = 9,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=24)
grid_search3 = GridSearchCV(bst3,param3,cv=5,scoring='roc_auc')
grid_search3.fit(X_train,y_train)
print(f'Best params: {grid_search3.best_params_} and best aoc score: {grid_search3.best_score_}')

Best params: {'gamma': 0.4} and best aoc score: 0.8901669344208271


In [127]:
model2 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=100,
                     learning_rate=0.1,
                     gamma = 0.4,
                     max_depth = 9,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=24)
cv_results = model_cv(model2,X_train,y_train)
model_fit(model2,X_train,y_train,X_test,y_test,cv_results)

{'callbacks': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'feature_types': None, 'importance_type': None, 'missing': nan, 'n_estimators': 100}
best round of iteration/n_estimator: 100
base_score: train-auc-mean    0.999996
train-auc-std     0.000004
test-auc-mean     0.884944
test-auc-std      0.031054
Name: 99, dtype: float64
train score: 1.0
test score: 0.8401576354679802


In [110]:
param4 = {
    'subsample' : [ x/10 for x in range(6,11)],
    'colsample_bytree': [x/10 for x in range(6,11)]
}
bst4 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=138,
                     learning_rate=0.1,
                     max_depth = 9,
                     gamma = 0.4,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=24)
grid_search4 = GridSearchCV(bst4,param4,cv=5,scoring='roc_auc')
grid_search4.fit(X_train,y_train)
print(f'Best params: {grid_search4.best_params_} and best aoc score: {grid_search4.best_score_}')

Best params: {'colsample_bytree': 0.8, 'subsample': 0.8} and best aoc score: 0.8901669344208271


In [121]:
param5 = {
    'subsample' : [ x/100 for x in range(75,90,5)],
    'colsample_bytree': [x/100 for x in range(75,90,5)]
}
bst5 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=153,
                     learning_rate=0.1,
                     gamma = 0.4,
                     max_depth = 9,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=24)
grid_search4 = GridSearchCV(bst4,param4,cv=5,scoring='roc_auc')
grid_search4.fit(X_train,y_train)
print(f'Best params: {grid_search4.best_params_} and best aoc score: {grid_search4.best_score_}')

Best params: {'colsample_bytree': 0.8, 'subsample': 0.8} and best aoc score: 0.8901669344208271


In [123]:
param6 = {
    'reg_alpha' :[0, 1e-5, 1e-2, 0.1, 1]
}

bst6 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=153,
                     learning_rate=0.1,
                     gamma = 0.4,
                     max_depth = 9,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=24)
grid_search4 = GridSearchCV(bst6,param6,cv=5,scoring='roc_auc')
grid_search4.fit(X_train,y_train)
print(f'Best params: {grid_search4.best_params_} and best aoc score: {grid_search4.best_score_}')

Best params: {'reg_alpha': 0} and best aoc score: 0.889812675070028


In [124]:
param7 = {
    'reg_alpha' :[0, 1e-08, 1e-07, 1e-06]
}
bst7 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=153,
                     learning_rate=0.1,
                     gamma = 0.4,
                     max_depth = 9,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=24)
grid_search4 = GridSearchCV(bst7,param7,cv=5,scoring='roc_auc')
grid_search4.fit(X_train,y_train)
print(f'Best params: {grid_search4.best_params_} and best aoc score: {grid_search4.best_score_}')

Best params: {'reg_alpha': 0} and best aoc score: 0.889812675070028


In [135]:
model3 = XGBClassifier(objective = 'binary:logistic',
                     eval_metric='auc',
                     booster='gbtree', 
                     nthread=-1,
                     n_estimators=1000,
                     learning_rate=0.08,
                     gamma = 0.4,
                     max_depth = 9,
                     min_child_weight=1,
                     max_delta_step=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     reg_alpha=0.0,
                     seed=24)
cv_results = model_cv(model3,X_train,y_train)
model_fit(model2,X_train,y_train,X_test,y_test,cv_results)

{'callbacks': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'feature_types': None, 'importance_type': None, 'missing': nan, 'n_estimators': 1000}
best round of iteration/n_estimator: 123
base_score: train-auc-mean    0.999993
train-auc-std     0.000007
test-auc-mean     0.886947
test-auc-std      0.032249
Name: 122, dtype: float64
train score: 1.0
test score: 0.841576354679803


In [103]:
base_cv_results = model_cv(base_model,X_train,y_train)
model_fit(base_model,X_train,y_train,X_test,y_test,base_cv_results)

{'callbacks': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'feature_types': None, 'importance_type': None, 'missing': nan, 'n_estimators': 157}
best round of iteration/n_estimator: 157
base_score: train-auc-mean    0.999919
train-auc-std     0.000042
test-auc-mean     0.880227
test-auc-std      0.036848
Name: 156, dtype: float64
train score: 0.9994967102032799
test score: 0.828768472906404


In [138]:
from bayes_opt import BayesianOptimization

In [158]:
dtrain = xgb.DMatrix(X_train,label=y_train)
def xbg_optimize(learning_rate,num_boost_round,min_child_weight,colsample_bytree,max_depth,subsample,gamma,alpha):
    params = {}
    params['learning_rate'] = float(learning_rate)
    num_boost_round = int(num_boost_round)
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = float(colsample_bytree)
    params['gamma'] = float(gamma)
    params['subsample'] = float(subsample)
    params['max_depth'] = int(max_depth)
    params['alpha'] = float(alpha)
    params['objective'] = 'binary:logistic'
    
    cv_results = xgb.cv(params,dtrain,nfold=5,seed=24,num_boost_round=num_boost_round, early_stopping_rounds=30,metrics={'auc'})
    return cv_results['test-auc-mean'].iloc[-1]

In [159]:
param8 = {
    'learning_rate' : (0.08,0.1),
    'num_boost_round' : (50,500),
    'min_child_weight' : (1,10),
    'colsample_bytree' : (0.5,1),
    'max_depth' : (4,10),
    'subsample' : (0.5,1),
    'gamma' : (0,10),
    'alpha' : (0,10),
}
xgb_opt = BayesianOptimization(xbg_optimize,param8)
xgb_opt.maximize(init_points=5, n_iter = 30)

|   iter    |  target   |   alpha   | colsam... |   gamma   | learni... | max_depth | min_ch... | num_bo... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.8216   [39m | [39m8.914    [39m | [39m0.5429   [39m | [39m6.685    [39m | [39m0.0989   [39m | [39m6.913    [39m | [39m5.382    [39m | [39m219.9    [39m | [39m0.5981   [39m |
| [35m2        [39m | [35m0.8358   [39m | [35m2.149    [39m | [35m0.8098   [39m | [35m8.682    [39m | [35m0.08348  [39m | [35m6.954    [39m | [35m6.895    [39m | [35m338.8    [39m | [35m0.824    [39m |
| [39m3        [39m | [39m0.8248   [39m | [39m7.977    [39m | [39m0.6807   [39m | [39m6.907    [39m | [39m0.09465  [39m | [39m7.993    [39m | [39m5.035    [39m | [39m466.1    [39m | [39m0.6168   [39m |
| [39m4        [39m | [39m0.8313   [39m | [39m5.182    [39m | [39m0.5388   [39m | 

In [164]:
print(xgb_opt.max)

{'target': np.float64(0.8838913526719191), 'params': {'alpha': np.float64(0.0), 'colsample_bytree': np.float64(0.5), 'gamma': np.float64(0.0), 'learning_rate': np.float64(0.1), 'max_depth': np.float64(10.0), 'min_child_weight': np.float64(1.0), 'num_boost_round': np.float64(131.95641895514302), 'subsample': np.float64(1.0)}}


In [169]:
bayesian_params = xgb_opt.max['params']
bayesian_params['max_depth'] = int(bayesian_params['max_depth'])
bayesian_params['min_child_weight'] = int(bayesian_params['min_child_weight'])
bayesian_params['num_boost_round'] = int(bayesian_params['num_boost_round'])
print(bayesian_params)

{'alpha': np.float64(0.0), 'colsample_bytree': np.float64(0.5), 'gamma': np.float64(0.0), 'learning_rate': np.float64(0.1), 'max_depth': 10, 'min_child_weight': 1, 'num_boost_round': 131, 'subsample': np.float64(1.0)}


In [166]:
bayesian_bst = xgb.XGBClassifier(
    learning_rate=bayesian_params['learning_rate'],
    min_child_weight=bayesian_params['min_child_weight'],
    colsample_bytree=bayesian_params['colsample_bytree'],
    max_depth=bayesian_params['max_depth'],
    subsample=bayesian_params['subsample'],
    gamma=bayesian_params['gamma'],
    alpha=bayesian_params['alpha'],
    objective='binary:logistic',
    n_estimators=bayesian_params['num_boost_round']  # Use this for XGBClassifier (fit)
)


In [167]:
base_cv_results = model_cv(base_model,X_train,y_train)
model_fit(base_model,X_train,y_train,X_test,y_test,base_cv_results)

{'callbacks': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'feature_types': None, 'importance_type': None, 'missing': nan, 'n_estimators': 157}
best round of iteration/n_estimator: 157
base_score: train-auc-mean    0.999919
train-auc-std     0.000042
test-auc-mean     0.880227
test-auc-std      0.036848
Name: 156, dtype: float64
train score: 0.9994967102032799
test score: 0.828768472906404


In [170]:
base_cv_results = model_cv(bayesian_bst,X_train,y_train)
model_fit(bayesian_bst,X_train,y_train,X_test,y_test,base_cv_results)

{'callbacks': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'feature_types': None, 'importance_type': None, 'missing': nan, 'n_estimators': 131}
best round of iteration/n_estimator: 74
base_score: train-logloss-mean    0.080807
train-logloss-std     0.001318
test-logloss-mean     0.425676
test-logloss-std      0.072083
Name: 73, dtype: float64
train score: 1.0
test score: 0.8381871921182265
