In [23]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
!head wine.data

--2017-06-20 18:08:27--  https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
Resolving archive.ics.uci.edu... 128.195.10.249
Connecting to archive.ics.uci.edu|128.195.10.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10782 (11K) [text/plain]
Saving to: ‘wine.data’


2017-06-20 18:08:28 (41.8 MB/s) - ‘wine.data’ saved [10782/10782]

1,14.23,1.71,2.43,15.6,127,2.8,3.06,.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,.26,1.28,4.38,1.05,3.4,1050
1,13.16,2.36,2.67,18.6,101,2.8,3.24,.3,2.81,5.68,1.03,3.17,1185
1,14.37,1.95,2.5,16.8,113,3.85,3.49,.24,2.18,7.8,.86,3.45,1480
1,13.24,2.59,2.87,21,118,2.8,2.69,.39,1.82,4.32,1.04,2.93,735
1,14.2,1.76,2.45,15.2,112,3.27,3.39,.34,1.97,6.75,1.05,2.85,1450
1,14.39,1.87,2.45,14.6,96,2.5,2.52,.3,1.98,5.25,1.02,3.58,1290
1,14.06,2.15,2.61,17.6,121,2.6,2.51,.31,1.25,5.05,1.06,3.58,1295
1,14.83,1.64,2.17,14,97,2.8,2.98,.29,1.98,5.2,1.08,2.85,1045
1,13.86,1.35,2.27,16,98,2.98,3.15,.22,1.85

In [211]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimizer
from sklearn.gaussian_process.kernels import RBF, Matern
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [115]:
data = pd.read_csv('wine.data', header=None, encoding='utf-8', delimiter=',')
targets = data.iloc[:,0] - 1  # reindex from 0 for XGBoost
features = data.iloc[:,1:]

In [158]:
def cross_validate_xgb(features, targets, params, n_splits=2):
    k_fold = KFold(n_splits=n_splits, shuffle=True, random_state=23)
    param_names = list(params.keys())
    results = []
    for train, test in k_fold.split(features, targets):
        train_data = xgb.DMatrix(data=features.iloc[train].values,
                                 label=targets.iloc[train].values)
        test_data = xgb.DMatrix(data=features.iloc[test].values,
                                label=targets.iloc[test].values)
        linear_tree_booster = xgb.train(
            params={
                'booster': 'gbtree',
                'objective': 'multi:softmax',
                'num_class': 3,
                **params
            },
            dtrain=train_data,
            num_boost_round=25
        )
        pred = linear_tree_booster.predict(test_data)
        results.append(f1_score(test_data.get_label(), pred, average='weighted'))
        
    return np.mean(results)

In [None]:
bo = BayesianOptimizer({
    'max_depth': (2, 10),
    'n_estimators': (50, 150),
    'eta': (.01, .9)
},
    kernel=Matern(nu=2)
)
optimizing = True
scores = []
for run in range(40):
    params = bo.suggest(return_dict=True)
    score = cross_validate_xgb(features, targets, params)
    print(params)
    print(score)
    bo.update(list(params.values()), score)
    scores.append(score)
    
plt.plot(scores)
plt.show()

{'max_depth': 9, 'eta': 0.09947842899817726, 'n_estimators': 83}
0.926496445653


  / np.sqrt(D.sum(2))[:, :, np.newaxis]


{'max_depth': 2, 'eta': 0.7341421004725803, 'n_estimators': 110}
0.926354175043
{'max_depth': 9, 'eta': 0.9, 'n_estimators': 115}
0.932029433911
{'max_depth': 5, 'eta': 0.0578477243650077, 'n_estimators': 147}
0.926795587721
{'max_depth': 9, 'eta': 0.3355233036136794, 'n_estimators': 109}
0.943369726061
{'max_depth': 10, 'eta': 0.0399163218607963, 'n_estimators': 109}
0.921918965723
