In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern

import pandas as pd
import numpy as np
from scipy import stats

from xgboost import XGBClassifier

In [15]:
y_encoder = LabelEncoder()
x7_encoder = LabelEncoder()

In [16]:
training_data = pd.read_csv('TrainOnMe_orig.csv')
training_data = training_data.drop(columns=['Unnamed: 0', 'x12'])
training_data = training_data.dropna()

training_data['y'] = y_encoder.fit_transform(training_data['y'])
training_data['x7'] = x7_encoder.fit_transform(training_data['x7'])

X = training_data.drop(columns=['y'])
y = training_data['y']

In [21]:
# placeholder for the best model
xgb = None

# choice of scoring for the cross validation
scoring = 'neg_log_loss'

# constant model parameters
eval_metric='mlogloss'  # merror
objective='multi:softmax'

# default model or not
default = True

# gaussian process model
kernel = 1.0 * Matern(length_scale=1.0, nu=2.5)
gaussian_process = GaussianProcessRegressor(kernel=kernel)

In [22]:
def get_score(model):
    score = cross_val_score(model, X, y, cv=10, scoring=scoring, n_jobs=-1)
    return score.mean()

def get_model(n_estimators: int, learning_rate: float, max_depth: int):
    global xgb
    params = {
        'n_estimators': int(n_estimators),
        'learning_rate': learning_rate,
        'max_depth': int(max_depth)
    }
    
    if default:
        model = XGBClassifier(**params)
    else:
        model = XGBClassifier(**params,
                        eval_metric=eval_metric,
                        objective=objective,
                        num_class=3)

    return model

def objective_function(params):
    model = get_model(**params)
    score = get_score(model)
    return -score

def expected_improvement(X, X_sample, y_sample, gausian_process, xi=0.01):
    mu, sigma = gausian_process.predict(X, return_std=True)
    mu_sample = gausian_process.predict(X_sample)

    sigma = sigma.reshape(-1, 1)
    mu_sample_opt = np.max(mu_sample)

    with np.errstate(divide='ignore'):
        imp = mu - mu_sample_opt - xi
        Z = imp / sigma
        ei = imp * stats.norm.cdf(Z) + sigma * stats.norm.pdf(Z)
        ei[sigma == 0.0] = 0.0

    return ei

In [None]:
pbounds = {
    'n_estimators': (100, 1000),
    'learning_rate': (0.01, 0.1),
    'max_depth': (5, 20)
}