### Load XGBoost libraries

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import explained_variance_score, r2_score
from matplotlib import pyplot as plt
from cxutils import tableau20
%matplotlib inline

### Set up hyperparameters for selection

In [None]:
estimator_cnts = [1, 10, 500, 1000]
max_depths = [2, 5, 10]
max_depths_marker = {2:'o', 5:'^', 10:'x'}
models = dict()
for estimator_cnt in estimator_cnts:
    for max_depth in max_depths:
        descriptor = 'est:{}|md:{}'.format(estimator_cnt, max_depth)
        models[descriptor] = XGBRegressor(n_estimators=estimator_cnt,
                                                   max_depth=max_depth)

### Load randomly selected train/test set 

In [None]:
def load_data_sets(filename):
    filenames = ('{}_{}.pickle'.format(filename, _) for _ in ('train_data', 'train_target', 'test_data', 'test_target'))    
    return [pd.read_pickle(_) for _ in filenames]


train_data, train_target, test_data, test_target = load_data_sets('usfd_additional_onehot_random_20')

### For all hyperparameters, fit model to data and predict on training and test set

In [None]:
predictions_test = dict()
predictions_train = dict()
for descriptor, model in models.items():
    model.fit(np.asarray(train_data), np.asarray(train_target).ravel())
    predictions_test[descriptor] = model.predict(np.asarray(test_data))
    predictions_train[descriptor] = model.predict(np.asarray(train_data))



### Show prediction result on the training set (this ought to be decent)

In [None]:
best_set = sorted(models.keys(), key=lambda _: -r2_score(train_target, predictions_train[_]))[0]
plt.scatter(x=train_target, y=predictions_train[best_set], marker='o', s=1, color=tableau20[1])
print('Set {} : R2 -> {}'
      .format(best_set, 
              r2_score(train_target, predictions_train[best_set])))

### Show prediction results on test set (this is the relevant quantity)

In [None]:
best_set = sorted(models.keys(), key=lambda _: -r2_score(test_target, predictions_test[_]))[0]
plt.scatter(x=test_target, y=predictions_test[best_set], marker='o', s=1, color=tableau20[2])
print('Set {} : R2 -> {}'
      .format(best_set, 
              r2_score(test_target, predictions_test[best_set])
             ))

### Compare hyperparameter selection

In [None]:
for estimator_cnt in estimator_cnts:
        for max_depth in max_depths:
            descriptor = 'est:{}|md:{}'.format(estimator_cnt, max_depth)
            plt.semilogx(estimator_cnt, r2_score(test_target, predictions_test[descriptor]), 
                     marker=max_depths_marker[max_depth])
            