### Load RegressorForest libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, r2_score
from matplotlib import pyplot as plt
from cxutils import tableau20
%matplotlib inline

### Set up hyperparameters for selection

In [None]:
estimator_cnts = [1, 10, 500, 1000]
min_samples_leaves = [1, 5, 10]
min_samples_leaves_colors = {1:tableau20[1], 5:tableau20[2], 10:tableau20[0]}
max_depths = [2, 5, 10]
max_depths_marker = {2:'o', 5:'^', 10:'x'}
models = dict()
for estimator_cnt in estimator_cnts:
    for min_samples_leaf in min_samples_leaves:
        for max_depth in max_depths:
            descriptor = 'est:{}|msl:{}|md:{}'.format(estimator_cnt, min_samples_leaf, max_depth)
            models[descriptor] = RandomForestRegressor(n_estimators=estimator_cnt, 
                                                       min_samples_leaf=min_samples_leaf,
                                                       max_depth=max_depth)

### Load randomly selected train/test set 

In [None]:
def load_data_sets(filename):
    filenames = ('{}_{}.pickle'.format(filename, _) for _ in ('train_data', 'train_target', 'test_data', 'test_target'))    
    return [pd.read_pickle(_) for _ in filenames]


train_data, train_target, test_data, test_target = load_data_sets('usfd_additional_onehot_by_time')

### For all hyperparameters, fit model to data and predict on training and test set

In [None]:
predictions_test = dict()
predictions_train = dict()
for descriptor, model in models.items():
    model.fit(np.asarray(train_data), np.asarray(train_target).ravel())
    predictions_test[descriptor] = model.predict(test_data)
    predictions_train[descriptor] = model.predict(train_data)



### Show prediction result on the training set (this ought to be decent)

In [None]:
best_set = sorted(models.keys(), key=lambda _: -r2_score(train_target, predictions_train[_]))[0]
plt.scatter(x=train_target, y=predictions_train[best_set], marker='o', s=1, color=tableau20[1])
print('Set {} : R2 -> {}, AS -> {}'
      .format(descriptor, 
              r2_score(train_target, predictions_train[best_set]),
              explained_variance_score(train_target, predictions_train[best_set])))

### Show prediction results on test set (this is the relevant quantity)

In [None]:
best_set = sorted(models.keys(), key=lambda _: -r2_score(test_target, predictions_test[_]))[0]
plt.scatter(x=test_target, y=predictions_test[best_set], marker='o', s=1, color=tableau20[2])
print('Set {} : R2 -> {}, AS -> {}'
      .format(descriptor, 
              r2_score(test_target, predictions_test[best_set]),
             0))

### Compare hyperparameter selection

In [None]:
for estimator_cnt in estimator_cnts:
    for min_samples_leaf in min_samples_leaves:
        for max_depth in max_depths:
            descriptor = 'est:{}|msl:{}|md:{}'.format(estimator_cnt, min_samples_leaf, max_depth)
            plt.semilogx(estimator_cnt, r2_score(test_target, predictions_test[descriptor]), 
                     marker=max_depths_marker[max_depth], color=min_samples_leaves_colors[min_samples_leaf])
            

In [None]:
sorted(list(zip(test_data.columns, models[best_set].feature_importances_)), key=lambda _: -_[1])
