In [56]:
#!/usr/bin/env python

import time

import sys
sys.path.insert(0, '../../../Utilities/')

from plotting import newfig, savefig

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np

import operator
import itertools

from sklearn import metrics
from sklearn.metrics import *
from sklearn import preprocessing

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score

from sklearn import kernel_ridge
from sklearn.kernel_ridge import KernelRidge

from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer

n_jobs = 1
trial  = 1

# Import database
dataset=np.loadtxt("../../data/dataset_lite.csv", delimiter=",")
x=dataset[:,0:2]
y=dataset[:,2] # 0: X, 1: T, 2: shear, 3: bulk, 4: conductivity

# Plot dataset
#plt.scatter(x[:,1], dataset[:,2], s=0.5)
#plt.title('Shear viscosity')
#plt.xlabel('T [K]')
#plt.ylabel(r'$\eta$')
#plt.show()

y=np.reshape(y, (-1,1))
sc_x = StandardScaler()
sc_y = StandardScaler()
X = sc_x.fit_transform(x)
Y = sc_y.fit_transform(y)

# The data is then split into training and test data
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25, random_state=42)

print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (3300, 2)
Training Labels Shape: (3300, 1)
Testing Features Shape: (1100, 2)
Testing Labels Shape: (1100, 1)


In [57]:
hyper_params = [{'n_estimators': (1,10,100,1000,),
                 'min_weight_fraction_leaf': (0.0, 0.25, 0.5),
                 'max_features': ('sqrt','log2',None),
}]

# https://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics

# The scorers can be either be one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {'EV': 'explained_variance',
           'ME': 'max_error',
           'NMAE': 'neg_mean_absolute_error',
           'NMSE': 'neg_mean_squared_error',
           'NRMSE': 'neg_root_mean_squared_error',
#          'NMSLE':'neg_mean_squared_log_error',
           'NMAE': 'neg_median_absolute_error',
           'R2':'r2',
#          'NMPD': 'neg_mean_poisson_deviance',
#          'NMGD':'neg_mean_gamma_deviance'
          }

#est=ensemble.RandomForestRegressor()
#grid_clf = GridSearchCV(est, cv=5, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, scoring='r2')
#grid_clf = GridSearchCV(est, cv=5, param_grid=hyper_params, verbose=2, n_jobs=n_jobs, 
#                        scoring=scoring, refit='r2', return_train_score=True)

grid_clf = GridSearchCV(RandomForestRegressor(random_state=42),
                  #param_grid={'min_samples_split': range(2, 403, 10)},
                  param_grid={'n_estimators': range(1,1000, 100)},
                  scoring=scoring, refit='R2', return_train_score=True,
                  verbose=2, n_jobs=n_jobs)

t0 = time.time()
grid_clf.fit(x_train, y_train.ravel())
runtime = time.time() - t0
print("RF complexity and bandwidth selected and model fitted in %.3f s" % runtime)

results = grid_clf.cv_results_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=1 ..................................................
[CV] ................................... n_estimators=1, total=   0.0s
[CV] n_estimators=1 ..................................................
[CV] ................................... n_estimators=1, total=   0.0s
[CV] n_estimators=1 ..................................................
[CV] ................................... n_estimators=1, total=   0.1s
[CV] n_estimators=1 ..................................................
[CV] ................................... n_estimators=1, total=   0.0s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] n_estimators=1 ..................................................
[CV] ................................... n_estimators=1, total=   0.0s
[CV] n_estimators=101 ................................................
[CV] ................................. n_estimators=101, total=   1.4s
[CV] n_estimators=101 ................................................
[CV] ................................. n_estimators=101, total=   1.1s
[CV] n_estimators=101 ................................................
[CV] ................................. n_estimators=101, total=   1.4s
[CV] n_estimators=101 ................................................
[CV] ................................. n_estimators=101, total=   1.4s
[CV] n_estimators=101 ................................................
[CV] ................................. n_estimators=101, total=   1.3s
[CV] n_estimators=201 ................................................
[CV] ................................. n_estimators=201, total=   2.6s
[CV] n

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  4.6min finished


RF complexity and bandwidth selected and model fitted in 286.063 s


In [76]:

print(grid_clf.cv_results_[train_score_mae])

# https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html

plt.figure(figsize=(13, 13))
plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
          fontsize=16)

#plt.xlabel("min samples split")
plt.xlabel("n estimators")
plt.ylabel("Score")

ax = plt.gca()
#ax.set_xlim(0, 402)
#ax.set_ylim(0.73, 1)

# Get the regular numpy array from the MaskedArray
#X_axis = np.array(results['param_min_samples_split'].data, dtype=float)
X_axis = np.array(results['param_n_estimators'].data, dtype=float)

for scorer, color in zip(sorted(scoring), ['g', 'k', 'r', 'b', 'y', 'm']):
    for sample, style in (('train', '--'), ('test', '-')):
        sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
        sample_score_std = results['std_%s_%s' % (sample, scorer)]
        ax.fill_between(X_axis, sample_score_mean - sample_score_std,
                        sample_score_mean + sample_score_std,
                        alpha=0.1 if sample == 'test' else 0, color=color)
        ax.plot(X_axis, sample_score_mean, style, color=color,
                alpha=1 if sample == 'test' else 0.7,
                label="%s (%s)" % (scorer, sample))

    best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
    best_score = results['mean_test_%s' % scorer][best_index]

    # Plot a dotted vertical line at the best score for that scorer marked by x
    ax.plot([X_axis[best_index], ] * 2, [0, best_score], 
            linestyle='-.', color=color, marker='x', markeredgewidth=3, ms=8)

    # Annotate the best score for that scorer
    ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005))

plt.legend(loc="best")
plt.grid(False)
plt.show()

KeyError: 1.233585602316465e-07

In [75]:
train_score_mse  = mean_squared_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(grid_clf.predict(x_train)))
train_score_mae  = mean_absolute_error(sc_y.inverse_transform(y_train),sc_y.inverse_transform(grid_clf.predict(x_train)))
train_score_evs  = explained_variance_score(sc_y.inverse_transform(y_train), sc_y.inverse_transform(grid_clf.predict(x_train)))
train_score_me   = max_error(sc_y.inverse_transform(y_train), sc_y.inverse_transform(grid_clf.predict(x_train)))

test_score_mse  = mean_squared_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(grid_clf.predict(x_test)))
test_score_mae  = mean_absolute_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(grid_clf.predict(x_test)))
test_score_evs  = explained_variance_score(sc_y.inverse_transform(y_test), sc_y.inverse_transform(grid_clf.predict(x_test)))
test_score_me   = max_error(sc_y.inverse_transform(y_test), sc_y.inverse_transform(grid_clf.predict(x_test)))

print(test_score_mae)

3.4062383273762887e-07


In [None]:
sorted_grid_params = sorted(grid_clf.best_params_.items(), key=operator.itemgetter(0))

out_text = '\t'.join(['random-forest',
                      str(trial),
                      str(sorted_grid_params).replace('\n',','),
                      str(train_score_mse),
                      str(train_score_mae),
                      str(train_score_evs),
                      str(train_score_me),
                      str(test_score_mse),
                      str(test_score_mae),
                      str(test_score_evs),
                      str(test_score_me),
                      str(runtime)])
print(out_text)
sys.stdout.flush()

best_n_estimators = grid_clf.best_params_['n_estimators']
best_min_weight_fraction_leaf = grid_clf.best_params_['min_weight_fraction_leaf']
best_max_features = grid_clf.best_params_['max_features']

# open a (new) file to write
outF = open("output.txt", "w")
print('best_n_estimators = ', best_n_estimators, file=outF)
print('best_min_weight_fraction_leaf = ', best_min_weight_fraction_leaf, file=outF)
print('best_max_features = ', best_max_features, file=outF)
outF.close()

In [None]:
rf = RandomForestRegressor(n_estimators=best_n_estimators,
                           min_weight_fraction_leaf=best_min_weight_fraction_leaf,
                           max_features=best_max_features)

t0 = time.time()
rf.fit(x_train, y_train.ravel())
rf_fit = time.time() - t0
print("RF complexity and bandwidth selected and model fitted in %.3f s" % rf_fit)

t0 = time.time()
y_rf = rf.predict(x_test)
rf_predict = time.time() - t0
print("KR prediction for %d inputs in %.3f s" % (x_test.shape[0], rf_predict))

# open a file to append
outF = open("output.txt", "a")
print("RF complexity and bandwidth selected and model fitted in %.3f s" % rf_fit, file=outF)
print("RF prediction for %d inputs in %.3f s" % (x_test.shape[0], rf_predict),file=outF)
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_rf), file=outF)
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_rf), file=outF)
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_rf)), file=outF)
outF.close()

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_rf))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_rf))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_rf)))

x_test_dim = sc_x.inverse_transform(x_test)
y_test_dim = sc_y.inverse_transform(y_test)
y_rf_dim   = sc_y.inverse_transform(y_rf)

plt.scatter(x_test_dim[:,1], y_test_dim[:], s=5, c='red',  marker='o', label='KAPPA')
plt.scatter(x_test_dim[:,1], y_rf_dim[:],   s=2, c='cyan', marker='*', label='Random Forest')
plt.title('Shear viscosity regression with RF')
plt.ylabel(r'$\eta$ [Pa·s]')
plt.xlabel('T [K] ')
plt.legend()
plt.tight_layout()
plt.savefig("eta_RF.pdf", dpi=150, crop='false')
plt.show()