In [1]:
from copy import deepcopy
import numpy as np
%matplotlib inline

import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_validate
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.early_stop import no_progress_loss
from sklearn.model_selection import KFold, cross_validate
import joblib

import warnings 
warnings.filterwarnings("ignore")

import function

In [2]:
df2_backup = pd.read_csv("data_model.csv", index_col='Unnamed: 0')
df2 = deepcopy(df2_backup)
X, y = df2.iloc[:, :-1], df2.iloc[:, -1]
Xtrain, Xtest, Ytrain, Ytest = tts(X, y, test_size=0.3, random_state=1)
for i in [Xtrain, Xtest]:
    i.index = range(i.shape[0])

In [3]:
rfr1 = RandomForestRegressor(random_state=1).fit(Xtrain, Ytrain)
function.eval_regressor(rfr1, Xtrain, Ytrain, Xtest, Ytest)

r2_train: 0.9965129922754085
r2_test: 0.9755928055993865
mse_train: 0.00020178941951421993
mse_test: 0.001372431142430183
rmse_train: 0.014205260276187126
rmse_test: 0.03704633777352605


In [5]:
def hyperopt_objective(params):

    reg = RandomForestRegressor(n_estimators=int(params["n_estimators"]),
                                max_depth=int(params["max_depth"]),
                                max_features=int(params["max_features"]),
                                random_state=1,
                                verbose=False,
                                n_jobs=6)

    cv = KFold(n_splits=5,shuffle=True,random_state=1)
    validation_loss = cross_validate(reg,
                                     Xtrain, Ytrain,
                                     scoring="neg_root_mean_squared_error",
                                     cv=cv,
                                     verbose=False,
                                     n_jobs=6,
                                     error_score='raise'
                                    )

    return np.mean(abs(validation_loss["test_score"]))

In [6]:
param_grid_simple = {'n_estimators': hp.quniform("n_estimators", 140, 180, 2),
                     'max_depth': hp.quniform("max_depth", 18, 25, 1),
                     "max_features": hp.quniform("max_features", 1, 8, 1)
                    }

In [7]:
def param_hyperopt(max_evals=100):

    trials = Trials()

    early_stop_fn = no_progress_loss(100)

    params_best = fmin(hyperopt_objective, 
                       space=param_grid_simple, 
                       algo=tpe.suggest, 
                       max_evals=max_evals, 
                       verbose=True,
                       trials=trials,
                       early_stop_fn=early_stop_fn
                      )

    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

In [None]:
params_best, trials = param_hyperopt(240)

In [4]:
rfr3 = RandomForestRegressor(n_estimators=160,
                            max_depth=24,
                            max_features=4,
                            random_state=1).fit(Xtrain,Ytrain)
function.eval_regressor(rfr3, Xtrain, Ytrain, Xtest, Ytest)

r2_train: 0.9969692544765298
r2_test: 0.977200204225548
mse_train: 0.00017538601235763554
mse_test: 0.001282046156075997
rmse_train: 0.013243338414374056
rmse_test: 0.03580567212155075


In [None]:
joblib.dump(rfr3, "rf.dat")