In [1]:
from copy import deepcopy
import numpy as np
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import KFold, cross_validate 
from sklearn.ensemble import GradientBoostingRegressor
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.early_stop import no_progress_loss
import joblib 
import warnings 
warnings.filterwarnings("ignore") 
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import function

In [2]:
df2_backup = pd.read_csv("data_model.csv", index_col='Unnamed: 0')
df2 = deepcopy(df2_backup)

In [3]:
X, y = df2.iloc[:, :-1], df2.iloc[:, -1]
Xtrain, Xtest, Ytrain, Ytest = tts(X, y, test_size=0.3, random_state=10)
for i in [Xtrain, Xtest]:
    i.index = range(i.shape[0])

In [5]:
#GBDT
gb1 = GradientBoostingRegressor(random_state=1).fit(Xtrain, Ytrain)

function.eval_regressor(gb1, Xtrain, Ytrain, Xtest, Ytest)

r2_train: 0.9787866814706709
r2_test: 0.9642061608970356
mse_train: 0.0012175233906805774
mse_test: 0.0020507949680181603
rmse_train: 0.03489302782334284
rmse_test: 0.04528570379289871


In [6]:
def param_hyperopt(max_evals=100):
    trials = Trials()

    early_stop_fn = no_progress_loss(100)

    params_best = fmin(hyperopt_objective, 
                       space=param_grid_simple, 
                       algo=tpe.suggest, 
                       max_evals=max_evals,
                       verbose=True,
                       trials=trials,
                       early_stop_fn=early_stop_fn
                      )

    print("\n","\n","best params: ", params_best,
          "\n")
    return params_best, trials

In [7]:
def hyperopt_objective(params):

    reg = GradientBoostingRegressor(n_estimators=int(params["n_estimators"]),
                                    learning_rate=params["lr"],
                                    criterion=params["criterion"],
                                    loss=params["loss"] ,
                                    max_features=int(params["max_features"]),
                                    subsample=params["subsample"],
                                    random_state=1,
                                    verbose=False)

    cv = KFold(n_splits=5,shuffle=True,random_state=1)
    validation_loss = cross_validate(reg,
                                     Xtrain,Ytrain,
                                     scoring="neg_root_mean_squared_error",
                                     cv=cv,
                                     verbose=False,
                                     n_jobs=6,
                                     error_score='raise'
                                    )

    return np.mean(abs(validation_loss["test_score"]))

In [8]:
param_grid_simple = {"n_estimators": hp.quniform("n_estimators", 5, 200, 5),
                     "lr": hp.quniform("learning_rate", 0.05, 1, 0.05),
                     "criterion": hp.choice("criterion",["friedman_mse", "squared_error"]),
                     "loss":hp.choice("loss",["squared_error", "huber", "quantile"]),
                     "subsample": hp.quniform("subsample", 0.1, 0.8, 0.1),
                     "max_features": hp.quniform("max_features",1,25,1)
                    }

In [9]:
params_best, trials = param_hyperopt(140)

100%|██████████| 140/140 [00:25<00:00,  5.51trial/s, best loss: 0.03295534054678871]

 
 best params:  {'criterion': 1, 'learning_rate': 0.2, 'loss': 0, 'max_features': 18.0, 'n_estimators': 190.0, 'subsample': 0.7000000000000001} 



In [10]:
reg3 = GradientBoostingRegressor(#init=rf,
                                 n_estimators=215,
                                 criterion='squared_error',
                                 learning_rate=0.18049172029558583,
                                 loss='squared_error',
                                 max_features=5,
                                 subsample=0.752290959272556,
                                 random_state=1).fit(Xtrain, Ytrain)

function.eval_regressor(reg3, Xtrain, Ytrain, Xtest, Ytest)

r2_train: 0.992167453526862
r2_test: 0.9822736240042149
mse_train: 0.00044954345669460016
mse_test: 0.0010156262531320149
rmse_train: 0.02120243987598126
rmse_test: 0.031868891620701445


In [8]:
joblib.dump(reg3, "gbdt.dat")

['gbdt.dat']