In [1]:
# load libs and modules
import numpy as np
import pandas as pd
import get_data
import preprocess_data
import pickle5

# import sklearn models and functions
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
import xgboost as xgb
from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [34]:
# unpickle files
train_df = pd.read_pickle('../stuart/jar2/train_df.pkl')
test_df = pd.read_pickle('../stuart/jar2/test_df.pkl')
validate_df = pd.read_pickle('../stuart/jar2/validate_df.pkl')
X_train = pd.read_pickle('../stuart/jar2/X_train.pkl')
y_train = pd.read_pickle('../stuart/jar2/y_train.pkl')
X_test = pd.read_pickle('../stuart/jar2/X_test.pkl')
y_test = pd.read_pickle('../stuart/jar2/y_test.pkl')
X_val = pd.read_pickle('../stuart/jar2/X_val.pkl')
y_val = pd.read_pickle('../stuart/jar2/y_val.pkl')

#combine test and validation
# X_test = pd.concat([X_test,X_val],ignore_index=False).reset_index(drop=True)
# y_test = pd.concat([y_test,y_val],ignore_index=False).reset_index(drop=True)

In [None]:
# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 1, 0.1)),
    'subsample':        hp.uniform('subsample', 0.5, 1),
    'gamma':            hp.uniform('gamma',1,15),
    'reg_alpha' :       hp.quniform('reg_alpah',40,180,1),
    'reg_lambda':       hp.uniform('reg_lambda',0,1),
    'n_estimators':     hp.choice('n_estimators',     np.arange(300,3000,100,dtype=int))
}
xgb_fit_params = {
    'eval_metric': "rmse",
    'early_stopping_rounds': 10,
    'verbose': 2
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
# xgb_para['loss_func' ] = lambda y, pred: (-1.0*r2_score(y, pred))
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [31]:
# XGB parameters
xgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.001, 0.5, 0.01)),
    'max_depth':        hp.choice('max_depth',        np.arange(3, 25, 1, dtype=int)),
    # 'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    # 'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 1, 0.1)),
    # 'subsample':        hp.uniform('subsample', 0.5, 1),
    'gamma':            hp.uniform('gamma',1,100),
    'reg_alpha' :       hp.quniform('reg_alpah',1,200,0.5),
    'reg_lambda':       hp.uniform('reg_lambda',0,100),
    'n_estimators':     hp.choice('n_estimators',     np.arange(100,5000,50,dtype=int))
}
xgb_fit_params = {
    'eval_metric': "rmse",
    'early_stopping_rounds': 10,
    'verbose': 2
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
# xgb_para['loss_func' ] = lambda y, pred: (-1.0*r2_score(y, pred))
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

In [32]:
class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)


    def train_reg(self, reg, para):
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}

In [35]:
obj = HPOpt(X_train, X_val, y_train, y_val)

xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)

[0]	validation_0-rmse:39.13844	validation_1-rmse:44.13301
[2]	validation_0-rmse:14.05742	validation_1-rmse:19.04369
[4]	validation_0-rmse:7.25518	validation_1-rmse:11.76939
[6]	validation_0-rmse:5.65468	validation_1-rmse:9.66126
[8]	validation_0-rmse:5.24244	validation_1-rmse:8.96207
[10]	validation_0-rmse:5.08785	validation_1-rmse:8.68911
[12]	validation_0-rmse:5.02254	validation_1-rmse:8.57391
[14]	validation_0-rmse:4.97821	validation_1-rmse:8.52646
[16]	validation_0-rmse:4.95053	validation_1-rmse:8.49905
[18]	validation_0-rmse:4.93222	validation_1-rmse:8.47649
[20]	validation_0-rmse:4.91909	validation_1-rmse:8.47032
[22]	validation_0-rmse:4.90731	validation_1-rmse:8.46711
[24]	validation_0-rmse:4.89844	validation_1-rmse:8.46237
[26]	validation_0-rmse:4.89077	validation_1-rmse:8.45976
[28]	validation_0-rmse:4.88758	validation_1-rmse:8.46038
[30]	validation_0-rmse:4.88218	validation_1-rmse:8.46164
[32]	validation_0-rmse:4.87751	validation_1-rmse:8.46346
[34]	validation_0-rmse:4.87161	