In [4]:
from hyperopt import hp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import catboost as ctb
import random
import copy
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
np.random.seed(42)
random.seed(42)
RANDOM_SEED = 42

In [2]:
### LOAD DATA
data = pd.read_csv('data.csv')
data = data[data.Datetime < '2020-12-08']
data.head()

Unnamed: 0,Datetime,temp,templow,icon,baro,hum,rain,fog,thunder,snow,...,desc_No weather data available,desc_Overcast.,desc_Partly cloudy.,desc_Partly sunny.,desc_Passing clouds.,desc_Rain. Fog.,desc_Scattered clouds.,desc_Sunny.,desc_Thunderstorms. Fog.,confinement
0,2014-01-01 00:00:00,7.0,6.0,17.0,1011.0,92.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2014-01-01 01:00:00,6.0,6.0,17.0,1011.0,92.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2014-01-01 02:00:00,6.0,6.0,17.0,1011.0,92.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2014-01-01 03:00:00,6.0,6.0,17.0,1011.0,92.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2014-01-01 04:00:00,6.0,6.0,17.0,1011.0,92.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
drop = ['etat_barre_ce', 'etat_barre_lc', 'etat_barre_pv', 'Année', 'Mois', 'Jour', 'Heure', 'Jour semaine']
target_ce = ['q_ce', 'k_ce']
target_lc = ['q_lc', 'k_lc']
target_pv = ['q_pv', 'k_pv']
all_ = drop + target_ce + target_lc + target_pv
features = [x for x in data.columns.tolist() if x not in all_]
df = copy.deepcopy(data)
df = df.drop(drop, axis=1)
df_ce = copy.deepcopy(df[features + target_ce])
df_lc = copy.deepcopy(df[features + target_lc])
df_pv = copy.deepcopy(df[features + target_pv])

q

CE : 95/123 à partir du mois de mai 2020 (sans k & q past week)

LC : 53 à partir du mois de mars 2020 (only with q)

SP : 50 a partir du mois de janvier 2020 (only with q)

k

CE : 3.75 septembre 2020 (mais pas bon sur l'avant derniere semaine sans k & q past week) 

LC : 3  janvier 2020 (avec k pas week)

SP : 1.26 juin 2019 (++) (avec k & q past week)


# Utils

In [6]:
def rolling_custom_q(d, df, label):
    try:
        return df.loc[d - 168, label]
    except KeyError:
        return float('nan')
#s_q = pd.Series([rolling_custom_q(d) for d in df_pv.index])

In [7]:
def create_train_test(df, date_min, date_max, label, prefix, start, dropna=True):
    if dropna:
        drop = df[label + '_' + prefix].notnull()
        train, test = df[(df.Datetime <= date_min) & drop & (df.Datetime >= start)], df[(df.Datetime > date_min) & (df.Datetime <= date_max) & drop]
    x_train, y_train = train.drop(['Datetime', 'q_' + prefix, 'k_' + prefix], axis=1), train[label + '_' + prefix]
    x_test, y_test = test.drop(['Datetime', 'q_' + prefix, 'k_' + prefix], axis=1), test[label + '_' + prefix]
    return x_train, y_train, x_test, y_test

In [16]:
class HPOpt(object):

    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_train = x_train
        self.x_test  = x_test
        self.y_train = y_train
        self.y_test  = y_test

    def process(self, fn_name, space, trials, algo, max_evals):
        fn = getattr(self, fn_name)
        try:
            result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
        except Exception as e:
            return {'status': STATUS_FAIL,
                    'exception': str(e)}
        return result, trials

    def xgb_reg(self, para):
        reg = xgb.XGBRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def lgb_reg(self, para):
        reg = lgb.LGBMRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def ctb_reg(self, para):
        reg = ctb.CatBoostRegressor(**para['reg_params'])
        return self.train_reg(reg, para)

    def train_reg(self, reg, para):
        """reg0, reg1 = copy.deepcopy(reg), copy.deepcopy(reg)
        reg0.fit(self.x_train[0], self.y_train[0],
                eval_set=[(self.x_train[0], self.y_train[0]), (self.x_test[0], self.y_test[0])],
                **para['fit_params'])
        pred = reg0.predict(self.x_test[0])
        loss0 = para['loss_func'](self.y_test, pred)
        reg1.fit(self.x_train[0], self.y_train[0],
                eval_set=[(self.x_train[0], self.y_train[0]), (self.x_test[0], self.y_test[0])],
                **para['fit_params'])
        pred = reg1.predict(self.x_test[0])
        loss1 = para['loss_func'](self.y_test, pred)
        loss = 0.4 * loss0 + 0.6 * loss1"""
        reg.fit(self.x_train, self.y_train,
                eval_set=[(self.x_train, self.y_train), (self.x_test, self.y_test)],
                **para['fit_params'])
        pred = reg.predict(self.x_test)
        loss = para['loss_func'](self.y_test, pred)
        return {'loss': loss, 'status': STATUS_OK}

# Our GridSearch

In [39]:
# XGB parameters
xgb_reg_params = {
    'learning_rate':    0.08,
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     200
}
xgb_fit_params = {
    'eval_metric': 'rmse',
    'early_stopping_rounds': 30,
    'verbose': False
}
xgb_para = dict()
xgb_para['reg_params'] = xgb_reg_params
xgb_para['fit_params'] = xgb_fit_params
xgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# LightGBM parameters
lgb_reg_params = {
    'learning_rate':    hp.choice('learning_rate',    np.arange(0.05, 0.31, 0.05)),
    'max_depth':        hp.choice('max_depth',        np.arange(5, 16, 1, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
    'colsample_bytree': hp.choice('colsample_bytree', np.arange(0.3, 0.8, 0.1)),
    'subsample':        hp.uniform('subsample', 0.8, 1),
    'n_estimators':     200
}
lgb_fit_params = {
    'eval_metric': 'l2',
    'early_stopping_rounds': 30,
    'verbose': False
}
lgb_para = dict()
lgb_para['reg_params'] = lgb_reg_params
lgb_para['fit_params'] = lgb_fit_params
lgb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))


# CatBoost parameters
ctb_reg_params = {
    'learning_rate':     0.08,
    'max_depth':         hp.choice('max_depth',         np.arange(5, 16, 1, dtype=int)),
    'colsample_bylevel': hp.choice('colsample_bylevel', np.arange(0.3, 0.8, 0.1)),
    'n_estimators':     200,
    'eval_metric':       'RMSE',
}
ctb_fit_params = {
    'early_stopping_rounds': 30,
    'verbose': False
}
ctb_para = dict()
ctb_para['reg_params'] = ctb_reg_params
ctb_para['fit_params'] = ctb_fit_params
ctb_para['loss_func' ] = lambda y, pred: np.sqrt(mean_squared_error(y, pred))

# Champs Elysées

## q 

In [36]:
X_train, y_train, X_test, y_test = create_train_test(df_ce, '2020-11-26', '2020-12-02', 'q', 'ce', '2020-05-01')
X_train1, y_train1, X_test1, y_test1 = create_train_test(df_ce, '2020-12-02', '2020-12-08', 'q', 'ce', '2020-05-01')
x_train, y_train, x_test, y_test = [X_train, X_train1], [y_train, y_train1], [X_test, X_test1], [y_test, y_test1]

In [40]:
obj = HPOpt(X_train1, X_test1, y_train1, y_test1)
#xgb_opt = obj.process(fn_name='xgb_reg', space=xgb_para, trials=Trials(), algo=tpe.suggest, max_evals=100)