In [12]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import random
import copy
np.random.seed(42)
random.seed(42)
RANDOM_SEED = 42

In [100]:
data = pd.read_csv('data.csv')
data = data[data.Datetime < '2020-12-08']
drop = ['etat_barre_ce', 'etat_barre_lc', 'templow', 'baro', 'etat_barre_pv', 'Année', 'Mois', 'Jour', 'Heure', 'Jour semaine']
target_ce = ['q_ce', 'k_ce']
target_lc = ['q_lc', 'k_lc']
target_pv = ['q_pv', 'k_pv']
all_ = drop + target_ce + target_lc + target_pv
features = [x for x in data.columns.tolist() if x not in all_]
df = copy.deepcopy(data)
df = df.drop(drop, axis=1)
df_ce = copy.deepcopy(df[features + target_ce])
df_lc = copy.deepcopy(df[features + target_lc])
df_pv = copy.deepcopy(df[features + target_pv])

In [23]:
def rolling_custom(d, df, label):
    try:
        return df.loc[d - 168, label]
    except KeyError:
        return float('nan')
#s_q = pd.Series([rolling_custom_q(d) for d in df_pv.index])

def generate_index_cv(X, date_0, date_1, date_2, date_3):
    CViterator = []
    trainindices1, trainindices2 = X[(X.Datetime <= date_1) & (X.Datetime >= date_0)].index.values.astype(int), \
                                   X[(X.Datetime <= date_2) & (X.Datetime >= date_0)].index.values.astype(int)
    testindices1, testindices2 = X[(X.Datetime <= date_2) & (X.Datetime >= date_1)].index.values.astype(int), \
                                 X[(X.Datetime <= date_3) & (X.Datetime >= date_2)].index.values.astype(int)
    CViterator.append((trainindices1, testindices1))
    CViterator.append((trainindices2, testindices2))
    return CViterator

def generate_X_y(df, label, prefix, dropna=True):
    if dropna:
        drop = df[label + '_' + prefix].notnull()
    X = df[drop].drop(['q_' + prefix, 'k_' + prefix], axis=1).reset_index(drop=True)
    y = df[drop][label + '_' + prefix].reset_index(drop=True)
    return X, y

def gridsearchCV(estimator, grid, X, y, cviterator, x_test, y_test):
    fit_params={"early_stopping_rounds":42, 
            "eval_metric" : "rmse",
               "eval_set": [(x_test, y_test)]}
    gridsearch = GridSearchCV(estimator, grid, verbose=1, cv=cviterator, refit=False, scoring='neg_mean_squared_error')
    gridsearch.fit(X, y, **fit_params, verbose=False)
    return gridsearch  

# Champs elysées
## GridSearch for Q

In [24]:
start = '2020-05-01'
date_1, date_2, date_3 = '2020-11-26', '2020-12-02', '2020-12-08'
df_ce = df_ce[df_ce.Datetime >= start]
X, y = generate_X_y(df_ce, 'q', 'ce')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [25]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   25.0s finished


In [28]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 6,
 'min_child_weight': 1,
 'subsample': 0.8}

In [33]:
### XGBOOST ####
params = {
        'booster' : ['gbtree', 'dart'],
        'min_child_weight': [1],
        'colsample_bytree': [0.5, 1.0],
        'max_depth': [6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3, subsample=0.8)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    9.9s finished


In [34]:
grid.best_params_

{'booster': 'gbtree',
 'colsample_bytree': 1.0,
 'max_depth': 6,
 'min_child_weight': 1}

In [35]:
### Lightgbm
params = {
        'boosting_type' : ['gbdt', 'dart'],
        'num_leaves': [15, 25],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.15, subsample=0.8)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:   40.6s finished


In [39]:
grid.best_params_

{'boosting_type': 'gbdt',
 'colsample_bytree': 1.0,
 'max_depth': 6,
 'num_leaves': 25,
 'subsample_freq': 0}

Xgboost : subsample=0.8

LGBM : lr = 0.15, subsample=0.8

max(xgb, lgbm) la journée
min(xgb, lgbm) la nuit

## GridSearch K

In [50]:
start = '2020-09-01'
date_1, date_2, date_3 = '2020-11-26', '2020-12-02', '2020-12-08'
df_ce = df_ce[df_ce.Datetime >= start]
X, y = generate_X_y(df_ce, 'k', 'ce')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [51]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    9.8s finished


In [54]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 6,
 'min_child_weight': 1,
 'subsample': 1.0}

In [55]:
### XGBOOST ####
params = {
        'booster' : ['gbtree', 'dart'],
        'colsample_bytree': [0.5, 0.8],
        'max_depth' : [6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3, subsample=1)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    4.5s finished


In [59]:
grid.best_params_

{'booster': 'gbtree', 'colsample_bytree': 0.8, 'max_depth': 6}

In [62]:
### Lightgbm
params = {
        'subsample': [0.8, 1],
        'num_leaves': [25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:   10.5s finished


In [63]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 6,
 'num_leaves': 25,
 'subsample': 0.8,
 'subsample_freq': 0}

XGBOOST : colsample_bytree = 0.8

LGBM : subsample = 0.8, num_leaves=25

(lgbm + xgboost / 2) ??

# SaintPere
## GridSearch for Q

In [88]:
start = '2020-01-01'
date_1, date_2, date_3 = '2020-11-26', '2020-12-02', '2020-12-08'
df_pv['back_q'] = pd.Series([rolling_custom(d, df_pv, 'q_pv') for d in df_pv.index])
df_pv = df_pv[df_pv.Datetime >= start]
X, y = generate_X_y(df_pv, 'q', 'pv')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [89]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  1.3min finished


In [90]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 4,
 'min_child_weight': 5,
 'subsample': 0.6}

In [91]:
### XGBOOST ####
params = {
        'min_child_weight': [4, 6],
        'colsample_bytree': [0.5, 0.8],
        'max_depth': [3, 4],
        'subsample': [0.4, 0.6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:   29.8s finished


In [92]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 4,
 'min_child_weight': 6,
 'subsample': 0.6}

In [95]:
### Lightgbm
params = {
        'num_leaves': [31, 35],
        'colsample_bytree': [0.7, 0.8],
        'max_depth': [-1, 8],
        'subsample': [0.7, 0.8]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    5.8s finished


In [96]:
grid.best_params_

{'colsample_bytree': 0.8, 'max_depth': -1, 'num_leaves': 31, 'subsample': 0.7}

Xgboost : subsample=0.6, colsample_bytree=0.8, max_depth=4, min_child_weight=5

LGBM : colsample_bytree=0.7, subsample=0.7

max(xgb, lgbm) la journée
min(xgb, lgbm) la nuit  ou la moyenne des deux à voir !

## GridSearch K

In [101]:
start = '2019-06-01'
date_1, date_2, date_3 = '2020-11-26', '2020-12-02', '2020-12-08'
df_pv['back_q'] = pd.Series([rolling_custom(d, df_pv, 'q_pv') for d in df_pv.index])
df_pv['back_k'] = pd.Series([rolling_custom(d, df_pv, 'k_pv') for d in df_pv.index])
df_pv = df_pv[df_pv.Datetime >= start]
X, y = generate_X_y(df_pv, 'k', 'pv')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [102]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  1.5min finished


In [103]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 6,
 'min_child_weight': 5,
 'subsample': 0.6}

In [104]:
### Lightgbm
params = {
        'subsample': [0.8, 1],
        'num_leaves': [25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:   32.3s finished


In [105]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 8,
 'num_leaves': 31,
 'subsample': 0.8,
 'subsample_freq': 0}

XGBOOST : max_depth=6, min_child_weight=5, subsample=0.6

LGBM : colsample_bytree=0.8, subsample=0.8

(lgbm + xgboost / 2) ??

# Convention
## GridSearch for Q

In [67]:
start = '2020-01-01'
date_1, date_2, date_3 = '2020-11-26', '2020-12-02', '2020-12-08'
df_lc['back_q'] = pd.Series([rolling_custom(d, df_lc, 'q_lc') for d in df_lc.index])
df_lc = df_lc[df_lc.Datetime >= start]
X, y = generate_X_y(df_lc, 'q', 'lc')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [68]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   35.3s finished


In [69]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 4,
 'min_child_weight': 1,
 'subsample': 1.0}

In [71]:
### XGBOOST ####
params = {
        'booster' : ['gbtree', 'dart'],
        'max_depth': [3, 4]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.8s finished


In [72]:
grid.best_params_

{'booster': 'gbtree', 'max_depth': 4}

In [73]:
### Lightgbm
params = {
        'num_leaves': [25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
        'subsample': [0.8, 1]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:   19.9s finished


In [74]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 6,
 'num_leaves': 25,
 'subsample': 0.8,
 'subsample_freq': 2}

Xgboost : max_depth = 4

LGBM : subsample=0.8, num_leaves = 20

max(xgb, lgbm) la journée
min(xgb, lgbm) la nuit ou moyenne des deux à voir ...

## GridSearch K

In [81]:
start = '2020-01-01'
date_1, date_2, date_3 = '2020-11-26', '2020-12-02', '2020-12-08'
df_lc['back_k'] = pd.Series([rolling_custom(d, df_lc, 'k_lc') for d in df_lc.index])
#df_lc = df_lc.drop(['back_q'], axis=1)
df_lc = df_lc[df_lc.Datetime >= start]
X, y = generate_X_y(df_lc, 'k', 'lc')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [82]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   41.1s finished


In [83]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 4,
 'min_child_weight': 1,
 'subsample': 1.0}

In [84]:
### XGBOOST ####
params = {
        'max_depth' : [3, 4]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.1s finished


In [85]:
grid.best_params_

{'max_depth': 4}

In [86]:
### Lightgbm
params = {
        'subsample': [0.8, 1],
        'num_leaves': [25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:   19.4s finished


In [87]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 8,
 'num_leaves': 31,
 'subsample': 0.8,
 'subsample_freq': 0}

XGBOOST : colsample_bytree = 0.8

LGBM : subsample = 0.8, max_depth = 8

(lgbm + xgboost / 2) ?? (lgbm a l'air de bien faire l'affaire)