In [1]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import random
import copy
np.random.seed(42)
random.seed(42)
RANDOM_SEED = 42

In [2]:
data = pd.read_csv('data.csv')
#data = data[data.Datetime < '2020-12-08']
drop = ['etat_barre_ce', 'etat_barre_lc', 'templow', 'baro', 'etat_barre_pv', 'Année', 'Mois', 'Jour', 'Heure', 'Jour semaine']
target_ce = ['q_ce', 'k_ce']
target_lc = ['q_lc', 'k_lc']
target_pv = ['q_pv', 'k_pv']
all_ = drop + target_ce + target_lc + target_pv
features = [x for x in data.columns.tolist() if x not in all_]
df = copy.deepcopy(data)
df = df.drop(drop, axis=1)
df_ce = copy.deepcopy(df[features + target_ce])
df_lc = copy.deepcopy(df[features + target_lc])
df_pv = copy.deepcopy(df[features + target_pv])

In [5]:
df_ce = df_ce.drop(['temp',
 'icon',
 'hum',
 'rain',
 'fog',
 'thunder',
 'snow','desc_Broken clouds.', 'desc_Chilly.', 'desc_Clear.',
       'desc_Cloudy.', 'desc_Cool.', 'desc_Dense fog.',
       'desc_Drizzle. Broken clouds.', 'desc_Drizzle. Fog.',
       'desc_Drizzle. Low clouds.', 'desc_Drizzle. Mostly cloudy.',
       'desc_Fog.', 'desc_Haze.', 'desc_Ice fog.', 'desc_Light fog.',
       'desc_Light rain. Broken clouds.', 'desc_Light rain. Clear.',
       'desc_Light rain. Cloudy.', 'desc_Light rain. Fog.',
       'desc_Light rain. Low clouds.',
       'desc_Light rain. More clouds than sun.',
       'desc_Light rain. Mostly cloudy.', 'desc_Light rain. Overcast.',
       'desc_Light rain. Partly cloudy.',
       'desc_Light rain. Partly sunny.',
       'desc_Light rain. Passing clouds.', 'desc_Light snow. Ice fog.',
       'desc_Low clouds.', 'desc_Mild.', 'desc_More clouds than sun.',
       'desc_Mostly cloudy.', 'desc_No weather data available',
       'desc_Overcast.', 'desc_Partly cloudy.', 'desc_Partly sunny.',
       'desc_Passing clouds.', 'desc_Rain. Fog.',
       'desc_Scattered clouds.', 'desc_Sprinkles. Mostly cloudy.',
       'desc_Sunny.', 'desc_Thunderstorms. Fog.'], axis=1)

In [3]:
def rolling_custom(d, df, label):
    try:
        return df.loc[d - 168, label]
    except KeyError:
        return float('nan')
#s_q = pd.Series([rolling_custom_q(d) for d in df_pv.index])

def generate_index_cv(X, date_0, date_1, date_2, date_3):
    CViterator = []
    trainindices1, trainindices2 = X[(X.Datetime <= date_1) & (X.Datetime >= date_0)].index.values.astype(int), \
                                   X[(X.Datetime <= date_2) & (X.Datetime >= date_0)].index.values.astype(int)
    testindices1, testindices2 = X[(X.Datetime <= date_2) & (X.Datetime >= date_1)].index.values.astype(int), \
                                 X[(X.Datetime <= date_3) & (X.Datetime >= date_2)].index.values.astype(int)
    CViterator.append((trainindices1, testindices1))
    CViterator.append((trainindices2, testindices2))
    return CViterator

def generate_X_y(df, label, prefix, dropna=True):
    if dropna:
        drop = df[label + '_' + prefix].notnull()
    X = df[drop].drop(['q_' + prefix, 'k_' + prefix], axis=1).reset_index(drop=True)
    y = df[drop][label + '_' + prefix].reset_index(drop=True)
    return X, y

def gridsearchCV(estimator, grid, X, y, cviterator, x_test, y_test):
    fit_params={"early_stopping_rounds":42, 
            "eval_metric" : "rmse",
               "eval_set": [(x_test, y_test)]}
    gridsearch = GridSearchCV(estimator, grid, verbose=1, cv=cviterator, refit=False, scoring='neg_mean_squared_error')
    gridsearch.fit(X, y, **fit_params, verbose=False)
    return gridsearch  

# Champs elysées
## GridSearch for Q

In [6]:
start = '2020-10-01'
date_1, date_2, date_3 = '2020-11-29', '2020-12-05', '2020-12-11'
df_ce = df_ce[df_ce.Datetime >= start]
X, y = generate_X_y(df_ce, 'q', 'ce')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [7]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   14.2s finished


In [8]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 6,
 'min_child_weight': 1,
 'subsample': 0.8}

In [9]:
### Lightgbm
params = {
        'num_leaves': [15, 25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
    'subsample' : [0.8, 1.0]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.15, subsample=0.8)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 108 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:   10.9s finished


In [10]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 8,
 'num_leaves': 25,
 'subsample': 0.8,
 'subsample_freq': 0}

Xgboost : n_estimators=300, subsample=0.8, random_state=27

LGBM : n_estimators=300, subsample=0.8, num_leaves=25, learning_rate=0.15, random_state=27

max(xgb, lgbm) la journée
min(xgb, lgbm) la nuit

## GridSearch K

In [11]:
start = '2020-08-01'
date_1, date_2, date_3 = '2020-11-29', '2020-12-05', '2020-12-11'
df_ce = df_ce[df_ce.Datetime >= start]
X, y = generate_X_y(df_ce, 'k', 'ce')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [12]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   12.9s finished


In [13]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 4,
 'min_child_weight': 5,
 'subsample': 0.6}

In [14]:
### Lightgbm
params = {
        'num_leaves': [15, 25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
    'subsample' : [0.8, 1.0]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 108 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:   15.6s finished


In [15]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 8,
 'num_leaves': 31,
 'subsample': 0.8,
 'subsample_freq': 2}

XGBOOST : n_estimators=300, subsample = 0.6, min_child_weight=5,  max_depth=4, random_state=27

LGBM : learning_rate = 0.15, subsample_freq=2, subsample=0.8, num_leaves=25, n_estimators=300, random_state=27

(lgbm + xgboost / 2) ??

# SaintPere
## GridSearch for Q

In [26]:
start = '2020-01-01'
date_1, date_2, date_3 = '2020-11-29', '2020-12-05', '2020-12-11'
df_pv['back_q'] = pd.Series([rolling_custom(d, df_pv, 'q_pv') for d in df_pv.index])
df_pv = df_pv[df_pv.Datetime >= start]
X, y = generate_X_y(df_pv, 'q', 'pv')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [27]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.7min finished


In [28]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 8,
 'min_child_weight': 5,
 'subsample': 1.0}

In [29]:
### Lightgbm
params = {
        'num_leaves': [15, 25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
        'subsample' : [0.8, 1.0]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 108 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:   34.0s finished


In [30]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 8,
 'num_leaves': 25,
 'subsample': 0.8,
 'subsample_freq': 1}

Xgboost : random_state = 27, max_depth=8, min_child_weight=5, n_estimators=300

LGBM : colsample_bytree=0.8, subsample=0.8, num_leaves=25, n_estimators=300,  subsample_freq=1, random_state=27

max(xgb, lgbm) la journée
min(xgb, lgbm) la nuit  ou la moyenne des deux à voir !

## GridSearch K

In [31]:
start = '2020-01-01'
date_1, date_2, date_3 = '2020-11-29', '2020-12-05', '2020-12-11'
df_pv['back_q'] = pd.Series([rolling_custom(d, df_pv, 'q_pv') for d in df_pv.index])
df_pv['back_k'] = pd.Series([rolling_custom(d, df_pv, 'k_pv') for d in df_pv.index])
df_pv = df_pv[df_pv.Datetime >= start]
X, y = generate_X_y(df_pv, 'k', 'pv')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [32]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.6min finished


In [33]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 6,
 'min_child_weight': 1,
 'subsample': 0.6}

In [34]:
### Lightgbm
params = {
        'num_leaves': [15, 25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
        'subsample' : [0.8, 1.0]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 108 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:   35.1s finished


In [35]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 8,
 'num_leaves': 31,
 'subsample': 0.8,
 'subsample_freq': 1}

XGBOOST : max_depth=6, min_child_weight=5, subsample=0.6

LGBM : colsample_bytree=0.8, subsample=0.8

(lgbm + xgboost / 2) ??

# Convention
## GridSearch for Q

In [16]:
start = '2020-02-01'
date_1, date_2, date_3 = '2020-11-29', '2020-12-05', '2020-12-11'
df_lc['back_q'] = pd.Series([rolling_custom(d, df_lc, 'q_lc') for d in df_lc.index])
df_lc = df_lc[df_lc.Datetime >= start]
X, y = generate_X_y(df_lc, 'q', 'lc')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [17]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.3min finished


In [18]:
grid.best_params_

{'colsample_bytree': 1.0,
 'max_depth': 4,
 'min_child_weight': 5,
 'subsample': 0.6}

In [19]:
### Lightgbm
params = {
        'num_leaves': [15, 25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
        'subsample' : [0.8, 1.0]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 108 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:   30.4s finished


In [20]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 4,
 'num_leaves': 15,
 'subsample': 0.8,
 'subsample_freq': 2}

Xgboost : n_estimators=300, max_depth=4, min_child_weight=5, subsample=0.6, random_state=27

LGBM : n_estimators=300, subsample=0.8, max_depth=4, colsample_bytree=0.8, 
                        subsample_freq=2, num_leaves=15, random_state=27

max(xgb, lgbm) la journée
min(xgb, lgbm) la nuit ou moyenne des deux à voir ...

## GridSearch K

In [21]:
start = '2020-02-01'
date_1, date_2, date_3 = '2020-11-29', '2020-12-05', '2020-12-11'
df_lc['back_k'] = pd.Series([rolling_custom(d, df_lc, 'k_lc') for d in df_lc.index])
df_lc = df_lc.drop(['back_q'], axis=1)
df_lc = df_lc[df_lc.Datetime >= start]
X, y = generate_X_y(df_lc, 'k', 'lc')
cviterator = generate_index_cv(X, start, date_1, date_2, date_3)
X = X.drop(['Datetime'], axis=1)
X_test, y_test = X.iloc[cviterator[1][1]], y.iloc[cviterator[1][1]]

In [22]:
### XGBOOST ####
params = {
        'min_child_weight': [1, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'max_depth': [4, 6, 8]
        }
xgb = XGBRegressor(n_estimators=300, random_state=27, learning_rate=0.3)
grid = gridsearchCV(xgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.0min finished


In [23]:
grid.best_params_

{'colsample_bytree': 0.7,
 'max_depth': 4,
 'min_child_weight': 5,
 'subsample': 0.8}

In [24]:
### Lightgbm
params = {
        'num_leaves': [15, 25, 31],
        'subsample_freq' : [0, 1, 2],
        'colsample_bytree': [0.8, 1.0],
        'max_depth': [4, 6, 8],
        'subsample' : [0.8, 1.0]
        }
lgb = LGBMRegressor(n_estimators=300, random_state=27, learning_rate=0.1)
grid = gridsearchCV(lgb, params, X.values, y.values, cviterator, X_test.values, y_test.values)

Fitting 2 folds for each of 108 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:   16.9s finished


In [25]:
grid.best_params_

{'colsample_bytree': 0.8,
 'max_depth': 8,
 'num_leaves': 15,
 'subsample': 0.8,
 'subsample_freq': 2}

XGBOOST : random_state = 27, colsample_bytree=0.7, max_depth=4, min_child_weight=5, subsample=0.8,
                      n_estimators=300

LGBM : subsample=0.8, subsample_freq=2, colsample_bytree=0.8, num_leaves=15, n_estimators=300, random_state=27

(lgbm + xgboost / 2) ?? (lgbm a l'air de bien faire l'affaire)