In [1]:
# Demonstration data used in this excercise is already preprocessed and split into
# training, validation, and test sets

# Use wget to download the data stored in csv format.
import itertools

# Define what files to download; download all of the preprocessed data
# Note that the data are already split into Train, Validation, and Test sets.
# The predictor data are denoted with 'X', the target by 'y'
data_download = {}
data_download["window_size"] = [5, 15]
data_download["data_type"] = ["train", "valid", "test"]
data_download["predictor_or_target"] = ["X", "y"]

# Prepare the combinations of the window sizes and the data types
keys, values = zip(*data_download.items())
data_download_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
display('The kind of data to be downloaded:', data_download_combinations)

print("Downloading started...")
for data_download_param in data_download_combinations:
  file_to_download = "https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_{0}_time_encoding_True/{1}_{2}_window_size_{0}_time_encoding_True.csv"\
                     .format(data_download_param["window_size"], data_download_param["predictor_or_target"], data_download_param["data_type"])

  # the actual downloading
  !wget "$file_to_download"

print("Downloading has finished")


'The kind of data to be downloaded:'

[{'data_type': 'train', 'predictor_or_target': 'X', 'window_size': 5},
 {'data_type': 'train', 'predictor_or_target': 'y', 'window_size': 5},
 {'data_type': 'valid', 'predictor_or_target': 'X', 'window_size': 5},
 {'data_type': 'valid', 'predictor_or_target': 'y', 'window_size': 5},
 {'data_type': 'test', 'predictor_or_target': 'X', 'window_size': 5},
 {'data_type': 'test', 'predictor_or_target': 'y', 'window_size': 5},
 {'data_type': 'train', 'predictor_or_target': 'X', 'window_size': 15},
 {'data_type': 'train', 'predictor_or_target': 'y', 'window_size': 15},
 {'data_type': 'valid', 'predictor_or_target': 'X', 'window_size': 15},
 {'data_type': 'valid', 'predictor_or_target': 'y', 'window_size': 15},
 {'data_type': 'test', 'predictor_or_target': 'X', 'window_size': 15},
 {'data_type': 'test', 'predictor_or_target': 'y', 'window_size': 15}]

Downloading started...
--2021-11-10 15:23:11--  https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_5_time_encoding_True/X_train_window_size_5_time_encoding_True.csv
Resolving frankfurt-school-dataset.s3.eu-central-1.amazonaws.com (frankfurt-school-dataset.s3.eu-central-1.amazonaws.com)... 52.219.169.30
Connecting to frankfurt-school-dataset.s3.eu-central-1.amazonaws.com (frankfurt-school-dataset.s3.eu-central-1.amazonaws.com)|52.219.169.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53997719 (51M) [text/csv]
Saving to: ‘X_train_window_size_5_time_encoding_True.csv’


2021-11-10 15:23:14 (18.0 MB/s) - ‘X_train_window_size_5_time_encoding_True.csv’ saved [53997719/53997719]

--2021-11-10 15:23:14--  https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_5_time_encoding_True/y_train_window_size_5_time_encoding_True.csv
Resolving frankfurt-school-dataset.s3.eu-central-1.amazonaws.com (frankfurt-scho

In [2]:
import pandas as pd 
import numpy as np

In [3]:
!pip install seglearn

Collecting seglearn
  Downloading seglearn-1.2.3-py3-none-any.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 4.9 MB/s 
Installing collected packages: seglearn
Successfully installed seglearn-1.2.3


In [4]:
def load_data(num):

    X_train = pd.read_csv("X_train_window_size_{}_time_encoding_True.csv".format(num))
    y_train = pd.read_csv("y_train_window_size_{}_time_encoding_True.csv".format(num))

    X_valid = pd.read_csv("X_valid_window_size_{}_time_encoding_True.csv".format(num))
    y_valid = pd.read_csv("y_valid_window_size_{}_time_encoding_True.csv".format(num))

    return X_train, y_train, X_valid, y_valid

In [5]:
def evaluate_model(model,X_valid, y_valid):
    from math import sqrt
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    predictions = model.predict(X_valid)
    mse = mean_squared_error(y_valid, predictions)
    mae = mean_absolute_error(y_valid, predictions)
    # normalized_rms = normalizers["y"].inverse_transform(np.array([rms]).reshape(1, -1))[0][0]
    # print("Root mean squared error on valid inverse transformed from normalization:",normalized_rms)
    return mse, mae

In [6]:
def display_history(history,name ="None"):
    import matplotlib.pyplot as plt
    """Summarize history for accuracy and loss.
    """
    fig, axs = plt.subplots(ncols=1,figsize=(10,6),sharey='row')
    title = "Loss_for_{}".format(name)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    # plt.savefig(name)
    fig.savefig(title + '.png')
    plt.show()

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,5,12], 
              'n_estimators': [10, 50, 100, 150],
              'learning_rate': [0.1, 0.01, 0.05]}
X_train, y_train, X_valid, y_valid = load_data(5)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)
#the first grid search for XG boost with varied max depths of [2,5,12], varied n estimators of [10,50,100,150] and learning rates of [0.1, 0.01,0.05]

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   1.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   0.8s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   1.4s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   2.9s
[CV] learning_rate=0.1, max_depth=2, n_estimators=100 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=100, total=   2.6s
[CV] learning_rate=0.1, max_depth=2, n_estimators=100 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=100, total=   5.6s
[CV] learning_rate=0.1, max_depth=2, n_estimators=150 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=150, total=   3.9s
[CV] learning_rate=0.1, max_depth=2, n_estimators=150 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=150, total=   8.3s
[CV] learning_rate=0.1, max_depth=5, n_estimators=10 .................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 12.3min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
           

In [None]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_)

evaluate_model(CV_xgbmodel, X_valid, y_valid)
#best MAE window 5

-0.006784124257238178 {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}


(26.07753730646325, 2.5847180467478985)

In [7]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,5,12], 
              'n_estimators': [10, 50, 100, 150],
              'learning_rate': [0.1, 0.01, 0.05]}
X_train, y_train, X_valid, y_valid = load_data(15)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)
#the first grid search for XG boost window size 15 with varied max depths of [2,5,12], varied n estimators of [10,50,100,150] and learning rates of [0.1, 0.01,0.05]

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   1.0s
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   0.6s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   1.0s
[CV] learning_rate=0.1, max_depth=2, n_estimators=50 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=50, total=   2.0s
[CV] learning_rate=0.1, max_depth=2, n_estimators=100 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=100, total=   2.0s
[CV] learning_rate=0.1, max_depth=2, n_estimators=100 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=100, total=   3.9s
[CV] learning_rate=0.1, max_depth=2, n_estimators=150 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=150, total=   3.0s
[CV] learning_rate=0.1, max_depth=2, n_estimators=150 ................
[CV] . learning_rate=0.1, max_depth=2, n_estimators=150, total=   5.8s
[CV] learning_rate=0.1, max_depth=5, n_estimators=10 .................
[CV] .

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  9.5min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
           

In [8]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_)

evaluate_model(CV_xgbmodel, X_valid, y_valid)
#The results for the values of learning rate, max_depth and n_estiators are same for both the window sizes
#The values are close to the dummy regressor values
#Best MSE window 15

-0.008547227883546449 {'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 10}


(50.01383215746997, 3.527062184506935)

In [7]:
import xgboost as xgb #### of 0.1
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,3,4,5,6], 
              'n_estimators': [10, 20, 30, 40],
              'learning_rate': [0.1, 0.2, 0.3, 0.4]}
X_train, y_train, X_valid, y_valid = load_data(5)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)
#2nd Grid Search with parameters since we found the highest learning rate from previous grid search to work best we tried values of [0.1, 0.2, 0.3, 0.4] and the n_estimators and max_depth were on the lower side so we try low n_estimator values.

Fitting 2 folds for each of 80 candidates, totalling 160 fits
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   1.1s
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   0.8s
[CV] learning_rate=0.1, max_depth=2, n_estimators=20 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=20, total=   0.6s
[CV] learning_rate=0.1, max_depth=2, n_estimators=20 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=20, total=   1.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=30 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=30, total=   0.8s
[CV] learning_rate=0.1, max_depth=2, n_estimators=30 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=30, total=   1.7s
[CV] learning_rate=0.1, max_depth=2, n_estimators=40 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=40, total=   1.1s
[CV] learning_rate=0.1, max_depth=2, n_estimators=40 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=40, total=   2.2s
[CV] learning_rate=0.1, max_depth=3, n_estimators=10 .................
[CV] .

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  5.4min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4],
        

In [8]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_) #lower estimators and higher learning rates tried still the best model is the same as previous model

evaluate_model(CV_xgbmodel, X_valid, y_valid)
#The results are similar to grid search 1 so not much improvement

-0.006784124257238178 {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}


(26.07753730646325, 2.5847180467478985)

In [7]:
#same varied learning rate values as above for window 15
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,3,4,5,6], 
              'n_estimators': [10, 20, 30, 40],
              'learning_rate': [0.1, 0.2, 0.3, 0.4]}
X_train, y_train, X_valid, y_valid = load_data(15)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)

Fitting 2 folds for each of 80 candidates, totalling 160 fits
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   0.5s
[CV] learning_rate=0.1, max_depth=2, n_estimators=10 .................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] .. learning_rate=0.1, max_depth=2, n_estimators=10, total=   0.6s
[CV] learning_rate=0.1, max_depth=2, n_estimators=20 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=20, total=   0.5s
[CV] learning_rate=0.1, max_depth=2, n_estimators=20 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=20, total=   1.0s
[CV] learning_rate=0.1, max_depth=2, n_estimators=30 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=30, total=   0.7s
[CV] learning_rate=0.1, max_depth=2, n_estimators=30 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=30, total=   1.3s
[CV] learning_rate=0.1, max_depth=2, n_estimators=40 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=40, total=   0.9s
[CV] learning_rate=0.1, max_depth=2, n_estimators=40 .................
[CV] .. learning_rate=0.1, max_depth=2, n_estimators=40, total=   1.7s
[CV] learning_rate=0.1, max_depth=3, n_estimators=10 .................
[CV] .

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  4.0min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4],
        

In [12]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_) #same results as the 1st grid search for best params

evaluate_model(CV_xgbmodel, X_valid, y_valid)
#Best MAE window 15

-0.044547374048633404 {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}


(52.6000696096425, 3.5082906674664693)

In [8]:
#Changing the time split in Time Series split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,3,4], 
              'n_estimators': [10, 20],
              'learning_rate': [0.1, 0.2, 0.3],
              }
X_train, y_train, X_valid, y_valid = load_data(5)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=3)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2, n_jobs = -1) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   34.6s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   50.5s finished




GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.2, 0.3],
               

In [10]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_) #with a varied time split still the model is performing a bit worse and doesn't have the best outputs even the mean squared error and mean absolute error increases in the model

evaluate_model(CV_xgbmodel, X_valid, y_valid)

-0.013092404537533095 {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 10}


(26.403812970998516, 2.5873915224981574)

In [None]:
#Even after trying multiple combinations with different learning rates [0.01, 0.01, 0,1, 0.2, 0.3, 0,4] and max depth [2, 3, 4, 5, 6, 12] and [10,20,30,40,50,100, 150] in addition to this different time spits were also tried of 2 and 3.
#the best mae for window size 5 is 2.5 and window size 15 is 3.5 hence, there has not been much improvement with an xg boost model till now 

In [6]:
#Final Grid Search  xgboost with low number of estimators and low max depths on window 5 as they performed best for earlier cases 
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,3,4,5,6], 
              'n_estimators': [2,4,5,6,7,8,9],
              'learning_rate': [0.1, 0.2, 0.3, 0.4]}
X_train, y_train, X_valid, y_valid = load_data(5)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)

Fitting 2 folds for each of 140 candidates, totalling 280 fits
[CV] learning_rate=0.1, max_depth=2, n_estimators=2 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... learning_rate=0.1, max_depth=2, n_estimators=2, total=   0.9s
[CV] learning_rate=0.1, max_depth=2, n_estimators=2 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ... learning_rate=0.1, max_depth=2, n_estimators=2, total=   0.3s
[CV] learning_rate=0.1, max_depth=2, n_estimators=4 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=4, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=4 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=4, total=   0.4s
[CV] learning_rate=0.1, max_depth=2, n_estimators=5 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=5, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=5 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=5, total=   0.5s
[CV] learning_rate=0.1, max_depth=2, n_estimators=6 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=6, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=6 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=6, total=   0.5s
[CV] learning_rate=0.1, max_depth=2, n_estimators=7 ..................
[CV] .

[Parallel(n_jobs=1)]: Done 280 out of 280 | elapsed:  2.7min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4],
        

In [7]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_) #This model has the best MSE value for the grid search we did

evaluate_model(CV_xgbmodel, X_valid, y_valid)
#Best MSE window 5

-0.0053682373960097785 {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 6}


(25.693117844952575, 2.5944702403704256)

In [8]:
#Final Grid Search for xgboost with low number of estimators and low max depths on window 15 as well
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
xgbreg = xgb.XGBRegressor()
param_grid = {'max_depth': [2,3,4,5,6], 
              'n_estimators': [2,4,5,6,7,8,9],
              'learning_rate': [0.1, 0.2, 0.3, 0.4]}
X_train, y_train, X_valid, y_valid = load_data(15)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_xgbmodel = GridSearchCV(estimator=xgbreg, param_grid=param_grid, cv = my_cv, verbose = 2) #n_jobs = -1
CV_xgbmodel.fit(X_train, y_train)

Fitting 2 folds for each of 140 candidates, totalling 280 fits
[CV] learning_rate=0.1, max_depth=2, n_estimators=2 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=2, total=   0.1s
[CV] learning_rate=0.1, max_depth=2, n_estimators=2 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ... learning_rate=0.1, max_depth=2, n_estimators=2, total=   0.3s
[CV] learning_rate=0.1, max_depth=2, n_estimators=4 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=4, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=4 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=4, total=   0.3s
[CV] learning_rate=0.1, max_depth=2, n_estimators=5 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=5, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=5 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=5, total=   0.3s
[CV] learning_rate=0.1, max_depth=2, n_estimators=6 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=6, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, n_estimators=6 ..................
[CV] ... learning_rate=0.1, max_depth=2, n_estimators=6, total=   0.4s
[CV] learning_rate=0.1, max_depth=2, n_estimators=7 ..................
[CV] .

[Parallel(n_jobs=1)]: Done 280 out of 280 | elapsed:  1.9min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.2, 0.3, 0.4],
        

In [9]:
print(CV_xgbmodel.best_score_, CV_xgbmodel.best_params_) 

evaluate_model(CV_xgbmodel, X_valid, y_valid)

-0.007342293850444492 {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 2}


(50.430546946291095, 3.523351474567169)

In [None]:
#There's a certain trend in XG boost the best performing models have low number of max depth and low number of n_estimators and time split is also at its minimum