In [None]:
# Demonstration data used in this excercise is already preprocessed and split into
# training, validation, and test sets

# Use wget to download the data stored in csv format.
import itertools

# Define what files to download; download all of the preprocessed data
# Note that the data are already split into Train, Validation, and Test sets.
# The predictor data are denoted with 'X', the target by 'y'
data_download = {}
data_download["window_size"] = [5, 15]
data_download["data_type"] = ["train", "valid", "test"]
data_download["predictor_or_target"] = ["X", "y"]

# Prepare the combinations of the window sizes and the data types
keys, values = zip(*data_download.items())
data_download_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
display('The kind of data to be downloaded:', data_download_combinations)

print("Downloading started...")
for data_download_param in data_download_combinations:
  file_to_download = "https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_{0}_time_encoding_True/{1}_{2}_window_size_{0}_time_encoding_True.csv"\
                     .format(data_download_param["window_size"], data_download_param["predictor_or_target"], data_download_param["data_type"])

  # the actual downloading
  !wget "$file_to_download"

print("Downloading has finished")


'The kind of data to be downloaded:'

[{'data_type': 'train', 'predictor_or_target': 'X', 'window_size': 5},
 {'data_type': 'train', 'predictor_or_target': 'y', 'window_size': 5},
 {'data_type': 'valid', 'predictor_or_target': 'X', 'window_size': 5},
 {'data_type': 'valid', 'predictor_or_target': 'y', 'window_size': 5},
 {'data_type': 'test', 'predictor_or_target': 'X', 'window_size': 5},
 {'data_type': 'test', 'predictor_or_target': 'y', 'window_size': 5},
 {'data_type': 'train', 'predictor_or_target': 'X', 'window_size': 15},
 {'data_type': 'train', 'predictor_or_target': 'y', 'window_size': 15},
 {'data_type': 'valid', 'predictor_or_target': 'X', 'window_size': 15},
 {'data_type': 'valid', 'predictor_or_target': 'y', 'window_size': 15},
 {'data_type': 'test', 'predictor_or_target': 'X', 'window_size': 15},
 {'data_type': 'test', 'predictor_or_target': 'y', 'window_size': 15}]

Downloading started...
--2021-11-09 16:17:48--  https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_5_time_encoding_True/X_train_window_size_5_time_encoding_True.csv
Resolving frankfurt-school-dataset.s3.eu-central-1.amazonaws.com (frankfurt-school-dataset.s3.eu-central-1.amazonaws.com)... 52.219.75.84
Connecting to frankfurt-school-dataset.s3.eu-central-1.amazonaws.com (frankfurt-school-dataset.s3.eu-central-1.amazonaws.com)|52.219.75.84|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53997719 (51M) [text/csv]
Saving to: ‘X_train_window_size_5_time_encoding_True.csv’


2021-11-09 16:17:52 (18.4 MB/s) - ‘X_train_window_size_5_time_encoding_True.csv’ saved [53997719/53997719]

--2021-11-09 16:17:52--  https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_5_time_encoding_True/y_train_window_size_5_time_encoding_True.csv
Resolving frankfurt-school-dataset.s3.eu-central-1.amazonaws.com (frankfurt-school

In [None]:
import pandas as pd 
import numpy as np

In [None]:
!pip install seglearn

Collecting seglearn
  Downloading seglearn-1.2.3-py3-none-any.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 4.3 MB/s 
Installing collected packages: seglearn
Successfully installed seglearn-1.2.3


In [None]:
def load_data(num):

    X_train = pd.read_csv("X_train_window_size_{}_time_encoding_True.csv".format(num))
    y_train = pd.read_csv("y_train_window_size_{}_time_encoding_True.csv".format(num))

    X_valid = pd.read_csv("X_valid_window_size_{}_time_encoding_True.csv".format(num))
    y_valid = pd.read_csv("y_valid_window_size_{}_time_encoding_True.csv".format(num))

    return X_train, y_train, X_valid, y_valid

In [None]:
def evaluate_model(model,X_valid, y_valid):
    from math import sqrt
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    predictions = model.predict(X_valid)
    mse = mean_squared_error(y_valid, predictions)
    mae = mean_absolute_error(y_valid, predictions)
    # normalized_rms = normalizers["y"].inverse_transform(np.array([rms]).reshape(1, -1))[0][0]
    # print("Root mean squared error on valid inverse transformed from normalization:",normalized_rms)
    return mse, mae

In [None]:
def display_history(history,name ="None"):
    import matplotlib.pyplot as plt
    """Summarize history for accuracy and loss.
    """
    fig, axs = plt.subplots(ncols=1,figsize=(10,6),sharey='row')
    title = "Loss_for_{}".format(name)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    # plt.savefig(name)
    fig.savefig(title + '.png')
    plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
RForregCV = RandomForestRegressor(random_state=10)
param_grid = {'n_estimators': [100, 200, 300,400], 
              'max_depth': [2,4,8,16],
              'max_features': ['auto', 'log2']
              }
X_train, y_train, X_valid, y_valid = load_data(15)
y_train = y_train.values.reshape(y_train.shape[0],)
my_cv = TimeSeriesSplit(n_splits=2)
CV_rfmodel = GridSearchCV(estimator=RForregCV, param_grid=param_grid, cv = my_cv, scoring = 'neg_mean_squared_error', verbose = 3)
CV_rfmodel.fit(X_train, y_train)

Fitting 2 folds for each of 32 candidates, totalling 64 fits
[CV] max_depth=2, max_features=auto, n_estimators=100 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=2, max_features=auto, n_estimators=100, score=-24.191, total=   7.0s
[CV] max_depth=2, max_features=auto, n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  max_depth=2, max_features=auto, n_estimators=100, score=-18.700, total=  13.7s
[CV] max_depth=2, max_features=auto, n_estimators=200 ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   20.7s remaining:    0.0s


[CV]  max_depth=2, max_features=auto, n_estimators=200, score=-23.770, total=  13.7s
[CV] max_depth=2, max_features=auto, n_estimators=200 ................
[CV]  max_depth=2, max_features=auto, n_estimators=200, score=-18.685, total=  27.2s
[CV] max_depth=2, max_features=auto, n_estimators=300 ................
[CV]  max_depth=2, max_features=auto, n_estimators=300, score=-23.829, total=  20.6s
[CV] max_depth=2, max_features=auto, n_estimators=300 ................
[CV]  max_depth=2, max_features=auto, n_estimators=300, score=-18.671, total=  41.0s
[CV] max_depth=2, max_features=auto, n_estimators=400 ................
[CV]  max_depth=2, max_features=auto, n_estimators=400, score=-23.763, total=  27.5s
[CV] max_depth=2, max_features=auto, n_estimators=400 ................
[CV]  max_depth=2, max_features=auto, n_estimators=400, score=-18.688, total=  54.7s
[CV] max_depth=2, max_features=log2, n_estimators=100 ................
[CV]  max_depth=2, max_features=log2, n_estimators=100, score=-2

[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed: 53.1min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=2),
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=10,
                                             verbose

In [None]:
evaluate_model(CV_rfmodel, X_valid, y_valid)

(50.265457989235244, 3.482849689906972)

In [None]:
CV_rfmodel.best_params_

{'max_depth': 2, 'max_features': 'log2', 'n_estimators': 200}