In [20]:
import pandas as pd 
import numpy as np

In [None]:
!pip install seglearn

In [None]:
# Demonstration data used in this excercise is already preprocessed and split into
# training, validation, and test sets

# Use wget to download the data stored in csv format.
import itertools

# Define what files to download; download all of the preprocessed data
# Note that the data are already split into Train, Validation, and Test sets.
# The predictor data are denoted with 'X', the target by 'y'
data_download = {}
data_download["window_size"] = [5, 15]
data_download["data_type"] = ["train", "valid", "test"]
data_download["predictor_or_target"] = ["X", "y"]

# Prepare the combinations of the window sizes and the data types
keys, values = zip(*data_download.items())
data_download_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
display('The kind of data to be downloaded:', data_download_combinations)

print("Downloading started...")
for data_download_param in data_download_combinations:
  file_to_download = "https://frankfurt-school-dataset.s3.eu-central-1.amazonaws.com/Sept2021/window_size_{0}_time_encoding_True/{1}_{2}_window_size_{0}_time_encoding_True.csv"\
                     .format(data_download_param["window_size"], data_download_param["predictor_or_target"], data_download_param["data_type"])

  # the actual downloading
  !wget "$file_to_download"

print("Downloading has finished")


In [21]:
def load_data(num):

    X_train = pd.read_csv("X_train_window_size_{}_time_encoding_True.csv".format(num))
    y_train = pd.read_csv("y_train_window_size_{}_time_encoding_True.csv".format(num))

    X_valid = pd.read_csv("X_valid_window_size_{}_time_encoding_True.csv".format(num))
    y_valid = pd.read_csv("y_valid_window_size_{}_time_encoding_True.csv".format(num))

    return X_train, y_train, X_valid, y_valid

In [22]:
def evaluate_model(model,X_valid, y_valid):
    from math import sqrt
    from sklearn.metrics import mean_squared_error, mean_absolute_error

    predictions = model.predict(X_valid)
    mse = mean_squared_error(y_valid, predictions)
    mae = mean_absolute_error(y_valid, predictions)
    # normalized_rms = normalizers["y"].inverse_transform(np.array([rms]).reshape(1, -1))[0][0]
    # print("Root mean squared error on valid inverse transformed from normalization:",normalized_rms)
    return mse, mae

In [23]:
def display_history(history,name ="None"):
    import matplotlib.pyplot as plt
    """Summarize history for accuracy and loss.
    """
    fig, axs = plt.subplots(ncols=1,figsize=(10,6),sharey='row')
    title = "Loss_for_{}".format(name)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    # plt.savefig(name)
    fig.savefig(title + '.png')
    plt.show()

In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor



def get_models():
	# linear models
	models=dict()
	models['LinearRegression'] = LinearRegression()
	models['LassoRegression'] = Lasso()
	models['RidgeRegression'] = Ridge()
	models['ElasticNet'] = ElasticNet()
	# models['huber'] = HuberRegressor()
	# models['lars'] = Lars()
	# models['llars'] = LassoLars()
	models['PassiveAggresive_Reg'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
	models['RANSACRegressor'] = RANSACRegressor()
	models['StochasticGD'] = SGDRegressor(loss = 'epsilon_insensitive', max_iter= 1000, tol=1e-20, shuffle = False, learning_rate = "adaptive")
	return models

In [74]:
models = get_models()

In [77]:
# n_steps = time_window -1
# n_features = 1
# batch_size = 100
# leanring_rate = 0.1
# epoch = 20
report = pd.DataFrame(columns = ['Time_window','model','Val_ms_error', 'Val_ma_error'])  
time_window = [5,15]
for i in time_window:
    X_train, y_train, X_valid, y_valid = load_data(i)
    print("This is for time_window{}".format(i))
    for name, model in models.items():
        model.fit(X_train.values,y_train.values.reshape(y_train.shape[0],))   #  callbacks=[lr_decayed_fn,cp1]
        # display_history(history,name)
        mse_error, mae_error = evaluate_model(model,X_valid,y_valid)
        report.loc[len(report)] = [i,name,mse_error,mae_error]
        print("MSE on valid for {}: {}".format(name,mse_error))
        print("MAE on valid for {}: {}".format(name,mae_error))
    

This is for time_window5
MSE on valid for lnearRegression: 25.869144008632595
MAE on valid for lnearRegression: 2.561283916269671
MSE on valid for LassoRegression: 25.70524861259394
MAE on valid for LassoRegression: 2.5579560701403414
MSE on valid for RidgeRegression: 25.869148246942526
MAE on valid for RidgeRegression: 2.5612839624336843
MSE on valid for ElasticNet: 25.70736870336591
MAE on valid for ElasticNet: 2.5547878254146643
MSE on valid for PassiveAggresive_Reg: 56.22499921193684
MAE on valid for PassiveAggresive_Reg: 6.022573349498216
MSE on valid for RansacRegressor: 46.322024255706566
MAE on valid for RansacRegressor: 4.114385447761471
MSE on valid for StochasticGD: 35.273249454683175
MAE on valid for StochasticGD: 3.8795094018998144
This is for time_window15
MSE on valid for lnearRegression: 52.1561138002538
MAE on valid for lnearRegression: 3.5514248473560954
MSE on valid for LassoRegression: 49.921083381083086
MAE on valid for LassoRegression: 3.470904162530835
MSE on val

In [78]:
report

Unnamed: 0,Time_window,model,Val_ms_error,Val_ma_error
0,5,lnearRegression,25.869144,2.561284
1,5,LassoRegression,25.705249,2.557956
2,5,RidgeRegression,25.869148,2.561284
3,5,ElasticNet,25.707369,2.554788
4,5,PassiveAggresive_Reg,56.224999,6.022573
5,5,RansacRegressor,46.322024,4.114385
6,5,StochasticGD,35.273249,3.879509
7,15,lnearRegression,52.156114,3.551425
8,15,LassoRegression,49.921083,3.470904
9,15,RidgeRegression,52.155991,3.551419
