In [53]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from model.loss_functions.RMSELoss import RMSELoss
from utils.dataset_utils import DatasetUtils

In [54]:
# Define constants
START_DATE_BOARD = '2022-11-03'
END_DATE_BOARD = '2023-06-15'
RANDOM_STATE = 42

## Benchmark Loss
The goal of this notebook is to compute the same loss of the 3 models (LSTM, ANN, VQR) in the same portion of test set. The result will be the benchmark that each model has to overcome.

### LSTM model
Best hyperparams:
- optimizer: ADAM
- criterion: L1
- learning rate: 0.001
- num epochs: 450
- hidden size: 600
- T: 3
- train size: 70%

Final loss on test set: 2.7362

### ANN model
Best hyperparams:
- optmizier: SGD
- criterion: L1
- learning rate: 0.0001
- num epochs: 200
- hidden size: 60
- hidden size layer 2: ?
- hidden size layer 3: ?
- train size: 70%

Final loss on test set: 5.3008

### VQR Non linear model
Best hyperparams:
- optmizier: 
- criterion: 
- learning rate: 
- num epochs: 
- num layers: 
- train size: 70%

Final loss on test set:

In [58]:
# Define best hyperparams for LSTM
TRAIN_SIZE      = 0.75
CRITERION_L1    = nn.L1Loss()
CRITERION_RMSE  = RMSELoss()
CRITERION_MSE  = nn.MSELoss()

#### Build the same dataset

In [59]:
df_sensors = pd.read_csv('../resources/dataset/unique_timeseries_by_median_hours.csv')
df_sensors.timestamp = pd.to_datetime(df_sensors.timestamp)
df_sensors.timestamp += pd.Timedelta(hours=1)
df_arpa = DatasetUtils.build_arpa_dataset('../resources/dataset/arpa/Dati PM10_PM2.5_2020-2022.csv', '../resources/dataset/arpa/Torino-Rubino_Polveri-sottili_2023-01-01_2023-06-30.csv', START_DATE_BOARD, END_DATE_BOARD)

df = df_sensors.merge(df_arpa, left_on=['timestamp'], right_on=['timestamp'])
df.rename(columns={"data": "x", "pm25": "y"}, inplace=True)
# Slide ARPA data 1 hour plus
df['y'] = DatasetUtils.slide_plus_1hours(df['y'], df['x'][0])
X = df.x.values
y = df.y.values
_, X_test, _, y_test = train_test_split(X, y, train_size=TRAIN_SIZE,
                                                    shuffle=False,
                                                    random_state=RANDOM_STATE)

#### Compute the loss

In [60]:
CRITERION_L1(torch.from_numpy(X_test.astype(np.float32)), torch.from_numpy(y_test.astype(np.float32))).item()

4.701448917388916

In [61]:
CRITERION_RMSE(torch.from_numpy(X_test.astype(np.float32)), torch.from_numpy(y_test.astype(np.float32))).item()

5.823546409606934

In [62]:
CRITERION_MSE(torch.from_numpy(X_test.astype(np.float32)), torch.from_numpy(y_test.astype(np.float32))).item()

33.913692474365234