# Sequence-to-Sequence Model Architecture

This Seq2Seq model has two main approaches for time series forecasting:

1. **RNN/LSTM/GRU Decoder + Linear**:
   - Traditional seq2seq approach
   - Decoder processes predictions sequentially
   - Each prediction can influence the next one
   - Uses teacher forcing during training
   ```python
   decoder_output, h = decoder(input, h)  # RNN/LSTM/GRU processing
   prediction = linear(decoder_output)     # Final linear layer for prediction
   ```

2. **FC (Fully Connected) Decoder**:
   - Simplified approach that predicts all future values at once
   - Two modes:
       - `use_all_hidden=True`: Uses all encoder hidden states
       - `use_all_hidden=False`: Uses only the last hidden state
   ```python
   # Option A: Use all hidden states
   predictions = linear(flatten(all_hidden_states))

   # Option B: Use only last hidden state
   predictions = linear(last_hidden_state)
   ```

**Key Differences**:
- RNN Decoder: Sequential predictions, better for capturing temporal dependencies
- FC Decoder: Faster, single-step predictions, simpler but might miss some temporal patterns

**Architecture Flow**:
```
Input sequence -> Encoder -> Hidden States -> Decoder (RNN or FC) -> Predictions
```

The model allows flexibility in choosing the encoder (RNN/LSTM/GRU) and decoder type based on the specific requirements of the forecasting task.

In [1]:
%cd ..

/Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python


In [2]:
import os
import shutil
import joblib

import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import asdict

import torch
from src.dl.dataloaders import TimeSeriesDataModule
from src.dl.models import SingleStepRNNConfig, SingleStepRNNModel
from src.dl.models import RNNConfig, Seq2SeqConfig, Seq2SeqModel
import pytorch_lightning as pl

# For reproduceability set a random seed
pl.seed_everything(42)

from src.utils import plotting_utils
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

from src.forecasting.ml_forecasting import (
    MissingValueConfig,
    calculate_metrics,
)

%load_ext autoreload
%autoreload 2

np.random.seed(42)

from tqdm.autonotebook import tqdm
tqdm.pandas()

Global seed set to 42


In [3]:
from itertools import cycle
import plotly.graph_objects as go

def plot_forecast(pred_df, forecast_columns, forecast_display_names=None):
    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = [f"rgba({','.join(map(str, plotting_utils.hex_to_rgb(c)))},<alpha>)" 
              for c in px.colors.qualitative.Plotly]
    
    fig = go.Figure()
    # Add actual values
    fig.add_trace(go.Scatter(
        x=pred_df[mask].index, 
        y=pred_df[mask].energy_consumption,
        mode="lines", 
        line=dict(color=colors[0].replace("<alpha>", "0.9")),
        name="Actual Consumption"
    ))
    
    # Add forecasts
    for col, name in zip(forecast_columns, forecast_display_names or forecast_columns):
        fig.add_trace(go.Scatter(
            x=pred_df[mask].index, 
            y=pred_df.loc[mask, col],
            mode="lines", 
            line=dict(dash="dot", color=next(cycle(colors[1:])).replace("<alpha>", "1")),
            name=name
        ))
    return fig

def highlight_abs_min(s, props=''): 
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends: 
        fig.for_each_trace(lambda t: t.update(name=next(cycle(legends))))
    
    fig.update_layout(
        width=900, height=500,
        title=dict(text=title, x=0.5, xanchor="center", yanchor="top", font_size=20),
        legend=dict(orientation="h", y=0.98, x=1, xanchor="right", yanchor="bottom", 
                   font_size=font_size),
        xaxis=dict(title_text=xlabel, titlefont_size=font_size, tickfont_size=font_size),
        yaxis=dict(title_text=ylabel, titlefont_size=font_size, tickfont_size=font_size)
    )
    return fig

In [4]:
os.environ['PYTORCH_DISABLE_RICH'] = '1'  # Force simpler progress bar format

preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [5]:
#Reading the missing value imputed and train test split data
train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")

# Read in the Validation dataset as test_df so that we predict on it
test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")

In [6]:
target = "energy_consumption"
index_cols = ["LCLid", "timestamp"]

# Setting the indices
train_df.set_index(index_cols, inplace=True, drop=False)
test_df.set_index(index_cols, inplace=True, drop=False)

## Selecting the sample data and metrics

In [7]:
sample_train_df = train_df.xs("MAC000193")
sample_test_df = test_df.xs("MAC000193")

# Creating a pred_df with actuals of training and test
pred_df = pd.concat([sample_train_df[[target]], sample_test_df[[target]]])

In [8]:
sample_val_df = sample_train_df.loc["2013-12"]
sample_train_df = sample_train_df.loc[:"2013-11"]

sample_train_df['type'] = "train"
sample_val_df['type'] = "val"
sample_test_df['type'] = "test"

sample_df = pd.concat([sample_train_df[[target, "type"]], sample_val_df[[target, "type"]], sample_test_df[[target, "type"]]])
sample_df.head()

Unnamed: 0_level_0,energy_consumption,type
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01 00:00:00,0.368,train
2012-01-01 00:30:00,0.386,train
2012-01-01 01:00:00,0.17,train
2012-01-01 01:30:00,0.021,train
2012-01-01 02:00:00,0.038,train


### Loading the Forecast and metrics from Single Step RNN

In [9]:

pred_df = pd.read_pickle(output/"dl_single_step_prediction_val_df_MAC000193.pkl")
metric_record = joblib.load(output/"dl_single_step_metrics_val_df_MAC000193.pkl")

### Creating the datamodule which splits and formats the data into windows

In [10]:
HORIZON = 1
WINDOW = 48

In [11]:
datamodule = TimeSeriesDataModule(
        data = sample_df[[target]],
        n_val = sample_val_df.shape[0],
        n_test = sample_test_df.shape[0],
        window = WINDOW,            # giving enough memory to capture daily seasonality
        horizon = HORIZON,          # single step
        normalize = "global",       # normalizing the data
        batch_size = 32,              # Increased batch size
        # num_workers=0,            # Use multiple workers
        # prefetch_factor=None,     # Prefetch batches
        # persistent_workers=True   # Keep workers alive
)

datamodule.setup()

## One-Step Prediction

### LSTM-FC Seq2Seq

In [12]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2fc_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="FC",
    encoder_params=encoder_config_dict,
    decoder_params={"window_size": WINDOW, "horizon":HORIZON},
    decoder_use_all_hidden=False,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2fc_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | Linear  | 257   
2 | loss    | MSELoss | 0     
------------------------------------
924 K     Trainable params
0         Non-trainable params
924 K     Total params
3.700     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

#### Prediction

In [13]:
tag = f"{rnn2fc_config.encoder_type}_{rnn2fc_config.decoder_type}_{'all_hidden' if rnn2fc_config.decoder_use_all_hidden else 'last_hidden'}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)

metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target],sample_val_df[target]])
)
metric_record.append(metrics)

  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [14]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431


In [15]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

### LSTM-FC Seq2Seq use all hidden

In [16]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2fc_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="FC",
    encoder_params=encoder_config_dict,
    decoder_params={"window_size": WINDOW, "horizon":HORIZON},
    decoder_use_all_hidden=True,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2fc_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | Linear  | 12.3 K
2 | loss    | MSELoss | 0     
------------------------------------
936 K     Trainable params
0         Non-trainable params
936 K     Total params
3.748     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

#### Prediction

In [17]:
tag = f"{rnn2fc_config.encoder_type}_{rnn2fc_config.decoder_type}_{'all_hidden' if rnn2fc_config.decoder_use_all_hidden else 'last_hidden'}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)

metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target],sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [18]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121


In [19]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

### LSTM-LSTM Seq2Seq

In [20]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2rnn_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="LSTM",
    encoder_params=encoder_config_dict,
    decoder_params=encoder_config_dict,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2rnn_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)

# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | LSTM    | 924 K 
2 | fc      | Linear  | 257   
3 | loss    | MSELoss | 0     
------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.398     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

#### Prediction

In [21]:
tag = f"{rnn2rnn_config.encoder_type}_{rnn2rnn_config.decoder_type}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)

metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target],sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [22]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121
8,LSTM_LSTM,0.1622,0.0778,1.2642


In [23]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

#### Plot of all the three models, zoomed into a day

In [24]:
fig = plot_forecast(
    pred_df, 
    forecast_columns=["LSTM_FC_last_hidden", "LSTM_FC_all_hidden", "LSTM_LSTM"], 
    forecast_display_names=["LSTM_FC_last_hidden", "LSTM_FC_all_hidden", "LSTM_LSTM"]
)
fig = format_plot(fig, title=f"Single Step Seq2Seq Models (One Day)")
fig.update_xaxes(type="date", range=["2014-01-03", "2014-01-04"])
fig.update_traces(line=dict(dash="dash"), selector= dict(name="LSTM_FC_all_hidden"))
fig.update_traces(line=dict(dash="dashdot"), selector= dict(name="LSTM_LSTM"))
fig.show()

## Multi-Step Prediction

### Creating the datamodule which splits and formats the data into windows

In [25]:
HORIZON = 48
WINDOW = 48 * 2

In [26]:
datamodule = TimeSeriesDataModule(
    data = sample_df[[target]],
    n_val = sample_val_df.shape[0],
    n_test = sample_test_df.shape[0],
    window = WINDOW, 
    horizon = HORIZON,
    normalize = "global", # normalizing the data
    batch_size = 32,
    num_workers = 0
)

datamodule.setup()

### LSTM-FC Seq2Seq Last Hidden

In [27]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2fc_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="FC",
    encoder_params=encoder_config_dict,
    decoder_params={"window_size": WINDOW, "horizon":HORIZON},
    decoder_use_all_hidden=False,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2fc_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | Linear  | 12.3 K
2 | loss    | MSELoss | 0     
------------------------------------
937 K     Trainable params
0         Non-trainable params
937 K     Total params
3.748     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [28]:

tag = f"MultiStep {rnn2fc_config.encoder_type}_{rnn2fc_config.decoder_type}_{'all_hidden' if rnn2fc_config.decoder_use_all_hidden else 'last_hidden'}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Selecting forward predictions of HORIZON timesteps, every HORIZON timesteps and flattening it
pred = pred[0::48].ravel()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)
metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [29]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121
8,LSTM_LSTM,0.1622,0.0778,1.2642
9,MultiStep LSTM_FC_last_hidden,0.2055,0.1193,1.6014


In [30]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

### LSTM-FC Seq2Seq All Hidden

In [31]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2fc_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="FC",
    encoder_params=encoder_config_dict,
    decoder_params={"window_size": WINDOW, "horizon":HORIZON},
    decoder_use_all_hidden=True,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2fc_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)

# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | Linear  | 1.2 M 
2 | loss    | MSELoss | 0     
------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.417     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Prediction

In [32]:
tag = f"MultiStep {rnn2fc_config.encoder_type}_{rnn2fc_config.decoder_type}_{'all_hidden' if rnn2fc_config.decoder_use_all_hidden else 'last_hidden'}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Selecting forward predictions of HORIZON timesteps, every HORIZON timesteps and flattening it
pred = pred[0::48].ravel()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)
metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [33]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121
8,LSTM_LSTM,0.1622,0.0778,1.2642
9,MultiStep LSTM_FC_last_hidden,0.2055,0.1193,1.6014


In [34]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

### LSTM-RNN Seq2Seq No Teacher Forcing

In [35]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2rnn_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="LSTM",
    encoder_params=encoder_config_dict,
    decoder_params=encoder_config_dict,
    teacher_forcing_ratio=0.0,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2rnn_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | LSTM    | 924 K 
2 | fc      | Linear  | 257   
3 | loss    | MSELoss | 0     
------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.398     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Prediction

In [36]:
tag = f"MultiStep {rnn2rnn_config.encoder_type}_{rnn2rnn_config.decoder_type}_teacher_forcing_{rnn2rnn_config.teacher_forcing_ratio}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Selecting forward predictions of HORIZON timesteps, every HORIZON timesteps and flattening it
pred = pred[0::48].ravel()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)
metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [37]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121
8,LSTM_LSTM,0.1622,0.0778,1.2642
9,MultiStep LSTM_FC_last_hidden,0.2055,0.1193,1.6014


In [38]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

### LSTM-RNN Seq2Seq With Stochastic Teacher Forcing

In [39]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)

rnn2rnn_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="LSTM",
    encoder_params=encoder_config_dict,
    decoder_params=encoder_config_dict,
    teacher_forcing_ratio=0.5,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2rnn_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | LSTM    | 924 K 
2 | fc      | Linear  | 257   
3 | loss    | MSELoss | 0     
------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.398     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Prediction

In [40]:
tag = f"MultiStep {rnn2rnn_config.encoder_type}_{rnn2rnn_config.decoder_type}_teacher_forcing_{rnn2rnn_config.teacher_forcing_ratio}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Selecting forward predictions of HORIZON timesteps, every HORIZON timesteps and flattening it
pred = pred[0::48].ravel()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)
metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [41]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121
8,LSTM_LSTM,0.1622,0.0778,1.2642
9,MultiStep LSTM_FC_last_hidden,0.2055,0.1193,1.6014


In [42]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

### LSTM-RNN Seq2Seq With complete Teacher Forcing

In [43]:
encoder_config = RNNConfig(
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
)

encoder_config_dict = asdict(encoder_config)


rnn2rnn_config = Seq2SeqConfig(
    encoder_type="LSTM",
    decoder_type="LSTM",
    encoder_params=encoder_config_dict,
    decoder_params=encoder_config_dict,
    teacher_forcing_ratio=1,
    learning_rate=1e-3,
)

model = Seq2SeqModel(rnn2rnn_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=1,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

MPS available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='mps', devices=1)`.


  | Name    | Type    | Params
------------------------------------
0 | encoder | LSTM    | 924 K 
1 | decoder | LSTM    | 924 K 
2 | fc      | Linear  | 257   
3 | loss    | MSELoss | 0     
------------------------------------
1.8 M     Trainable params
0         Non-trainable params
1.8 M     Total params
7.398     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]


The dataloader, val_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Prediction

In [44]:
tag = f"MultiStep {rnn2rnn_config.encoder_type}_{rnn2rnn_config.decoder_type}_teacher_forcing_{rnn2rnn_config.teacher_forcing_ratio}"
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Selecting forward predictions of HORIZON timesteps, every HORIZON timesteps and flattening it
pred = pred[0::48].ravel()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({tag: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)
metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[tag],
    tag,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)


The dataloader, predict_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Predicting: 0it [00:00, ?it/s]

In [45]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418
6,LSTM_FC_last_hidden,0.1595,0.0806,1.2431
7,LSTM_FC_all_hidden,0.1684,0.0785,1.3121
8,LSTM_LSTM,0.1622,0.0778,1.2642
9,MultiStep LSTM_FC_last_hidden,0.2055,0.1193,1.6014


In [46]:
fig = plot_forecast(pred_df, forecast_columns=[tag], forecast_display_names=[tag])
fig = format_plot(fig, title=f"MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.show()

#### Plot of the three models, zoomed into a day

In [47]:
fig = plot_forecast(
    pred_df,
    forecast_columns=[
        "MultiStep LSTM_FC_last_hidden",
        "MultiStep LSTM_FC_all_hidden",
        "MultiStep LSTM_LSTM_teacher_forcing_0.0",
        "MultiStep LSTM_LSTM_teacher_forcing_0.5",
        "MultiStep LSTM_LSTM_teacher_forcing_1",
    ],
    forecast_display_names=[
        "MultiStep LSTM_FC_last_hidden",
        "MultiStep LSTM_FC_all_hidden",
        "MultiStep LSTM_LSTM_teacher_forcing_0.0",
        "MultiStep LSTM_LSTM_teacher_forcing_0.5",
        "MultiStep LSTM_LSTM_teacher_forcing_1",
    ],
)
fig = format_plot(fig, title=f"Multi-Step Seq2Seq Models (One Day)")
fig.update_xaxes(type="date", range=["2014-01-11", "2014-01-12"])
fig.update_traces(
    line=dict(dash="dash"), selector=dict(name="MultiStep LSTM_FC_all_hidden")
)
fig.update_traces(
    line=dict(dash="dashdot"),
    selector=dict(name="MultiStep LSTM_LSTM_teacher_forcing_0.0"),
)

fig.update_traces(
    line=dict(dash="longdash"),
    selector=dict(name="MultiStep LSTM_LSTM_teacher_forcing_0.5"),
)
fig.update_traces(
    line=dict(dash="longdashdot"),
    selector=dict(name="MultiStep LSTM_LSTM_teacher_forcing_1"),
)
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
fig.write_image("imgs/chapter_12/multi_step_seq2seq.png")
fig.show()

In [48]:
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

In [49]:
pred_df.to_pickle(output/"dl_seq_2_seq_prediction_val_df_MAC000193.pkl")
joblib.dump(metric_record, output/"dl_seq_2_seq_metrics_val_df_MAC000193.pkl")

['data/london_smart_meters/output/dl_seq_2_seq_metrics_val_df_MAC000193.pkl']