In [1]:
%cd ../..

/Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python


In [2]:
import os
import shutil
import joblib


import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

from pathlib import Path

from src.forecasting.ml_forecasting import (
    MissingValueConfig,
    calculate_metrics,
)
from src.utils import plotting_utils
from tqdm.autonotebook import tqdm
from src.forecasting.ml_forecasting import calculate_metrics
from src.utils import ts_utils
from IPython.display import display, HTML
# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()


  from tqdm.autonotebook import tqdm


In [3]:
os.environ['PYTORCH_DISABLE_RICH'] = '1'  # Force simpler progress bar format

In [4]:
import torch
import pytorch_lightning as pl
print(f'PyTorch version: {torch.__version__}')
print(f'PyTorch Lightning version: {pl.__version__}')
print(f'MPS (Metal) available: {torch.backends.mps.is_available()}')
print(f'Using M1 Mac: {torch.backends.mps.is_built()}')
print(f"Current Device: {torch.device('mps' if torch.backends.mps.is_available() else 'cpu')}")

PyTorch version: 2.1.0
PyTorch Lightning version: 2.1.0
MPS (Metal) available: True
Using M1 Mac: True
Current Device: mps


In [5]:
os.makedirs("imgs/chapter_12", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

## Utility Functions

In [6]:
def format_plot(fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        )
    )
    return fig

In [8]:
from itertools import cycle


def plot_forecast(pred_df, forecast_columns, forecast_display_names=None):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns) == len(forecast_display_names)
    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = [
        "rgba(" + ",".join([str(c) for c in plotting_utils.hex_to_rgb(c)]) + ",<alpha>)"
        for c in px.colors.qualitative.Plotly
    ]
    act_color = colors[0]
    colors = cycle(colors[1:])
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].energy_consumption,
            mode="lines",
            line=dict(color=act_color.replace("<alpha>", "0.9")),
            name="Actual Consumption",
        )
    )
    for col, display_col in zip(forecast_columns, forecast_display_names):
        fig.add_trace(
            go.Scatter(
                x=pred_df[mask].index,
                y=pred_df.loc[mask, col],
                mode="lines",
                line=dict(dash="dot", color=next(colors).replace("<alpha>", "1")),
                name=display_col,
            )
        )
    return fig

def highlight_abs_min(s, props=''):
    return np.where(s == np.nanmin(np.abs(s.values)), props, '')

## Reading the data

In [9]:
try:
    #Reading the missing value imputed and train test split data
    train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")
    # Read in the Validation dataset as test_df so that we predict on it
    test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")
    # test_df = pd.read_parquet(preprocessed/"block_0-7_test_missing_imputed_feature_engg.parquet")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Feature Engineering.ipynb in Chapter06
    </div>
    """))

In [10]:
target = "energy_consumption"
index_cols = ["LCLid", "timestamp"]

In [11]:
# Setting the indices
train_df.set_index(index_cols, inplace=True, drop=False)
test_df.set_index(index_cols, inplace=True, drop=False)

### Loading the Single Step ML Forecast

In [12]:
try:
    single_step_ahead_ml_fc_df = pd.read_pickle(output/"ml_single_step_prediction_val_df.pkl")
    single_step_ahead_ml_metrics_df = pd.read_pickle(output/"ml_single_step_metrics_val_df.pkl")
    single_step_ahead_ml_agg_metrics_df = pd.read_pickle(output/"ml_single_step_aggregate_metrics_val.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 01-Forecasting with ML in Chapter08
    </div>
    """))

# Running the RNN on a Sample Household

## Selecting the sample data and metrics

In [13]:
sample_train_df = train_df.xs("MAC000193")
sample_test_df = test_df.xs("MAC000193")

# Creating a pred_df with actuals
pred_df = pd.concat([sample_train_df[[target]], sample_test_df[[target]]])

Split Train into Train and Validation and combine everything together into a single dataframe

In [14]:
sample_val_df = sample_train_df.loc["2013-12"]
sample_train_df = sample_train_df.loc[:"2013-11"]

sample_train_df['type'] = "train"
sample_val_df['type'] = "val"
sample_test_df['type'] = "test"
sample_df = pd.concat([sample_train_df[[target, "type"]], sample_val_df[[target, "type"]], sample_test_df[[target, "type"]]])
sample_df.head()

Unnamed: 0_level_0,energy_consumption,type
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01 00:00:00,0.368,train
2012-01-01 00:30:00,0.386,train
2012-01-01 01:00:00,0.17,train
2012-01-01 01:30:00,0.021,train
2012-01-01 02:00:00,0.038,train


In [15]:
metric_record = []
metric_record += (
    single_step_ahead_ml_metrics_df.loc[single_step_ahead_ml_metrics_df.LCLid == "MAC000193"]
    .drop(columns="LCLid")
    .to_dict(orient="records")
)
metric_record

[{'Algorithm': 'Lasso Regression',
  'MAE': 0.15977797844798053,
  'MSE': 0.07431120430346767,
  'MASE': 1.2451775076984242},
 {'Algorithm': 'XGB Random Forest',
  'MAE': 0.16423608362674713,
  'MSE': 0.08156435191631317,
  'MASE': 1.279920220375061},
 {'Algorithm': 'LightGBM',
  'MAE': 0.14890492743376219,
  'MSE': 0.06918339763995401,
  'MASE': 1.1604419346584254}]

## Loading the necessary classes

In [16]:
from src.dl.dataloaders import TimeSeriesDataModule
from src.dl.models import SingleStepRNNConfig, SingleStepRNNModel
import pytorch_lightning as pl
import torch

# For reproduceability set a random seed
pl.seed_everything(42)

Seed set to 42


42

### Creating the datamodule which splits and formats the data into windows

In [17]:
datamodule = TimeSeriesDataModule(
        data = sample_df[[target]],
        n_val = sample_val_df.shape[0],
        n_test = sample_test_df.shape[0],
        window = 48, # giving enough memory to capture daily seasonality
        horizon = 1, # single step
        normalize = "global", # normalizing the data
        batch_size = 32,
        num_workers = 0
)
datamodule.setup()

### Setting the config for the RNN and initializing the model

In [18]:
rnn_config = SingleStepRNNConfig(
    rnn_type="RNN",
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
    learning_rate=1e-3,
)

model = SingleStepRNNModel(rnn_config)

### Manual Inspection

In [19]:
# Getting a batch from the train_dataloader
for batch in datamodule.train_dataloader():
    x, y = batch
    break

print("Shape of x: ", x.shape)
print("Shape of y: ", y.shape)

Shape of x:  torch.Size([32, 48, 1])
Shape of y:  torch.Size([32, 1, 1])


In [20]:
# Running the batch through the model
# We expect two outputs - the first one is the forecast and second is the corresponding target
y_hat, y = model(batch)
print("Shape of y_hat: ", y_hat.shape)
print("Shape of y: ", y.shape)

Shape of y_hat:  torch.Size([32, 48, 1])
Shape of y:  torch.Size([32, 48, 1])


In [21]:
# Calculating the loss
l = model.loss(y_hat, y)
print(l)

tensor(0.9786, grad_fn=<MseLossBackward0>)


### Full Training

**Uncomment below cell if you need to monitor training using TensorBoard**

In [21]:
# # Load the TensorBoard notebook extension
# %load_ext tensorboard
# os.makedirs(lightning_logs, exist_ok=True)
# %tensorboard --logdir lightning_logs/

In [23]:
trainer = pl.Trainer(
    accelerator='mps',
    devices=1,
    min_epochs=5,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
    logger=True,
    log_every_n_steps=1,
)

trainer.fit(model, datamodule)

# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python/lightning_logs

  | Name | Type    | Params
---------------------------------
0 | rnn  | RNN     | 231 K 
1 | fc   | Linear  | 257   
2 | loss | MSELoss | 0     
---------------------------------
231 K     Trainable params
0         Non-trainable params
231 K     Total params
0.926     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

### Prediction

In [35]:
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({rnn_config.rnn_type: pred}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)

metrics = calculate_metrics(
    sample_test_df[target], 
    pred_df_[rnn_config.rnn_type], 
    rnn_config.rnn_type, 
    pd.concat([sample_train_df[target],sample_val_df[target]]))
metric_record.append(metrics)

/Users/mlevydaniel/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting: |                                                                                       | 0/? [00:…

In [36]:
formatted = pd.DataFrame(metric_record).style.format(
    {
    "MAE": "{:.4f}", 
    "MSE": "{:.4f}", 
    "MASE": "{:.4f}"
    }
)
formatted.highlight_min(color='lightgreen', subset=["MAE","MSE","MASE"]
                        ).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['MAE'])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083


In [38]:
fig = plot_forecast(pred_df, forecast_columns=[rnn_config.rnn_type], forecast_display_names=[rnn_config.rnn_type])
fig = format_plot(fig, title=f"{rnn_config.rnn_type}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_12/rnn.png")
fig.show()

# Running LSTMs and GRUs on a Sample Household

## LSTM

In [39]:
rnn_config = SingleStepRNNConfig(
    rnn_type="LSTM",
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
    learning_rate=1e-3,
)

model = SingleStepRNNModel(rnn_config)

trainer = pl.Trainer(
    # auto_lr_find=True,
    accelerator='cpu',
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
    logger=True,
    log_every_n_steps=1,
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.


  | Name | Type    | Params
---------------------------------
0 | rnn  | LSTM    | 924 K 
1 | fc   | Linear  | 257   
2 | loss | MSELoss | 0     
---------------------------------
924 K     Trainable params
0         Non-trainable params
924 K     Total params
3.700     Total estimated model params size (MB)


Sanity Checking: |                                                                                  | 0/? [00:…


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



Training: |                                                                                         | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

In [None]:
pred = trainer.predict(model, datamodule.test_dataloader())
# pred is a list of outputs, one for each batch

pred = torch.cat(pred).squeeze().detach().numpy()
# Apply reverse transformation because we applied global normalization

pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({rnn_config.rnn_type: pred}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)
metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[rnn_config.rnn_type],
    rnn_config.rnn_type,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)

In [41]:
formatted = pd.DataFrame(metric_record).style.format({
    "MAE": "{:.4f}", 
    "MSE": "{:.4f}", 
    "MASE": "{:.4f}", 
  #   "Forecast Bias": "{:.2f}%"
  })
formatted.highlight_min(color='lightgreen', subset=["MAE", "MSE", "MASE"]
                        ).apply(highlight_abs_min, props='color:black;background-color:lightgreen', subset=['MAE'], axis=0)

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149


In [43]:
fig = plot_forecast(pred_df, forecast_columns=[rnn_config.rnn_type], forecast_display_names=[rnn_config.rnn_type])
fig = format_plot(fig, title=f"{rnn_config.rnn_type}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_12/lstm.png")
fig.show()

## GRU

In [46]:
rnn_config = SingleStepRNNConfig(
    rnn_type="GRU",
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
    learning_rate=1e-3,
)

model = SingleStepRNNModel(rnn_config)

trainer = pl.Trainer(
    accelerator='cpu',
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)
trainer.fit(model, datamodule)
# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

Missing logger folder: /Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python/lightning_logs

  | Name | Type    | Params
---------------------------------
0 | rnn  | GRU     | 693 K 
1 | fc   | Linear  | 257   
2 | loss | MSELoss | 0     
---------------------------------
693 K     Trainable params
0         Non-trainable params
693 K     Total params
2.775     Total estimated model params size (MB)


Sanity Checking: |                                                                                  | 0/? [00:…

Training: |                                                                                         | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

Validation: |                                                                                       | 0/? [00:…

In [47]:
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean

pred_df_ = pd.DataFrame({rnn_config.rnn_type: pred}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)

metrics = calculate_metrics(
    sample_test_df[target],
    pred_df_[rnn_config.rnn_type],
    rnn_config.rnn_type,
    pd.concat([sample_train_df[target], sample_val_df[target]])
)
metric_record.append(metrics)


The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.



Predicting: |                                                                                       | 0/? [00:…

In [48]:
formatted = pd.DataFrame(metric_record).style.format(
    {
    "MAE": "{:.4f}", 
    "MSE": "{:.4f}", 
    "MASE": "{:.4f}", 
#   "Forecast Bias": "{:.2f}%"
    }
)
formatted.highlight_min(color='lightgreen', subset=["MAE", "MSE", "MASE"]
                        ).apply(highlight_abs_min, props='color:black;background-color:lightgreen', axis=0, subset=['MAE'])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1679,0.0869,1.3083
4,LSTM,0.1687,0.0885,1.3149
5,GRU,0.1722,0.0831,1.3418


In [49]:
fig = plot_forecast(pred_df, forecast_columns=[rnn_config.rnn_type], forecast_display_names=[rnn_config.rnn_type])
fig = format_plot(fig, title=f"{rnn_config.rnn_type}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.write_image("imgs/chapter_12/gru.png")
fig.show()

In [50]:
fig = plot_forecast(pred_df, forecast_columns=["LSTM", "GRU"], forecast_display_names=["LSTM", "GRU"])
fig = format_plot(fig, title=f"LSTM, and GRU")
fig.update_xaxes(type="date", range=["2014-01-01", "2014-01-08"])
fig.update_traces(line=dict(dash="dash"), selector= dict(name="GRU"))
fig.write_image("imgs/chapter_12/lstm_gru.png")
fig.show()

In [51]:
pred_df.to_pickle(output/"dl_single_step_prediction_val_df_MAC000193.pkl")
joblib.dump(metric_record, output/"dl_single_step_metrics_val_df_MAC000193.pkl")

['data/london_smart_meters/output/dl_single_step_metrics_val_df_MAC000193.pkl']