In [1]:
%cd ..

/Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python


In [2]:
import os
import shutil

import numpy as np
import pandas as pd
from pathlib import Path

import torch
from src.dl.dataloaders import TimeSeriesDataModule
from src.dl.models import SingleStepRNNConfig, SingleStepRNNModel
import pytorch_lightning as pl

# For reproduceability set a random seed
pl.seed_everything(42)

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

from src.forecasting.ml_forecasting import (
    MissingValueConfig,
    calculate_metrics,
)

%load_ext autoreload
%autoreload 2

np.random.seed(42)

from tqdm.autonotebook import tqdm
tqdm.pandas()

Seed set to 42


In [3]:
os.environ['PYTORCH_DISABLE_RICH'] = '1'  # Force simpler progress bar format

os.makedirs("imgs/chapter_12", exist_ok=True)
preprocessed = Path("data/london_smart_meters/preprocessed")
output = Path("data/london_smart_meters/output")

In [4]:
#Reading the missing value imputed and train test split data
train_df = pd.read_parquet(preprocessed/"selected_blocks_train_missing_imputed_feature_engg.parquet")

# Read in the Validation dataset as test_df so that we predict on it
test_df = pd.read_parquet(preprocessed/"selected_blocks_val_missing_imputed_feature_engg.parquet")

In [5]:
target = "energy_consumption"
index_cols = ["LCLid", "timestamp"]

# Setting the indices
train_df.set_index(index_cols, inplace=True, drop=False)
test_df.set_index(index_cols, inplace=True, drop=False)

## Selecting the sample data and metrics

In [6]:
sample_train_df = train_df.xs("MAC000193")
sample_test_df = test_df.xs("MAC000193")

# Creating a pred_df with actuals of training and test
pred_df = pd.concat([sample_train_df[[target]], sample_test_df[[target]]])

In [7]:
sample_val_df = sample_train_df.loc["2013-12"]
sample_train_df = sample_train_df.loc[:"2013-11"]

sample_train_df['type'] = "train"
sample_val_df['type'] = "val"
sample_test_df['type'] = "test"

sample_df = pd.concat([sample_train_df[[target, "type"]], sample_val_df[[target, "type"]], sample_test_df[[target, "type"]]])
sample_df.head()

Unnamed: 0_level_0,energy_consumption,type
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-01-01 00:00:00,0.368,train
2012-01-01 00:30:00,0.386,train
2012-01-01 01:00:00,0.17,train
2012-01-01 01:30:00,0.021,train
2012-01-01 02:00:00,0.038,train


## Loading the necessary classes

### Creating the datamodule which splits and formats the data into windows

In [8]:
datamodule = TimeSeriesDataModule(
        data = sample_df[[target]],
        n_val = sample_val_df.shape[0],
        n_test = sample_test_df.shape[0],
        window = 48,            # giving enough memory to capture daily seasonality
        horizon = 1,            # single step
        normalize = "global",   # normalizing the data
        batch_size=32,          # Increased batch size
        # num_workers=0,          # Use multiple workers
        # prefetch_factor=None,      # Prefetch batches
        # persistent_workers=True # Keep workers alive
)

datamodule.setup()

In [9]:
rnn_config = SingleStepRNNConfig(
    rnn_type="RNN",
    input_size=1,
    hidden_size=128,
    num_layers=3,
    bidirectional=True,
    learning_rate=1e-3,
)

model = SingleStepRNNModel(rnn_config)

trainer = pl.Trainer(
    accelerator='cpu',
    devices=1,
    min_epochs=5,
    max_epochs=100,
    callbacks=[pl.callbacks.EarlyStopping(monitor="valid_loss", patience=3)],
)

trainer.fit(model, datamodule)

# Removing artifacts created during training
shutil.rmtree("lightning_logs")

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/mlevydaniel/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Missing logger folder: /Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python/lightning_logs

  | Name | Type    | Params
---------------------------------
0 | rnn  | RNN     | 231 K 
1 | fc   | Linear  | 257   
2 | loss | MSELoss | 0     
---------------------------------
231 K     Trainable params
0         Non-trainable params
231 K     Total params
0.926     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/mlevydaniel/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
/Users/mlevydaniel/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

### Prediction

In [10]:
single_step_ahead_ml_metrics_df = pd.read_pickle(output/"ml_single_step_metrics_val_df.pkl")

metric_record = []
metric_record += (
    single_step_ahead_ml_metrics_df.loc[single_step_ahead_ml_metrics_df.LCLid == "MAC000193"]
    .drop(columns="LCLid")
    .to_dict(orient="records")
)

metric_record

[{'Algorithm': 'Lasso Regression',
  'MAE': 0.15977797844798053,
  'MSE': 0.07431120430346767,
  'MASE': 1.2451775076984242},
 {'Algorithm': 'XGB Random Forest',
  'MAE': 0.16423608362674713,
  'MSE': 0.08156435191631317,
  'MASE': 1.279920220375061},
 {'Algorithm': 'LightGBM',
  'MAE': 0.14890492743376219,
  'MSE': 0.06918339763995401,
  'MASE': 1.1604419346584254}]

In [11]:
pred = trainer.predict(model, datamodule.test_dataloader())

# pred is a list of outputs, one for each batch
pred = torch.cat(pred).squeeze().detach().numpy()

# Apply reverse transformation because we applied global normalization
pred = pred * datamodule.train.std + datamodule.train.mean
pred_df_ = pd.DataFrame({rnn_config.rnn_type: pred[0]}, index=sample_test_df.index)
pred_df = pred_df.join(pred_df_)

metrics = calculate_metrics(
    sample_test_df[target], 
    pred_df_[rnn_config.rnn_type], 
    rnn_config.rnn_type, 
    pd.concat([sample_train_df[target], sample_val_df[target]])
)

metric_record.append(metrics)

/Users/mlevydaniel/miniforge3/envs/pytorch_m1/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [12]:
formatted = pd.DataFrame(metric_record).style.format({"MAE": "{:.4f}", "MSE": "{:.4f}", "MASE": "{:.4f}"})

formatted.highlight_min(color='green', subset=["MAE", "MSE", "MASE"])

Unnamed: 0,Algorithm,MAE,MSE,MASE
0,Lasso Regression,0.1598,0.0743,1.2452
1,XGB Random Forest,0.1642,0.0816,1.2799
2,LightGBM,0.1489,0.0692,1.1604
3,RNN,0.1652,0.0905,1.2872


In [13]:
def create_forecast_plot(pred_df, rnn_config, metrics):
    # Setup colors
    colors = px.colors.qualitative.Plotly
    actual_color = colors[0]
    forecast_color = colors[1]

    # Create figure
    fig = go.Figure()

    # Add actual values trace
    mask = ~pred_df[rnn_config.rnn_type].isnull()
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].energy_consumption,
            mode="lines",
            line=dict(color=actual_color, width=2),
            name="Actual Consumption"
        )
    )

    # Add forecast trace
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df.loc[mask, rnn_config.rnn_type],
            mode="lines",
            line=dict(color=forecast_color, width=2, dash="dot"),
            name=rnn_config.rnn_type
        )
    )

    # Update layout
    fig.update_layout(
        # Size and title
        autosize=False,
        width=900,
        height=500,
        title=dict(
            text=f"{rnn_config.rnn_type}: MAE: {metrics['MAE']:.4f} | MSE: {metrics['MSE']:.4f} | MASE: {metrics['MASE']:.4f}",
            x=0.5,
            xanchor="center",
            yanchor="top",
            font=dict(size=20)
        ),
        
        # Legend
        showlegend=True,
        legend=dict(
            font=dict(size=15),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1
        ),
        
        # Axes
        xaxis=dict(
            title="Time",
            titlefont=dict(size=15),
            tickfont=dict(size=15),
            type="date",
            range=["2014-01-01", "2014-01-08"]
        ),
        yaxis=dict(
            title="Value",
            titlefont=dict(size=15),
            tickfont=dict(size=15)
        ),
        
        # Template and margins
        template="plotly_white",
        margin=dict(l=80, r=80, t=100, b=80)
    )

    return fig

# Usage:
fig = create_forecast_plot(pred_df, rnn_config, metrics)
fig.show()