In [18]:
import pandas as pd
import numpy as np
import pystan as stan
import torch
#import matplotlib as plt
# from sklearn.linear_model import LogisticRegression
from datetime import datetime

import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [4]:
# CA hourly electricty consumption in MWh, 2015-2022
ca_hourly_ec_data = pd.read_csv('data/ca_hourly_demand.csv')
ca_hourly_ec_data.head()

Unnamed: 0,date,demand
0,20221128T19Z,29029
1,20221128T18Z,29878
2,20221128T17Z,30452
3,20221128T16Z,28969
4,20221128T15Z,26456


In [81]:
# add time index
for i in range(len(ca_hourly_ec_data)):
    ca_hourly_ec_data.loc[:("time_idx", i)]= int(ca_hourly_ec_data["date"][i].replace("T","").replace("Z",""))
ca_hourly_ec_data.sample(10, random_state=521)

InvalidIndexError: ('time_idx', 0)

In [76]:
max_prediction_length = 1
max_encoder_length = 24
training_cutoff = ca_hourly_ec_data["time_idx"].max() - max_prediction_length

print(ca_hourly_ec_data["time_idx"])

training = TimeSeriesDataSet(
    data=ca_hourly_ec_data[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="volume",
    group_ids=["agency", "sku"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["agency", "sku"],
    #static_reals=["avg_population_2017", "avg_yearly_household_income_2017"],
    #time_varying_known_categoricals=["special_days", "month"],
    #variable_groups={"special_days": special_days},  # group of categorical variables can be treated as one variable
    #time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"],
    # time_varying_unknown_categoricals=[],
    # time_varying_unknown_reals=[
    #     "volume",
    #     "log_volume",
    #     "industry_volume",
    #     "soda_volume",
    #     "avg_max_temp",
    #     "avg_volume_by_agency",
    #     "avg_volume_by_sku",
    # ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, ca_hourly_ec_data, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

0        7042711
1        7042710
2        7042709
3        7042708
4        7042707
          ...   
64975          4
64976          3
64977          2
64978          1
64979          0
Name: time_idx, Length: 64980, dtype: object


AssertionError: Timeseries index should be of type integer

In [None]:
# calculate baseline mean absolute error, i.e. predict next value as the last available value from the history
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

NameError: name 'val_dataloader' is not defined

In [9]:
# CA daily weather conditions, 2020-17
ca_hourly_ec_data = pd.read_csv('data/ca_hourly_demand.csv')