# Info

Training and fine-tuning of Transfromer model.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-dark-palette')

import torch

import lightning.pytorch as pl
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import LearningRateMonitor
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.metrics import QuantileLoss, MAE, RMSE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import pickle

In [2]:
with open('./data/input_data.pkl', 'rb') as f:
  input_data = pickle.load(f)

merged_df, training, train_dataloader, validation, val_dataloader = input_data

In [3]:
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=200,
    accelerator="cpu",
    enable_model_summary=True,
    gradient_clip_val=0.02,
    #limit_train_batches=50,  # coment in for training, running valiation every 30 batches
    # fast_dev_run=True,  # comment in to check that networkor dataset has no serious bugs
    callbacks=[lr_logger, early_stop_callback]
)

tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate= 0.001,
    hidden_size=32,
    attention_head_size=2,
    dropout=0.15,
    hidden_continuous_size=1,
    lstm_layers = 2,
    loss=QuantileLoss(),
    optimizer="Ranger",
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Number of parameters in network: 142.3k


In [4]:
optuna_train_dataloader = training.to_dataloader(train=True, batch_size=128, shuffle=True)
optuna_val_dataloader = validation.to_dataloader(train=False, batch_size=512, shuffle=True)

In [5]:
# create study
study = optimize_hyperparameters(
    optuna_train_dataloader,
    val_dataloader,
    model_path="optuna_test",
    n_trials=25,
    max_epochs=5,
    gradient_clip_val_range=(0.3, 0.8),
    hidden_size_range=(96, 128),
    hidden_continuous_size_range=(8, 16),
    attention_head_size_range=(2, 4),
    learning_rate_range=(0.012, 0.015),
    dropout_range=(0.15, 0.25),
    trainer_kwargs=dict(limit_train_batches=30),
    reduce_on_plateau_patience=4,
    use_learning_rate_finder=False,
)

# save study results - also we can resume tuning at a later point in time
with open("test_study.pkl", "wb") as fout:
    pickle.dump(study, fout)

# show best hyperparameters
print(study.best_trial.params)

[I 2023-10-01 10:49:50,653] A new study created in memory with name: no-name-a69f7bcb-359f-4177-ad0d-49b023aace26
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=5` reached.
[I 2023-10-01 11:50:35,616] Trial 0 finished with value: 63.335384368896484 and parameters: {'gradient_clip_val': 0.46880744899588866, 'hidden_size': 121, 'dropout': 0.18457728196359122, 'hidden_continuous_size': 12, 'attention_head_size': 2, 'learning_rate': 0.012471631006971363}. Best is trial 0 with value: 63.335384368896484.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=5` reached.
[I 2023-10-01 12:44:14,522] Trial 1 finished with value: 63.371158599853516 and parameters: {'gradient_clip_val': 0.37293205349987274, 'hidden_size': 108, 'dropout': 0

{'gradient_clip_val': 0.5954558734996294, 'hidden_size': 123, 'dropout': 0.16398943569323365, 'hidden_continuous_size': 8, 'attention_head_size': 4, 'learning_rate': 0.013179919059645011}


In [6]:
trainer.fit(
        tft,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

Missing logger folder: d:\Programming\DP\TFT_model_data\lightning_logs

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 13    
3  | prescalers                         | ModuleDict                      | 176   
4  | static_variable_selection          | VariableSelectionNetwork        | 936   
5  | encoder_variable_selection         | VariableSelectionNetwork        | 36.5 K
6  | decoder_variable_selection         | VariableSelectionNetwork        | 34.5 K
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | stat

Epoch 14: 100%|██████████| 671/671 [55:44<00:00,  4.98s/it, v_num=0, train_loss_step=52.50, val_loss=71.20, train_loss_epoch=55.40] 


In [7]:
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

torch.save(best_tft, "tft_model")

In [7]:
best_tft = torch.load("tft_model")

In [73]:
best_tft.eval()
raw_prediction = best_tft.predict(
    validation,
    mode="raw",
    return_x=True,
    return_y=True,
    batch_size = 1,
)

In [None]:
predictions = best_tft.predict(validation, batch_size=1, return_y=True)

In [60]:
MAE()(predictions.output, predictions.y[0].view(-1,8))

tensor(122.9977)

In [61]:
RMSE()(predictions.output, predictions.y[0].view(-1,8))

tensor(170.0242)

In [80]:
predictions.output[0]

tensor([  72.5813,   40.2114,    7.7808,  -11.0756,    2.5228,  -68.9974,
        -102.8922,  -97.0845])

In [78]:
raw_prediction.output.prediction[0]

tensor([[-1.3946e+02, -5.1067e+01,  1.6953e+01,  7.2581e+01,  1.2900e+02,
          1.7845e+02,  2.7298e+02],
        [-1.6032e+02, -6.7371e+01, -6.5494e+00,  4.0211e+01,  9.6146e+01,
          1.3674e+02,  2.0958e+02],
        [-2.3153e+02, -1.2108e+02, -5.5369e+01,  7.7808e+00,  7.4741e+01,
          1.2882e+02,  2.1492e+02],
        [-2.1438e+02, -1.3139e+02, -7.1129e+01, -1.1076e+01,  4.4728e+01,
          8.7666e+01,  1.6946e+02],
        [-2.1937e+02, -1.2536e+02, -5.8693e+01,  2.5228e+00,  7.0254e+01,
          1.2052e+02,  2.0638e+02],
        [-2.9513e+02, -1.8984e+02, -1.3053e+02, -6.8997e+01, -1.7653e+01,
          2.2576e+01,  9.1551e+01],
        [-3.8635e+02, -2.6618e+02, -1.9679e+02, -1.0289e+02, -3.5783e+01,
          2.1523e+01,  1.0396e+02],
        [-3.2419e+02, -2.2131e+02, -1.6210e+02, -9.7085e+01, -4.4661e+01,
         -1.6698e-02,  7.1332e+01]])

In [77]:
raw_prediction.y[0].shape

torch.Size([1, 111560])