In [1]:
# Install PyTorch, PyTorch Lightning, and PyTorch Forecasting
!pip install pytorch-lightning
!pip install pytorch-forecasting
!pip install torch


Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.5.1-py3-none-any.whl (890 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.6/890.6 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.8 pytorch-lightning-2.4.0 torchmetrics-1.5.1
Collecting pytorch-for

In [2]:
import torch
import pandas as pd
import numpy as np

In [3]:
data_train = pd.read_csv("/content/drive/MyDrive/DSB/2A HEC/Time series/time_series_project/data/train_preprocessed.csv")

In [4]:
data_train.isna().sum()

Unnamed: 0,0
id,0
valeur_NO2,0
valeur_CO,0
valeur_O3,0
valeur_PM10,0
valeur_PM25,0
is_holiday,0
is_jour_ferie,0
precipitation,0
wind_speed,0


In [5]:
data_train.fillna(-1, inplace=True)

In [6]:
data_train["id"] = pd.to_datetime(data_train["id"])
features = ['id', 'is_holiday', 'car_flow', 'precipitation',
       'wind_speed', 'temperature', 'humidity', 'pressure', 'visibility',
       'global_solar_radiation', 'Year', 'is_weekend', 'DayOfYear',
       'HourOfDay', 'DayOfYear_sin', 'DayOfYear_cos', 'HourOfDay_sin',
       'HourOfDay_cos', 'Weekday_sin', 'Weekday_cos']
data_train["location"] = "Montsouris"
data_train['time_idx'] = ((data_train['id'] - data_train['id'].min()).dt.total_seconds() // 3600).astype("int")
data_train.head()

Unnamed: 0,id,valeur_NO2,valeur_CO,valeur_O3,valeur_PM10,valeur_PM25,is_holiday,is_jour_ferie,precipitation,wind_speed,...,HourOfDay,Weekday,DayOfYear_sin,DayOfYear_cos,HourOfDay_sin,HourOfDay_cos,Weekday_sin,Weekday_cos,location,time_idx
0,2020-01-01 00:00:00,42.9,0.718,15.7,73.1,64.4,1,0,0.0,1.5,...,0,2,0.017213,0.999852,0.0,1.0,0.974928,-0.222521,Montsouris,0
1,2020-01-01 01:00:00,33.6,0.587,10.1,74.8,66.0,1,0,0.0,2.6,...,1,2,0.017213,0.999852,0.258819,0.965926,0.974928,-0.222521,Montsouris,1
2,2020-01-01 02:00:00,29.3,0.400655,5.1,51.0,44.9,1,0,0.0,1.9,...,2,2,0.017213,0.999852,0.5,0.866025,0.974928,-0.222521,Montsouris,2
3,2020-01-01 03:00:00,30.5,0.246,7.2,27.7,25.1,1,0,0.0,1.8,...,3,2,0.017213,0.999852,0.707107,0.707107,0.974928,-0.222521,Montsouris,3
4,2020-01-01 04:00:00,29.3,0.204,8.3,15.3,13.6,1,0,0.0,2.2,...,4,2,0.017213,0.999852,0.866025,0.5,0.974928,-0.222521,Montsouris,4


In [7]:
max_encoder_length = 7 * 24
max_prediction_length = 502  # Forecast 502 hours into the future

In [8]:
from pytorch_forecasting import TimeSeriesDataSet

In [9]:
# Step 1: Define the split point
split_idx = int(data_train['time_idx'].max() * 0.9)

# Step 2: Split into training and validation sets
train_data = data_train[data_train["time_idx"] <= split_idx]
test_data = data_train[data_train["time_idx"] > split_idx]

features.remove("car_flow")

# Step 3: Create TimeSeriesDataSets for train and validation
train_dataset = TimeSeriesDataSet(
    train_data,
    time_idx="time_idx",
    target=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"],
    group_ids=["location"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"],
    time_varying_known_reals=features,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

val_dataset = TimeSeriesDataSet(
    test_data,
    time_idx="time_idx",
    target=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"],
    group_ids=["location"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25", "car_flow"],
    time_varying_known_reals=features,
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

In [12]:
train_dataloader = train_dataset.to_dataloader(train=True, batch_size=64, num_workers=8)
val_dataloader = val_dataset.to_dataloader(train=False, batch_size=64, num_workers=8)

In [13]:
from pytorch_forecasting.models.temporal_fusion_transformer import TemporalFusionTransformer
from pytorch_forecasting.metrics import MAE

# Define TFT model
tft = TemporalFusionTransformer.from_dataset(
    train_dataset,
    learning_rate=0.03,  # You may adjust this
    hidden_size=16,  # Size of the network layers
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=[1, 1, 1, 1, 1],
    loss=MAE(),
    log_interval=10,
    reduce_on_plateau_patience=4,
)
print(f"Number of parameters in model: {tft.size()/1e3:.1f}k")


Number of parameters in model: 53.1k


In [16]:
from lightning.pytorch import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger(save_dir="/content/drive/MyDrive/lighting_logs")


In [18]:
from lightning.pytorch import Trainer

import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names, but StandardScaler was fitted with feature names")

trainer = Trainer(max_epochs=5,logger=tb_logger)
trainer.fit(tft,
            train_dataloaders=train_dataloader,
            val_dataloaders=val_dataloader)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
   | Name                               | Type                            | Params | Mode 
------------------------------------------------------------------------------------------------
0  | loss                               | MultiLoss                       | 0      | train
1  | logging_metrics                    | ModuleList                      | 0      | train
2  | input_embeddings                   | MultiEmbedding                  | 0      | train
3  | prescalers       

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=5` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [None]:
tft.predict(val_dataloader)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [None]:
new_data = pd.read_csv("/content/drive/MyDrive/DSB/2A HEC/Time series/time_series_project/data/test_preprocessed.csv")
new_data["id"] = pd.to_datetime(new_data["id"])
new_data["location"] = "Montsouris"
new_data['time_idx'] = ((new_data['id'] - new_data['id'].min()).dt.total_seconds() // 3600).astype("int")
new_data.head()

# Step 2: Create a TimeSeriesDataSet for the new data
new_data_dataset = TimeSeriesDataSet(
    new_data,  # replace with your new data
    time_idx="time_idx",
    target=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"],  # same target variables
    group_ids=["location"],  # same group id as used in training
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["valeur_NO2", "valeur_CO", "valeur_O3", "valeur_PM10", "valeur_PM25"],
    time_varying_known_reals=features,  # other features you included during training
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# Step 3: Create a DataLoader for the new data
new_data_loader = new_data_dataset.to_dataloader(train=False, batch_size=64, num_workers=4)

# Step 4: Use the model to predict
predictions = tft.predict(new_data_loader)
