More recent models, such as TSMixer, TFT and NHITS achieve better accuracy than LSTM in most settings.

In [1]:
!pip install neuralforecast ray[tune] pytorch-lightning utilsforecast matplotlib pandas mlforecast window_ops torch

Collecting neuralforecast
  Downloading neuralforecast-3.0.0-py3-none-any.whl.metadata (14 kB)
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (20 kB)
Collecting utilsforecast
  Downloading utilsforecast-0.2.12-py3-none-any.whl.metadata (7.6 kB)
Collecting mlforecast
  Downloading mlforecast-1.0.2-py3-none-any.whl.metadata (13 kB)
Collecting window_ops
  Downloading window_ops-0.0.15-py3-none-any.whl.metadata (6.8 kB)
Collecting ray[tune]
  Downloading ray-2.44.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting coreforecast>=0.0.6 (from neuralforecast)
  Downloading coreforecast-0.0.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting optuna (from neuralforecast)
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lig

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Set file paths
csv_path = '/content/drive/MyDrive/airkaz/selected_sensors2_cleaned.csv'
pipeline_path = '/content/drive/MyDrive/airkaz/MLForecastPipeline.py'

# Import pipeline module
import sys
sys.path.append('/content/drive/MyDrive/airkaz/')
from MLForecastPipeline import *

# Load CSV
import pandas as pd
selected_sensors_df = pd.read_csv(csv_path, index_col=0)


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt

class AttentiveDilatedRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, dilation=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dilation = dilation

        self.rnn_cell = nn.GRUCell(input_size, hidden_size)
        self.attn = nn.Linear(hidden_size + input_size, 1)

    def forward(self, x_t, hidden_state):
        if hidden_state is None:
            hidden_state = torch.zeros(x_t.size(0), self.hidden_size, device=x_t.device)
        h_t = self.rnn_cell(x_t, hidden_state)
        attn_input = torch.cat([x_t, h_t], dim=-1)
        alpha = torch.sigmoid(self.attn(attn_input))
        h_t_attn = alpha * h_t + (1 - alpha) * hidden_state
        return h_t_attn

class StackedADRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dilations=(2, 4, 7)):
        super().__init__()
        self.cells = nn.ModuleList([
            AttentiveDilatedRNNCell(input_size if i == 0 else hidden_size, hidden_size, d)
            for i, d in enumerate(dilations)
        ])
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        h = None
        for cell in self.cells:
            outputs = []
            h = None
            for t in range(seq_len):
                h = cell(x[:, t, :], h)
                outputs.append(h.unsqueeze(1))
            x = torch.cat(outputs, dim=1)
        out = self.fc(x[:, -1, :])
        return out

In [None]:
# Load and prepare data
df = pd.read_csv('/content/drive/MyDrive/airkaz/selected_sensors2_cleaned.csv', index_col=0)

df = df.rename(columns={'full_date': 'ds', '2': 'y'})
df['ds'] = pd.to_datetime(df['ds'])
df['unique_id'] = 'sensor_2'

# Normalize
scaler = StandardScaler()
df['y_scaled'] = scaler.fit_transform(df[['y']])

# Windowing for univariate
input_size = 30
horizon = 1
X, y = [], []
for i in range(len(df) - input_size - horizon):
  X.append(df['y_scaled'].values[i:i+input_size])
  y.append(df['y_scaled'].values[i+input_size + horizon - 1])

X = torch.tensor(np.array(X), dtype=torch.float32).unsqueeze(-1)  # [samples, time, 1]
y = torch.tensor(np.array(y), dtype=torch.float32).unsqueeze(-1)  # [samples, 1]

# Train-test split
split = int(0.75 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

# Model and training setup
model = StackedADRNN(input_size=1, hidden_size=64, output_size=1)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.SmoothL1Loss()

# Training loop
train_metrics = []
test_metrics = []

for epoch in range(500):
  model.train()
  optimizer.zero_grad()
  y_pred = model(X_train)
  loss = criterion(y_pred, y_train)
  loss.backward()
  optimizer.step()
  train_metrics.append(loss.item())

  model.eval()
  with torch.no_grad():
    y_pred_test = model(X_test).squeeze().numpy()
    y_true = y_test.squeeze().numpy()
    y_pred_inv = scaler.inverse_transform(y_pred_test.reshape(-1, 1)).squeeze()
    y_true_inv = scaler.inverse_transform(y_true.reshape(-1, 1)).squeeze()
  test_metrics.append(mean_absolute_percentage_error(y_true_inv, y_pred_inv))

  if (epoch+1) % 10 == 0:
    print(f"Epoch {epoch+1}: Train Loss = {loss.item():.4f}, Test MAPE = {test_metrics[-1]:.2%}")

# Evaluation
model.eval()
with torch.no_grad():
  y_pred = model(X_test).squeeze().numpy()
  y_true = y_test.squeeze().numpy()
  y_pred_inv = scaler.inverse_transform(y_pred.reshape(-1, 1)).squeeze()
  y_true_inv = scaler.inverse_transform(y_true.reshape(-1, 1)).squeeze()

mape = mean_absolute_percentage_error(y_true_inv, y_pred_inv)
print(f"Test MAPE: {mape:.2%}")

# Plot
plt.figure(figsize=(12, 6))
plt.plot(y_true_inv, label='Actual')
plt.plot(y_pred_inv, label='Forecast')
plt.title(f"adRNN Forecast (MAPE: {mape:.2%})")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


Epoch 10: Train Loss = 0.5100, Test MAPE = 36.41%
Epoch 20: Train Loss = 0.3719, Test MAPE = 41.62%
Epoch 30: Train Loss = 0.3355, Test MAPE = 32.42%
Epoch 40: Train Loss = 0.3228, Test MAPE = 34.51%
Epoch 50: Train Loss = 0.3200, Test MAPE = 33.41%
Epoch 60: Train Loss = 0.3182, Test MAPE = 33.10%
Epoch 70: Train Loss = 0.3157, Test MAPE = 33.42%
Epoch 80: Train Loss = 0.3136, Test MAPE = 33.05%
Epoch 90: Train Loss = 0.3117, Test MAPE = 32.71%
Epoch 100: Train Loss = 0.3095, Test MAPE = 32.48%
Epoch 110: Train Loss = 0.3073, Test MAPE = 31.95%
Epoch 120: Train Loss = 0.3046, Test MAPE = 31.84%
Epoch 130: Train Loss = 0.3019, Test MAPE = 32.90%
Epoch 140: Train Loss = 0.2983, Test MAPE = 33.03%
Epoch 150: Train Loss = 0.2953, Test MAPE = 32.24%
Epoch 160: Train Loss = 0.2914, Test MAPE = 32.09%
Epoch 170: Train Loss = 0.2872, Test MAPE = 32.72%
Epoch 180: Train Loss = 0.2820, Test MAPE = 31.83%
Epoch 190: Train Loss = 0.2907, Test MAPE = 44.28%
Epoch 200: Train Loss = 0.3038, Test MAP

KeyboardInterrupt: 

In [13]:
df.shape[0] * 0.6

777.6

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, random_split
import optuna
import os

class AttentiveDilatedRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size, dilation=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dilation = dilation

        self.rnn_cell = nn.GRUCell(input_size, hidden_size)
        self.attn = nn.Linear(hidden_size + input_size, 1)

    def forward(self, x_t, hidden_state):
        if hidden_state is None:
            hidden_state = torch.zeros(x_t.size(0), self.hidden_size, device=x_t.device)
        h_t = self.rnn_cell(x_t, hidden_state)
        attn_input = torch.cat([x_t, h_t], dim=-1)
        alpha = torch.sigmoid(self.attn(attn_input))
        h_t_attn = alpha * h_t + (1 - alpha) * hidden_state
        return h_t_attn

class StackedADRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dilations):
        super().__init__()
        self.cells = nn.ModuleList([
            AttentiveDilatedRNNCell(input_size if i == 0 else hidden_size, hidden_size, d)
            for i, d in enumerate(dilations)
        ])
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        h = None
        for cell in self.cells:
            outputs = []
            h = None
            for t in range(seq_len):
                h = cell(x[:, t, :], h)
                outputs.append(h.unsqueeze(1))
            x = torch.cat(outputs, dim=1)
        out = self.fc(x[:, -1, :])
        return out

def prepare_dataset(df, input_size, horizon):
    X, y = [], []
    for i in range(len(df) - input_size - horizon):
        X.append(df['y_scaled'].values[i:i+input_size])
        y.append(df['y_scaled'].values[i+input_size:i+input_size+horizon])
    X = torch.tensor(np.array(X), dtype=torch.float32).unsqueeze(-1)
    y = torch.tensor(np.array(y), dtype=torch.float32)
    return TensorDataset(X, y)

def forecast_direct(model, series, input_size, horizon):
    model.eval()
    preds = []
    i = 0
    while i + input_size + horizon <= len(series):
        window = series[i:i+input_size]
        window_tensor = torch.tensor(np.array(window), dtype=torch.float32).unsqueeze(0).unsqueeze(-1)
        with torch.no_grad():
            forecast = model(window_tensor).squeeze().cpu().numpy()
        preds.extend(forecast)
        i += horizon
    return np.array(preds)

if __name__ == "__main__":
    df = pd.read_csv('/content/drive/MyDrive/airkaz/selected_sensors2_cleaned.csv', index_col=0)
    df = df.rename(columns={'full_date': 'ds', '2': 'y'})
    df['ds'] = pd.to_datetime(df['ds'])
    df['unique_id'] = 'sensor_2'
    NUM_TRIALS = 10
    TRAIN_TEST_SPLIT = 0.6

    scaler = StandardScaler()
    df['y_scaled'] = scaler.fit_transform(df[['y']])
    horizons = [7, 14, 30, 60, 90, 180, 365]
    total_trials = len(horizons) * NUM_TRIALS  # 10 trials per horizon
    print(f"Total combinations to explore: {total_trials}\n")

    results_summary = {}

    for horizon in horizons:
        print(f"=== Horizon: {horizon} ===")

        def objective(trial):
            input_size = trial.suggest_int("input_size", min(30, horizon), min(365, horizon * 2))
            hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128, 256])
            learning_rate = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
            dilations = trial.suggest_categorical("dilations", [(1, 2, 4), (2, 4, 7), (4, 8, 16)])

            dataset = prepare_dataset(df, input_size, horizon)
            train_size = int(TRAIN_TEST_SPLIT * len(dataset))
            val_size = len(dataset) - train_size
            train_ds, val_ds = random_split(dataset, [train_size, val_size])
            train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
            val_loader = DataLoader(val_ds, batch_size=64)

            model = StackedADRNN(input_size=1, hidden_size=hidden_size, output_size=horizon, dilations=dilations)
            optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            criterion = nn.SmoothL1Loss()

            best_val_loss = float('inf')
            patience = 8
            trigger_times = 0

            for epoch in range(75):
                model.train()
                for xb, yb in train_loader:
                    optimizer.zero_grad()
                    preds = model(xb)
                    loss = criterion(preds, yb)
                    loss.backward()
                    optimizer.step()

                model.eval()
                val_losses = []
                with torch.no_grad():
                    for xb, yb in val_loader:
                        preds = model(xb)
                        loss = criterion(preds, yb)
                        val_losses.append(loss.item())
                avg_val_loss = np.mean(val_losses)

                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    trigger_times = 0
                else:
                    trigger_times += 1
                    if trigger_times >= patience:
                        break

            return best_val_loss

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=NUM_TRIALS)

        best_params = study.best_trial.params
        results_summary[horizon] = {
            "loss": study.best_value,
            "params": best_params
        }
        print(f"Best Loss for Horizon {horizon}: {study.best_value:.4f}")
        print(f"Best Params: {best_params}\n")

    print("=== Summary of Best Results per Horizon ===")
    all_results = []
    for h, res in results_summary.items():
        print(f"Horizon {h} → Loss: {res['loss']:.4f}, Params: {res['params']}")

        # Forecast with best model
        best_params = res['params']
        input_size = best_params['input_size']
        hidden_size = best_params['hidden_size']
        dilations = best_params['dilations']
        lr = best_params['lr']

        model = StackedADRNN(input_size=1, hidden_size=hidden_size, output_size=h, dilations=dilations)
        dataset = prepare_dataset(df, input_size, h)
        train_size = int(0.8 * len(dataset))
        test_ds = list(dataset)[train_size:]
        train_loader = DataLoader(dataset[:train_size], batch_size=64, shuffle=True)

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.SmoothL1Loss()

        for epoch in range(30):
            model.train()
            for xb, yb in train_loader:
                optimizer.zero_grad()
                loss = criterion(model(xb), yb)
                loss.backward()
                optimizer.step()

        # Rolling forecast on test data
        full_series = df['y_scaled'].values
        test_series = full_series[-(input_size + h + 720):]  # make sure we have enough for evaluation
        preds_scaled = forecast_direct(model, test_series, input_size, h)
        preds = scaler.inverse_transform(preds_scaled.reshape(-1, 1)).squeeze()
        actual = df['y'].iloc[-len(preds):].values

        # Evaluate for different test lengths
        max_test_length = len(preds)
        test_lengths = list(range(30, 181, 30)) + [240, 300, 360, 480, 600, 720, max_test_length]
        test_lengths = [t for t in test_lengths if t <= max_test_length]

        error_dict = {}
        for l in test_lengths:
            error_dict[f"MAPE_{l}d"] = mean_absolute_percentage_error(actual[:l], preds[:l])

        all_results.append({
            "Horizon": h,
            "input_size": input_size,
            "hidden_size": hidden_size,
            "lr": lr,
            "dilations": str(dilations),
            **error_dict
        })

    # Save to CSV
    results_df = pd.DataFrame(all_results)
    results_df.to_csv("forecast_eval_by_test_length.csv", index=False)
    print("\nEvaluation results saved to forecast_eval_by_test_length.csv")



[I 2025-03-25 08:39:55,740] A new study created in memory with name: no-name-0af81df2-4505-481d-89df-117cbdacad25


Total combinations to explore: 70

=== Horizon: 7 ===


[I 2025-03-25 08:40:20,085] Trial 0 finished with value: 0.13477065414190292 and parameters: {'input_size': 14, 'hidden_size': 64, 'lr': 0.0019160187045090251, 'dilations': (1, 2, 4)}. Best is trial 0 with value: 0.13477065414190292.
[W 2025-03-25 08:41:21,988] Trial 1 failed with parameters: {'input_size': 14, 'hidden_size': 128, 'lr': 0.007817144445076493, 'dilations': (4, 8, 16)} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-8-9562150485ec>", line 121, in objective
    loss.backward()
  File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 626, in backward
    torch.autograd.backward(
  File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 347, in backward
    _engine_run_backward(
  File "/usr/local/lib/python3.

KeyboardInterrupt: 