# 1. Load Libraries

In [148]:
import pandas as pd
import numpy as np
import pytorch_forecasting as pf
from tabulate import tabulate
from tqdm import tqdm
from torch.utils.data import random_split, TensorDataset
import torch
import matplotlib.pyplot as plt
from torch.utils.data import (ConcatDataset, DataLoader, Dataset, Subset,
                              random_split)
from torchvision import datasets, transforms
import seaborn as sns
import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.tuner import Tuner
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
import pytorch_forecasting as pf
from pytorch_forecasting.metrics import QuantileLoss


In [149]:
path = r"C:\Users\User\Documents\Semester 07\EE4750 - Data Analytics in Power Systems\WindPowerForecastingData TASK.xlsx\WindPowerForecastingData TASK.xlsx"
data = pd.read_excel(path, engine='openpyxl')
print(tabulate(data.head(), headers='keys', tablefmt='psql'))

+----+---------------+-------------+---------+-----------+---------+-----------+
|    | TIMESTAMP     |   TARGETVAR |     U10 |       V10 |    U100 |      V100 |
|----+---------------+-------------+---------+-----------+---------+-----------|
|  0 | 20120101 1:00 |   0         | 2.1246  | -2.68197  | 2.86428 | -3.66608  |
|  1 | 20120101 2:00 |   0.0548791 | 2.52169 | -1.79696  | 3.34486 | -2.46476  |
|  2 | 20120101 3:00 |   0.110234  | 2.67221 | -0.822516 | 3.50845 | -1.21409  |
|  3 | 20120101 4:00 |   0.165116  | 2.4575  | -0.143642 | 3.21523 | -0.355546 |
|  4 | 20120101 5:00 |   0.15694   | 2.2459  |  0.389576 | 2.95768 |  0.332701 |
+----+---------------+-------------+---------+-----------+---------+-----------+


In [150]:
data['TIMESTAMP'] = pd.to_datetime(data['TIMESTAMP'])
data = data.set_index('TIMESTAMP')
# Add incremental ID
data['ID'] = range(1, len(data)+1)
print(tabulate(data.head(), headers='keys', tablefmt='psql'))

+---------------------+-------------+---------+-----------+---------+-----------+------+
| TIMESTAMP           |   TARGETVAR |     U10 |       V10 |    U100 |      V100 |   ID |
|---------------------+-------------+---------+-----------+---------+-----------+------|
| 2012-01-01 01:00:00 |   0         | 2.1246  | -2.68197  | 2.86428 | -3.66608  |    1 |
| 2012-01-01 02:00:00 |   0.0548791 | 2.52169 | -1.79696  | 3.34486 | -2.46476  |    2 |
| 2012-01-01 03:00:00 |   0.110234  | 2.67221 | -0.822516 | 3.50845 | -1.21409  |    3 |
| 2012-01-01 04:00:00 |   0.165116  | 2.4575  | -0.143642 | 3.21523 | -0.355546 |    4 |
| 2012-01-01 05:00:00 |   0.15694   | 2.2459  |  0.389576 | 2.95768 |  0.332701 |    5 |
+---------------------+-------------+---------+-----------+---------+-----------+------+


In [151]:
from sklearn.calibration import LabelEncoder
from sklearn.discriminant_analysis import StandardScaler


def fit_preprocessing(train, real_columns, categorical_columns):
    real_scalers = StandardScaler().fit(train[real_columns].values)

    categorical_scalers = {}
    num_classes = []
    for col in categorical_columns:
        srs = train[col].apply(str) 
        categorical_scalers[col] = LabelEncoder().fit(srs.values)
        num_classes.append(srs.nunique())

    return real_scalers, categorical_scalers


def transform_inputs(df, real_scalers, categorical_scalers, real_columns, categorical_columns):
    out = df.copy()
    out[real_columns] = real_scalers.transform(df[real_columns].values)

    for col in categorical_columns:
        string_df = df[col].apply(str)
        out[col] = categorical_scalers[col].transform(string_df)

    return out

real_columns = ['TARGETVAR', 'U10', 'V10', 'U100', 'V100']
categorical_columns = []   # add any categorical column names if you have them

# Fit on the training data
real_scalers, categorical_scalers = fit_preprocessing(
    data, real_columns, categorical_columns
)

In [152]:
# 1. Preprocess as DataFrames
real_scalers, categorical_scalers = fit_preprocessing(
    data, real_columns, categorical_columns
)
scaled_df = transform_inputs(data, real_scalers, categorical_scalers,
                             real_columns, categorical_columns)

# 2. Split the *scaled* DataFrame
train_df, val_df, test_df = np.split(
    scaled_df.sample(frac=1, random_state=42),
    [int(.6*len(scaled_df)), int(.8*len(scaled_df))]
)

# 3. Convert to tensors
train_dataset = TensorDataset(torch.tensor(train_df.values, dtype=torch.float32))
val_dataset   = TensorDataset(torch.tensor(val_df.values,   dtype=torch.float32))
test_dataset  = TensorDataset(torch.tensor(test_df.values,  dtype=torch.float32))


  return bound(*args, **kwds)


In [153]:
# Global variables

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
NUM_EPOCHS = 2
DROPOUT = 0.3
LEARNING_RATE = 0.001
ENCODER_STEPS = 175
DECODER_STEPS = 180
HIDDEN_LAYER_SIZE = 80
EMBEDDING_DIMENSION = 8
NUM_LSTM_LAYERS = 1
NUM_ATTENTION_HEADS = 2
QUANTILES = [0.1, 0.5, 0.9]



In [154]:
# Dataset variables
input_columns = ['U10', 'V10', 'U100', 'V100']
target_column = 'TARGETVAR'
time_column = 'TIMESTAMP'
col_to_idx = {col: idx for idx, col in enumerate(input_columns)}

In [155]:
params = {
    "quantiles": QUANTILES,
    "batch_size": BATCH_SIZE,
    "dropout": DROPOUT,
    "device": DEVICE,
    "hidden_layer_size": HIDDEN_LAYER_SIZE,
    "num_lstm_layers": NUM_LSTM_LAYERS,
    "embedding_dim": EMBEDDING_DIMENSION,
    "encoder_steps": ENCODER_STEPS,
    "num_attention_heads": NUM_ATTENTION_HEADS,
    "col_to_idx": col_to_idx,
    "time_dependent_continuous": input_columns,
    "known_time_dependent": input_columns,
    "observed_time_dependent": target_column
}

In [156]:
import numpy as np
from torch.utils.data import Dataset

class TFT_Dataset(Dataset):
    def __init__(self, data, entity_column, time_column, target_column, 
                 input_columns, encoder_steps, decoder_steps):
        """
        data (pd.DataFrame): dataframe containing raw data
        entity_column (str): name of column containing entity data
        time_column (str): name of column containing date data
        target_column (str): name of column we need to predict
        input_columns (list): list of string names of columns used as input
        encoder_steps (int): number of known past time steps used for forecast
        decoder_steps (int): number of input time steps used for each forecast date
        """
        
        self.encoder_steps = encoder_steps
        self.decoder_steps = decoder_steps
        inputs, outputs, entity, time = [], [], [], []

        for e_val in data[entity_column].unique():
            entity_group = data[data[entity_column] == e_val]
            data_time_steps = len(entity_group)

            if data_time_steps >= decoder_steps:
                x = entity_group[input_columns].values.astype(np.float32)
                inputs.append(
                    np.stack([x[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1)
                )

                y = entity_group[[target_column]].values.astype(np.float32)
                outputs.append(
                    np.stack([y[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1)
                )

                e_arr = entity_group[[entity_column]].values.astype(np.float32)
                entity.append(
                    np.stack([e_arr[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1)
                )

                t = entity_group[[time_column]].values.astype(np.int64)
                time.append(
                    np.stack([t[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1)
                )

        # Concatenate all entities
        self.inputs = np.concatenate(inputs, axis=0)
        self.outputs = np.concatenate(outputs, axis=0)[:, encoder_steps:, :]
        self.entity = np.concatenate(entity, axis=0)
        self.time = np.concatenate(time, axis=0)
        self.active_inputs = np.ones_like(self.outputs)

    def __getitem__(self, index):
        return {
            'inputs': self.inputs[index],
            'outputs': self.outputs[index],
            'active_entries': self.active_inputs[index],
            'time': self.time[index],
            'identifier': self.entity[index]
        }

    def __len__(self):
        return self.inputs.shape[0]


In [157]:
data['TIMESTAMP'] = pd.to_datetime(data['TIMESTAMP'])
data["ID"] = 0
data = data.set_index('TIMESTAMP')

training_data = TFT_Dataset(train_df.reset_index(), 
                            entity_column="ID",   # <--- add this
                            time_column=time_column, 
                            target_column=target_column, 
                            input_columns=input_columns, 
                            encoder_steps=ENCODER_STEPS, 
                            decoder_steps=DECODER_STEPS)

validation_data = TFT_Dataset(val_df.reset_index(), 
                              entity_column="ID", 
                              time_column=time_column, 
                              target_column=target_column, 
                              input_columns=input_columns, 
                              encoder_steps=ENCODER_STEPS, 
                              decoder_steps=DECODER_STEPS)

testing_data = TFT_Dataset(test_df.reset_index(), 
                           entity_column="ID", 
                           time_column=time_column, 
                           target_column=target_column, 
                           input_columns=input_columns, 
                           encoder_steps=ENCODER_STEPS, 
                           decoder_steps=DECODER_STEPS)


KeyError: 'TIMESTAMP'