In [1]:
import numpy as np
import pandas as pd
import torch

def compute_metrics(all_predictions,all_actuals, scaler, Normalization = False):    
    mae = torch.mean(torch.abs(all_predictions - all_actuals)).item()
    rmse = torch.sqrt(torch.mean((all_predictions - all_actuals) ** 2)).item()
    print(f"MAE in scaled space: {mae:.4f}")
    print(f"RMSE in scaled space: {rmse:.4f}")

    # Convert to NumPy arrays
    all_predictions_np = all_predictions.detach().numpy()
    all_actuals_np = all_actuals.detach().numpy()

    # 'scaler' is already fitted on the training data
    if Normalization:
        all_predictions_original = scaler.inverse_transform(all_predictions_np)
        all_actuals_original = scaler.inverse_transform(all_actuals_np)
    else:
        all_predictions_original = all_predictions_np
        all_actuals_original = all_actuals_np
    

    # Compute metrics in the original scale
    mae = np.mean(np.abs(all_predictions_original - all_actuals_original))
    rmse = np.sqrt(np.mean((all_predictions_original - all_actuals_original) ** 2))

    range_actuals = np.max(all_actuals_original) - np.min(all_actuals_original)
    nmae = mae / range_actuals
    nrmse = rmse / range_actuals

    metrics = {'MAE':[mae], 'RMSE':[rmse], 'NMAE':[nmae], 'NRMSE':[nrmse]}
    metrics_df = pd.DataFrame(metrics)
    return metrics_df

In [2]:
from utils.sequentialdataset import SequentialDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader


def split_data(df, config_model):
    ds, target = df.columns
    test_size, input_seq_length, output_seq_length = config_model['test_size'], config_model['input_seq_length'], config_model['output_seq_length']
    
    train_data, test_data = train_test_split(df, test_size=test_size, shuffle=False)  # 15% as testing, (15%) as validation
    
    train_data_values = train_data[target].values.reshape(-1, 1)
    test_data_values = test_data[target].values.reshape(-1, 1)

    train_timestamps = train_data[ds].tolist()
    test_timestamps = test_data[ds].tolist()

    # Initialize and fit the scaler on training data
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_data_scaled = scaler.fit_transform(train_data_values)

    # Transform validation and test data using the same scaler
    test_data_scaled = scaler.transform(test_data_values)

    # Convert back to lists for dataset compatibility
    train_data_scaled = train_data_scaled.flatten().tolist()
    test_data_scaled = test_data_scaled.flatten().tolist()

    train_val_dataset = SequentialDataset(train_data_scaled, train_timestamps, input_window=input_seq_length, output_window=output_seq_length)
    test_dataset = SequentialDataset(test_data_scaled, test_timestamps, input_window=input_seq_length, output_window=output_seq_length)

    train_size = int(df.shape[0] * (1-2*test_size))  # 70% for training and 15% for validation
    train_dataset = torch.utils.data.Subset(train_val_dataset, range(0, train_size))
    val_dataset = torch.utils.data.Subset(train_val_dataset, range(train_size, len(train_val_dataset)))

    train_loader = DataLoader(train_dataset, batch_size=config_model['batch_size'], shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=config_model['batch_size'], shuffle=False)
    val_loader = DataLoader(val_dataset, batch_size=config_model['batch_size'], shuffle=False)

    return train_loader, test_loader, val_loader, test_dataset, scaler

In [3]:
from utils.plot_helper import plot_sequence_predictions
from datetime import date
import os

def save_seq_predictions(all_predictions, all_actuals, all_inputs, test_dataset, scaler, output_length, result_dir, target, normalization=True):
    
    today = date.today()
    result_path=f'{result_dir}/images/TSTransformer/{today}/{target}'
    os.makedirs(result_path, exist_ok=True)
    

    result_data_path = f'{result_dir}/data/{today}_TSTransformer/{target}'
    os.makedirs(result_data_path, exist_ok=True)

    predictions_np = all_predictions.detach().cpu().numpy()
    actuals_np = all_actuals.detach().cpu().numpy()
    inputs_np = all_inputs.detach().cpu().numpy()
    
    if normalization:
        predictions_np = scaler.inverse_transform(predictions_np)
        actuals_np = scaler.inverse_transform(actuals_np)
        inputs_np = scaler.inverse_transform(inputs_np)

    for k in range (len(test_dataset)):
        input_sequence=inputs_np[k]
        actual_sequence = actuals_np[k]
        predicted_sequence = predictions_np[k]

        seq_x_timestamps, seq_y_timestamps = test_dataset.get_datetime_sequences(k)
        seq_x_timestamps = pd.to_datetime(seq_x_timestamps)
        seq_y_timestamps = pd.to_datetime(seq_y_timestamps)
        timestamps = seq_x_timestamps.append(seq_y_timestamps).date

        filename = f"{result_data_path}/{k:03d}.csv"
        input_actual = np.append(input_sequence,actual_sequence)
        input_predict = np.append(input_sequence,predicted_sequence)
        results_df = pd.DataFrame({'timestamps':timestamps, 'actual':np.round(input_actual,3), 'predict': np.round(input_predict, 3)})
        results_df.to_csv(filename, index = False)
        if (k%14==0):
            plot_sequence_predictions(input_sequence, actual_sequence, predicted_sequence, timestamps, target, result_path)

In [4]:
from scripts.pvtw_tsf_transformer import PVTWTimeSeriesTransformer
def window_analysis(df, config_model, feature, result_data_path):
    all_metrics=[]

    for window in range(7,42,7):
        config_model['input_seq_length']  = 2*window
        config_model['output_seq_length']  = window
            
        train_loader, test_loader, val_loader, test_dataset, scaler = split_data(df, config_model)
        
        model_path=f'./results/models/TSTransformer_{feature}_{window:03d}.pth'
        pvtw_model = PVTWTimeSeriesTransformer(config_model)    
        pvtw_model.fit(train_loader, val_loader)    
        pvtw_model.save_model(model_path)
        
        all_predictions, all_actuals, all_inputs = pvtw_model.evaluate(test_loader) 
        metrics = compute_metrics(all_predictions,all_actuals, scaler, Normalization = True)    
        all_metrics.append(metrics) 
    
    metrics_df = pd.concat(all_metrics, ignore_index=True)
    metrics_df.index = list(range(7,42,7))
    metrics_df.round(6).to_csv(f'{result_data_path}/{feature}_TST_WindowAnalysis.csv')   

In [None]:
import pandas as pd
import os
from datetime import date
from scripts.pvtw_tsf_transformer import PVTWTimeSeriesTransformer

data_path='./data/processed/' 
result_dir=f'./results/'
today = date.today()
result_data_path = f'{result_dir}/data/{today}_TSTransformer/'
os.makedirs(result_data_path, exist_ok=True)

gra = 'daily'
features = ['fatalities', 'event']
output_length= 28
all_metrics=[]

config_model = {'input_seq_length' :2*output_length, 'output_seq_length' : output_length, 
                'batch_size' :32,'epochs':50,
                "input_dim": 1, "output_dim": 1,
                "model_dim": 64,"num_heads": 4, "ff_dim": 128,
                "num_encoder_layers": 3, "num_decoder_layers": 3,            
                "threshold":  0, "dropout": 0.1, "test_size": 0.15
            }



for feature in features:

  filename=f"{data_path}ts_pvtw_{gra}_{feature}.csv"

  df = pd.read_csv(filename) 
  ds, target = df.columns
  df[ds] = pd.to_datetime(df[ds]) 

  threshold = df[target].quantile(0.75)  
  config_model['threhold'] = threshold  
    
  train_loader, test_loader, val_loader, test_dataset, scaler = split_data(df, config_model)

  model_path=f'./results/models/TSTransformer_{feature}.pth'
  pvtw_model = PVTWTimeSeriesTransformer(config_model)    
  pvtw_model.fit(train_loader, val_loader)    
  pvtw_model.save_model(model_path)

  all_predictions, all_actuals, all_inputs = pvtw_model.evaluate(test_loader) 
  metrics = compute_metrics(all_predictions,all_actuals, scaler, Normalization = True)    
  all_metrics.append(metrics) 

  save_seq_predictions(all_predictions, all_actuals, all_inputs, test_dataset, scaler, output_length, result_dir, feature, normalization=True)
  window_analysis(df, config_model, feature, result_data_path)

metrics_df = pd.concat(all_metrics, ignore_index=True)
metrics_df.index = [f"{gra}--{feature}" for feature in features]
metrics_df.round(6).to_csv(f'{result_data_path}/metrics_TSTransformer.csv')