In [1]:
print("Starting script...")

from modelling import *
from modelling import GRU

from pipeline import normalise_linear

import os
from pathlib import Path
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset

Starting script...

Running __init__.py for data pipeline...

Running __init__.py for data pipeline...
Pipeline initialized

Modelling package initialized



In [2]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Device: ", device)

Device:  cuda


In [3]:
HABROK = bool(0)                  # set to True if using HABROK; it will print
                                  # all stdout to a .txt file to log progress
CITY_NAME = "Amsterdam"
BASE_DIR = Path.cwd()
MODEL_PATH = BASE_DIR / "results" / "models"
MINMAX_PATH = BASE_DIR.parent / "data" / "data_combined" / CITY_NAME.lower() / "contaminant_minmax.csv"

print("BASE_DIR: ", BASE_DIR)
print("MODEL_PATH: ", MODEL_PATH)
print("MINMAX_PATH: ", MINMAX_PATH)

torch.manual_seed(34)             # set seed for reproducibility

N_HOURS_U = 72                    # number of hours to use for input
N_HOURS_Y = 24                    # number of hours to predict
N_HOURS_STEP = 24                 # "sampling rate" in hours of the data; e.g. 24 
                                  # means sample an I/O-pair every 24 hours
                                  # the contaminants and meteorological vars
CONTAMINANTS = ['NO2', 'O3'] # 'PM10', 'PM25']
COMPONENTS = ['NO2', 'O3', 'PM10', 'PM25', 'SQ', 'WD', 'Wvh', 'dewP', 'p', 'temp']
WEATHER_COMP = ['SQ', 'WD', 'Wvh', 'dewP', 'p', 'temp']

BASE_DIR:  /home/nick/bachelor-project/forecasting_smog_DL_GNN/src
MODEL_PATH:  /home/nick/bachelor-project/forecasting_smog_DL_GNN/src/results/models
MINMAX_PATH:  /home/nick/bachelor-project/forecasting_smog_DL_GNN/data/data_combined/amsterdam/contaminant_minmax.csv


In [4]:
hp_gru = {
    'n_hours_u' : N_HOURS_U,
    'n_hours_y' : N_HOURS_Y,

    'model_class' : GRU, # changed to GRU
    'input_units' : 8, #train_dataset.__n_features_in__(),
    'hidden_layers' : 6,
    'hidden_units' : 128,
    # 'branches' : 2,  # predicting only no2 and o3
    'output_units' : 2, #train_dataset.__n_features_out__(),

    'Optimizer' : torch.optim.Adam,
    'lr_shared' : 1e-3,
    'scheduler' : torch.optim.lr_scheduler.ReduceLROnPlateau,
    'scheduler_kwargs' : {'mode' : 'min',
                          'factor' : 0.1,
                          'patience' : 3,
                          'cooldown' : 8,
                          'verbose' : True},
    'w_decay' : 1e-5,
    'loss_fn' : torch.nn.MSELoss(),

    'epochs' : 5000,
    'early_stopper' : EarlyStopper,
    'patience' : 15,
    'batch_sz' : 16,
    'k_folds' : 5,
}

## Utrecht model and evaluating at Amsterdam

In [5]:
with PrintManager('.', 'a', HABROK):
    print("\nPrinting model:")
    model_utrecht = GRU(hp_gru['n_hours_u'],
                 hp_gru['n_hours_y'],
                 hp_gru['input_units'],
                 hp_gru['hidden_layers'],
                 hp_gru['hidden_units'], 
                #  hp['branches'],
                 hp_gru['output_units'])
    print(model_utrecht)


Printing model:
GRU(
  (gru): GRU(8, 128, num_layers=6, batch_first=True)
  (dense): Linear(in_features=128, out_features=2, bias=True)
)


In [6]:
model_utrecht.load_state_dict(torch.load(f"{MODEL_PATH}/model_GRU_utrecht.pth"))

<All keys matched successfully>

In [7]:
def denormalize_then_normalize_with_utrecht(
    df, rotterdam_minmax_path, utrecht_minmax_path, contaminants=["NO2", "O3"]
):
    """
    Denormalizes Rotterdam data and then normalizes it with Utrecht parameters
    
    :param df: DataFrame with normalized Rotterdam data
    :param rotterdam_minmax_path: Path to Rotterdam's min/max values
    :param utrecht_minmax_path: Path to Utrecht's min/max values
    :param contaminants: List of contaminants to process
    :return: DataFrame normalized with Utrecht parameters
    """
    # Get min/max values
    rotterdam_params = retrieve_min_max(rotterdam_minmax_path, conts=contaminants)
    utrecht_params = retrieve_min_max(utrecht_minmax_path, conts=contaminants)
    
    # Create a copy to avoid modifying original
    df_copy = df.copy()
    
    # Process each contaminant
    for cont in contaminants:
        if cont in df_copy.columns:
            # Step 1: Denormalize using Rotterdam parameters
            r_min = rotterdam_params[f"{cont}_min"]
            r_max = rotterdam_params[f"{cont}_max"]
            denormalized = df_copy[cont] * (r_max - r_min) + r_min

            # Step 2: Normalize using Utrecht parameters
            u_min = utrecht_params[f"{cont}_min"]
            u_max = utrecht_params[f"{cont}_max"]
            df_copy[cont] = (denormalized - u_min) / (u_max - u_min)
            print("Normalized with Utrecht parameters")
    
    return df_copy

In [8]:
def normalize_dataset_for_cross_city(city_name, calc_new_params=False):
    """
    Normalizes a city's dataset using Utrecht's normalization parameters
    
    :param city_name: Name of the city whose data needs to be normalized
    :return: Tuple of normalized input and output frames
    """
    # Define paths
    city_minmax = BASE_DIR.parent / "data" / "data_combined" / city_name.lower() / "contaminant_minmax.csv"
    utrecht_minmax = Path(MINMAX_PATH)
    
    # Get all dataframes
    train_input_frames = get_dataframes('train', 'u', city_name)
    train_output_frames = get_dataframes('train', 'y', city_name)
    val_input_frames = get_dataframes('val', 'u', city_name)
    val_output_frames = get_dataframes('val', 'y', city_name)
    test_input_frames = get_dataframes('test', 'u', city_name)
    test_output_frames = get_dataframes('test', 'y', city_name)

    # Combine frames into list structure
    input_frames = [train_input_frames, val_input_frames, test_input_frames]
    output_frames = [train_output_frames, val_output_frames, test_output_frames]

    # Transform each nested frame to Utrecht's normalization space
    re_normalized_input_frames = []
    for frame_list in input_frames:
        normalized_list = []
        for frame in frame_list:
            normalized_frame = denormalize_then_normalize_with_target(frame, city_minmax, utrecht_minmax, no_target=calc_new_params)
            normalized_list.append(normalized_frame)
        re_normalized_input_frames.append(normalized_list)

    re_normalized_output_frames = []
    for frame_list in output_frames:
        normalized_list = []
        for frame in frame_list:
            normalized_frame = denormalize_then_normalize_with_target(frame, city_minmax, utrecht_minmax, no_target=calc_new_params)
            normalized_list.append(normalized_frame)
        re_normalized_output_frames.append(normalized_list)
    
    return re_normalized_input_frames, re_normalized_output_frames

In [9]:
re_normalized_input_frames, re_normalized_output_frames = normalize_dataset_for_cross_city('Amsterdam')
train_input_frames_ams, val_input_frames_ams, test_input_frames_ams = re_normalized_input_frames
train_output_frames_ams, val_output_frames_ams, test_output_frames_ams = re_normalized_output_frames


In [10]:
train_ams_dataset = TimeSeriesDataset(
    train_input_frames_ams,
    train_output_frames_ams,
    5,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)


val_ams_dataset = TimeSeriesDataset(
    val_input_frames_ams,
    val_output_frames_ams,
    3,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)

test_ams_dataset = TimeSeriesDataset(
    test_input_frames_ams,
    test_output_frames_ams,
    3,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)

In [11]:
full_dataset = ConcatDataset([train_ams_dataset, val_ams_dataset, test_ams_dataset])
test_loader = DataLoader(full_dataset, batch_size=16, shuffle=False)
loss_fn = nn.MSELoss()  # Instantiate the loss function
test_error = test(model_utrecht, loss_fn, test_loader)
test_error

0.007195948333180738

In [12]:
print("\nRMSE Test set:")
print_dict_vertically_root(
    test_separately(model_utrecht, nn.MSELoss(), test_loader, True, MINMAX_PATH, components=["NO2", "O3"])
)


RMSE Test set:
NO2: 15.922462916585815
O3 : 9.596654612042402


## Utrecht model and evaluating Rotterdam

In [13]:
re_normalized_input_frames, re_normalized_output_frames = normalize_dataset_for_cross_city('Rotterdam')
train_input_frames_rot, val_input_frames_rot, test_input_frames_rot = re_normalized_input_frames
train_output_frames_rot, val_output_frames_rot, test_output_frames_rot = re_normalized_output_frames


In [14]:
train_rot_dataset = TimeSeriesDataset(
    train_input_frames_rot,
    train_output_frames_rot,
    5,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)


val_rot_dataset = TimeSeriesDataset(
    val_input_frames_rot,
    val_output_frames_rot,
    3,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)

test_rot_dataset = TimeSeriesDataset(
    test_input_frames_rot,
    test_output_frames_rot,
    3,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)

In [15]:
full_rot_dataset = ConcatDataset([train_rot_dataset, val_rot_dataset, test_rot_dataset])

test_loader = DataLoader(full_rot_dataset, batch_size=16, shuffle=False)
loss_fn = nn.MSELoss()  # Instantiate the loss function
test_error = test(model_utrecht, loss_fn, test_loader)
test_error

0.007277254849883183

In [16]:
print("\nRMSE Test set:")
print_dict_vertically_root(
    test_separately(model_utrecht, nn.MSELoss(), test_loader, True, MINMAX_PATH, components=["NO2", "O3"])
)


RMSE Test set:
NO2: 15.282571441015362
O3 : 11.516198351493946


In [17]:
test_input_frames = get_dataframes('test', 'u', 'Utrecht')
test_output_frames = get_dataframes('test', 'y', 'Utrecht')

In [18]:
test_utrecht_dataset = TimeSeriesDataset(
    test_input_frames,
    test_output_frames,
    3,
    N_HOURS_U,
    N_HOURS_Y,
    N_HOURS_STEP,
)


In [19]:
utrecht_test_loader = DataLoader(test_utrecht_dataset, batch_size=16, shuffle=False)
one_model_dataset = ConcatDataset([full_dataset, full_rot_dataset])