In [1]:
import os
os.chdir("../../")


import os
import shutil
import numpy as np
import pandas as pd
from pathlib import Path
from itertools import cycle
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from tqdm.autonotebook import tqdm
from src.utils import plotting_utils
from src.transforms.target_transformations import AutoStationaryTransformer
import plotly.io as pio
import warnings
import logging

# Description: This script contains the code for the second experiment in the project, 
# forecasting COVID-19 MVBeds using various RNN models and hyperparameter tuning with Simulated Annealing.

# Set seeds for reproducibility
pl.seed_everything(42)
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.set_float32_matmul_precision('high')

# Set default plotly template
pio.templates.default = "plotly_white"

# Ignore warnings
warnings.filterwarnings("ignore")

# Set logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

Global seed set to 42


In [2]:

# Load and Prepare Data
data_path = Path("data/processed/merged_nhs_covid_data.csv")
data = pd.read_csv(data_path).drop("Unnamed: 0", axis=1)
data["date"] = pd.to_datetime(data["date"])

# Select a different area name
selected_area = "East of England"
data_filtered = data[data["areaName"] == selected_area]

# Data Processing
data_filtered["date"] = pd.to_datetime(data_filtered["date"])
data_filtered.sort_values(by=["date", "areaName"], inplace=True)
data_filtered.drop(
    [
        "areaName",
        "areaCode",
        "cumAdmissions",
        "cumulative_confirmed",
        "cumulative_deceased",
        "population",
        "latitude",
        "longitude",
        "epi_week",
    ],
    axis=1,
    inplace=True,
)

In [3]:
def add_rolling_features(df, window_size, columns, agg_funcs=None):
    if agg_funcs is None:
        agg_funcs = ["mean"]
    added_features = {}
    for column in columns:
        for func in agg_funcs:
            roll_col_name = f"{column}_rolling_{window_size}_{func}"
            df[roll_col_name] = df[column].rolling(window_size).agg(func)
            if column not in added_features:
                added_features[column] = []
            added_features[column].append(roll_col_name)
    df.dropna(inplace=True)
    return df, added_features

# Configuration
window_size = 7
columns_to_roll = ["hospitalCases", "newAdmissions", "new_confirmed", "new_deceased"]
agg_funcs = ["mean", "std"]

# Apply rolling features for each column
data_filtered, added_features = add_rolling_features(
    data_filtered, window_size, columns_to_roll, agg_funcs
)

for column, features in added_features.items():
    print(f"{column}: {', '.join(features)}")

hospitalCases: hospitalCases_rolling_7_mean, hospitalCases_rolling_7_std
newAdmissions: newAdmissions_rolling_7_mean, newAdmissions_rolling_7_std
new_confirmed: new_confirmed_rolling_7_mean, new_confirmed_rolling_7_std
new_deceased: new_deceased_rolling_7_mean, new_deceased_rolling_7_std


In [4]:

def add_lags(data, lags, features):
    added_features = []
    for feature in features:
        for lag in lags:
            new_feature = feature + f"_lag_{lag}"
            data[new_feature] = data[feature].shift(lag)
            added_features.append(new_feature)
    return data, added_features

lags = [1, 2, 3, 5, 7, 14, 21]
data_filtered, added_features = add_lags(data_filtered, lags, ["covidOccupiedMVBeds"])
data_filtered.dropna(inplace=True)

In [5]:
def create_temporal_features(df, date_column):
    df["month"] = df[date_column].dt.month
    df["day"] = df[date_column].dt.day
    df["day_of_week"] = df[date_column].dt.dayofweek
    return df

data_filtered = create_temporal_features(data_filtered, "date")
data_filtered.set_index("date", inplace=True)

In [6]:
# Load and process the SEIRD data
seird_data = pd.read_csv(f"reports/output/pinn_{selected_area}_output.csv")
seird_data["date"] = pd.to_datetime(seird_data["date"])
seird_data.set_index("date", inplace=True)

# Merge the two dataframes on the date index
merged_data = pd.merge(data_filtered, seird_data, left_index=True, right_index=True, how="inner")

# Drop rows with any missing values
merged_data.dropna(inplace=True)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 396 entries, 2020-05-01 to 2021-05-31
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   covidOccupiedMVBeds           396 non-null    float64
 1   hospitalCases                 396 non-null    float64
 2   newAdmissions                 396 non-null    int64  
 3   new_confirmed                 396 non-null    float64
 4   new_deceased                  396 non-null    float64
 5   hospitalCases_rolling_7_mean  396 non-null    float64
 6   hospitalCases_rolling_7_std   396 non-null    float64
 7   newAdmissions_rolling_7_mean  396 non-null    float64
 8   newAdmissions_rolling_7_std   396 non-null    float64
 9   new_confirmed_rolling_7_mean  396 non-null    float64
 10  new_confirmed_rolling_7_std   396 non-null    float64
 11  new_deceased_rolling_7_mean   396 non-null    float64
 12  new_deceased_rolling_7_std    396 non-null   

In [7]:
# Set the target variable and make it stationary
target = "covidOccupiedMVBeds"
seasonal_period = 7
auto_stationary = AutoStationaryTransformer(seasonal_period=seasonal_period)
data_stat = auto_stationary.fit_transform(merged_data[[target]], freq="D")
merged_data[target] = data_stat.values

# Ensure the index is a DateTimeIndex
merged_data.index = pd.to_datetime(merged_data.index)

In [10]:
# Filter data between the specified dates
start_date = "2020-05-01"
end_date = "2021-05-31"
merged_data = merged_data[start_date:end_date]

min_date = merged_data.index.min()
max_date = merged_data.index.max()

# Calculate the range of dates
date_range = max_date - min_date
logging.info(f"Data ranges from {min_date} to {max_date} ({date_range.days} days)")

# Calculate split points
total_days = date_range.days
train_end = min_date + pd.Timedelta(days=int(total_days * 0.70))
val_end = train_end + pd.Timedelta(days=int(total_days * 0.20))

# Split the data into training, validation, and testing sets
train = merged_data[merged_data.index <= train_end]
val = merged_data[(merged_data.index > train_end) & (merged_data.index <= val_end)]
test = merged_data[merged_data.index > val_end]

# Calculate the percentage of dates in each dataset
total_sample = len(merged_data)
train_sample = len(train) / total_sample * 100
val_sample = len(val) / total_sample * 100
test_sample = len(test) / total_sample * 100

print(f"Train: {train_sample:.2f}%, Validation: {val_sample:.2f}%, Test: {test_sample:.2f}%")
print(f"Train: {len(train)} samples, Validation: {len(val)} samples, Test: {len(test)} samples")
print(f"Max date in train: {train.index.max()}, Min date in validation: {val.index.min()}, Max date in test: {test.index.max()}")


2024-05-22 13:13:51,408 - INFO - Data ranges from 2020-05-01 00:00:00 to 2021-05-31 00:00:00 (395 days)


Train: 69.95%, Validation: 19.95%, Test: 10.10%
Train: 277 samples, Validation: 79 samples, Test: 40 samples
Max date in train: 2021-02-01 00:00:00, Min date in validation: 2021-02-02 00:00:00, Max date in test: 2021-05-31 00:00:00


In [9]:
train_dates = (train.index.min(), train.index.max())
val_dates = (val.index.min(), val.index.max())
test_dates = (test.index.min(), test.index.max())

print(f"Train dates: {train_dates}, Val dates: {val_dates}, Test dates: {test_dates}")

Train dates: (Timestamp('2020-05-01 00:00:00'), Timestamp('2021-02-01 00:00:00')), Val dates: (Timestamp('2021-02-02 00:00:00'), Timestamp('2021-04-21 00:00:00')), Test dates: (Timestamp('2021-04-22 00:00:00'), Timestamp('2021-05-31 00:00:00'))
