In [1]:
# Path to source code
%cd ../../

c:\Users\ajaoo\Desktop\Projects\Multivate-forecasting


In [2]:
# Imports for handling data
import os
import numpy as np
import pandas as pd
from pathlib import Path
from itertools import cycle
from sklearn.model_selection import TimeSeriesSplit, train_test_split

# Imports for machine learning
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr

# Imports for visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Progress bar
from tqdm.autonotebook import tqdm
# Enable progress apply for pandas
tqdm.pandas()


# Local imports for data loaders and models
from src.utils import plotting_utils
from src.dl.dataloaders import TimeSeriesDataModule
from src.dl.multivariate_models import SingleStepRNNConfig, SingleStepRNNModel
from src.transforms.target_transformations import AutoStationaryTransformer


pl.seed_everything(42)
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Set default plotly template
import plotly.io as pio
pio.templates.default = "plotly_white"

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

Seed set to 42


In [3]:
def format_plot(
    fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15
):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
    )
    return fig


def mase(actual, predicted, insample_actual):
    mae_insample = np.mean(np.abs(np.diff(insample_actual)))
    mae_outsample = np.mean(np.abs(actual - predicted))
    return mae_outsample / mae_insample


def forecast_bias(actual, predicted):
    return np.mean(predicted - actual)


def plot_forecast(
    pred_df, forecast_columns, forecast_display_names=None, save_path=None
):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns) == len(forecast_display_names)

    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = px.colors.qualitative.Set2  # Using a different color palette
    act_color = colors[0]
    colors = cycle(colors[1:])

    fig = go.Figure()

    # Actual data plot
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].covidOccupiedMVBeds,
            mode="lines",
            marker=dict(size=6, opacity=0.5),
            line=dict(color=act_color, width=2),
            name="Actual COVID-19 MVBeds trends",
        )
    )

    # Predicted data plot
    for col, display_col in zip(forecast_columns, forecast_display_names):
        fig.add_trace(
            go.Scatter(
                x=pred_df[mask].index,
                y=pred_df.loc[mask, col],
                mode="lines+markers",
                marker=dict(size=4),
                line=dict(color=next(colors), width=2),
                name=display_col,
            )
        )

    return fig


def highlight_abs_min(s, props=""):
    return np.where(s == np.nanmin(np.abs(s.values)), props, "")

In [4]:
# Load and Prepare Data
data_path = Path("data/processed/merged_nhs_covid_data.csv")
data = pd.read_csv(data_path).drop("Unnamed: 0", axis=1)
data['date'] = pd.to_datetime(data['date'])

In [5]:
# check the unique values in the areaName column
data['areaName'].unique()

array(['East of England', 'London', 'Midlands',
       'North East and Yorkshire', 'North West', 'South East',
       'South West'], dtype=object)

In [6]:
# Select a different area name
selected_area = "London" # "London", "South East", "North West", "East of England", "South West", "West Midlands", "East Midlands", "Yorkshire and The Humber", "North East"
data_filtered = data[data['areaName'] == selected_area]

# Data Processing
data_filtered['date'] = pd.to_datetime(data_filtered['date'])
data_filtered.sort_values(by=["date", "areaName"], inplace=True)

In [7]:
def add_rolling_features(df, window_size, columns, agg_funcs=None):
    if agg_funcs is None:
        agg_funcs = ['mean']
    added_features = {}
    for column in columns:
        for func in agg_funcs:
            roll_col_name = f"{column}_rolling_{window_size}_{func}"
            df[roll_col_name] = df[column].rolling(window_size).agg(func)
            if column not in added_features:
                added_features[column] = []
            added_features[column].append(roll_col_name)
    # Drop rows with NaN values which are the result of rolling window
    df.dropna(inplace=True)
    return df, added_features

# Configuration
window_size = 7
columns_to_roll = ["hospitalCases", "newAdmissions", "new_confirmed", "new_deceased"]
agg_funcs = ['mean', 'std']

# # Apply rolling features for each column
# data_filtered, added_features = add_rolling_features(data_filtered, window_size, columns_to_roll, agg_funcs)

# # Print added features for each column
# for column, features in added_features.items():
#     print(f"{column}: {', '.join(features)}")

In [8]:
# Define a function to add time-lagged features to the dataset
def add_lags(data, lags, features):
    added_features = []
    for feature in features:
        for lag in lags:
            new_feature = feature + f'_lag_{lag}'
            data[new_feature] = data[feature].shift(lag)
            added_features.append(new_feature)
    return data, added_features


lags = [1, 2, 3, 5, 7, 14, 21]

data_filtered, added_features = add_lags(data_filtered, lags, ['covidOccupiedMVBeds'])
data_filtered.dropna(inplace=True)

In [9]:
def create_temporal_features(df, date_column):
    df["month"] = df[date_column].dt.month
    df["day"] = df[date_column].dt.day
    df["day_of_week"] = df[date_column].dt.dayofweek
    return df


data_filtered = create_temporal_features(data_filtered, "date")
data_filtered.head()

Unnamed: 0,areaCode,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,...,covidOccupiedMVBeds_lag_1,covidOccupiedMVBeds_lag_2,covidOccupiedMVBeds_lag_3,covidOccupiedMVBeds_lag_5,covidOccupiedMVBeds_lag_7,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,month,day,day_of_week
1767,E40000003,London,2020-04-22,927.0,18181,3768.0,225,399.0,110.0,24734.0,...,972.0,1019.0,991.0,982.0,1023.0,941.0,0.0,4,22,2
1766,E40000003,London,2020-04-23,897.0,18447,3510.0,266,520.0,80.0,25254.0,...,927.0,972.0,1019.0,992.0,1012.0,849.0,673.0,4,23,3
1765,E40000003,London,2020-04-24,836.0,18618,3281.0,171,375.0,75.0,25629.0,...,897.0,927.0,972.0,991.0,982.0,1057.0,759.0,4,24,4
1764,E40000003,London,2020-04-25,868.0,18755,3198.0,137,332.0,57.0,25961.0,...,836.0,897.0,927.0,1019.0,992.0,1035.0,824.0,4,25,5
1763,E40000003,London,2020-04-26,831.0,18942,3115.0,187,310.0,62.0,26271.0,...,868.0,836.0,897.0,972.0,991.0,1035.0,800.0,4,26,6


In [11]:
data_filtered['date'] = pd.to_datetime(data_filtered['date'])
data_filtered = data_filtered.set_index('date')
data_filtered.head()

Unnamed: 0_level_0,areaCode,areaName,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,...,covidOccupiedMVBeds_lag_1,covidOccupiedMVBeds_lag_2,covidOccupiedMVBeds_lag_3,covidOccupiedMVBeds_lag_5,covidOccupiedMVBeds_lag_7,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,month,day,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-22,E40000003,London,927.0,18181,3768.0,225,399.0,110.0,24734.0,4983.0,...,972.0,1019.0,991.0,982.0,1023.0,941.0,0.0,4,22,2
2020-04-23,E40000003,London,897.0,18447,3510.0,266,520.0,80.0,25254.0,5063.0,...,927.0,972.0,1019.0,992.0,1012.0,849.0,673.0,4,23,3
2020-04-24,E40000003,London,836.0,18618,3281.0,171,375.0,75.0,25629.0,5138.0,...,897.0,927.0,972.0,991.0,982.0,1057.0,759.0,4,24,4
2020-04-25,E40000003,London,868.0,18755,3198.0,137,332.0,57.0,25961.0,5195.0,...,836.0,897.0,927.0,1019.0,992.0,1035.0,824.0,4,25,5
2020-04-26,E40000003,London,831.0,18942,3115.0,187,310.0,62.0,26271.0,5257.0,...,868.0,836.0,897.0,972.0,991.0,1035.0,800.0,4,26,6


In [12]:
# Set the target variable
target = 'covidOccupiedMVBeds'

seasonal_period = 7
auto_stationary = AutoStationaryTransformer(seasonal_period=seasonal_period)

# Fit and transform the target column to make it stationary
data_stat = auto_stationary.fit_transform(data_filtered[[target]], freq="D")

# Replace the original target values with the transformed stationary values
data_filtered[target] = data_stat.values

# Print the transformed data to check
data_filtered.head()

Unnamed: 0_level_0,areaCode,areaName,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,...,covidOccupiedMVBeds_lag_1,covidOccupiedMVBeds_lag_2,covidOccupiedMVBeds_lag_3,covidOccupiedMVBeds_lag_5,covidOccupiedMVBeds_lag_7,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,month,day,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-22,E40000003,London,26.618059,18181,3768.0,225,399.0,110.0,24734.0,4983.0,...,972.0,1019.0,991.0,982.0,1023.0,941.0,0.0,4,22,2
2020-04-23,E40000003,London,26.279578,18447,3510.0,266,520.0,80.0,25254.0,5063.0,...,927.0,972.0,1019.0,992.0,1012.0,849.0,673.0,4,23,3
2020-04-24,E40000003,London,25.563608,18618,3281.0,171,375.0,75.0,25629.0,5138.0,...,897.0,927.0,972.0,991.0,982.0,1057.0,759.0,4,24,4
2020-04-25,E40000003,London,25.948043,18755,3198.0,137,332.0,57.0,25961.0,5195.0,...,836.0,897.0,927.0,1019.0,992.0,1035.0,824.0,4,25,5
2020-04-26,E40000003,London,25.509239,18942,3115.0,187,310.0,62.0,26271.0,5257.0,...,868.0,836.0,897.0,972.0,991.0,1035.0,800.0,4,26,6


In [13]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 873 entries, 2020-04-22 to 2022-09-12
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   areaCode                    873 non-null    object 
 1   areaName                    873 non-null    object 
 2   covidOccupiedMVBeds         873 non-null    float64
 3   cumAdmissions               873 non-null    int64  
 4   hospitalCases               873 non-null    float64
 5   newAdmissions               873 non-null    int64  
 6   new_confirmed               873 non-null    float64
 7   new_deceased                873 non-null    float64
 8   cumulative_confirmed        873 non-null    float64
 9   cumulative_deceased         873 non-null    float64
 10  population                  873 non-null    int64  
 11  latitude                    873 non-null    float64
 12  longitude                   873 non-null    float64
 13  epi_week        

In [14]:
# Get the minimum and maximum date from the data
min_date = data_filtered.index.min()
max_date = data_filtered.index.max()
# Calculate the range of dates
date_range = max_date - min_date
print(f"Data ranges from {min_date} to {max_date} ({date_range.days} days)")

Data ranges from 2020-04-22 00:00:00 to 2022-09-12 00:00:00 (873 days)


In [17]:
# Filter data between the specified dates
start_date = "2020-05-01"
end_date = "2020-12-30"
data_filtered = data_filtered[start_date:end_date]

In [18]:
# selecting 1 year data for training and 2 months data for validation and 3 months data for testing
train_end = min_date + pd.DateOffset(months=13)
val_end = train_end + pd.DateOffset(months=4)
test_end = val_end + pd.DateOffset(months=2)

# Split the data into training, validation, and testing sets
train = data_filtered[data_filtered.index <= train_end]
val = data_filtered[(data_filtered.index > train_end) & (data_filtered.index <= val_end)]
test = data_filtered[data_filtered.index > val_end]

# Calculate the percentage of dates in each dataset
total_sample = len(data_filtered)
train_sample = len(train) / total_sample * 100
val_sample = len(val) / total_sample * 100
test_sample = len(test) / total_sample * 100

print(f"Train: {train_sample:.2f}%, Validation: {val_sample:.2f}%, Test: {test_sample:.2f}%")
print(f"Train: {len(train)} samples, Validation: {len(val)} samples, Test: {len(test)} samples")
print(f"Max date in train: {train.index.max()}, Min date in validation: {val.index.min()}, Max date in test: {test.index.max()}")

NameError: name 'midlands_data' is not defined