In [1]:
# path to src code
%cd ../../

/home/olarinoyem/Research/Multivate-forecasting


In [2]:
# Standard Library Imports
import os
import random
import shutil
import warnings
from itertools import cycle
from pathlib import Path

# Third-Party Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from matplotlib.backends.backend_pdf import PdfPages
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm

# PyTorch Imports
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping

# Plotly Imports
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

# Local Application/Library Specific Imports
from src.dl.dataloaders import TimeSeriesDataModule
from src.dl.multivariate_models import SingleStepRNNConfig, SingleStepRNNModel
from src.utils import plotting_utils

# Configuration and Setup
pio.templates.default = "plotly_white"
pl.seed_everything(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
warnings.filterwarnings("ignore")
tqdm.pandas()
torch.set_float32_matmul_precision("high")


  from tqdm.autonotebook import tqdm
Global seed set to 42


In [3]:
def format_plot(
    fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15
):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
    )
    return fig


def mase(actual, predicted, insample_actual):
    mae_insample = np.mean(np.abs(np.diff(insample_actual)))
    mae_outsample = np.mean(np.abs(actual - predicted))
    return mae_outsample / mae_insample


# def forecast_bias(actual, predicted):
#     return np.mean(predicted - actual)
def forecast_bias(actual, predicted):
    total_predicted = np.nansum(predicted)
    total_actual = np.nansum(actual)
    if total_actual == 0:
        return float('inf')  # or some large number, to indicate infinite bias if actual sum is zero
    return 100 * (total_predicted - total_actual) / total_actual



def plot_forecast(
    pred_df, forecast_columns, forecast_display_names=None, save_path=None
):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns) == len(forecast_display_names)

    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = px.colors.qualitative.Set2  # Using a different color palette
    act_color = colors[0]
    colors = cycle(colors[1:])

    fig = go.Figure()

    # Actual data plot
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].covidOccupiedMVBeds,
            mode="lines",
            marker=dict(size=6, opacity=0.5),
            line=dict(color=act_color, width=2),
            name="Actual COVID-19 MVBeds trends",
        )
    )

    # Predicted data plot
    for col, display_col in zip(forecast_columns, forecast_display_names):
        fig.add_trace(
            go.Scatter(
                x=pred_df[mask].index,
                y=pred_df.loc[mask, col],
                mode="lines+markers",
                marker=dict(size=4),
                line=dict(color=next(colors), width=2),
                name=display_col,
            )
        )

    return fig


def highlight_abs_min(s, props=""):
    return np.where(s == np.nanmin(np.abs(s.values)), props, "")

In [4]:
if not os.path.exists("../images"):
    os.mkdir("../images")

In [5]:
# load the data
data = pd.read_csv("data/processed/merged_nhs_covid_data.csv").drop("Unnamed: 0", axis=1)
data.head()

Unnamed: 0,areaCode,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,population,latitude,longitude,epi_week
0,E40000007,East of England,2022-09-12,9.0,84162,418.0,47,505.0,2.0,2177579.0,19129.0,6235410,52.24,0.41,202237
1,E40000007,East of England,2022-09-11,8.0,84115,421.0,46,429.0,3.0,2177074.0,19127.0,6235410,52.24,0.41,202237
2,E40000007,East of England,2022-09-10,8.0,84069,419.0,34,296.0,0.0,2176645.0,19124.0,6235410,52.24,0.41,202236
3,E40000007,East of England,2022-09-09,9.0,84035,411.0,34,308.0,2.0,2176349.0,19124.0,6235410,52.24,0.41,202236
4,E40000007,East of England,2022-09-08,9.0,84001,421.0,51,335.0,3.0,2176041.0,19122.0,6235410,52.24,0.41,202236


In [6]:
data['date'] = pd.to_datetime(data['date'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8054 entries, 0 to 8053
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   areaCode              8054 non-null   object        
 1   areaName              8054 non-null   object        
 2   date                  8054 non-null   datetime64[ns]
 3   covidOccupiedMVBeds   8054 non-null   float64       
 4   cumAdmissions         8054 non-null   int64         
 5   hospitalCases         8054 non-null   float64       
 6   newAdmissions         8054 non-null   int64         
 7   new_confirmed         8054 non-null   float64       
 8   new_deceased          8054 non-null   float64       
 9   cumulative_confirmed  8054 non-null   float64       
 10  cumulative_deceased   8054 non-null   float64       
 11  population            8054 non-null   int64         
 12  latitude              8054 non-null   float64       
 13  longitude         

In [7]:
from collections import namedtuple

FeatureConfig = namedtuple(
    "FeatureConfig",
    [
        "target",
        "index_col"
        "time_varying_known_categoricals",
        "time_varying_known",
        "time_varying_unknown",
    ],
)

In [8]:
data = data.sort_values(by=["date", "areaName"]).reset_index(drop=True)
data.head()

Unnamed: 0,areaCode,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,population,latitude,longitude,epi_week
0,E40000007,East of England,2020-04-01,0.0,1400,833.0,167,334.0,75.0,2938.0,455.0,6235410,52.24,0.41,202014
1,E40000003,London,2020-04-01,0.0,7651,4219.0,767,977.0,189.0,11086.0,1566.0,8982256,51.5,-0.083333,202014
2,E40000008,Midlands,2020-04-01,0.0,5104,2192.0,776,301.0,44.0,2294.0,259.0,4811065,52.98,-0.75,202014
3,E40000008,Midlands,2020-04-01,0.0,5104,2192.0,776,411.0,108.0,4234.0,721.0,5907872,52.47,-2.29,202014
4,E40000009,North East and Yorkshire,2020-04-01,0.0,2765,1293.0,507,307.0,32.0,1478.0,152.0,2656980,55.0,-1.866667,202014


In [9]:
# # Filter data between the specified dates
# data_filtered = data[(data["date"] >= "2020-01-01") & (data["date"] < "2021-12-31")]

data.drop(["areaCode", "cumAdmissions","cumulative_confirmed", "cumulative_deceased", "population", "latitude", "longitude", "epi_week"], axis=1, inplace=True)

In [10]:
class LogTime:
    def __enter__(self):
        self.start = pd.Timestamp.now()

    def __exit__(self, *args):
        print(f"Duration: {pd.Timestamp.now() - self.start}")

def add_rolling_features(df, window_size, columns, agg_funcs=None):
    if agg_funcs is None:
        agg_funcs = ['mean']
    added_features = {}
    for column in columns:
        for func in agg_funcs:
            roll_col_name = f"{column}_rolling_{window_size}_{func}"
            df[roll_col_name] = df.groupby("areaName")[column].rolling(window_size, min_periods=1).agg(func).reset_index(level=0, drop=True)
            if column not in added_features:
                added_features[column] = []
            added_features[column].append(roll_col_name)
    return df, added_features

In [11]:

# Configuration for rolling calculation
window_size = 7
columns_to_roll = ["hospitalCases", "newAdmissions", "new_confirmed", "new_deceased"]
agg_funcs = ['mean', 'std']

# Using LogTime to measure the duration of operation
with LogTime():
    data, feature_names = add_rolling_features(data, window_size, columns_to_roll, agg_funcs)

print("Added features:", feature_names)
data.head()

Duration: 0 days 00:00:00.042860
Added features: {'hospitalCases': ['hospitalCases_rolling_7_mean', 'hospitalCases_rolling_7_std'], 'newAdmissions': ['newAdmissions_rolling_7_mean', 'newAdmissions_rolling_7_std'], 'new_confirmed': ['new_confirmed_rolling_7_mean', 'new_confirmed_rolling_7_std'], 'new_deceased': ['new_deceased_rolling_7_mean', 'new_deceased_rolling_7_std']}


Unnamed: 0,areaName,date,covidOccupiedMVBeds,hospitalCases,newAdmissions,new_confirmed,new_deceased,hospitalCases_rolling_7_mean,hospitalCases_rolling_7_std,newAdmissions_rolling_7_mean,newAdmissions_rolling_7_std,new_confirmed_rolling_7_mean,new_confirmed_rolling_7_std,new_deceased_rolling_7_mean,new_deceased_rolling_7_std
0,East of England,2020-04-01,0.0,833.0,167,334.0,75.0,833.0,,167.0,,334.0,,75.0,
1,London,2020-04-01,0.0,4219.0,767,977.0,189.0,4219.0,,767.0,,977.0,,189.0,
2,Midlands,2020-04-01,0.0,2192.0,776,301.0,44.0,2192.0,,776.0,,301.0,,44.0,
3,Midlands,2020-04-01,0.0,2192.0,776,411.0,108.0,2192.0,0.0,776.0,0.0,356.0,77.781746,76.0,45.254834
4,North East and Yorkshire,2020-04-01,0.0,1293.0,507,307.0,32.0,1293.0,,507.0,,307.0,,32.0,


In [12]:
def add_lags(data, lags, features):
    added_features = []
    # Apply lagging within each group defined by 'areaName'
    for feature in features:
        for lag in lags:
            new_feature = f"{feature}_lag_{lag}"
            data[new_feature] = data.groupby("areaName")[feature].shift(lag)
            added_features.append(new_feature)
    return data, added_features

In [13]:
# Specify the lags and features
lags = [1, 2, 3, 5, 6, 7, 10, 11, 12, 13, 14, 21, 22, 23, 25, 27, ]
features_to_lag = ['covidOccupiedMVBeds']  

# Apply the function using LogTime to measure the performance
with LogTime():
    data, added_features = add_lags(data, lags, features_to_lag)
    data.dropna(inplace=True)  # Drop rows with NaN values that result from shifting

# Showing some of the output for verification
print("Added features:", added_features)
data.head()

Duration: 0 days 00:00:00.033221
Added features: ['covidOccupiedMVBeds_lag_1', 'covidOccupiedMVBeds_lag_2', 'covidOccupiedMVBeds_lag_3', 'covidOccupiedMVBeds_lag_5', 'covidOccupiedMVBeds_lag_6', 'covidOccupiedMVBeds_lag_7', 'covidOccupiedMVBeds_lag_10', 'covidOccupiedMVBeds_lag_11', 'covidOccupiedMVBeds_lag_12', 'covidOccupiedMVBeds_lag_13', 'covidOccupiedMVBeds_lag_14', 'covidOccupiedMVBeds_lag_21', 'covidOccupiedMVBeds_lag_22', 'covidOccupiedMVBeds_lag_23', 'covidOccupiedMVBeds_lag_25', 'covidOccupiedMVBeds_lag_27']


Unnamed: 0,areaName,date,covidOccupiedMVBeds,hospitalCases,newAdmissions,new_confirmed,new_deceased,hospitalCases_rolling_7_mean,hospitalCases_rolling_7_std,newAdmissions_rolling_7_mean,...,covidOccupiedMVBeds_lag_10,covidOccupiedMVBeds_lag_11,covidOccupiedMVBeds_lag_12,covidOccupiedMVBeds_lag_13,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,covidOccupiedMVBeds_lag_22,covidOccupiedMVBeds_lag_23,covidOccupiedMVBeds_lag_25,covidOccupiedMVBeds_lag_27
120,Midlands,2020-04-14,467.0,3345.0,359,330.0,82.0,3368.285714,49.895605,332.142857,...,465.0,465.0,450.0,450.0,445.0,360.0,357.0,357.0,301.0,0.0
122,North East and Yorkshire,2020-04-14,299.0,2545.0,306,316.0,63.0,2549.428571,20.68701,304.285714,...,289.0,289.0,260.0,260.0,230.0,148.0,116.0,116.0,83.0,0.0
128,Midlands,2020-04-15,472.0,3101.0,242,216.0,67.0,3326.142857,110.437657,318.857143,...,464.0,465.0,465.0,450.0,450.0,360.0,360.0,357.0,301.0,0.0
129,Midlands,2020-04-15,472.0,3101.0,242,394.0,108.0,3279.142857,127.553164,310.142857,...,464.0,464.0,465.0,465.0,450.0,406.0,360.0,360.0,357.0,301.0
130,North East and Yorkshire,2020-04-15,291.0,2500.0,254,286.0,31.0,2546.571429,26.10145,299.142857,...,292.0,289.0,289.0,260.0,260.0,148.0,148.0,116.0,83.0,0.0


In [14]:
def create_temporal_features(df, date_column, group_column=None):
    if group_column:
        # Process each group separately and concatenate the results
        grouped = df.groupby(group_column)
        processed_groups = [create_temporal_features_for_group(group, date_column) for _, group in grouped]
        return pd.concat(processed_groups).reset_index(drop=True)
    else:
        # Apply date transformations to the entire DataFrame
        return create_temporal_features_for_group(df, date_column)

def create_temporal_features_for_group(df, date_column):
    df["month"] = df[date_column].dt.month
    df["day"] = df[date_column].dt.day
    df["day_of_week"] = df[date_column].dt.dayofweek
    return df

In [15]:
# Apply the function to the entire DataFrame
data = create_temporal_features(data, 'date', group_column='areaName')

data = data.copy()

In [16]:
from src.utils.general import LogTime

from src.feature_engineering.autoregressive_features import add_ewma

with LogTime():
    data, added_features = add_ewma(data,
                                    spans=[3, 7, 14, 21, 30],
                         column="covidOccupiedMVBeds",
                                    ts_id="areaName",
                                    use_32_bit=True,)
    
print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 0 microseconds
Features Created: covidOccupiedMVBeds_ewma_span_3,covidOccupiedMVBeds_ewma_span_7,covidOccupiedMVBeds_ewma_span_14,covidOccupiedMVBeds_ewma_span_21,covidOccupiedMVBeds_ewma_span_30


In [17]:
data = data.dropna(True)

In [18]:
from src.transforms.target_transformations import AutoStationaryTransformer

def make_stationary(df, target, seasonal_period):
    # Ensure the index of the DataFrame is a datetime index
    df = df.set_index(pd.to_datetime(df['date']))
    # Initialize the AutoStationaryTransformer with a seasonality period
    auto_stationary = AutoStationaryTransformer(seasonal_period=seasonal_period)
    
    # Fit and transform the target column to make it stationary
    data_stat = auto_stationary.fit_transform(df[[target]], freq="D")
    
    # Replace the original target values with the transformed stationary values
    df[target] = data_stat.values
    return df

In [19]:
target = 'covidOccupiedMVBeds'
seasonal_period = 7 

# Group by 'areaName' and apply the transformation
transformed_data = data.groupby('areaName').apply(make_stationary, target=target, seasonal_period=seasonal_period).reset_index(drop=True)

# Print the transformed data to check
transformed_data.head()

Unnamed: 0,areaName,date,covidOccupiedMVBeds,hospitalCases,newAdmissions,new_confirmed,new_deceased,hospitalCases_rolling_7_mean,hospitalCases_rolling_7_std,newAdmissions_rolling_7_mean,...,covidOccupiedMVBeds_lag_13,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,covidOccupiedMVBeds_lag_22,covidOccupiedMVBeds_lag_23,covidOccupiedMVBeds_lag_25,covidOccupiedMVBeds_lag_27,month,day,day_of_week
0,East of England,2020-04-28,72.925546,1389.0,184,463.0,68.0,1427.857143,46.759771,139.428571,...,308.0,303.0,255.0,204.0,219.0,162.0,0.0,4,28,1
1,East of England,2020-04-29,69.557847,1319.0,157,476.0,50.0,1400.428571,46.16946,144.714286,...,308.0,308.0,276.0,255.0,204.0,171.0,119.0,4,29,2
2,East of England,2020-04-30,67.465631,1253.0,198,425.0,65.0,1369.142857,61.306412,154.714286,...,320.0,308.0,265.0,276.0,255.0,219.0,162.0,4,30,3
3,East of England,2020-05-01,65.083592,1211.0,127,492.0,72.0,1341.571429,82.687881,152.714286,...,312.0,320.0,280.0,265.0,276.0,204.0,171.0,5,1,4
4,East of England,2020-05-02,62.400511,1190.0,111,303.0,38.0,1313.857143,97.314659,150.285714,...,306.0,312.0,283.0,280.0,265.0,255.0,219.0,5,2,5


In [20]:
transformed_data['date'] = pd.to_datetime(transformed_data['date'])
transformed_data = transformed_data.set_index('date')
transformed_data.head()

Unnamed: 0_level_0,areaName,covidOccupiedMVBeds,hospitalCases,newAdmissions,new_confirmed,new_deceased,hospitalCases_rolling_7_mean,hospitalCases_rolling_7_std,newAdmissions_rolling_7_mean,newAdmissions_rolling_7_std,...,covidOccupiedMVBeds_lag_13,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,covidOccupiedMVBeds_lag_22,covidOccupiedMVBeds_lag_23,covidOccupiedMVBeds_lag_25,covidOccupiedMVBeds_lag_27,month,day,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-28,East of England,72.925546,1389.0,184,463.0,68.0,1427.857143,46.759771,139.428571,21.816333,...,308.0,303.0,255.0,204.0,219.0,162.0,0.0,4,28,1
2020-04-29,East of England,69.557847,1319.0,157,476.0,50.0,1400.428571,46.16946,144.714286,20.782318,...,308.0,308.0,276.0,255.0,204.0,171.0,119.0,4,29,2
2020-04-30,East of England,67.465631,1253.0,198,425.0,65.0,1369.142857,61.306412,154.714286,27.237929,...,320.0,308.0,265.0,276.0,255.0,219.0,162.0,4,30,3
2020-05-01,East of England,65.083592,1211.0,127,492.0,72.0,1341.571429,82.687881,152.714286,28.877409,...,312.0,320.0,280.0,265.0,276.0,204.0,171.0,5,1,4
2020-05-02,East of England,62.400511,1190.0,111,303.0,38.0,1313.857143,97.314659,150.285714,31.862801,...,306.0,312.0,283.0,280.0,265.0,255.0,219.0,5,2,5


In [21]:
transformed_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7865 entries, 2020-04-28 to 2022-09-12
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   areaName                      7865 non-null   object 
 1   covidOccupiedMVBeds           7865 non-null   float64
 2   hospitalCases                 7865 non-null   float64
 3   newAdmissions                 7865 non-null   int64  
 4   new_confirmed                 7865 non-null   float64
 5   new_deceased                  7865 non-null   float64
 6   hospitalCases_rolling_7_mean  7865 non-null   float64
 7   hospitalCases_rolling_7_std   7865 non-null   float64
 8   newAdmissions_rolling_7_mean  7865 non-null   float64
 9   newAdmissions_rolling_7_std   7865 non-null   float64
 10  new_confirmed_rolling_7_mean  7865 non-null   float64
 11  new_confirmed_rolling_7_std   7865 non-null   float64
 12  new_deceased_rolling_7_mean   7865 non-null 

In [22]:
# Get the minimum and maximum date from the data
min_date = transformed_data.index.min()
max_date = transformed_data.index.max()
# Calculate the range of dates
date_range = max_date - min_date
print(f"Data ranges from {min_date} to {max_date} ({date_range.days} days)")

Data ranges from 2020-04-14 00:00:00 to 2022-09-12 00:00:00 (881 days)


In [23]:
# select the midelands in the areaName
midlands_data = transformed_data[transformed_data['areaName'] == 'Midlands']
midlands_data.head()

Unnamed: 0_level_0,areaName,covidOccupiedMVBeds,hospitalCases,newAdmissions,new_confirmed,new_deceased,hospitalCases_rolling_7_mean,hospitalCases_rolling_7_std,newAdmissions_rolling_7_mean,newAdmissions_rolling_7_std,...,covidOccupiedMVBeds_lag_13,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,covidOccupiedMVBeds_lag_22,covidOccupiedMVBeds_lag_23,covidOccupiedMVBeds_lag_25,covidOccupiedMVBeds_lag_27,month,day,day_of_week
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-14,Midlands,32.840992,3345.0,359,330.0,82.0,3368.285714,49.895605,332.142857,22.915996,...,450.0,445.0,360.0,357.0,357.0,301.0,0.0,4,14,1
2020-04-15,Midlands,33.018215,3101.0,242,216.0,67.0,3326.142857,110.437657,318.857143,40.891843,...,450.0,450.0,360.0,360.0,357.0,301.0,0.0,4,15,2
2020-04-15,Midlands,33.018215,3101.0,242,394.0,108.0,3279.142857,127.553164,310.142857,50.260749,...,465.0,450.0,406.0,360.0,360.0,357.0,301.0,4,15,2
2020-04-16,Midlands,33.090707,3095.0,283,266.0,31.0,3231.285714,124.32312,307.285714,51.292346,...,465.0,465.0,406.0,406.0,360.0,357.0,301.0,4,16,3
2020-04-16,Midlands,33.090707,3095.0,283,514.0,89.0,3199.714286,127.251345,300.142857,50.591266,...,464.0,465.0,419.0,406.0,406.0,360.0,357.0,4,16,3


In [24]:
# Starting and ending dates from the index
start_date = transformed_data.index.min()
end_date = transformed_data.index.max()

# Print start and end date to confirm
print("Start date:", start_date)
print("End date:", end_date)

Start date: 2020-04-14 00:00:00
End date: 2022-09-12 00:00:00


In [25]:
# Calculate split dates
train_end_date = start_date + pd.DateOffset(years=1)
validation_end_date = train_end_date + pd.DateOffset(months=2)
test_end_date = validation_end_date + pd.DateOffset(months=3)

# Check if the test_end_date exceeds the available data
if test_end_date > end_date:
    print(f"Requested test end date {test_end_date} exceeds the range of available data ending on {end_date}. Adjusting to end of data.")
    test_end_date = end_date

print("Training end date:", train_end_date)
print("Validation end date:", validation_end_date)
print("Test end date:", test_end_date)


Training end date: 2021-04-14 00:00:00
Validation end date: 2021-06-14 00:00:00
Test end date: 2021-09-14 00:00:00


In [28]:
# Splitting the data
train_data = midlands_data[start_date:train_end_date]
validation_data = midlands_data[train_end_date + pd.DateOffset(days=1):validation_end_date]
test_data = midlands_data[validation_end_date + pd.DateOffset(days=1):test_end_date]

# Calculate the percentage of dates in each dataset
total_sample = len(midlands_data)


# Print data ranges to verify
print(f"Training Data Range: {train_data.index.min()} to {train_data.index.max()}")
print(f"Validation Data Range: {validation_data.index.min()} to {validation_data.index.max()}")
print(f"Test Data Range: {test_data.index.min()} to {test_data.index.max()}")


Training Data Range: 2020-04-14 00:00:00 to 2021-04-14 00:00:00
Validation Data Range: 2021-04-15 00:00:00 to 2021-06-14 00:00:00
Test Data Range: 2021-06-15 00:00:00 to 2021-09-14 00:00:00


In [None]:
# Split the data into training, validation, and testing sets
train = midlands_data[transformed_data.index <= train_end]
val = midlands_data[(midlands_data.index > train_end) & (midlands_data.index <= val_end)]
test = midlands_data[midlands_data.index > val_end]

# Calculate the percentage of dates in each dataset
total_sample = len(midlands_data)
train_sample = len(train) / total_sample * 100
val_sample = len(val) / total_sample * 100
test_sample = len(test) / total_sample * 100

print(f"Train: {train_sample:.2f}%, Validation: {val_sample:.2f}%, Test: {test_sample:.2f}%")
print(f"Train: {len(train)} samples, Validation: {len(val)} samples, Test: {len(test)} samples")
print(f"Max date in train: {train.index.max()}, Min date in validation: {val.index.min()}, Max date in test: {test.index.max()}")