In [1]:
%cd ../..

c:\Users\ajaoo\Desktop\Projects\Multivate-forecasting


In [2]:
import os
import time

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"

import warnings
from pathlib import Path
from itertools import cycle
import humanize
from darts.metrics import mae, mase, mse
from sklearn.preprocessing import StandardScaler
from src.forecasting.ml_forecasting import (
    FeatureConfig,
    calculate_metrics,
)
from src.utils import plotting_utils
from src.utils.general import LogTime
from src.utils.ts_utils import darts_metrics_adapter, forecast_bias
from tqdm.autonotebook import tqdm
from IPython.display import display, HTML

# %load_ext autoreload
# %autoreload 2
np.random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


In [3]:
os.makedirs("reports/figures/ml_forecasting", exist_ok=True)
preprocessed_data_dir = Path("data/processed")
output = Path("reports/figures/ml_forecasting")

In [4]:
def format_plot(
    fig, legends=None, xlabel="Time", ylabel="Value", title="", font_size=15
):
    if legends:
        names = cycle(legends)
        fig.for_each_trace(lambda t: t.update(name=next(names)))
    fig.update_layout(
        autosize=False,
        width=900,
        height=500,
        title_text=title,
        title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
        titlefont={"size": 20},
        legend_title=None,
        legend=dict(
            font=dict(size=font_size),
            orientation="h",
            yanchor="bottom",
            y=0.98,
            xanchor="right",
            x=1,
        ),
        yaxis=dict(
            title_text=ylabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
        xaxis=dict(
            title_text=xlabel,
            titlefont=dict(size=font_size),
            tickfont=dict(size=font_size),
        ),
    )
    return fig


def mase(actual, predicted, insample_actual):
    mae_insample = np.mean(np.abs(np.diff(insample_actual)))
    mae_outsample = np.mean(np.abs(actual - predicted))
    return mae_outsample / mae_insample


def forecast_bias(actual, predicted):
    return np.mean(predicted - actual)


def plot_forecast(
    pred_df, forecast_columns, forecast_display_names=None, save_path=None
):
    if forecast_display_names is None:
        forecast_display_names = forecast_columns
    else:
        assert len(forecast_columns) == len(forecast_display_names)

    mask = ~pred_df[forecast_columns[0]].isnull()
    colors = px.colors.qualitative.Set2  # Using a different color palette
    act_color = colors[0]
    colors = cycle(colors[1:])

    fig = go.Figure()

    # Actual data plot
    fig.add_trace(
        go.Scatter(
            x=pred_df[mask].index,
            y=pred_df[mask].covidOccupiedMVBeds,
            mode="lines",
            marker=dict(size=6, opacity=0.5),
            line=dict(color=act_color, width=2),
            name="Actual COVID-19 MVBeds trends",
        )
    )

    # Predicted data plot
    for col, display_col in zip(forecast_columns, forecast_display_names):
        fig.add_trace(
            go.Scatter(
                x=pred_df[mask].index,
                y=pred_df.loc[mask, col],
                mode="lines+markers",
                marker=dict(size=4),
                line=dict(color=next(colors), width=2),
                name=display_col,
            )
        )

    return fig


def highlight_abs_min(s, props=""):
    return np.where(s == np.nanmin(np.abs(s.values)), props, "")

In [5]:
data = pd.read_csv(
    preprocessed_data_dir / "merged_nhs_covid_data.csv", parse_dates=["date"]
)
data = data.drop(columns=["Unnamed: 0"])
data.head()

Unnamed: 0,areaCode,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,population,latitude,longitude,epi_week
0,E40000007,East of England,2022-09-12,9.0,84162,418.0,47,505.0,2.0,2177579.0,19129.0,6235410,52.24,0.41,202237
1,E40000007,East of England,2022-09-11,8.0,84115,421.0,46,429.0,3.0,2177074.0,19127.0,6235410,52.24,0.41,202237
2,E40000007,East of England,2022-09-10,8.0,84069,419.0,34,296.0,0.0,2176645.0,19124.0,6235410,52.24,0.41,202236
3,E40000007,East of England,2022-09-09,9.0,84035,411.0,34,308.0,2.0,2176349.0,19124.0,6235410,52.24,0.41,202236
4,E40000007,East of England,2022-09-08,9.0,84001,421.0,51,335.0,3.0,2176041.0,19122.0,6235410,52.24,0.41,202236


In [6]:
len(data.areaName.unique())

7

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8054 entries, 0 to 8053
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   areaCode              8054 non-null   object        
 1   areaName              8054 non-null   object        
 2   date                  8054 non-null   datetime64[ns]
 3   covidOccupiedMVBeds   8054 non-null   float64       
 4   cumAdmissions         8054 non-null   int64         
 5   hospitalCases         8054 non-null   float64       
 6   newAdmissions         8054 non-null   int64         
 7   new_confirmed         8054 non-null   float64       
 8   new_deceased          8054 non-null   float64       
 9   cumulative_confirmed  8054 non-null   float64       
 10  cumulative_deceased   8054 non-null   float64       
 11  population            8054 non-null   int64         
 12  latitude              8054 non-null   float64       
 13  longitude         

In [8]:
data = data.sort_values(["areaName", "date"])
data = data.drop(columns=["areaCode", "latitude", "longitude", "epi_week"])
data.head()

Unnamed: 0,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,population
894,East of England,2020-04-01,0.0,1400,833.0,167,334.0,75.0,2938.0,455.0,6235410
893,East of England,2020-04-02,119.0,1584,841.0,184,372.0,71.0,3310.0,526.0,6235410
892,East of England,2020-04-03,162.0,1776,914.0,192,350.0,85.0,3660.0,611.0,6235410
891,East of England,2020-04-04,171.0,1939,988.0,163,268.0,70.0,3928.0,681.0,6235410
890,East of England,2020-04-05,219.0,2159,1230.0,220,281.0,91.0,4209.0,772.0,6235410


In [9]:
# # select a single areaName
# area = "East of England"
# data = data[data.areaName == area]
# data = data.drop(columns=["areaName", "areaCode", "latitude", "longitude", "population", "epi_week"])
# data.head()

In [10]:
# from src.transforms.target_transformations import AutoStationaryTransformer
# # Autostatianry
# transfomer_pipeline = {}
# for _id in tqdm(data["areaName"].unique()):

#     auto_stationary = AutoStationaryTransformer(seasonal_period=7)
#     y = data[data["areaName"] == _id]["date","covidOccupiedMVBeds"].set_index("date")

#     y_stat = auto_stationary.fit_transform(y, freq="D")

#     data

In [11]:
from src.feature_engineering.autoregressive_features import add_lags

lags = [3, 7, 14, 21, 28]

with LogTime():
    data, added_features = add_lags(
        data, lags=lags, column="covidOccupiedMVBeds", ts_id="areaName", use_32_bit=True
    )
print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 0 microseconds
Features Created: covidOccupiedMVBeds_lag_3,covidOccupiedMVBeds_lag_7,covidOccupiedMVBeds_lag_14,covidOccupiedMVBeds_lag_21,covidOccupiedMVBeds_lag_28


In [12]:
# class LogTime:
#     from time import time

#     def __enter__(self):
#         self.start_time = self.time()
#         print("Starting operation...")

#     def __exit__(self, type, value, traceback):
#         elapsed_time = self.time() - self.start_time
#         print(f"Operation completed in {elapsed_time} seconds.")


# def add_lags(df, lags, column):
#     added_features = []
#     for lag in lags:
#         lag_col_name = f"{column}_lag_{lag}"
#         df[lag_col_name] = df[column].shift(lag)
#         added_features.append(lag_col_name)
#     # Drop rows with NaN values
#     df.dropna(inplace=True)
#     return df, added_features


# lags = [1, 7, 14]

# data_with_lags, added_features = add_lags(data, lags, "covidOccupiedMVBeds")

In [13]:
# data = data_with_lags.copy()

## Rolling


In [14]:
from src.feature_engineering.autoregressive_features import add_rolling_features

with LogTime():
    data, added_features = add_rolling_features(
        data,
        rolls=[7, 14, 21],
        column="covidOccupiedMVBeds",
        agg_funcs=["mean", "std"],
        ts_id="areaName",
        use_32_bit=True,
    )
print(f"Features Created: {','.join(added_features)}")

Time Elapsed: 0 microseconds
Features Created: covidOccupiedMVBeds_rolling_7_mean,covidOccupiedMVBeds_rolling_7_std,covidOccupiedMVBeds_rolling_14_mean,covidOccupiedMVBeds_rolling_14_std,covidOccupiedMVBeds_rolling_21_mean,covidOccupiedMVBeds_rolling_21_std


In [15]:
# from src.feature_engineering.autoregressive_features import add_ewma

# with LogTime():
#     data, added_features = add_ewma(
#         data,

#         column="covidOccupiedMVBeds",
#         ts_id="areaName",
#         use_32_bit=True,
#     )
# print(f"Features Created: {','.join(added_features)}")

In [16]:
# function for creating temporal features and day of the week and month in the dataset


def create_temporal_features(df, date_column):
    df["month"] = df[date_column].dt.month
    df["day"] = df[date_column].dt.day
    df["day_of_week"] = df[date_column].dt.dayofweek
    return df


data = create_temporal_features(data, "date")

In [17]:
nc = data.isnull().sum()
nc[nc > 0]

covidOccupiedMVBeds_lag_3               21
covidOccupiedMVBeds_lag_7               49
covidOccupiedMVBeds_lag_14              98
covidOccupiedMVBeds_lag_21             147
covidOccupiedMVBeds_lag_28             196
covidOccupiedMVBeds_rolling_7_mean      49
covidOccupiedMVBeds_rolling_7_std       49
covidOccupiedMVBeds_rolling_14_mean     98
covidOccupiedMVBeds_rolling_14_std      98
covidOccupiedMVBeds_rolling_21_mean    147
covidOccupiedMVBeds_rolling_21_std     147
dtype: int64

In [18]:
# solve the missing values with backward fill
data = data.fillna(method="bfill")
nc = data.isnull().sum()
nc[nc > 0]

  data = data.fillna(method="bfill")


Series([], dtype: int64)

In [19]:
data.tail()

Unnamed: 0,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,...,covidOccupiedMVBeds_lag_28,covidOccupiedMVBeds_rolling_7_mean,covidOccupiedMVBeds_rolling_7_std,covidOccupiedMVBeds_rolling_14_mean,covidOccupiedMVBeds_rolling_14_std,covidOccupiedMVBeds_rolling_21_mean,covidOccupiedMVBeds_rolling_21_std,month,day,day_of_week
7163,South West,2022-09-08,2.0,70521,308.0,57,349.0,7.0,1881291.0,11968.0,...,9.0,4.0,2.309401,4.928571,2.055547,4.904762,1.70014,9,8,3
7162,South West,2022-09-09,1.0,70559,308.0,38,299.0,2.0,1881590.0,11970.0,...,7.0,3.428571,2.225394,4.714286,2.198901,4.761905,1.813967,9,9,4
7161,South West,2022-09-10,0.0,70609,291.0,50,359.0,4.0,1881949.0,11974.0,...,8.0,2.571429,1.718249,4.5,2.409915,4.571429,1.989257,9,10,5
7160,South West,2022-09-11,0.0,70663,309.0,54,401.0,3.0,1882350.0,11977.0,...,8.0,1.857143,1.573592,4.142857,2.684919,4.333333,2.221111,9,11,6
7159,South West,2022-09-12,0.0,70709,322.0,46,516.0,0.0,1882866.0,11977.0,...,8.0,1.285714,1.380131,3.714286,2.840059,4.142857,2.414243,9,12,0


In [20]:
data["areaName"].nunique()

7

In [21]:
# data["date"] = pd.to_datetime(data["date"])
# data = data.set_index("date")

# data['day'] = data.index.day
# data['month'] = data.index.month

In [22]:
# the second wave of the pandemic, the initial data 4 months of 2021, the test data covers september to november 2021
# data = data[(data.date >= "2021-04-01") & (data.date <= "2021-11-30")]

for area in data["areaName"].unique():
    area_data = data[data["areaName"] == area]
    min_data = area_data["date"].min()
    max_data = area_data["date"].max()

    print(f"Area: {area} | Min Date: {min_data} | Max Date: {max_data}")

    # Calculate the date ranges for train, val, and test sets
    date_range = max_data - min_data
    train_end = min_data + pd.Timedelta(days=date_range.days * 0.75)
    val_end = train_end + pd.Timedelta(days=date_range.days * 0.10)

    # Split the data into train, validation, and test sets based on the date ranges
    train = area_data[area_data["date"] < train_end]
    val = area_data[(area_data["date"] >= train_end) & (area_data["date"] < val_end)]
    test = area_data[area_data["date"] >= val_end]

    # Calculate the percentage of dates in each dataset
    total_samples = len(area_data)
    train_percentage = len(train) / total_samples * 100
    val_percentage = len(val) / total_samples * 100
    test_percentage = len(test) / total_samples * 100
    # save the data as pickle
    train.to_pickle(f"data/processed/{area}_train.pkl")
    val.to_pickle(f"data/processed/{area}_val.pkl")
    test.to_pickle(f"data/processed/{area}_test.pkl")
    
    
    print(
        f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}"
    )
    print(
        f"Percentage of Dates in Train: {train_percentage:.2f}% | Percentage of Dates in Validation: {val_percentage:.2f}% | Percentage of Dates in Test: {test_percentage:.2f}%"
    )
    print(
        f"Max Date in Train: {train.date.max()} | Min Date in Validation: {val.date.min()} | Min Date in Test: {test.date.min()}"
    )

Area: East of England | Min Date: 2020-04-01 00:00:00 | Max Date: 2022-09-12 00:00:00
# of Training samples: 671 | # of Validation samples: 89 | # of Test samples: 135
Percentage of Dates in Train: 74.97% | Percentage of Dates in Validation: 9.94% | Percentage of Dates in Test: 15.08%
Max Date in Train: 2022-01-31 00:00:00 | Min Date in Validation: 2022-02-01 00:00:00 | Min Date in Test: 2022-05-01 00:00:00
Area: London | Min Date: 2020-04-01 00:00:00 | Max Date: 2022-09-12 00:00:00
# of Training samples: 671 | # of Validation samples: 89 | # of Test samples: 134
Percentage of Dates in Train: 75.06% | Percentage of Dates in Validation: 9.96% | Percentage of Dates in Test: 14.99%
Max Date in Train: 2022-01-31 00:00:00 | Min Date in Validation: 2022-02-01 00:00:00 | Min Date in Test: 2022-05-01 00:00:00
Area: Midlands | Min Date: 2020-04-01 00:00:00 | Max Date: 2022-09-12 00:00:00
# of Training samples: 1342 | # of Validation samples: 178 | # of Test samples: 270
Percentage of Dates in T

In [23]:
from src.transforms.target_transformations import AutoStationaryTransformer

transformer_pipeline = {}

for _id in tqdm(data["areaName"].unique()):
    auto_stationary = AutoStationaryTransformer(seasonal_period=7)
    y = train.loc[train["areaName"] == _id, ["date", "covidOccupiedMVBeds"]].set_index(
        "date"
    )
    
    y_stat = auto_stationary.fit_transform(y, freq="D")
    
    train.loc[train["areaName"] == _id, "covidOccupiedMVBeds"] = y_stat.values
    
    transformer_pipeline[_id] = auto_stationary

  0%|          | 0/7 [00:00<?, ?it/s]

In [24]:
# correlation matrix for the features in the data for one area
area = "East of England"
area_data = train[train["areaName"] == area].drop(
    columns=["areaName", "date", "population"]
)
corr = area_data.corr()

fig = px.imshow(corr, labels=dict(x="Features", y="Features", color="Correlation"))
fig.update_layout(
    autosize=False,
    width=900,
    height=900,
    title_text=f"Correlation Matrix for Features in {area}",
    title={"x": 0.5, "xanchor": "center", "yanchor": "top"},
    titlefont={"size": 20},
    legend_title=None,
    legend=dict(
        font=dict(size=15),
        orientation="h",
        yanchor="bottom",
        y=0.98,
        xanchor="right",
        x=1,
    ),
    yaxis=dict(
        title_text="Features",
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
    xaxis=dict(
        title_text="Features",
        titlefont=dict(size=15),
        tickfont=dict(size=15),
    ),
)
fig.show()

## Detecting seasonality


In [25]:
data.columns

Index(['areaName', 'date', 'covidOccupiedMVBeds', 'cumAdmissions',
       'hospitalCases', 'newAdmissions', 'new_confirmed', 'new_deceased',
       'cumulative_confirmed', 'cumulative_deceased', 'population',
       'covidOccupiedMVBeds_lag_3', 'covidOccupiedMVBeds_lag_7',
       'covidOccupiedMVBeds_lag_14', 'covidOccupiedMVBeds_lag_21',
       'covidOccupiedMVBeds_lag_28', 'covidOccupiedMVBeds_rolling_7_mean',
       'covidOccupiedMVBeds_rolling_7_std',
       'covidOccupiedMVBeds_rolling_14_mean',
       'covidOccupiedMVBeds_rolling_14_std',
       'covidOccupiedMVBeds_rolling_21_mean',
       'covidOccupiedMVBeds_rolling_21_std', 'month', 'day', 'day_of_week'],
      dtype='object')

In [26]:
# Find the minimum and maximum dates
min_date = data["date"].min()
max_date = data["date"].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

# Calculate the date ranges for train, val, and test sets
date_range = max_date - min_date
train_end = min_date + pd.Timedelta(days=date_range.days * 0.75)
val_end = train_end + pd.Timedelta(days=date_range.days * 0.10)

# Split the data into train, validation, and test sets based on the date ranges
train = data[data["date"] < train_end]
val = data[(data["date"] >= train_end) & (data["date"] < val_end)]
test = data[data["date"] >= val_end]

# Calculate the percentage of dates in each dataset
total_samples = len(data)
train_percentage = len(train) / total_samples * 100
val_percentage = len(val) / total_samples * 100
test_percentage = len(test) / total_samples * 100

print(
    f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}"
)
print(
    f"Percentage of Dates in Train: {train_percentage:.2f}% | Percentage of Dates in Validation: {val_percentage:.2f}% | Percentage of Dates in Test: {test_percentage:.2f}%"
)
print(
    f"Max Date in Train: {train.date.max()} | Min Date in Validation: {val.date.min()} | Min Date in Test: {test.date.min()}"
)

Minimum Date: 2020-04-01 00:00:00
Maximum Date: 2022-09-12 00:00:00
# of Training samples: 6039 | # of Validation samples: 801 | # of Test samples: 1214
Percentage of Dates in Train: 74.98% | Percentage of Dates in Validation: 9.95% | Percentage of Dates in Test: 15.07%
Max Date in Train: 2022-01-31 00:00:00 | Min Date in Validation: 2022-02-01 00:00:00 | Min Date in Test: 2022-05-01 00:00:00


In [27]:
# feat_config = FeatureConfig(
#     continuous_features=[
#         "covidOccupiedMVBeds_lag_1",
#         "covidOccupiedMVBeds_lag_3",
#         "covidOccupiedMVBeds_lag_7",
#         "covidOccupiedMVBeds_lag_14",
#         "covidOccupiedMVBeds_lag_21",
#         "covidOccupiedMVBeds_lag_28",
#         "covidOccupiedMVBeds_roll_7_mean",
#         "covidOccupiedMVBeds_roll_7_std",
#         "covidOccupiedMVBeds_roll_14_mean",
#         "covidOccupiedMVBeds_roll_14_std",
#         "covidOccupiedMVBeds_roll_21_mean",
#         "covidOccupiedMVBeds_roll_21_std",
#         "covidOccupiedMVBeds_ewma_7",
#         "covidOccupiedMVBeds_ewma_14",
#         "covidOccupiedMVBeds_ewma_21",
#     ],
#     categorical_features=["day", "month"],
#     target="covidOccupiedMVBeds",
#     date="date",
# )

In [28]:
len(train), len(val), len(test)

(6039, 801, 1214)

In [29]:
nc = train.isnull().sum()
nc[nc > 0]

Series([], dtype: int64)

In [30]:
nc = val.isnull().sum()
nc[nc > 0]

Series([], dtype: int64)

In [31]:
target = "covidOccupiedMVBeds"
index_cols = ["date", "areaName"]

In [32]:
train = train.set_index(index_cols)
val = val.set_index(index_cols)
pred_df = pd.concat([train, val], axis=0)

In [33]:
train

Unnamed: 0_level_0,Unnamed: 1_level_0,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,population,covidOccupiedMVBeds_lag_3,...,covidOccupiedMVBeds_lag_28,covidOccupiedMVBeds_rolling_7_mean,covidOccupiedMVBeds_rolling_7_std,covidOccupiedMVBeds_rolling_14_mean,covidOccupiedMVBeds_rolling_14_std,covidOccupiedMVBeds_rolling_21_mean,covidOccupiedMVBeds_rolling_21_std,month,day,day_of_week
date,areaName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-04-01,East of England,0.0,1400,833.0,167,334.0,75.0,2938.0,455.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.322510,248.523804,79.266396,4,1,2
2020-04-02,East of England,119.0,1584,841.0,184,372.0,71.0,3310.0,526.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.322510,248.523804,79.266396,4,2,3
2020-04-03,East of England,162.0,1776,914.0,192,350.0,85.0,3660.0,611.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.322510,248.523804,79.266396,4,3,4
2020-04-04,East of England,171.0,1939,988.0,163,268.0,70.0,3928.0,681.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.322510,248.523804,79.266396,4,4,5
2020-04-05,East of England,219.0,2159,1230.0,220,281.0,91.0,4209.0,772.0,6235410,119.0,...,0.0,161.428574,83.500214,220.857147,84.322510,248.523804,79.266396,4,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-27,South West,14.0,43023,835.0,127,9121.0,22.0,1226282.0,8805.0,5616381,16.0,...,56.0,15.428572,1.812654,20.071428,6.673303,28.571428,13.991323,1,27,3
2022-01-28,South West,15.0,43137,830.0,114,8135.0,20.0,1234417.0,8825.0,5616381,14.0,...,58.0,15.428572,1.812654,18.714285,5.703286,26.714285,13.149688,1,28,4
2022-01-29,South West,17.0,43240,798.0,103,7472.0,14.0,1241889.0,8839.0,5616381,16.0,...,55.0,15.714286,1.496027,17.571428,4.535573,24.857143,11.786797,1,29,5
2022-01-30,South West,17.0,43349,831.0,109,8758.0,24.0,1250647.0,8863.0,5616381,14.0,...,53.0,15.571428,1.272418,16.857143,3.634390,23.238094,10.251365,1,30,6


In [34]:
sample_train_df = train[train.index.get_level_values("areaName") == "East of England"]
sample_val = val[val.index.get_level_values("areaName") == "East of England"]


In [35]:
pred_df = pd.concat([sample_train_df, sample_val], axis=0)
pred_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,population,covidOccupiedMVBeds_lag_3,...,covidOccupiedMVBeds_lag_28,covidOccupiedMVBeds_rolling_7_mean,covidOccupiedMVBeds_rolling_7_std,covidOccupiedMVBeds_rolling_14_mean,covidOccupiedMVBeds_rolling_14_std,covidOccupiedMVBeds_rolling_21_mean,covidOccupiedMVBeds_rolling_21_std,month,day,day_of_week
date,areaName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-04-01,East of England,0.0,1400,833.0,167,334.0,75.0,2938.0,455.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,1,2
2020-04-02,East of England,119.0,1584,841.0,184,372.0,71.0,3310.0,526.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,2,3
2020-04-03,East of England,162.0,1776,914.0,192,350.0,85.0,3660.0,611.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,3,4
2020-04-04,East of England,171.0,1939,988.0,163,268.0,70.0,3928.0,681.0,6235410,0.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,4,5
2020-04-05,East of England,219.0,2159,1230.0,220,281.0,91.0,4209.0,772.0,6235410,119.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,5,6


In [36]:
# split the sample_train_df into train and val based on date
sample_val_df = sample_train_df[sample_train_df.index.get_level_values("date") >= "2021-09-01"]
sample_train_df = sample_train_df[sample_train_df.index.get_level_values("date") < "2021-09-01"]

sample_test_df = sample_val.copy()

In [37]:
sample_train_df['type'] = 'train'
sample_val_df['type'] = 'val'
sample_test_df['type'] = 'test'

sample_df = pd.concat([sample_train_df[[target, 'type']], sample_val_df[[target, 'type']], sample_test_df[[target, 'type']]], axis=0)
sample_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,covidOccupiedMVBeds,type
date,areaName,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-04-01,East of England,0.0,train
2020-04-02,East of England,119.0,train
2020-04-03,East of England,162.0,train
2020-04-04,East of England,171.0,train
2020-04-05,East of England,219.0,train


In [38]:
# Missing_Value_Config = MissingValueConfig(
#     bfill_columns=[
#         "covidOccupiedMVBeds_lag_1",
#         "covidOccupiedMVBeds_lag_3",
#         "covidOccupiedMVBeds_lag_7",
#         "covidOccupiedMVBeds_lag_14",
#         "covidOccupiedMVBeds_lag_21",
#         "covidOccupiedMVBeds_lag_28",
#         "covidOccupiedMVBeds_roll_7_mean",
#         "covidOccupiedMVBeds_roll_7_std",
#         "covidOccupiedMVBeds_roll_14_mean",
#         "covidOccupiedMVBeds_roll_14_std",
#         "covidOccupiedMVBeds_roll_21_mean",
#         "covidOccupiedMVBeds_roll_21_std",
#         "covidOccupiedMVBeds_ewma_7",
#         "covidOccupiedMVBeds_ewma_14",
#         "covidOccupiedMVBeds_ewma_21",
#         "Vax_index",

#     ],
#     ffill_columns=[],
#     zero_fill_columns=[],
# )

In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics, linear_model, tree, discriminant_analysis,\
                    ensemble, neural_network, inspection
import matplotlib.pyplot as plt
from pdpbox import pdp
from pycebox.ice import ice, ice_plot

In [40]:
area = "East of England"
train = pd.read_pickle(f"data/processed/{area}_train.pkl")
val = pd.read_pickle(f"data/processed/{area}_val.pkl")
test = pd.read_pickle(f"data/processed/{area}_test.pkl")

In [41]:
# train = train.drop(columns=["areaName", "population", 'date'])
# test = test.drop(columns=["areaName", "population", 'date'])
# val = val.drop(columns=["areaName", "population", 'date'])

In [42]:
train.shape

(671, 25)

In [43]:
train.head()

Unnamed: 0,areaName,date,covidOccupiedMVBeds,cumAdmissions,hospitalCases,newAdmissions,new_confirmed,new_deceased,cumulative_confirmed,cumulative_deceased,...,covidOccupiedMVBeds_lag_28,covidOccupiedMVBeds_rolling_7_mean,covidOccupiedMVBeds_rolling_7_std,covidOccupiedMVBeds_rolling_14_mean,covidOccupiedMVBeds_rolling_14_std,covidOccupiedMVBeds_rolling_21_mean,covidOccupiedMVBeds_rolling_21_std,month,day,day_of_week
894,East of England,2020-04-01,0.0,1400,833.0,167,334.0,75.0,2938.0,455.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,1,2
893,East of England,2020-04-02,119.0,1584,841.0,184,372.0,71.0,3310.0,526.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,2,3
892,East of England,2020-04-03,162.0,1776,914.0,192,350.0,85.0,3660.0,611.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,3,4
891,East of England,2020-04-04,171.0,1939,988.0,163,268.0,70.0,3928.0,681.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,4,5
890,East of England,2020-04-05,219.0,2159,1230.0,220,281.0,91.0,4209.0,772.0,...,0.0,161.428574,83.500214,220.857147,84.32251,248.523804,79.266396,4,5,6


In [44]:
from collections import namedtuple

FeatureConfig = namedtuple(
    "FeatureConfig",
    [
        "target",
        "index_cols",
        "static_categoricals",
        "static_reals",
        "time_varying_known_categoricals",
        "time_varying_known_reals",
        "time_varying_unknown_reals",
        "group_ids"
    ],
)