# Packages

In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

from preprocessing import filter_country, colnames_generator, Create_TrainTest
from prediction import evaluate, onestep_prediction, multistep_prediction, plot
from prediction import train_lasso, train_ridge

import keras
import keras_tuner as kt

# Data 

In [2]:
data = pd.read_csv('../Data/Covid19_Europe_20210710_preprocessed.csv', index_col=0, parse_dates=['Date'])
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28407 entries, 0 to 28406
Data columns (total 39 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   CountryName                           28407 non-null  object        
 1   Date                                  28407 non-null  datetime64[ns]
 2   ConfirmedCases                        28407 non-null  int64         
 3   ConfirmedDeaths                       28407 non-null  int64         
 4   C1_School closing                     28407 non-null  int64         
 5   C2_Workplace closing                  28407 non-null  int64         
 6   C3_Cancel public events               28407 non-null  int64         
 7   C4_Restrictions on gatherings         28407 non-null  int64         
 8   C5_Close public transport             28407 non-null  int64         
 9   C6_Stay at home requirements          28407 non-null  int64         
 10

# Implementation

__Variants Emergence__

* Alpha $\rightarrow$ Sep-2020 
* Beta $\rightarrow$ May-2020 
* Gamma $\rightarrow$ Nov-2020 
* Delta $\rightarrow$ Oct-2020
* Lambda $\rightarrow$ Dec-2020
* Mu $\rightarrow$ Jan-2021

### 1. Extrcating data for each country

In [40]:
# Please add the necesary columns here

COLS = ['C1_School closing', 'C2_Workplace closing', 'C3_Cancel public events', 'C4_Restrictions on gatherings', 
        'C6_Stay at home requirements', 'H2_Testing policy', 'H3_Contact tracing', 'H6_Facial Coverings', 
        'H7_Vaccination policy'] 

In [41]:
T = 21       # days in past to include in training data
n_test = 30   # days to include for future

country_list = data.CountryName.sort_values().unique()

X_all_train_dict = {}
y_all_train_dict = {}
X_all_test_dict = {}
y_all_test_dict = {}


for country in country_list:
    
    X, y, indx = filter_country(data, 
                                country=country, 
                                COLS=COLS,
                                start_time='2019-01-01',   # Starting time
                                end_time='2020-10-01', 
                                outcome='PredictionRatio' )
    
    
    X_train, X_test, y_train, y_test, train_indx, test_indx = Create_TrainTest(X = X, 
                                                                               y = y,
                                                                               method='single', 
                                                                               indx=indx, 
                                                                               T=T, 
                                                                               n_test=n_test)
    
    X_all_train_dict[country] = X_train
    y_all_train_dict[country] = y_train
    
    X_all_test_dict[country] = X_test
    y_all_test_dict[country] = y_test

In [49]:
X_all_train_dict.keys()

dict_keys(['Albania', 'Andorra', 'Austria', 'Azerbaijan', 'Belarus', 'Belgium', 'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark', 'England', 'Estonia', 'Faeroe Islands', 'Finland', 'France', 'Georgia', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Kazakhstan', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malta', 'Moldova', 'Monaco', 'Netherlands', 'Northern Ireland', 'Norway', 'Poland', 'Portugal', 'Romania', 'Russia', 'San Marino', 'Scotland', 'Serbia', 'Slovak Republic', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'Ukraine', 'Wales'])

In [52]:
train_indx[-1]

Timestamp('2020-08-25 00:00:00')

In [53]:
test_indx[-1]

Timestamp('2020-09-01 00:00:00')

### 2. Merging all countries

In [43]:
n_features = X.shape[1]

X_all_train = np.array([]).reshape(-1, n_features*T)
y_all_train = np.array([]).reshape(-1, 1)

X_all_test = np.array([]).reshape(-1, n_features*T)
y_all_test = np.array([]).reshape(-1, 1)


for country in country_list:
    X_all_train = np.concatenate((X_all_train, X_all_train_dict[country]))
    y_all_train = np.concatenate((y_all_train, y_all_train_dict[country]))
    
    X_all_test = np.concatenate((X_all_test, X_all_test_dict[country]))
    y_all_test= np.concatenate((y_all_test, y_all_test_dict[country]))

In [17]:
X_all_train.shape

(11067, 231)

# 3. Lasso 

In [8]:
lasso_trained_1step = train_lasso(X_all_train, 
                                  y_all_train, 
                                  scoring='neg_root_mean_squared_error', 
                                  n_cv=5)

### 3.1. Overall Performance 

In [9]:
lasso_pred_1step = lasso_trained_1step.predict(X_all_test)

pd.Series(evaluate(y_true=y_all_test, 
          y_pred=lasso_pred_1step), 
          name='all')

RMSE     1.367878e-01
MAE      6.946787e-02
MAPE     2.405800e+13
sMAPE    7.899660e-02
R2       3.134684e-02
Name: all, dtype: float64

### 3.2. By Country 

In [10]:
lasso_1step_df = pd.DataFrame()

for country in country_list:
    temp = pd.Series(evaluate(y_true=y_all_test_dict[country], 
                              y_pred=lasso_trained_1step.predict(X_all_test_dict[country])), 
                     name=country)
    lasso_1step_df = pd.concat([lasso_1step_df, temp], axis='columns')

In [11]:
lasso_1step_df

Unnamed: 0,Albania,Andorra,Austria,Azerbaijan,Belarus,Belgium,Bosnia and Herzegovina,Bulgaria,Croatia,Cyprus,...,Scotland,Serbia,Slovak Republic,Slovenia,Spain,Sweden,Switzerland,Turkey,Ukraine,Wales
RMSE,0.049083,0.249566,0.044838,0.021816,0.048869,0.049068,0.06443,0.073107,0.036705,0.217614,...,0.081049,0.054939,0.060267,0.046877,0.037902,0.173071,0.022472,0.013485,0.01884,0.061415
MAE,0.043588,0.155318,0.035811,0.019662,0.044908,0.036423,0.047072,0.057114,0.031505,0.176378,...,0.070424,0.042553,0.052862,0.03875,0.027947,0.140226,0.019226,0.010883,0.015274,0.052383
MAPE,0.045806,0.11195,0.036347,0.019911,0.043865,0.038181,0.04554,0.05938,0.031169,0.239705,...,0.064871,0.045706,0.053323,0.038647,0.026342,0.143943,0.018861,0.010743,0.015217,0.047318
sMAPE,0.046294,0.120762,0.03515,0.017151,0.045496,0.035901,0.040815,0.05892,0.033196,0.198972,...,0.076085,0.044293,0.050688,0.039327,0.025176,0.137185,0.018078,0.011096,0.014698,0.047155
R2,-1.954934,-0.387225,-0.869741,-0.72446,-0.388472,-0.839028,-0.131206,-0.223059,0.014582,-1.350153,...,-0.034392,-1.965576,-0.358957,-0.03161,-0.904518,-0.12957,-0.524218,-0.510814,-0.108974,-0.94538


# 4. Ridge 

In [12]:
ridge_trained_1step = train_ridge(X_all_train, 
                                  y_all_train, 
                                  scoring='neg_root_mean_squared_error', 
                                  n_cv=5)

### 4.1. Overall Performance

In [13]:
ridge_pred_1step = ridge_trained_1step.predict(X_all_test)

pd.Series(evaluate(y_true=y_all_test, 
          y_pred=ridge_pred_1step), 
          name='all')

RMSE     1.362174e-01
MAE      7.000318e-02
MAPE     2.354443e+13
sMAPE    7.379070e-02
R2       3.940929e-02
Name: all, dtype: float64

### 4.2. By Country

In [14]:
ridge_1step_df = pd.DataFrame()

for country in country_list:
    temp = pd.Series(evaluate(y_true=y_all_test_dict[country], 
                              y_pred=ridge_trained_1step.predict(X_all_test_dict[country])), 
                     name=country)
    ridge_1step_df = pd.concat([ridge_1step_df, temp], axis='columns')

In [15]:
ridge_1step_df

Unnamed: 0,Albania,Andorra,Austria,Azerbaijan,Belarus,Belgium,Bosnia and Herzegovina,Bulgaria,Croatia,Cyprus,...,Scotland,Serbia,Slovak Republic,Slovenia,Spain,Sweden,Switzerland,Turkey,Ukraine,Wales
RMSE,0.0489,0.253145,0.045854,0.025109,0.050892,0.054239,0.067476,0.069194,0.038727,0.219665,...,0.076358,0.050783,0.060774,0.047299,0.0375,0.174233,0.023193,0.010062,0.019205,0.061985
MAE,0.043139,0.163501,0.036626,0.021047,0.047435,0.043077,0.051864,0.054047,0.033393,0.176307,...,0.065112,0.037734,0.05344,0.039037,0.028885,0.140797,0.019519,0.00882,0.015246,0.053793
MAPE,0.045354,0.11956,0.037189,0.021371,0.046386,0.04483,0.05038,0.056156,0.033162,0.240293,...,0.060256,0.040644,0.054011,0.038991,0.027359,0.144663,0.019166,0.008681,0.015209,0.048683
sMAPE,0.044055,0.134986,0.036166,0.021065,0.045429,0.043511,0.051258,0.054202,0.032648,0.198877,...,0.061702,0.039167,0.052767,0.038542,0.027875,0.142047,0.018945,0.00865,0.015091,0.050177
R2,-1.932969,-0.427298,-0.955431,-1.284477,-0.505822,-1.247109,-0.240697,-0.095649,-0.097002,-1.39467,...,0.081892,-1.533954,-0.381914,-0.050269,-0.864341,-0.144786,-0.623647,0.158795,-0.152346,-0.981618


# 5. ANN

### 5.1. Tuning Model

In [5]:
def model_builder(hp):
    
    # Initialize sequential API and start building model.
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(210)))
    
    # Tune the number of hidden layers and units in each.
    # Number of hidden layers: 1 - 5
    # Number of Units: 32 - 512 with stepsize of 32
    for i in range(1, hp.Int("num_layers", 1, 4)):
        model.add(
            keras.layers.Dense(
                units=hp.Int("units_" + str(i), min_value=20, max_value=200, step=20),
                activation="relu")
            )
        
        # Tune dropout layer with values from 0 - 0.3 with stepsize of 0.1.
        model.add(keras.layers.Dropout(hp.Float("dropout_" + str(i), 0, 0.3, step=0.1)))
    
    # Add output layer.
    model.add(keras.layers.Dense(1))
    
    # Tune learning rate for Adam optimizer with values from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    
    # Define optimizer, loss, and metrics
    model.compile(optimizer=keras.optimizers.adam_v2.Adam(learning_rate=hp_learning_rate),
                  loss='mse',
                  metrics=["mae", "mse"])
    
    return model

In [9]:
tuner = kt.Hyperband(hypermodel=model_builder,
                     objective = kt.Objective('val_mse', direction="min"),
                     max_epochs=100,
                     factor=3,
                     directory='MLP_hyperparameter',
                     project_name='Covid19')

stop_early = keras.callbacks.EarlyStopping(monitor='val_mse', patience=5)

INFO:tensorflow:Reloading Oracle from existing project MLP_hyperparameter\Covid19\oracle.json
INFO:tensorflow:Reloading Tuner from MLP_hyperparameter\Covid19\tuner0.json


In [63]:
tuner.search(X_all_train, 
             y_all_train, 
             epochs=100, 
             validation_data=(X_all_test, y_all_test), 
             callbacks=[stop_early], 
             verbose=2)

Trial 238 Complete [00h 00m 12s]
val_mse: 0.01901964843273163

Best val_mse So Far: 0.01832188107073307
Total elapsed time: 00h 28m 49s

Search: Running Trial #239

Hyperparameter    |Value             |Best Value So Far 
num_layers        |2                 |3                 
learning_rate     |0.0001            |0.001             
units_1           |100               |80                
dropout_1         |0.2               |0                 
units_2           |60                |60                
dropout_2         |0.2               |0                 
units_3           |40                |180               
dropout_3         |0.2               |0.2               
tuner/epochs      |34                |34                
tuner/initial_e...|0                 |12                
tuner/bracket     |1                 |2                 
tuner/round       |0                 |1                 

Epoch 1/34
346/346 - 2s - loss: 0.3632 - mae: 0.4391 - mse: 0.3632 - val_loss: 0.0431 - val_m

UnknownError: Failed to rename: MLP_hyperparameter\Covid19\trial_face235c583c8a23bbb56ca43018870f\checkpoints\epoch_0\checkpoint_temp/part-00000-of-00001.data-00000-of-00001 to: MLP_hyperparameter\Covid19\trial_face235c583c8a23bbb56ca43018870f\checkpoints\epoch_0\checkpoint.data-00000-of-00001 : Access is denied.
; Input/output error [Op:MergeV2Checkpoints]

In [69]:
# Refit using the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters()[0]
h_model = tuner.hypermodel.build(best_hps)

h_model.summary()

h_model.fit(X_all_train, 
            y_all_train, 
            epochs=100, 
            validation_data=(X_all_test, y_all_test), 
            callbacks=[stop_early], 
            verbose=2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 80)                16880     
_________________________________________________________________
dropout_3 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 60)                4860      
_________________________________________________________________
dropout_4 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 61        
Total params: 21,801
Trainable params: 21,801
Non-trainable params: 0
_________________________________________________________________


### 3.1. Overall Performance

In [71]:
mlp_pred_1step = h_model.predict(X_all_test)

pd.Series(evaluate(y_true=y_all_test, 
          y_pred=mlp_pred_1step), 
          name='all')

RMSE     1.540566e-01
MAE      1.027930e-01
MAPE     2.613917e+13
sMAPE    1.041758e-01
R2      -2.286672e-01
Name: all, dtype: float64

### By Country 

In [74]:
mlp_1step_df = pd.DataFrame()

for country in country_list:
    temp = pd.Series(evaluate(y_true=y_all_test_dict[country], 
                              y_pred=h_model.predict(X_all_test_dict[country])), 
                     name=country)
    mlp_1step_df = pd.concat([mlp_1step_df, temp], axis='columns')

mlp_1step_df

Unnamed: 0,Albania,Andorra,Austria,Azerbaijan,Belarus,Belgium,Bosnia and Herzegovina,Bulgaria,Croatia,Cyprus,...,Scotland,Serbia,Slovak Republic,Slovenia,Spain,Sweden,Switzerland,Turkey,Ukraine,Wales
RMSE,0.116353,0.216892,0.091278,0.083966,0.052046,0.111105,0.103859,0.109345,0.081978,0.280608,...,0.067821,0.104545,0.107897,0.090265,0.065979,0.182884,0.06816,0.026422,0.046822,0.044124
MAE,0.113312,0.166325,0.085309,0.082187,0.048052,0.104453,0.098955,0.095963,0.072924,0.244713,...,0.054564,0.099311,0.096051,0.081548,0.059406,0.165786,0.065453,0.025203,0.043011,0.042559
MAPE,0.118177,0.129777,0.085753,0.082883,0.047014,0.107502,0.100555,0.100216,0.072893,0.324358,...,0.053249,0.10496,0.098364,0.082892,0.057995,0.176131,0.063999,0.024784,0.0428,0.039524
sMAPE,0.111188,0.1378,0.081678,0.079431,0.046028,0.101235,0.095458,0.093889,0.069645,0.261454,...,0.051876,0.099093,0.09255,0.07884,0.05598,0.166871,0.061841,0.02445,0.041731,0.039448
R2,-15.605339,-0.047762,-6.748544,-24.546101,-0.574874,-8.429089,-1.93938,-1.736112,-3.915533,-2.907724,...,0.27571,-9.738826,-3.355849,-2.825039,-4.771396,-0.261294,-13.022763,-4.800535,-5.849714,-0.004156


# LSTM

In [44]:
X_all_train_lstm = X_all_train.reshape(-1, T, n_features)
X_all_test_lstm = X_all_test.reshape(-1, T, n_features) 

In [34]:
X_all_train_lstm.shape

(11067, 21, 11)

In [45]:
def model_builder_lstm(hp):
    
    # Initialize sequential API and start building model.
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(T, n_features)))
    
    # Tune the number of hidden layers and units in each.
    # Number of hidden layers: 1 - 5
    # Number of Units: 32 - 512 with stepsize of 32
    for i in range(1, hp.Int("num_layers", 1, 4)):
        model.add(
            keras.layers.LSTM(
                units=hp.Int("units_" + str(i), min_value=5, max_value=205, step=20),
                activation="tanh", return_sequences=True)
            )
        
        # Tune dropout layer with values from 0 - 0.3 with stepsize of 0.1.
        model.add(keras.layers.Dropout(hp.Float("dropout_" + str(i), 0, 0.5, step=0.1)))
    
    # Aggregating Layer
    model.add(keras.layers.GlobalMaxPool1D())
    
    # Add output layer.
    model.add(keras.layers.Dense(1))
    
    # Tune learning rate for Adam optimizer with values from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    
    # Define optimizer, loss, and metrics
    model.compile(optimizer=keras.optimizers.adam_v2.Adam(learning_rate=hp_learning_rate),
                  loss='mse',
                  metrics=["mae", "mse"])
    
    return model

In [46]:
tuner_lstm = kt.Hyperband(hypermodel=model_builder_lstm,
                         objective = kt.Objective('val_mse', direction="min"),
                         max_epochs=100,
                         factor=3,
                         directory='LSTM_hyperparameter',
                         project_name='Covid19')

stop_early = keras.callbacks.EarlyStopping(monitor='val_mse', patience=5)

INFO:tensorflow:Reloading Oracle from existing project LSTM_hyperparameter\Covid19\oracle.json
INFO:tensorflow:Reloading Tuner from LSTM_hyperparameter\Covid19\tuner0.json


In [None]:
tuner_lstm.search(X_all_train_lstm, 
                 y_all_train, 
                 epochs=100, 
                 validation_data=(X_all_test_lstm, y_all_test), 
                 callbacks=[stop_early], verbose=2)

In [38]:
best_hps_lstm = tuner_lstm.get_best_hyperparameters()[0]
h_lstm = tuner_lstm.hypermodel.build(best_hps_lstm)

h_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 21, 5)             340       
_________________________________________________________________
dropout_2 (Dropout)          (None, 21, 5)             0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 21, 5)             220       
_________________________________________________________________
dropout_3 (Dropout)          (None, 21, 5)             0         
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 5)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6         
Total params: 566
Trainable params: 566
Non-trainable params: 0
________________________________________________________

In [104]:
best_hps_lstm = tuner_lstm.get_best_hyperparameters()[0]
h_lstm = tuner_lstm.hypermodel.build(best_hps_lstm)
h_lstm.fit(X_all_train_lstm, 
            y_all_train, 
            epochs=100, 
            validation_data=(X_all_test_lstm, y_all_test), 
            callbacks=[stop_early], 
            verbose=2)

Epoch 1/100
346/346 - 39s - loss: 0.0795 - mae: 0.1675 - mse: 0.0795 - val_loss: 0.0193 - val_mae: 0.0682 - val_mse: 0.0193
Epoch 2/100
346/346 - 27s - loss: 0.0437 - mae: 0.0951 - mse: 0.0437 - val_loss: 0.0193 - val_mae: 0.0672 - val_mse: 0.0193
Epoch 3/100
346/346 - 22s - loss: 0.0434 - mae: 0.0947 - mse: 0.0434 - val_loss: 0.0201 - val_mae: 0.0720 - val_mse: 0.0201
Epoch 4/100
346/346 - 21s - loss: 0.0432 - mae: 0.0948 - mse: 0.0432 - val_loss: 0.0197 - val_mae: 0.0722 - val_mse: 0.0197
Epoch 5/100
346/346 - 21s - loss: 0.0438 - mae: 0.0975 - mse: 0.0438 - val_loss: 0.0193 - val_mae: 0.0662 - val_mse: 0.0193
Epoch 6/100
346/346 - 21s - loss: 0.0435 - mae: 0.0964 - mse: 0.0435 - val_loss: 0.0194 - val_mae: 0.0662 - val_mse: 0.0194


<keras.callbacks.History at 0x13f42efe250>

### 3.1. Overall Performance

In [115]:
X_all_test_lstm[0:1].shape

(1, 21, 10)

In [118]:
h_lstm.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 21, 145)           90480     
_________________________________________________________________
dropout_7 (Dropout)          (None, 21, 145)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 21, 145)           168780    
_________________________________________________________________
dropout_8 (Dropout)          (None, 21, 145)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 21, 5)             3020      
_________________________________________________________________
dropout_9 (Dropout)          (None, 21, 5)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 21, 1)            

In [116]:
h_lstm.predict(X_all_test_lstm[0:1])

array([[[1.0061033],
        [1.0042149],
        [1.0040555],
        [1.0040313],
        [1.0040228],
        [1.0040193],
        [1.0040171],
        [1.0040158],
        [1.0040148],
        [1.0040139],
        [1.0040132],
        [1.0040123],
        [1.0040116],
        [1.0040109],
        [1.0040102],
        [1.0040096],
        [1.004009 ],
        [1.0040084],
        [1.0040079],
        [1.0040075],
        [1.0040069]]], dtype=float32)

In [108]:
lstm_pred_1step.shape

(357, 21, 1)

In [None]:
lstm_pred_1step = h_lstm.predict(X_all_test_lstm)

pd.Series(evaluate(y_true=y_all_test, 
          y_pred=lstm_pred_1step), 
          name='all')