In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
import seaborn as sns

from plotting import plot_formatter

# Data 

In [2]:
data = pd.read_csv('../Data/Covid19_Data.csv', index_col=0, parse_dates=['Date'])
data= data.loc[data.CountryName.isin(['England', 'France', 'Germany', 'Spain', 'Italy'])]
data.set_index('Date', inplace=True)
data.head()

Unnamed: 0_level_0,CountryName,ConfirmedCases,ConfirmedDeaths,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,...,Vaccinated,FullyVaccinated,DailyVaccination,HeatIndexC,humidity,tempC,windspeedKmph,precipMM,DewPointC,pressure
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,5,82,6,17,0.2,2,1032
2020-01-23,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,3,91,5,6,0.0,2,1033
2020-01-24,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,2,82,5,7,0.0,0,1024
2020-01-25,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,3,84,4,9,0.0,0,1021
2020-01-26,Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,2,77,4,7,0.0,-1,1018


In [41]:
from datetime import datetime, timedelta
import pandas as pd


def generate_data(file_path, country, train_start_date, train_end_date, test_period='short'):
    
    """
    It generates the dataset that can be used for training
    
    Parameters
    ----------
    file_path: str
        path to the data file containing the preprocessed data
    country: str
        Name of teh country to use
    train_start_date: str
        The beginning period of training data (inclusive), format of yyyy-mm-dd
    train_end_date: str
        The ending date of training data (inclusive), format of yyyy-mm-dd
    test_period: str
        short (One week ahead), long (Four weeks ahead).
    
        
    Return
    ------
    A Training and test dataframe
    """
    
    # Read Data
    df = pd.read_csv(file_path, index_col=0, parse_dates=['Date'])
    df.set_index('Date', inplace=True)
    
    # Filtering Country
    df_country = df.loc[df.CountryName == country]
    
    # Filtering features
    
    
    # Training df
    df_train = df_country.loc[train_start_date:train_end_date]
    
    # Test df
    test_start_date = datetime.strptime(train_end_date, '%Y-%m-%d') + timedelta(days=1)
    if test_period == 'short':
        test_end_date = test_start_date + timedelta(weeks=1)
    elif test_period == 'long':
        test_end_date = test_start_date + timedelta(weeks=4)
        
    df_test = df_country.loc[test_start_date:test_end_date]
    
    
    return df_train

In [42]:
generate_data(file_path='../Data/Covid19_Data.csv',
              country='Italy', train_start_date='2020-10-01', train_end_date='2020-11-1', test_period='short').index

DatetimeIndex(['2020-10-01', '2020-10-02', '2020-10-03', '2020-10-04',
               '2020-10-05', '2020-10-06', '2020-10-07', '2020-10-08',
               '2020-10-09', '2020-10-10', '2020-10-11', '2020-10-12',
               '2020-10-13', '2020-10-14', '2020-10-15', '2020-10-16',
               '2020-10-17', '2020-10-18', '2020-10-19', '2020-10-20',
               '2020-10-21', '2020-10-22', '2020-10-23', '2020-10-24',
               '2020-10-25', '2020-10-26', '2020-10-27', '2020-10-28',
               '2020-10-29', '2020-10-30', '2020-10-31', '2020-11-01'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [40]:
generate_data(file_path='../Data/Covid19_Data.csv',
              country='Italy', train_start_date='2020-10-01', train_end_date='2020-11-1', test_period='short').index

DatetimeIndex(['2020-11-02', '2020-11-03', '2020-11-04', '2020-11-05',
               '2020-11-06', '2020-11-07', '2020-11-08', '2020-11-09'],
              dtype='datetime64[ns]', name='Date', freq=None)

__Variants Emergence__

* Alpha $\rightarrow$ Sep-2020 
* Beta $\rightarrow$ May-2020 
* Gamma $\rightarrow$ Nov-2020 
* Delta $\rightarrow$ Oct-2020
* Lambda $\rightarrow$ Dec-2020
* Mu $\rightarrow$ Jan-2021