In [1]:
import numpy as np
import pandas as pd



NPI_COLS = ['C1_School closing', 'C2_Workplace closing',
           'C3_Cancel public events', 'C4_Restrictions on gatherings',
           'C5_Close public transport', 'C6_Stay at home requirements',
           'C7_Restrictions on internal movement',
           'C8_International travel controls', 'E1_Income support',
           'E2_Debt/contract relief', 'H1_Public information campaigns',
           'H2_Testing policy', 'H3_Contact tracing', 'H6_Facial Coverings', 
           'H7_Vaccination policy','H8_Protection of elderly people']

covid_path = '../Data/Covid19_Europe_20210710.csv'
vaccination_path='../Data/Vaccination.csv'
population_path='../Data/Population.csv'
weather_path = '../Data/Weather.csv'

COUNTRIES = ['Germany', 'Italy', 'France', 'Spain',  'Denmark', 'England',
             'Netherlands', 'Belgium', 'Sweden', 'Switzerland', 'Norway'] 

### Functions

In [4]:
def _prepare_covid(file_path, countries_list):   
    """
    It generate the covid-19 dataset for a given list of countries, up to 2021-07-10.
    """
    covid_cols = ['CountryName', 'RegionName', 'Date', 'ConfirmedCases', 'ConfirmedDeaths']
    
    # Main Covid-19 dataset
    df = pd.read_csv(file_path, index_col=0, parse_dates=['Date']).reset_index(drop=True)
    # Only Keep needed Columns
    df = df[covid_cols + NPI_COLS]
    # Filter for countries
    df_filtered = df.loc[df.CountryName.isin(countries_list)]
    # Exception for England
    if 'England' in countries_list:
        # Extract only England
        england = df.loc[(df.CountryName == 'United Kingdom') & (df.RegionName == 'England')]
        # Merge countries
        df_filtered = pd.concat([df_filtered, england], axis='rows', sort=False)
        # Change United Kingdom to England
        df_filtered.loc[df_filtered.CountryName == 'United Kingdom', 'CountryName'] = 'England'   
    # Drop unnecessary columns
    df_filtered.drop(columns=['RegionName'], inplace=True)
    
    if len(df_filtered) == 0:
        raise ValueError('Check the list of countries again. The output length is zero!')

    return df_filtered

In [5]:
def _prepare_vaccination(file_path, countries_list):

    # Read data
    df = pd.read_csv(file_path, index_col=0, parse_dates=['date'])
    # Drop unnecessary columns
    df.drop(columns=['iso_code'], inplace=True)
    # Change column names
    df.columns = ['CountryName', 'Date', 'Vaccinated', 'FullyVaccinated', 'DailyVaccination']
    # Filter for countries
    df_filtered = df.loc[df.CountryName.isin(countries_list), :]

    return df_filtered

In [6]:
def _prepare_population(file_path, countries_list):
    
    df = pd.read_csv(file_path)
    # Drop unnecessary columns
    df.drop(columns=['Country Code'], inplace=True)
    # Change Columns Names
    df.columns = ['CountryName', 'Population2020']
    # Filter Countries
    df_filtered = df.loc[df.CountryName.isin(countries_list)]
    if 'England' in countries_list:
        # population of England
        df_filtered = df_filtered.append({'CountryName': 'England', 
                                          'Population2020': 56550000}, 
                                         ignore_index=True)

    return df_filtered

In [14]:
def _prepare_weather(file_path, countries_list):

    
    # Read Data
    df = pd.read_csv(file_path, index_col=0, parse_dates=['date_time'])
    # Change Columns Names
    df.columns = ['CountryName', 'Date', 'HeatIndexC', 'humidity', 
                  'tempC', 'windspeedKmph', 'precipMM', 'DewPointC', 'pressure']
    # Filter Countries
    df_filtered = df.loc[df.CountryName.isin(countries_list)]

    return df_filtered

In [8]:
def _preprocess_covid(covid_df, population_df, window_size=7):
    """
    # Fill missing values by interpolation, ffill, and filling NaN.
    # Creates new variables of interest.
    """
    df_new = covid_df.copy()
    # Interpolation & Drop country which no number of cases is available
    df_new.update(df_new.groupby('CountryName')['ConfirmedCases'].apply(lambda group: group.interpolate(limit_area='inside')))
    df_new.dropna(subset=['ConfirmedCases'], inplace=True)
    # Interpolation & Drop country which no number of cases is available
    df_new.update(df_new.groupby('CountryName')['ConfirmedDeaths'].apply(lambda group: group.interpolate(limit_area='inside')))
    df_new.dropna(subset=['ConfirmedDeaths'], inplace=True)
    # For Policies
    for npi_column in NPI_COLS:
        df_new.update(df_new.groupby('CountryName')[npi_column].ffill().fillna(0))
        
    
    # Merge covid and vaccination
    final_df = pd.merge(df_new, population_df, how='left', on='CountryName')

    # Compute number of new cases and deaths each day
    final_df['NewCases'] = final_df.groupby('CountryName').ConfirmedCases.diff().fillna(0)
    final_df['NewDeaths'] = final_df.groupby('CountryName').ConfirmedDeaths.diff().fillna(0)
    
    # Replace negative values (which do not make sense for these columns) with 0
    final_df['NewCases'] = final_df['NewCases'].clip(lower=0)
    final_df['NewDeaths'] = final_df['NewDeaths'].clip(lower=0)

    # Compute smoothed versions of new cases and deaths each day
    final_df['SmoothNewCases'] = final_df.groupby('CountryName')['NewCases'].rolling(
        window_size, center=False).mean().fillna(0).reset_index(0, drop=True).round()
    final_df['SmoothNewDeaths'] = final_df.groupby('CountryName')['NewDeaths'].rolling(
        window_size, center=False).mean().fillna(0).reset_index(0, drop=True).round()

    # Compute percent change in new cases and deaths each day
    final_df['CaseRatio'] = final_df.groupby('CountryName').SmoothNewCases.pct_change().fillna(0).replace(np.inf, 0) + 1
    final_df['DeathRatio'] = final_df.groupby('CountryName').SmoothNewDeaths.pct_change().fillna(0).replace(np.inf, 0) + 1
    
    # Add column for proportion of population infected
    final_df['ProportionInfected'] = final_df['ConfirmedCases'] / final_df['Population2020']

    # Create column of value to predict
    final_df['PredictionRatio'] = final_df['CaseRatio'] / (1 - final_df['ProportionInfected'])
    
    return final_df

In [10]:
def _preprocess_vaccination(df):
    
    df_new = df.copy()
    # daily vaccination: put 0 for missing values
    df_new.DailyVaccination.fillna(0, inplace=True)
    
    # FullyVaccinated: forward fill (fill with the latest value, since it is cumulative), and 0 for the rest
    df_new.update(df_new.groupby('CountryName')['FullyVaccinated'].ffill().fillna(0))
    
    # Vaccinated: first forward fill, then replace with 0 for the rest.
    df_new.update(df_new.groupby('CountryName')['Vaccinated'].ffill().fillna(0))
    
    return df_new

In [11]:
def load_data(covid_path, vaccination_path, population_path, weather_path, countries_list):
    
    covid_data = _prepare_covid(covid_path, countries_list)
    vaccination_data = _prepare_vaccination(vaccination_path, countries_list)
    population_data = _prepare_population(population_path, countries_list)
    weather_data = _prepare_weather(weather_path, countries_list)
    
    # Preprocessing
    vaccination_data = _preprocess_vaccination(vaccination_data)
    covid_data = _preprocess_covid(covid_data, population_data, window_size=7)
    
    # Merging Datasets
    merged_data = pd.merge(covid_data, vaccination_data, on=['CountryName', 'Date'], how='left')
    merged_data = pd.merge(merged_data, weather_data, on=['CountryName', 'Date'], how='left')

    return merged_data

### Implementation

In [12]:
_prepare_vaccination(vaccination_path, COUNTRIES)

Unnamed: 0,CountryName,Date,Vaccinated,FullyVaccinated,DailyVaccination
3166,Belgium,2020-12-28,304.0,,
3167,Belgium,2020-12-29,318.0,10.0,14.0
3168,Belgium,2020-12-30,807.0,19.0,252.0
3169,Belgium,2020-12-31,821.0,19.0,172.0
3170,Belgium,2021-01-01,823.0,19.0,130.0
...,...,...,...,...,...
29363,Switzerland,2021-07-07,4509017.0,3430353.0,59998.0
29364,Switzerland,2021-07-08,4522512.0,3487532.0,59820.0
29365,Switzerland,2021-07-09,4536188.0,3543723.0,59379.0
29366,Switzerland,2021-07-10,4542501.0,3579108.0,58597.0


In [10]:
_prepare_weather(weather_path, COUNTRIES)

Unnamed: 0,CountryName,Date,MaxTempC,MintempC,HeatIndexC,Humidity,TempC,WindSpeedKPH
0,Belgium,2020-01-01,9,3,5,86,9,7
1,Belgium,2020-01-02,7,2,4,87,7,16
2,Belgium,2020-01-03,10,5,8,86,10,25
3,Belgium,2020-01-04,7,3,5,86,7,15
4,Belgium,2020-01-05,6,3,4,89,6,11
...,...,...,...,...,...,...,...,...
552,Switzerland,2021-07-06,27,13,21,74,27,5
553,Switzerland,2021-07-07,20,15,18,89,20,6
554,Switzerland,2021-07-08,15,13,14,97,15,3
555,Switzerland,2021-07-09,22,11,17,77,22,4


In [14]:
_prepare_population(population_path, COUNTRIES)

Unnamed: 0,CountryName,Population2020
0,Belgium,11555997.0
1,Switzerland,8636896.0
2,Germany,83240525.0
3,Denmark,5831404.0
4,Spain,47351567.0
5,France,67391582.0
6,Italy,59554023.0
7,Netherlands,17441139.0
8,Norway,5379475.0
9,Sweden,10353442.0


In [15]:
final_df = load_data(covid_path, vaccination_path, population_path, weather_path, COUNTRIES)

In [16]:
final_df.head()

Unnamed: 0,CountryName,Date,ConfirmedCases,ConfirmedDeaths,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,...,Vaccinated,FullyVaccinated,DailyVaccination,HeatIndexC,humidity,tempC,windspeedKmph,precipMM,DewPointC,pressure
0,Belgium,2020-01-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,3,81,5,5,0.0,0,1037
1,Belgium,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,3,87,6,12,0.0,1,1031
2,Belgium,2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,4,84,7,3,0.0,1,1024
3,Belgium,2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,4,80,6,7,0.0,0,1021
4,Belgium,2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,4,88,8,16,0.0,3,1016


### Save

In [17]:
final_df.isna().sum()

CountryName                                0
Date                                       0
ConfirmedCases                             0
ConfirmedDeaths                            0
C1_School closing                          0
C2_Workplace closing                       0
C3_Cancel public events                    0
C4_Restrictions on gatherings              0
C5_Close public transport                  0
C6_Stay at home requirements               0
C7_Restrictions on internal movement       0
C8_International travel controls           0
E1_Income support                          0
E2_Debt/contract relief                    0
H1_Public information campaigns            0
H2_Testing policy                          0
H3_Contact tracing                         0
H6_Facial Coverings                        0
H7_Vaccination policy                      0
H8_Protection of elderly people            0
Population2020                             0
NewCases                                   0
NewDeaths 

In [18]:
final_df.to_csv('../Data/Covid19_Data.csv')