In [1]:
# Import the Libraries for Google Drive Document Retrieval
from pydrive.auth import GoogleAuth
import pandas as pd
from pydrive.drive import GoogleDrive
import glob

#Libraries to handel and prepare data 
import importlib
import datetime
import holidays
import progressbar
from time import sleep
import pycountry
import math

In [2]:
# A browser window will open. login using the appropriate account.
gauth = GoogleAuth()
gauth.LocalWebserverAuth() #

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=267819126046-3da5rmgml4iemor49ermcv23tnaj02t5.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


In [3]:
drive = GoogleDrive(gauth)

In [4]:
drive

<pydrive.drive.GoogleDrive at 0x7fb2cd428640>

## Holidays auxiliary code

In [5]:
#Evaluates wether a given date fro a given country is a national holiday or not.
def getHoliday(country, date):
    
    res=False
    if hasattr(holidays, country):
        function_string = 'holidays.' + country
        mod_name, func_name = function_string.rsplit('.',1)
        mod = importlib.import_module(mod_name)
        func = getattr(mod, func_name)
        if(isinstance(func, int)==False):
            result = func()
            res = result.get(date) is not None
 
    return res

## Retrieve data from Google Drive

### Read single file 

In [6]:
#Reads a csv file in a certain drive folder, skipping n first nrows. 
#Returns a dataframe, if not found returns empty dataframe
def read_csv_file(folder, fileName, nrows):
    
    listed_mortality = drive.ListFile({'q': "title contains '" + fileName + ".csv' and '" + folder + "' in parents"}).GetList()
    
    if len(listed_mortality)>0:
        title = listed_mortality[0]['title']
        id = listed_mortality[0]['id']
        each_file = drive.CreateFile({'id': id})
        each_file.GetContentFile(title)
        df = pd.read_csv(title, skiprows=nrows)
    else:
        
        df=pd.DataFrame()
    
    return df

### Grouping policies 

Method used to group policies as the average of policies in the selected week. It has been decided to group values on Sundays, since excess mortality data comes out every Sunday.
The rest of the variables are calculated as detailed below:
* Day x Policies = policies and trends of x - 18 days.
* Week x Policies = mean of policies and trends in the selected week.

Every day in the week has a policies related from 18 previous days (average death limit for covid death in patients)

In [7]:
def group_policies(filename):

    df = pd.read_csv(filename)
    df['Holiday']=df.apply(lambda x: getHoliday(x['Code'], x['Day']), axis=1)
    df['Date'] = pd.to_datetime(df['Day']) + pd.to_timedelta(18, unit='d')
    
    #Group to closest Monday
    df = df.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
    #df = df.groupby(['Code', pd.Grouper(key='Date', freq='W-MON', label='left',closed='left')]).mean().reset_index()
        
    return df

Loop through all files contained in folder and aggregate them using "group_policies" auxiliary method as detailed above.

In [9]:
def df_group_policies_trends_others():
    listed = drive.ListFile({'q': "title contains '.csv' and '1XpkEI-8Zfe0IQCzoa9Pas6Y_hpjGVhGE' in parents"}).GetList()
    list_id = []
    list_title = []
    for file in listed:
        list_id.append(file['id'])
        list_title.append(file['title'])
        
    bar = progressbar.ProgressBar(maxval=15, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    df = pd.DataFrame()

    i=0

    for id, title in zip(list_id, list_title):
        print(title)
        bar.update(i+1)
        sleep(0.1)
        
        each_file = drive.CreateFile({'id': id})
        each_file.GetContentFile(title)

        if(i==0):
            df = group_policies(title)
        else:
            df_aux = group_policies(title)
            df = pd.merge(df, df_aux, on=["Code", "Date"])
        i=i+1
            
    return df

# Create Dataframe with all Features

## Policies, Mobility Trends and Holidays

The call to this method reads and groups data as described in df_group_policies_trends_others and may take some time

In [10]:
%%time
df_policies=df_group_policies_trends_others()

[====                                                                    ]   6%

covid-contact-tracing.csv




changes-visitors-covid.csv




debt-relief-covid.csv




income-support-covid.csv




covid-19-testing-policy.csv




international-travel-covid.csv




internal-movement-covid.csv




public-transport-covid.csv




public-campaigns-covid.csv




face-covering-policies-covid.csv




stay-at-home-covid.csv




public-gathering-rules-covid.csv




public-events-covid.csv




workplace-closures-covid.csv




school-closures-covid .csv
CPU times: user 6min 37s, sys: 3.06 s, total: 6min 40s
Wall time: 7min 42s


In [11]:
#Delete duplicated Holiday columns and view output
del df_policies['Holiday_x']
del df_policies['Holiday_y']
df_policies['Holiday']=df_policies['Holiday']*7
df_policies

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,restrictions_internal_movements,close_public_transport,public_information_campaigns,facial_coverings,stay_home_requirements,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,Holiday
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,1.0
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,0.0,0.000000,0.285714,0.0,0.0,0.0,0.0,0.0,1.285714,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,0.0,1.000000,2.000000,3.0,2.0,3.0,2.0,2.0,1.000000,0.0
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,0.0,1.000000,2.000000,3.0,2.0,3.0,2.0,2.0,1.000000,0.0
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,0.0,1.000000,2.000000,3.0,2.0,3.0,2.0,2.0,1.000000,0.0
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,0.0,0.571429,2.000000,3.0,2.0,3.0,2.0,2.0,1.000000,0.0


In [12]:
#Lets check for Holiday in Spain related to previous 18 days.
df_policies[(df_policies['Code']=='ESP') & (df_policies['Holiday'] > 0)]

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,restrictions_internal_movements,close_public_transport,public_information_campaigns,facial_coverings,stay_home_requirements,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,Holiday
2189,ESP,2020-05-24,1.0,-84.897857,-46.714286,26.448857,-73.347,-58.612286,-63.204,2.0,...,1.0,1.0,2.0,0.857143,2.0,4.0,2.0,3.0,3.0,1.0
2204,ESP,2020-09-06,1.0,-16.795857,-4.857,3.489571,-35.918429,66.857143,-31.877571,2.0,...,2.0,0.0,2.0,3.0,1.0,4.0,2.0,2.0,2.0,1.0
2212,ESP,2020-11-01,1.0,-24.816286,-3.571571,5.204,-24.102143,6.0,-20.979714,2.0,...,2.0,0.0,2.0,3.285714,1.0,4.0,2.0,2.0,3.0,1.0
2215,ESP,2020-11-22,1.0,-33.673571,-1.632714,8.020286,-27.571429,-7.714286,-19.734714,2.0,...,2.0,0.0,2.0,4.0,2.0,4.0,2.0,2.0,1.0,1.0
2220,ESP,2020-12-27,1.0,-32.979429,-0.265286,9.469429,-29.898,-19.550857,-22.979714,2.0,...,2.0,0.0,2.0,4.0,2.0,4.0,2.0,2.0,1.0,2.0
2223,ESP,2021-01-17,1.0,-28.591857,2.510143,10.469429,-34.102143,-12.571429,-37.040857,2.0,...,2.0,0.0,2.0,4.0,2.0,4.0,2.0,2.0,3.0,1.0
2224,ESP,2021-01-24,1.0,-31.203857,-1.571571,12.693857,-40.061143,-16.612143,-43.591857,2.0,...,2.0,0.0,2.0,4.0,2.0,4.0,2.0,2.0,3.0,2.0


## Weather Data

Collects weather data from all stations in the world and relate them to each country taking average of values. The treatment and creation of this file can be found in the file weather_data which uses Google BigQuery and Google Maps API in order to retrieve all the desired information

In [13]:
df_weather=read_csv_file('1yXs7YOhttKS7STixDmqD1kv1woRYWfUd','weather_final_df',0)
df_weather=df_weather.dropna()
#Cast country code to standard used in our dataset
for country in pycountry.countries: 
    df_weather.loc[df_weather['country']==country.alpha_2, 'Code'] = country.alpha_3
#Apply grouping
df_weather=df_weather[['Code','date', 'temp', 'prcp']]
df_weather['Date'] = pd.to_datetime(df_weather['date']) + pd.to_timedelta(18, unit='d')
df_weather = df_weather.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
df_weather

In [78]:
df_merge=df_policies

In [79]:
#Combine original dataframe with retrieved information
df_merge = pd.merge(df_merge, df_weather, on=["Code", "Date"], how="left")
df_merge.head()

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,public_information_campaigns,facial_coverings,stay_home_requirements,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,Holiday,temp,prcp
0,ABW,2020-03-08,0.0,-0.233333,-3.111,1.911,6.572333,9.755667,-8.889,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,ABW,2020-03-15,0.0,1.051,-3.050714,2.33,8.013571,7.656429,-13.826429,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,ABW,2020-04-05,0.0,-12.347,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,0.285714,0.0,0.0,0.0,0.0,0.0,1.285714,1.0,,


## Doctors and Nurses

Doctors and nurses information per 1000 

In [80]:
doctors_df=read_csv_file('17wrWTb6BZMqWkNrqYipTQXo7IOxqekOf','doctors',0)
nurses_df=read_csv_file('17wrWTb6BZMqWkNrqYipTQXo7IOxqekOf','nurses',0)

doctors_df=doctors_df[['LOCATION', 'TIME', 'Value']]
doctors_df_18=doctors_df[doctors_df['TIME'] == 2018]
nurses_df=nurses_df[['LOCATION', 'TIME', 'Value']]
nurses_df=nurses_df[nurses_df['TIME'] == 2018]

for index, row in doctors_df.iterrows(): 
    df_merge.loc[df_merge['Code']==row.LOCATION , 'doctors_per_1000'] = row.Value
for index, row in nurses_df.iterrows(): 
    df_merge.loc[df_merge['Code']==row.LOCATION , 'nurses_per_1000'] = row.Value
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,stay_home_requirements,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,,,,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,0.0,0.0,0.0,0.0,0.000000,1.0,,,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,,,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,,,,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,0.0,0.0,0.0,0.0,1.285714,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,2.0,3.0,2.0,2.0,1.000000,0.0,23.986111,0.0,,
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,2.0,3.0,2.0,2.0,1.000000,0.0,,,,
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,2.0,3.0,2.0,2.0,1.000000,0.0,,,,
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,2.0,3.0,2.0,2.0,1.000000,0.0,,,,


## Hospital Beds

Hospital beds per 1000 

In [81]:
hospital_beds_df=read_csv_file('1YlfutCzBqYwPhZw-7OsfLC3k1uv1LYfa','hospital_beds', 3)
hospital_beds_df[hospital_beds_df['Country Code']=='ESP']
#Select latest value between 2010 and 2020 
years=hospital_beds_df.columns[54:-1]

latest_values=[]

#Get latest value for hospital beds
for index, row in hospital_beds_df.iterrows():
    
    aux_year=2010
    
    for year in years:
        
        if((math.isnan(row[year])==False)):
            
            aux_year=year
            
    latest_values.append(row[str(aux_year)])

hospital_beds_df['latest_value']=latest_values
hospital_beds_df=hospital_beds_df[['Country Code', 'latest_value']]
hospital_beds_df.columns=['Code', 'beds_per_1000']
hospital_beds_df.head()

Unnamed: 0,Code,beds_per_1000
0,ABW,
1,AFG,0.39
2,AGO,
3,ALB,2.89
4,AND,


In [82]:
df_merge = pd.merge(df_merge, hospital_beds_df, on=["Code"], how="left")
df_merge.head()

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000
0,ABW,2020-03-08,0.0,-0.233333,-3.111,1.911,6.572333,9.755667,-8.889,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
1,ABW,2020-03-15,0.0,1.051,-3.050714,2.33,8.013571,7.656429,-13.826429,0.0,...,0.0,0.0,0.0,0.0,1.0,,,,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,
4,ABW,2020-04-05,0.0,-12.347,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,0.0,0.0,0.0,1.285714,1.0,,,,,


## International Tourism

Collect international toruism data from previous years and select last occurence 

In [83]:
tourism_df=read_csv_file('1L_YnKR2utwlvzE2ykY47xGsZeLwi3RKu','international-tourism-number-of-arrivals', 0)

tourism_df=tourism_df.dropna()
years=tourism_df['Year'].unique()
countries=tourism_df['Code'].unique()

#Iterate through all values and collect the values for the maximum year recorded for the selected country
latest_values=[]

max_year=1995

for country in countries:
    max_year=1995
    for year in years:
        if((year > max_year) & (len(tourism_df[(tourism_df['Year']==year) & (tourism_df['Code']==country)]) > 0)):
            max_year = year

    latest_values.append(tourism_df[(tourism_df['Year']==max_year) & (tourism_df['Code']==country)]['International tourism, number of arrivals'].values[0])

tourism_df_latest=pd.DataFrame()
tourism_df_latest['Code']=countries
tourism_df_latest['number_of_arrivals']=latest_values
tourism_df_latest  

Unnamed: 0,Code,number_of_arrivals
0,ALB,4.070000e+06
1,DZA,2.039000e+06
2,ASM,2.010000e+04
3,AND,2.831000e+06
4,AGO,3.970000e+05
...,...,...
198,VNM,1.001300e+07
199,OWID_WRL,1.244961e+09
200,YEM,3.667000e+05
201,ZMB,9.560000e+05


In [84]:
df_merge = pd.merge(df_merge, tourism_df_latest, on=["Code"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,cancel_public_events,workplace_closures,school_closures,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,0.0,0.0,0.000000,0.0,,,,,,1102000.0
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,0.0,0.0,0.000000,1.0,,,,,,1102000.0
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,0.0,0.000000,0.0,,,,,,1102000.0
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,0.0,0.000000,0.0,,,,,,1102000.0
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,0.0,0.0,1.285714,1.0,,,,,,1102000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,2.0,2.0,1.000000,0.0,23.986111,0.0,,,1.7,2168000.0
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,2.0,2.0,1.000000,0.0,,,,,1.7,2168000.0
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,2.0,2.0,1.000000,0.0,,,,,1.7,2168000.0
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,2.0,2.0,1.000000,0.0,,,,,1.7,2168000.0


## Urbanization

Collect % urbanization population data in each country from previous years and select last occurence  

In [85]:
urbanization_df=read_csv_file('1XrFzNCmJRUYxATxI6gSfwXrk9aXjYP2K','share-of-population-urban', 0)

urbanization_df=urbanization_df.dropna()
years=urbanization_df['Year'].unique()
countries=urbanization_df['Code'].unique()

#Iterate through all values and collect the values for the maximum year recorded for the selected country
latest_values=[]

for country in countries:
    max_year=2010
    for year in years:
        if((year > max_year) & (len(urbanization_df[(urbanization_df['Year']==year) & (urbanization_df['Code']==country)]) > 0)):
            max_year = year

    latest_values.append(urbanization_df[(urbanization_df['Year']==max_year) & (urbanization_df['Code']==country)]['Urban population (% of total)'].values[0])

urbanization_df_latest=pd.DataFrame()
urbanization_df_latest['Code']=countries
urbanization_df_latest['urban_population']=latest_values
urbanization_df_latest

Unnamed: 0,Code,urban_population
0,AFG,25.250000
1,ALB,59.383000
2,DZA,72.052000
3,ASM,87.170000
4,AND,88.150000
...,...,...
210,VNM,35.213000
211,OWID_WRL,54.826518
212,YEM,36.016000
213,ZMB,42.976000


In [86]:
df_merge = pd.merge(df_merge, urbanization_df_latest, on=["Code"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,workplace_closures,school_closures,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,0.0,0.000000,0.0,,,,,,1102000.0,43.293
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,0.0,0.000000,1.0,,,,,,1102000.0,43.293
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,0.000000,0.0,,,,,,1102000.0,43.293
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,0.000000,0.0,,,,,,1102000.0,43.293
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,0.0,1.285714,1.0,,,,,,1102000.0,43.293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,2.0,1.000000,0.0,23.986111,0.0,,,1.7,2168000.0,32.237
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,2.0,1.000000,0.0,,,,,1.7,2168000.0,32.237
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,2.0,1.000000,0.0,,,,,1.7,2168000.0,32.237
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,2.0,1.000000,0.0,,,,,1.7,2168000.0,32.237


## Vaccination

Retrieve vaccination data grouping done by week 

In [87]:
vaccination_df=read_csv_file('1mLYB7CX0BYOkGqRp6QWhDKV_FrXOlDIr','covid-vaccination-doses-per-capita', 0)
vaccination_df['Date'] = pd.to_datetime(vaccination_df['Day']) + pd.to_timedelta(18, unit='d')
vaccination_df=vaccination_df[['Code', 'Date', 'total_vaccinations_per_hundred']]
vaccination_df = vaccination_df.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
vaccination_df.columns=['Code', 'Date', 'total_vaccinations_per_100']
vaccination_df

Unnamed: 0,Code,Date,total_vaccinations_per_100
0,ABW,2021-04-18,24.130000
1,ABW,2021-05-02,41.670000
2,ABW,2021-05-09,52.935714
3,ABW,2021-05-16,65.184000
4,AFG,2021-03-14,0.000000
...,...,...,...
1847,ZWE,2021-04-18,0.518571
1848,ZWE,2021-04-25,0.950000
1849,ZWE,2021-05-02,1.528571
1850,ZWE,2021-05-09,2.107143


In [88]:
df_merge = pd.merge(df_merge, vaccination_df, on=["Code", "Date"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,school_closures,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,0.000000,0.0,,,,,,1102000.0,43.293,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,0.000000,1.0,,,,,,1102000.0,43.293,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.000000,0.0,,,,,,1102000.0,43.293,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.000000,0.0,,,,,,1102000.0,43.293,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,1.285714,1.0,,,,,,1102000.0,43.293,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,1.000000,0.0,23.986111,0.0,,,1.7,2168000.0,32.237,0.518571
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,1.000000,0.0,,,,,1.7,2168000.0,32.237,0.950000
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,1.000000,0.0,,,,,1.7,2168000.0,32.237,1.528571
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,1.000000,0.0,,,,,1.7,2168000.0,32.237,2.107143


## Youth Unemployment

#### Youth unemployment data for all countries, selecting last value occurence

In [89]:
youth_unemployment_df=read_csv_file('1Vv6uPieFDONx3kSA9V6vbgzJSireUJDN','youth_unemployment', 3)

#Select latest value between 2010 and 2020 
years=youth_unemployment_df.columns[54:-1]

#Iterate through all values and collect the values for the maximum year recorded for the selected country
latest_values=[]

for index, row in youth_unemployment_df.iterrows():
    
    aux_year=2010
    
    for year in years:
        
        if((math.isnan(row[year])==False)):
            
            aux_year=year
            
    latest_values.append(row[str(aux_year)])
    
youth_unemployment_df['latest_value']=latest_values
youth_unemployment_df=youth_unemployment_df[['Country Code', 'latest_value']]
youth_unemployment_df.columns=['Code', '%youth_unemployment_total']
youth_unemployment_df

Unnamed: 0,Code,%youth_unemployment_total
0,ABW,
1,AFG,17.219999
2,AGO,16.260000
3,ALB,26.990000
4,AND,
...,...,...
259,XKX,
260,YEM,24.240000
261,ZAF,57.470001
262,ZMB,22.629999


In [90]:
df_merge = pd.merge(df_merge, youth_unemployment_df, on=["Code"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,0.0,,,,,,1102000.0,43.293,,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,1.0,,,,,,1102000.0,43.293,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,0.0,,,,,,1102000.0,43.293,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,0.0,,,,,,1102000.0,43.293,,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,1.0,,,,,,1102000.0,43.293,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,0.0,23.986111,0.0,,,1.7,2168000.0,32.237,0.518571,8.13
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,0.0,,,,,1.7,2168000.0,32.237,0.950000,8.13
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,0.0,,,,,1.7,2168000.0,32.237,1.528571,8.13
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,0.0,,,,,1.7,2168000.0,32.237,2.107143,8.13


## Life Expectancy

In [91]:
life_expectancy_df=read_csv_file('1946XP73K-KBKOYjZMTrR-kObEkr7VZ23','life_expectancy', 3)

#Select latest value between 2010 and 2020 
years=life_expectancy_df.columns[54:-1]

#Iterate through all values and collect the values for the maximum year recorded for the selected country
latest_values=[]

for index, row in life_expectancy_df.iterrows():
    
    aux_year=2010
    
    for year in years:
        
        if((math.isnan(row[year])==False)):
            
            aux_year=year
            
    latest_values.append(row[str(aux_year)])
    
life_expectancy_df['latest_value']=latest_values
life_expectancy_df=life_expectancy_df[['Country Code', 'latest_value']]
life_expectancy_df.columns=['Code', 'life_expectancy']
life_expectancy_df

Unnamed: 0,Code,life_expectancy
0,ABW,76.152000
1,AFG,64.486000
2,AGO,60.782000
3,ALB,78.458000
4,AND,
...,...,...
259,XKX,72.195122
260,YEM,66.096000
261,ZAF,63.857000
262,ZMB,63.510000


In [92]:
df_merge = pd.merge(df_merge, life_expectancy_df, on=["Code"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,,,,,,1102000.0,43.293,,,76.152
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,,,,,,1102000.0,43.293,,,76.152
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,,,,,,1102000.0,43.293,,,76.152
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,,,,,,1102000.0,43.293,,,76.152
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,,,,,,1102000.0,43.293,,,76.152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,23.986111,0.0,,,1.7,2168000.0,32.237,0.518571,8.13,61.195
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,,,,,1.7,2168000.0,32.237,0.950000,8.13,61.195
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,,,,,1.7,2168000.0,32.237,1.528571,8.13,61.195
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,,,,,1.7,2168000.0,32.237,2.107143,8.13,61.195


## %Of Population greater than 65

In [93]:
df_population_gr_65=read_csv_file('1z4gqhcx55nLbhJ9v6__Re4UEglUt7-Dz','ages65_and_older', 3)
years=df_population_gr_65.columns[54:-1]

latest_values=[]

for index, row in df_population_gr_65.iterrows():
    
    aux_year=2010
    
    for year in years:
        
        if((math.isnan(row[year])==False)):
            
            aux_year=year
            
    latest_values.append(row[str(aux_year)])
    
df_population_gr_65['latest_value']=latest_values
df_population_gr_65=df_population_gr_65[['Country Code', 'latest_value']]
df_population_gr_65.columns=['Code', '%df_population_gr_65']
df_population_gr_65
df_merge = pd.merge(df_merge, df_population_gr_65, on=["Code"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,,,,,1102000.0,43.293,,,76.152,14.058355
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,,,,,1102000.0,43.293,,,76.152,14.058355
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,,,,,1102000.0,43.293,,,76.152,14.058355
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,,,,,1102000.0,43.293,,,76.152,14.058355
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,,,,,1102000.0,43.293,,,76.152,14.058355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,0.0,,,1.7,2168000.0,32.237,0.518571,8.13,61.195,2.980608
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,,,,1.7,2168000.0,32.237,0.950000,8.13,61.195,2.980608
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,,,,1.7,2168000.0,32.237,1.528571,8.13,61.195,2.980608
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,,,,1.7,2168000.0,32.237,2.107143,8.13,61.195,2.980608


## Median Age

In [94]:
df_median_age=read_csv_file('1jQ_mcxB7JKIlp5xJ9d4zSmjdJQyAwKp0','median-age',0)
years=df_median_age['Year'].unique()
years=years[1:15]
countries=df_median_age['Code'].unique()
countries = [x for x in countries if str(x) != 'nan']
latest_values=[]

for country in countries:
    max_year=2010
    for year in years:
        if((year > max_year) & (len(df_median_age[(df_median_age['Year']==year) & (df_median_age['Code']==country)]) > 0)):
            max_year = year

    latest_values.append(df_median_age[(df_median_age['Year']==max_year) & (df_median_age['Code']==country)]['UN Population Division (Median Age) (2017)'].values[0])
df_median_age=pd.DataFrame()
df_median_age['Code']=countries
df_median_age['UN Population Division (Median Age) (2017)']=latest_values
df_merge = pd.merge(df_merge, df_median_age, on=["Code"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65,UN Population Division (Median Age) (2017)
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,,,,1102000.0,43.293,,,76.152,14.058355,41.200001
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,,,,1102000.0,43.293,,,76.152,14.058355,41.200001
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,,,,1102000.0,43.293,,,76.152,14.058355,41.200001
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,,,,1102000.0,43.293,,,76.152,14.058355,41.200001
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,,,,1102000.0,43.293,,,76.152,14.058355,41.200001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,,,1.7,2168000.0,32.237,0.518571,8.13,61.195,2.980608,19.600000
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,,,1.7,2168000.0,32.237,0.950000,8.13,61.195,2.980608,19.600000
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,,,1.7,2168000.0,32.237,1.528571,8.13,61.195,2.980608,19.600000
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,,,1.7,2168000.0,32.237,2.107143,8.13,61.195,2.980608,19.600000


## Excess Mortality  (Previous 35 days)

In [95]:
df_excess_mortality_prev=read_csv_file('1tJBHRAMU_xPLljHaotn4rxeYP5JCkb9j','excess-mortality-p-scores',0)
df_excess_mortality_prev['Date'] = pd.to_datetime(df_excess_mortality_prev['Day']) + pd.to_timedelta(35, unit='d')
df_excess_mortality_prev = df_excess_mortality_prev.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
df_excess_mortality_prev.columns=['Code', 'Date', 'Excess mortality P-scores, all ages Prev 35 days']
df_excess_mortality_prev

Unnamed: 0,Code,Date,"Excess mortality P-scores, all ages Prev 35 days"
0,ALB,2020-03-08,-10.65
1,ALB,2020-04-05,2.17
2,ALB,2020-05-10,0.62
3,ALB,2020-06-07,3.23
4,ALB,2020-07-05,6.15
...,...,...,...
3211,UZB,2021-01-10,10.71
3212,UZB,2021-02-07,6.41
3213,UZB,2021-03-07,17.12
3214,UZB,2021-04-04,4.76


In [96]:
df_merge = pd.merge(df_merge, df_excess_mortality_prev, on=["Code", "Date"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65,UN Population Division (Median Age) (2017),"Excess mortality P-scores, all ages Prev 35 days"
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,,,1102000.0,43.293,,,76.152,14.058355,41.200001,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,,,1102000.0,43.293,,,76.152,14.058355,41.200001,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,,,1102000.0,43.293,,,76.152,14.058355,41.200001,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,,,1102000.0,43.293,,,76.152,14.058355,41.200001,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,,,1102000.0,43.293,,,76.152,14.058355,41.200001,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,,1.7,2168000.0,32.237,0.518571,8.13,61.195,2.980608,19.600000,
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,,1.7,2168000.0,32.237,0.950000,8.13,61.195,2.980608,19.600000,
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,,1.7,2168000.0,32.237,1.528571,8.13,61.195,2.980608,19.600000,
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,,1.7,2168000.0,32.237,2.107143,8.13,61.195,2.980608,19.600000,


## Excess Mortality  (Previous 28 days)

In [97]:
df_excess_mortality_prev=read_csv_file('1tJBHRAMU_xPLljHaotn4rxeYP5JCkb9j','excess-mortality-p-scores',0)
df_excess_mortality_prev['Date'] = pd.to_datetime(df_excess_mortality_prev['Day']) + pd.to_timedelta(28, unit='d')
df_excess_mortality_prev = df_excess_mortality_prev.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
df_excess_mortality_prev.columns=['Code', 'Date', 'Excess mortality P-scores, all ages Prev 28 days']
df_excess_mortality_prev

Unnamed: 0,Code,Date,"Excess mortality P-scores, all ages Prev 28 days"
0,ALB,2020-03-01,-10.65
1,ALB,2020-03-29,2.17
2,ALB,2020-05-03,0.62
3,ALB,2020-05-31,3.23
4,ALB,2020-06-28,6.15
...,...,...,...
3211,UZB,2021-01-03,10.71
3212,UZB,2021-01-31,6.41
3213,UZB,2021-02-28,17.12
3214,UZB,2021-03-28,4.76


In [98]:
df_merge = pd.merge(df_merge, df_excess_mortality_prev, on=["Code", "Date"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65,UN Population Division (Median Age) (2017),"Excess mortality P-scores, all ages Prev 35 days","Excess mortality P-scores, all ages Prev 28 days"
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,,1102000.0,43.293,,,76.152,14.058355,41.200001,,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,,1102000.0,43.293,,,76.152,14.058355,41.200001,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,,1102000.0,43.293,,,76.152,14.058355,41.200001,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,,1102000.0,43.293,,,76.152,14.058355,41.200001,,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,,1102000.0,43.293,,,76.152,14.058355,41.200001,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,1.7,2168000.0,32.237,0.518571,8.13,61.195,2.980608,19.600000,,
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,1.7,2168000.0,32.237,0.950000,8.13,61.195,2.980608,19.600000,,
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,1.7,2168000.0,32.237,1.528571,8.13,61.195,2.980608,19.600000,,
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,1.7,2168000.0,32.237,2.107143,8.13,61.195,2.980608,19.600000,,


## Excess Mortality (Previous 7 days)

In [99]:
df_excess_mortality_prev=read_csv_file('1tJBHRAMU_xPLljHaotn4rxeYP5JCkb9j','excess-mortality-p-scores',0)
df_excess_mortality_prev['Date'] = pd.to_datetime(df_excess_mortality_prev['Day']) + pd.to_timedelta(7, unit='d')
df_excess_mortality_prev = df_excess_mortality_prev.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
df_excess_mortality_prev.columns=['Code', 'Date', 'Excess mortality P-scores, all ages Prev 7 days']
df_excess_mortality_prev

Unnamed: 0,Code,Date,"Excess mortality P-scores, all ages Prev 7 days"
0,ALB,2020-02-09,-10.65
1,ALB,2020-03-08,2.17
2,ALB,2020-04-12,0.62
3,ALB,2020-05-10,3.23
4,ALB,2020-06-07,6.15
...,...,...,...
3211,UZB,2020-12-13,10.71
3212,UZB,2021-01-10,6.41
3213,UZB,2021-02-07,17.12
3214,UZB,2021-03-07,4.76


In [100]:
df_merge = pd.merge(df_merge, df_excess_mortality_prev, on=["Code", "Date"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65,UN Population Division (Median Age) (2017),"Excess mortality P-scores, all ages Prev 35 days","Excess mortality P-scores, all ages Prev 28 days","Excess mortality P-scores, all ages Prev 7 days"
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,1102000.0,43.293,,,76.152,14.058355,41.200001,,,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,1102000.0,43.293,,,76.152,14.058355,41.200001,,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,1102000.0,43.293,,,76.152,14.058355,41.200001,,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,1102000.0,43.293,,,76.152,14.058355,41.200001,,,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,1102000.0,43.293,,,76.152,14.058355,41.200001,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,2168000.0,32.237,0.518571,8.13,61.195,2.980608,19.600000,,,
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,2168000.0,32.237,0.950000,8.13,61.195,2.980608,19.600000,,,
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,2168000.0,32.237,1.528571,8.13,61.195,2.980608,19.600000,,,
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,2168000.0,32.237,2.107143,8.13,61.195,2.980608,19.600000,,,


## Excess Mortality

In [102]:
df_excess_mortality=read_csv_file('1tJBHRAMU_xPLljHaotn4rxeYP5JCkb9j','excess-mortality-p-scores',0)
df_excess_mortality['Date'] = pd.to_datetime(df_excess_mortality['Day'])
df_excess_mortality = df_excess_mortality.groupby(['Code', pd.Grouper(key='Date', freq='W-SUN')]).mean().reset_index()
df_excess_mortality

Unnamed: 0,Code,Date,"Excess mortality P-scores, all ages"
0,ALB,2020-02-02,-10.65
1,ALB,2020-03-01,2.17
2,ALB,2020-04-05,0.62
3,ALB,2020-05-03,3.23
4,ALB,2020-05-31,6.15
...,...,...,...
3211,UZB,2020-12-06,10.71
3212,UZB,2021-01-03,6.41
3213,UZB,2021-01-31,17.12
3214,UZB,2021-02-28,4.76


In [103]:
df_merge = pd.merge(df_merge, df_excess_mortality, on=["Code", "Date"], how="left")
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65,UN Population Division (Median Age) (2017),"Excess mortality P-scores, all ages Prev 35 days","Excess mortality P-scores, all ages Prev 28 days","Excess mortality P-scores, all ages Prev 7 days","Excess mortality P-scores, all ages"
0,ABW,2020-03-08,0.0,-0.233333,-3.111000,1.911000,6.572333,9.755667,-8.889000,0.0,...,43.293,,,76.152,14.058355,41.200001,,,,
1,ABW,2020-03-15,0.0,1.051000,-3.050714,2.330000,8.013571,7.656429,-13.826429,0.0,...,43.293,,,76.152,14.058355,41.200001,,,,
2,ABW,2020-03-22,0.0,1.734714,0.755143,1.020286,3.979714,7.836857,-4.469286,0.0,...,43.293,,,76.152,14.058355,41.200001,,,,
3,ABW,2020-03-29,0.0,2.265429,3.775571,0.224571,3.775429,7.877714,2.020143,0.0,...,43.293,,,76.152,14.058355,41.200001,,,,
4,ABW,2020-04-05,0.0,-12.347000,5.428714,4.102143,-6.877714,0.489857,-3.795857,0.0,...,43.293,,,76.152,14.058355,41.200001,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7911,ZWE,2021-04-18,0.0,13.775571,39.591857,-0.469571,15.204286,17.244857,4.857143,0.0,...,32.237,0.518571,8.13,61.195,2.980608,19.600000,,,,
7912,ZWE,2021-04-25,0.0,12.591857,38.347000,0.857429,15.122286,20.796143,-10.183714,0.0,...,32.237,0.950000,8.13,61.195,2.980608,19.600000,,,,
7913,ZWE,2021-05-02,0.0,11.489714,34.816429,1.061286,15.040857,22.489857,-0.857000,0.0,...,32.237,1.528571,8.13,61.195,2.980608,19.600000,,,,
7914,ZWE,2021-05-09,0.0,13.999857,38.510429,0.918429,18.591714,23.530571,6.306143,0.0,...,32.237,2.107143,8.13,61.195,2.980608,19.600000,,,,


## Calculation of Raw Deaths

In [106]:
df_raw_deaths=read_csv_file('1avaGZk0Zlrx786lM6wOdaF_74Ssq68K2','excess-mortality-raw-death-count',0)

df_raw_deaths=df_raw_deaths[['Code', 'Day', 'average_deaths_2015_2019_all_ages']]
df_raw_deaths['month']=pd.to_datetime(df_raw_deaths['Day']).dt.month
df_raw_deaths=df_raw_deaths[['Code', 'month', 'average_deaths_2015_2019_all_ages']]
df_raw_deaths=df_raw_deaths.groupby(['Code','month']).agg('sum').reset_index()
df_raw_deaths['average_deaths_2015_2019_all_ages']=df_raw_deaths['average_deaths_2015_2019_all_ages']/4

df_merge['month']=pd.to_datetime(df_merge['Date']).dt.month
df_merge = pd.merge(df_merge, df_raw_deaths, on=["Code", "month"])
df_merge['deaths_prev_7']=(df_merge['average_deaths_2015_2019_all_ages']/100)*df_merge['Excess mortality P-scores, all ages Prev 7 days']
df_merge['deaths_prev_28']=(df_merge['average_deaths_2015_2019_all_ages']/100)*df_merge['Excess mortality P-scores, all ages Prev 28 days']
df_merge['deaths_prev_35']=(df_merge['average_deaths_2015_2019_all_ages']/100)*df_merge['Excess mortality P-scores, all ages Prev 35 days']
df_merge['deaths']=((df_merge['average_deaths_2015_2019_all_ages']/100)*df_merge['Excess mortality P-scores, all ages'])

df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,"Excess mortality P-scores, all ages Prev 35 days","Excess mortality P-scores, all ages Prev 28 days","Excess mortality P-scores, all ages Prev 7 days","Excess mortality P-scores, all ages",month,average_deaths_2015_2019_all_ages,deaths_prev_7,deaths_prev_28,deaths_prev_35,deaths
0,AUS,2020-03-08,2.0,1.050000,1.722333,-0.739000,4.977667,-2.089000,7.927667,0.0,...,3.32,4.42,4.92,4.67,3,3126.250,153.811500,138.180250,103.791500,145.995875
1,AUS,2020-03-15,2.0,0.761714,1.510143,-1.286000,7.697286,-0.819857,10.567714,0.0,...,4.42,6.18,4.67,4.62,3,3126.250,145.995875,193.202250,138.180250,144.432750
2,AUS,2020-03-22,2.0,1.285714,3.367429,-1.081714,9.346857,0.938857,10.877857,0.0,...,6.18,4.53,4.62,7.28,3,3126.250,144.432750,141.619125,193.202250,227.591000
3,AUS,2020-03-29,2.0,0.714429,8.979571,-0.387571,5.448857,-2.857143,8.326571,0.0,...,4.53,4.92,7.28,12.13,3,3126.250,227.591000,153.811500,141.619125,379.214125
4,AUS,2021-03-07,2.0,-14.061000,-1.653143,5.306143,-37.918286,-16.959143,-10.775571,1.0,...,,,,,3,3126.250,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3926,USA,2021-01-31,1.0,-23.979714,-11.387857,11.449143,-39.571429,-22.591714,-28.999857,1.0,...,44.00,47.46,32.45,28.11,1,74249.625,24094.003313,35238.872025,32669.835000,20871.569588
3927,USA,2021-02-07,1.0,-24.673429,-12.857143,10.795714,-40.183714,-19.183714,-28.816286,1.0,...,47.46,38.43,28.11,25.01,2,57847.700,16260.988470,22230.871110,27454.518420,14467.709770
3928,USA,2021-02-14,1.0,-25.836714,-14.142714,10.734429,-41.285714,-21.000143,-28.816143,1.0,...,38.43,36.07,25.01,15.60,2,57847.700,14467.709770,20865.665390,22230.871110,9024.241200
3929,USA,2021-02-21,1.0,-25.938714,-14.551000,10.714000,-41.714286,-25.775714,-27.938714,1.0,...,36.07,32.45,15.60,12.85,2,57847.700,9024.241200,18771.578650,20865.665390,7433.429450


## Calculation of Accumulated Deaths 

In [109]:
df_population=read_csv_file('14URaFs8YAJmJzXQuHzT7qV6uqGRUr9oW','populations',3)

In [110]:
#Select latest value between 2010 and 2020 
years=df_population.columns[54:-1]
years

Index(['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018',
       '2019', '2020'],
      dtype='object')

In [111]:
#Iterate through all values and collect the values for the maximum year recorded for the selected country
latest_values=[]

for index, row in df_population.iterrows():
    
    aux_year=2010
    
    for year in years:
        
        if((math.isnan(row[year])==False)):
            
            aux_year=year
            
    latest_values.append(row[str(aux_year)])

In [112]:
df_population['latest_value']=latest_values
df_population=df_population[['Country Code', 'latest_value']]
df_population.columns=['Code', 'population']
df_population

Unnamed: 0,Code,population
0,ABW,106314.0
1,AFG,38041754.0
2,AGO,31825295.0
3,ALB,2854191.0
4,AND,77142.0
...,...,...
259,XKX,1794248.0
260,YEM,29161922.0
261,ZAF,58558270.0
262,ZMB,17861030.0


In [113]:
df_merge = pd.merge(df_merge, df_population, on=["Code"], how="left")
df_merge=df_merge.sort_values(['Code','Date'], ascending=[True, True])

In [114]:
df_merge=df_merge.sort_values(['Code','Date'], ascending=[True, True])

In [115]:
accumulated=[]
country='AUS'
cumsum=0

for index, row in df_merge.iterrows():
    country_last=row['Code']
    
    if(country!=country_last):
        
        cumsum=0
        country=row['Code']
        if(row['deaths_prev_28']  > 0):
            cumsum=cumsum+row['deaths_prev_28'] 
        
    else:
        
        if(row['deaths_prev_28']  > 0):
            cumsum=cumsum+row['deaths_prev_28'] 
    
    accumulated.append(cumsum)

df_merge['accumulated']=accumulated
df_merge['accumulated']=100*df_merge['accumulated']/df_merge['population']

## Calculation of R0

In [116]:
df_merge['R0_prev28']=df_merge['deaths_prev_28']/df_merge['deaths_prev_35']
df_merge['R0']=df_merge['deaths']/df_merge['deaths_prev_7']

In [117]:
df_merge

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,month,average_deaths_2015_2019_all_ages,deaths_prev_7,deaths_prev_28,deaths_prev_35,deaths,population,accumulated,R0_prev28,R0
0,AUS,2020-03-08,2.0,1.050000,1.722333,-0.739000,4.977667,-2.089000,7.927667,0.0,...,3,3126.25,153.811500,138.180250,103.791500,145.995875,25364307.0,0.000545,1.331325,0.949187
1,AUS,2020-03-15,2.0,0.761714,1.510143,-1.286000,7.697286,-0.819857,10.567714,0.0,...,3,3126.25,145.995875,193.202250,138.180250,144.432750,25364307.0,0.001306,1.398190,0.989293
2,AUS,2020-03-22,2.0,1.285714,3.367429,-1.081714,9.346857,0.938857,10.877857,0.0,...,3,3126.25,144.432750,141.619125,193.202250,227.591000,25364307.0,0.001865,0.733010,1.575758
3,AUS,2020-03-29,2.0,0.714429,8.979571,-0.387571,5.448857,-2.857143,8.326571,0.0,...,3,3126.25,227.591000,153.811500,141.619125,379.214125,25364307.0,0.002471,1.086093,1.666209
8,AUS,2020-04-05,2.0,-0.898000,15.163286,0.448857,-1.061286,-4.959143,6.959000,0.0,...,4,2564.75,311.104175,119.773825,126.185700,290.073225,25364307.0,0.002943,0.949187,0.932399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3882,USA,2021-04-11,1.0,-8.816429,-4.244714,7.040714,-27.183571,4.265286,-25.775714,1.0,...,4,53949.95,,,,,328239523.0,0.241952,,
3883,USA,2021-04-18,1.0,-8.469429,-4.000143,6.632286,-26.163143,12.040857,-24.877429,1.0,...,4,53949.95,,,,,328239523.0,0.241952,,
3884,USA,2021-04-25,1.0,-8.020429,-0.142857,6.449143,-25.469286,17.755000,-27.979714,1.0,...,4,53949.95,,,,,328239523.0,0.241952,,
3890,USA,2021-05-02,1.0,-7.979571,-1.816286,5.816143,-24.571429,24.347000,-26.428571,1.0,...,5,64689.30,,,,,328239523.0,0.241952,,


In [119]:
df_final=df_merge.dropna(subset=['R0'])

In [120]:
df_final

Unnamed: 0,Code,Date,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,...,month,average_deaths_2015_2019_all_ages,deaths_prev_7,deaths_prev_28,deaths_prev_35,deaths,population,accumulated,R0_prev28,R0
0,AUS,2020-03-08,2.0,1.050000,1.722333,-0.739000,4.977667,-2.089000,7.927667,0.0,...,3,3126.250,153.811500,138.180250,103.791500,145.995875,25364307.0,0.000545,1.331325,0.949187
1,AUS,2020-03-15,2.0,0.761714,1.510143,-1.286000,7.697286,-0.819857,10.567714,0.0,...,3,3126.250,145.995875,193.202250,138.180250,144.432750,25364307.0,0.001306,1.398190,0.989293
2,AUS,2020-03-22,2.0,1.285714,3.367429,-1.081714,9.346857,0.938857,10.877857,0.0,...,3,3126.250,144.432750,141.619125,193.202250,227.591000,25364307.0,0.001865,0.733010,1.575758
3,AUS,2020-03-29,2.0,0.714429,8.979571,-0.387571,5.448857,-2.857143,8.326571,0.0,...,3,3126.250,227.591000,153.811500,141.619125,379.214125,25364307.0,0.002471,1.086093,1.666209
8,AUS,2020-04-05,2.0,-0.898000,15.163286,0.448857,-1.061286,-4.959143,6.959000,0.0,...,4,2564.750,311.104175,119.773825,126.185700,290.073225,25364307.0,0.002943,0.949187,0.932399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3926,USA,2021-01-31,1.0,-23.979714,-11.387857,11.449143,-39.571429,-22.591714,-28.999857,1.0,...,1,74249.625,24094.003313,35238.872025,32669.835000,20871.569588,328239523.0,0.204830,1.078636,0.866256
3927,USA,2021-02-07,1.0,-24.673429,-12.857143,10.795714,-40.183714,-19.183714,-28.816286,1.0,...,2,57847.700,16260.988470,22230.871110,27454.518420,14467.709770,328239523.0,0.211602,0.809735,0.889719
3928,USA,2021-02-14,1.0,-25.836714,-14.142714,10.734429,-41.285714,-21.000143,-28.816143,1.0,...,2,57847.700,14467.709770,20865.665390,22230.871110,9024.241200,328239523.0,0.217959,0.938590,0.623750
3929,USA,2021-02-21,1.0,-25.938714,-14.551000,10.714000,-41.714286,-25.775714,-27.938714,1.0,...,2,57847.700,9024.241200,18771.578650,20865.665390,7433.429450,328239523.0,0.223678,0.899640,0.823718


In [121]:
df_final.to_csv('/Users/lucas_botella/Downloads/df_covid_processed_FINAL.csv')

## Correlation Matrix

In [122]:
import numpy as np

In [123]:
rs = np.random.RandomState(0)
df = pd.DataFrame(rs.rand(10, 10))
corr = df_final.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,contact_tracing,retail_and_recreation,grocery_and_pharmacy,residential,transit_stations,parks,workplaces,debt_relief,income_support,testing_policy,international_travel_controls,restrictions_internal_movements,close_public_transport,public_information_campaigns,facial_coverings,stay_home_requirements,restriction_gatherings,cancel_public_events,workplace_closures,school_closures,Holiday,temp,prcp,doctors_per_1000,nurses_per_1000,beds_per_1000,number_of_arrivals,urban_population,total_vaccinations_per_100,%youth_unemployment_total,life_expectancy,%df_population_gr_65,UN Population Division (Median Age) (2017),"Excess mortality P-scores, all ages Prev 35 days","Excess mortality P-scores, all ages Prev 28 days","Excess mortality P-scores, all ages Prev 7 days","Excess mortality P-scores, all ages",month,average_deaths_2015_2019_all_ages,deaths_prev_7,deaths_prev_28,deaths_prev_35,deaths,population,accumulated,R0_prev28,R0
contact_tracing,1.0,0.062096,0.105148,-0.110533,0.100956,0.086961,0.077936,0.146533,0.248446,0.339724,0.181435,-0.030047,-0.127878,0.206137,0.253915,0.006394,0.041454,-0.011825,-0.024113,-0.059289,-0.002876,0.088763,0.148911,0.151211,0.131965,0.283937,-0.059549,0.135217,0.114785,0.013968,0.225462,0.120841,0.197888,-0.068194,-0.066505,-0.06124,-0.068745,0.186392,-0.123004,-0.12522,-0.121706,-0.120513,-0.129454,-0.129962,-0.016825,-0.043816,-0.006173
retail_and_recreation,0.062096,1.0,0.784111,-0.9091,0.865888,0.602722,0.781909,-0.223859,-0.198187,0.023436,-0.261586,-0.580037,-0.467518,-0.187015,-0.190388,-0.668262,-0.583042,-0.592644,-0.672887,-0.624317,-0.123584,0.218731,0.044619,0.036924,0.137512,0.113178,-0.07531,0.034253,0.02491,-0.073684,0.009558,0.156683,0.13218,-0.37899,-0.388812,-0.277035,-0.186889,0.265566,-0.011703,-0.105951,-0.158019,-0.155366,-0.065283,-0.017877,-0.265267,-0.000418,-0.0023
grocery_and_pharmacy,0.105148,0.784111,1.0,-0.833892,0.777924,0.526959,0.719299,-0.226951,-0.099857,0.056276,-0.240647,-0.445437,-0.479808,-0.19717,-0.089044,-0.499089,-0.37355,-0.428222,-0.50155,-0.482388,-0.081769,0.060384,-0.008003,0.132791,0.104301,0.268587,-0.024385,0.08809,0.135831,-0.03493,0.024032,0.250234,0.2311,-0.315986,-0.336538,-0.239225,-0.156538,0.148741,-0.008711,-0.083003,-0.135338,-0.126985,-0.043988,-0.023825,-0.038537,-0.010815,-0.011044
residential,-0.110533,-0.9091,-0.833892,1.0,-0.899786,-0.584562,-0.830042,0.233779,0.141997,0.007483,0.28796,0.598998,0.547696,0.224071,0.139005,0.674206,0.565873,0.581525,0.67178,0.636689,0.161993,-0.108429,-0.054979,-0.152086,-0.102891,-0.301737,0.03162,0.073281,-0.034051,0.033666,0.022221,-0.314315,-0.311163,0.404562,0.421503,0.320903,0.227643,-0.237819,0.043654,0.155875,0.201674,0.194835,0.115191,0.072009,0.192972,0.002409,-0.002634
transit_stations,0.100956,0.865888,0.777924,-0.899786,1.0,0.467812,0.820027,-0.270159,-0.290349,-0.049048,-0.406856,-0.589797,-0.506468,-0.288222,-0.149158,-0.634906,-0.62048,-0.599725,-0.683792,-0.618492,-0.171247,0.184734,0.058799,0.036212,0.019754,0.293693,-0.079133,-0.037993,0.047683,-0.068561,-0.0504,0.13573,0.163319,-0.333214,-0.343966,-0.224593,-0.123922,0.186611,-0.047617,-0.141384,-0.195861,-0.191585,-0.095609,-0.059438,-0.183148,0.002465,0.000671
parks,0.086961,0.602722,0.526959,-0.584562,0.467812,1.0,0.18132,-0.046067,0.194289,0.113921,-0.069172,-0.341005,-0.25965,-0.012853,-0.15016,-0.394394,-0.227957,-0.305202,-0.274227,-0.276573,0.039914,0.207163,-0.041424,0.179647,0.167456,0.154604,-0.043752,0.028305,0.131056,-0.002524,0.096512,0.364254,0.318174,-0.306078,-0.309686,-0.29241,-0.266985,0.248132,-0.070377,-0.170004,-0.182187,-0.180341,-0.157976,-0.097691,-0.259471,0.014038,0.017776
workplaces,0.077936,0.781909,0.719299,-0.830042,0.820027,0.18132,1.0,-0.305912,-0.329792,-0.047102,-0.281775,-0.492661,-0.449524,-0.259785,-0.138568,-0.542178,-0.574354,-0.541738,-0.657679,-0.611849,-0.292314,0.083647,0.065203,-0.007966,0.049898,0.190109,-0.086594,-0.000859,0.104612,-0.064556,-0.007913,0.059316,0.088923,-0.334519,-0.350881,-0.231673,-0.134379,0.170918,-0.050011,-0.134877,-0.189132,-0.181636,-0.089382,-0.047161,-0.141517,-0.003864,0.01195
debt_relief,0.146533,-0.223859,-0.226951,0.233779,-0.270159,-0.046067,-0.305912,1.0,0.216495,0.263928,0.172584,0.172185,0.251471,0.245705,0.309129,0.276329,0.269534,0.202586,0.262695,0.308999,-0.038333,0.242439,-0.020112,-0.108406,-0.480717,-0.094983,-0.085695,-0.153374,-0.052099,0.165106,-0.165926,-0.126235,-0.091041,0.158917,0.164822,0.123018,0.086513,0.202193,-0.152015,-0.046053,-0.016824,-0.01768,-0.068235,-0.139103,0.198861,-0.01472,-0.001372
income_support,0.248446,-0.198187,-0.099857,0.141997,-0.290349,0.194289,-0.329792,0.216495,1.0,0.30239,0.314824,0.181952,0.07205,0.33486,0.2335,0.198557,0.453083,0.328565,0.272421,0.151873,0.122219,-0.123699,0.011428,0.219023,0.294247,0.057165,0.102282,0.210546,0.165566,0.029597,0.27323,0.248124,0.197572,0.035397,0.034691,-0.030982,-0.071687,0.148635,0.031475,-0.005897,0.03288,0.03314,-0.035943,-0.018175,0.152641,0.014924,0.025086
testing_policy,0.339724,0.023436,0.056276,0.007483,-0.049048,0.113921,-0.047102,0.263928,0.30239,1.0,0.237042,0.047556,0.03082,0.333927,0.397765,0.142817,0.345977,0.189578,0.163903,0.070358,-0.033154,0.1089,0.071646,0.005223,0.103513,0.092484,0.090159,0.070912,0.215157,0.058891,0.259678,0.070454,0.105074,0.022573,0.014577,-0.01802,-0.046689,0.233811,0.086045,0.021883,0.031069,0.032398,0.013923,0.093954,0.138939,-0.064377,-0.045589


## Countries includeed in the analysis

In [403]:
import pycountry
from matplotlib import pyplot as plt

In [404]:
country_list=df_final.Code.unique()
country_list.size

40

In [405]:
def country_name(country):
    return pycountry.countries.get(alpha_3=country).name

In [406]:
def country_code_2_lower(country):
    return pycountry.countries.get(alpha_3=country).alpha_2.lower()

In [407]:
country_name('AUT')

'Austria'

In [408]:
country_code_2_lower('AUT')

'at'

In [409]:
country_names_list=[country_name(country) for country in country_list]
country_names_list

['Australia',
 'Austria',
 'Belgium',
 'Bulgaria',
 'Canada',
 'Switzerland',
 'Chile',
 'Colombia',
 'Czechia',
 'Germany',
 'Denmark',
 'Ecuador',
 'Spain',
 'Estonia',
 'Finland',
 'France',
 'United Kingdom',
 'Greece',
 'Guatemala',
 'Croatia',
 'Hungary',
 'Israel',
 'Italy',
 'Korea, Republic of',
 'Lithuania',
 'Luxembourg',
 'Latvia',
 'Mexico',
 'Malta',
 'Netherlands',
 'Norway',
 'New Zealand',
 'Poland',
 'Portugal',
 'Romania',
 'Slovakia',
 'Slovenia',
 'Sweden',
 'Taiwan, Province of China',
 'United States']

In [410]:
country_names_list_code2_lower=[country_code_2_lower(country) for country in country_list]
country_names_list_code2_lower

['au',
 'at',
 'be',
 'bg',
 'ca',
 'ch',
 'cl',
 'co',
 'cz',
 'de',
 'dk',
 'ec',
 'es',
 'ee',
 'fi',
 'fr',
 'gb',
 'gr',
 'gt',
 'hr',
 'hu',
 'il',
 'it',
 'kr',
 'lt',
 'lu',
 'lv',
 'mx',
 'mt',
 'nl',
 'no',
 'nz',
 'pl',
 'pt',
 'ro',
 'sk',
 'si',
 'se',
 'tw',
 'us']

In [411]:
pip install geopandas

Note: you may need to restart the kernel to use updated packages.


In [412]:
from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim
   
# declare an empty list to store
# latitude and longitude of values 
# of city column
longitude = []
latitude = []
   
# function to find the coordinate
# of a given city 
def findGeocode(city):
       
    # try and catch is used to overcome
    # the exception thrown by geolocator
    # using geocodertimedout  
    try:
          
        # Specify the user_agent as your
        # app name it should not be none
        geolocator = Nominatim(user_agent="your_app_name")
          
        return geolocator.geocode(city)
      
    except GeocoderTimedOut:
          
        return findGeocode(city)    

In [413]:
findGeocode('Spain')

Location(España, (39.3260685, -4.8379791, 0.0))

In [414]:
df_plot_countries= pd.DataFrame()
df_plot_countries['Country']=country_names_list
df_plot_countries.head()

Unnamed: 0,Country
0,Australia
1,Austria
2,Belgium
3,Bulgaria
4,Canada


In [416]:
# each value from city column
# will be fetched and sent to
# function find_geocode  
latitude=[]
longitude=[]
for i in (df_plot_countries["Country"]):
      
    if findGeocode(i) != None:
           
        loc = findGeocode(i)
          
        # coordinates returned from 
        # function is stored into
        # two separate list
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
       
    # if coordinate for a city not
    # found, insert "NaN" indicating 
    # missing value 
    else:
        latitude.append(np.nan)
        longitude.append(np.nan)

GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /search?q=Taiwan%2C+Province+of+China&format=json&limit=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))

In [None]:
df_plot_countries['latitude']=latitude
df_plot_countries['longitude']=longitude
df_plot_countries

In [None]:
pip install folium

In [None]:
# Create a world map to show distributions of users 
import folium
from folium.plugins import MarkerCluster
#empty map
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)
#for each coordinate, create circlemarker of user percent
for i in range(len(df_plot_countries)):
        lat = df_plot_countries.iloc[i]['latitude']
        long = df_plot_countries.iloc[i]['longitude']
        radius=5
        popup_text = """Country : {}<br>"""
        popup_text = popup_text.format(df_plot_countries.iloc[i]['Country']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
#show the map
world_map

In [None]:
pip install pygal_maps_world

In [None]:
!pip3 install cairosvg

In [None]:
import pygal
from pygal.maps.world import World
worldmap_chart = World()

worldmap_chart.title = 'Countries included in the analysis'

for i in range(len(country_names_list)):
    worldmap_chart.add(country_names_list[i], country_names_list_code2_lower[i])
    
worldmap_chart.render_to_file('mymap.html')

In [None]:
from IPython.core.display import display, HTML

In [None]:
display(HTML(worldmap_chart.render(is_unicode=True)))