In [1]:
import numpy as np
import pandas as pd
import folium
import matplotlib.pyplot as plt

In [2]:
country_geo = 'datasets/all_countries.geo.json'

In [51]:
## Gather all sets
covid_data = pd.read_csv('datasets/COVID-19-worldwide.csv')
pop_dens = pd.read_csv('datasets/population_density.csv')
tourism_data = pd.read_csv('datasets/tourism_data.csv')
#sqmt = pd.read_csv('datasets/squaremeter_per_country.csv')
urban_area = pd.read_csv('datasets/urban_landarea.csv')
urban_pop = pd.read_csv('datasets/urban_pop.csv')
rural_pop = pd.read_csv('datasets/rural_pop.csv')
trust_in_pol = pd.read_csv('datasets/trust_in_politicians.csv')
gov_eff = pd.read_csv('datasets/gov_effectiveness.csv')

## Set correct indices
trust_in_pol.set_index('Country ISO3',inplace=True)
gov_eff.set_index('Country ISO3',inplace=True)
rural_pop.set_index('Country Code',inplace=True)
urban_pop.set_index('Country Code',inplace=True)
tourism_data.set_index('Country Code',inplace=True)
pop_dens.set_index('Country Code',inplace=True)
urban_area.set_index('Country Code',inplace=True)

In [52]:
# Clean datasets
trust_in_pol = trust_in_pol.loc[(trust_in_pol['Indicator'] == 'Public trust in politicians') & (trust_in_pol['Subindicator Type'] == '1-7 Best')]
trust_in_pol = trust_in_pol[['Country Name','2017-2018']]
trust_in_pol['2017-2018'] = trust_in_pol['2017-2018'].round(3);

gov_stats = gov_eff[['Country Name','Indicator','Subindicator Type','2018']]
gov_stats_corruption = gov_stats.loc[(gov_stats['Indicator'] == 'Control of Corruption') & (gov_stats['Subindicator Type'] == 'Estimate')]
gov_stats_effectiveness = gov_stats.loc[(gov_stats['Indicator'] == 'Government Effectiveness') & (gov_stats['Subindicator Type'] == 'Estimate')]
gov_stats_rule = gov_stats.loc[(gov_stats['Indicator'] == 'Rule of Law') & (gov_stats['Subindicator Type'] == 'Estimate')]
gov_stats_reg = gov_stats.loc[(gov_stats['Indicator'] == 'Regulatory Quality') & (gov_stats['Subindicator Type'] == 'Estimate')]
gov_stats_full = gov_stats_corruption.append([gov_stats_effectiveness,gov_stats_rule,gov_stats_reg])
gov_stats_full.sort_index()
gov_stats_full.dropna(inplace=True)

rural_2018 = rural_pop[['Country Name','2018']]
rural_2018 = rural_2018.drop(index=['SSF','WLD','HIC','OED','PST','ECS','IBT','LMY','EUU','MIC',
                                    'IBD','EMU','UMC','LTE','EAS','EAR','TEC','EAP','TEA','ECA',
                                    'LMC','LCN','MEA','ARB','NAC','TLA','CEB','LAC','MNA','TMN',
                                    'SST','OSS','LDC','TSA','SAS','TSS','SSA','FCS','HPC','PRE'])

urban_2018 = urban_pop[['Country Name','2018']]
urban_2018 = urban_2018.drop(index=['SSF','WLD','HIC','OED','PST','ECS','IBT','LMY','EUU','MIC',
                                    'IBD','EMU','UMC','LTE','EAS','EAR','TEC','EAP','TEA','ECA',
                                    'LMC','LCN','MEA','ARB','NAC','TLA','CEB','LAC','MNA','TMN',
                                    'SST','OSS','LDC','TSA','SAS','TSS','SSA','FCS','HPC','PRE'])

pop_dens = pop_dens[['Country Name','2018']]
tourism_data = tourism_data[['Country Name', '2018']]
t_data = tourism_data.drop(index=['WLD','HIC','OED','PST','ECS','IBT','LMY','EUU','MIC','IBD','EMU',
                            'UMC','LTE','EAS','EAR','TEC','EAP','TEA','ECA','LMC','LCN','MEA',
                            'ARB','NAC','TLA','CEB','LAC','MNA','TMN','SST','OSS','LDC','TSA',
                            'SAS'])

In [53]:
t = covid_data.groupby('countryterritoryCode').count()
new_df = covid_data
new_df = new_df.set_index('countryterritoryCode')
new_df['total_deaths'] = 0
new_df['total_cases'] = 0
new_df['deaths_per_100k'] = 0
new_df['cases_per_100k'] = 0
new_df['current_infection_rate'] = 0
new_df['tourists_per_100k'] = 0
new_df['urban_pop_per_100k'] = 0
new_df['rural_percent'] = 0
new_df['trust_in_politicians'] = 0
new_df['rule_of_law'] = 0
new_df['gov_effectiveness'] = 0
new_df['reg_quality'] = 0
new_df['corruption_control'] = 0

new_df = new_df[::-1]

for idx, row in t.iterrows():
    new_df.loc[[idx],['total_deaths']] = new_df.loc[idx]['deaths'].cumsum(axis=0)
    new_df.loc[[idx],['total_cases']] = new_df.loc[idx]['cases'].cumsum(axis=0)
    
    new_df.loc[[idx],['deaths_per_100k']] = (100000 * (new_df.loc[idx]['total_deaths'] / new_df.loc[idx]['popData2018'])).round(3)
    new_df.loc[[idx],['cases_per_100k']] = (100000 * (new_df.loc[idx]['total_cases'] / new_df.loc[idx]['popData2018'])).round(3)
    
    if(idx in urban_2018.index):
        new_df.loc[[idx],['urban_pop_per_100k']] = (100000 * (urban_2018.loc[idx]['2018'] / new_df.loc[idx]['popData2018'])).round(3)
        
    if(idx in rural_2018.index):
        new_df.loc[[idx],['rural_percent']] = rural_2018.loc[idx]['2018']

    if(idx in trust_in_pol.index):
        new_df.loc[[idx],['trust_in_politicians']] = trust_in_pol.loc[idx]['2017-2018']
        
    if(idx in gov_stats_full.index):
        new_df.loc[[idx],['rule_of_law']] = gov_stats_rule.loc[idx]['2018']
        new_df.loc[[idx],['gov_effectiveness']] = gov_stats_effectiveness.loc[idx]['2018']
        new_df.loc[[idx],['reg_quality']] = gov_stats_corruption.loc[idx]['2018']
        new_df.loc[[idx],['corruption_control']] = gov_stats_reg.loc[idx]['2018']
    
    if(idx in tourism_data.index):
        tourists_total = tourism_data.loc[idx]['2018']
        new_df.loc[[idx],['tourists_per_100k']] = 100000*(tourists_total / new_df.loc[idx]['popData2018']).round(3)
        
for i in range(2,len(new_df)):
    current_country = new_df.iloc[i].name
    previous_country = new_df.iloc[i-1].name
    if(current_country == previous_country):
        previous_cases = new_df.iloc[i-1,new_df.columns.get_loc('cases')]
        current_cases = new_df.iloc[i,new_df.columns.get_loc('cases')]
        # Avoiding zero-division
        if(previous_cases > 0 and current_cases > 0):
            new_df.iloc[i,new_df.columns.get_loc('current_infection_rate')] = (current_cases / previous_cases).round(1)
            
new_df = new_df[::-1]

In [57]:
full_set = new_df.drop(columns=['day','month','year','cases','deaths'])

In [54]:
new_df.loc['USA'].head()

Unnamed: 0_level_0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,popData2018,continentExp,...,cases_per_100k,current_infection_rate,tourists_per_100k,urban_pop_per_100k,rural_percent,trust_in_politicians,rule_of_law,gov_effectiveness,reg_quality,corruption_control
countryterritoryCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
USA,4/27/2020,27,4,2020,26857,1687,United_States_of_America,US,327167434.0,America,...,295.234,0.6,24400.0,82135.336,17.744,4.846,1.453255,1.576998,1.323218,1.577987
USA,4/26/2020,26,4,2020,48529,2172,United_States_of_America,US,327167434.0,America,...,287.025,2.3,24400.0,82135.336,17.744,4.846,1.453255,1.576998,1.323218,1.577987
USA,4/25/2020,25,4,2020,21352,1054,United_States_of_America,US,327167434.0,America,...,272.192,0.8,24400.0,82135.336,17.744,4.846,1.453255,1.576998,1.323218,1.577987
USA,4/24/2020,24,4,2020,26543,3179,United_States_of_America,US,327167434.0,America,...,265.666,1.5,24400.0,82135.336,17.744,4.846,1.453255,1.576998,1.323218,1.577987
USA,4/23/2020,23,4,2020,17588,1721,United_States_of_America,US,327167434.0,America,...,257.553,0.5,24400.0,82135.336,17.744,4.846,1.453255,1.576998,1.323218,1.577987


In [None]:
new_df[new_df['dateRep'] == '4/27/2020'].sort_values('urban_pop_per_100k',ascending=False)[:20]

In [None]:
m = folium.Map(location=[48, -102], zoom_start=3)
folium.Choropleth(
    geo_data=country_geo,
    name='choropleth',
    data=present_day_df,
    columns=[present_day_df.index,'deaths_per_100k'],
    key_on='properties.iso_a3',
    fill_color='OrRd',
    fill_opacity=0.5,
    line_opacity=0.2,
    legend_name='pop',
    bins=bins,
    reset=True
).add_to(m)

In [None]:
m

In [None]:
for i in range(1,len(t_data)):
    country_code = t_data.iloc[i].name
    if (country_code in new_df.index):
        tourists_total = t_data.iloc[i,t_data.columns.get_loc('2018')]
        new_df.loc[[country_code],['tourists_per_100k']] = (tourists_total / new_df.loc[country_code]['popData2018']).round(3)

In [58]:
full_set.corr()

Unnamed: 0,popData2018,total_deaths,total_cases,deaths_per_100k,cases_per_100k,current_infection_rate,tourists_per_100k,urban_pop_per_100k,rural_percent,trust_in_politicians,rule_of_law,gov_effectiveness,reg_quality,corruption_control
popData2018,1.0,0.104782,0.170267,-0.030447,-0.061788,0.019784,-0.114633,-0.09703,0.133853,0.147535,-0.065434,0.001975,-0.087481,-0.07939
total_deaths,0.104782,1.0,0.891762,0.366677,0.223253,0.015808,-0.026223,0.071414,-0.060151,0.063405,0.091233,0.10025,0.081478,0.096012
total_cases,0.170267,0.891762,1.0,0.230798,0.181921,0.017661,-0.03594,0.070583,-0.058089,0.093907,0.09296,0.109803,0.083429,0.098508
deaths_per_100k,-0.030447,0.366677,0.230798,1.0,0.754194,0.024582,0.190983,0.130803,-0.144754,-0.073013,0.091124,0.088869,0.087026,0.081062
cases_per_100k,-0.061788,0.223253,0.181921,0.754194,1.0,0.026658,0.328044,0.074272,-0.191261,-0.100139,0.159803,0.15434,0.154869,0.137739
current_infection_rate,0.019784,0.015808,0.017661,0.024582,0.026658,1.0,0.005395,0.024466,-0.00648,0.025047,-0.005297,0.00174,-0.009054,0.002825
tourists_per_100k,-0.114633,-0.026223,-0.03594,0.190983,0.328044,0.005395,1.0,0.251166,-0.197618,-0.15113,0.222266,0.210469,0.188594,0.169157
urban_pop_per_100k,-0.09703,0.071414,0.070583,0.130803,0.074272,0.024466,0.251166,1.0,-0.73447,0.357996,0.418758,0.44511,0.417546,0.416923
rural_percent,0.133853,-0.060151,-0.058089,-0.144754,-0.191261,-0.00648,-0.197618,-0.73447,1.0,-0.179425,-0.473526,-0.504538,-0.471526,-0.474961
trust_in_politicians,0.147535,0.063405,0.093907,-0.073013,-0.100139,0.025047,-0.15113,0.357996,-0.179425,1.0,0.556481,0.589699,0.557741,0.546372
