In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pycountry
import pycountry_convert as pc
from scipy import stats

# Loading Data

## Gross Domestic Product

In [2]:
#countries income group
df_gdp_group = pd.read_csv('data/gdp_group.csv', delimiter =';')
df_gdp_group = df_gdp_group[['TableName', 'IncomeGroup']]
df_gdp_group.columns = ['pais', 'IncomeGroup']

#gdp percapita
df_gdp_pc = pd.read_csv('data/gdp_percapita.csv', delimiter =';', dtype={'2018': np.float32})
df_gdp_pc.columns = ['pais', 'gdp_pc']

#gdp
df_gdp = pd.read_csv('data/gdp_values.csv', delimiter =';', dtype={'2018': np.float32})
df_gdp.columns = ['pais', 'gdp']

df_potential = pd.read_csv('data/viral_potential.csv')

## Flights Percent Loss

In [3]:
df_pct_loss = pd.read_csv('data/flights_pct_loss_week.csv')

In [4]:
df_pct_loss = df_pct_loss[(df_pct_loss.valor_semana >=15) & (df_pct_loss.valor_semana <=22)]

In [5]:
# df_pct_loss = df_pct_loss[['pais', 'pct_loss']].groupby(['pais']).mean().reset_index()

In [6]:
df_pct_loss

Unnamed: 0,pais,valor_semana,dia_semana,voos_soma,jan_mean,pct_loss,date,continent
69,Algeria,15,0,3.0,130.076923,0.976937,2020-04-11,Africa
70,Algeria,15,1,3.0,130.076923,0.976937,2020-04-11,Africa
71,Algeria,15,2,17.0,130.076923,0.869308,2020-04-11,Africa
72,Algeria,15,3,24.0,130.076923,0.815494,2020-04-11,Africa
73,Algeria,15,4,5.0,130.076923,0.961561,2020-04-11,Africa
...,...,...,...,...,...,...,...,...
8536,Kazakhstan,22,3,12.0,133.192308,0.909905,2020-05-30,Asia
8539,Latvia,15,6,1.0,198.615385,0.994965,2020-04-11,Europe
8541,Morocco,22,0,2.0,110.653846,0.981926,2020-05-30,Africa
8542,Singapore,19,6,6.0,1038.461538,0.994222,2020-05-09,Asia


## Economic Impact

In [7]:
df_eco_impct = pd.read_csv('data/all_variables_response_economic_countries.csv')
df_eco_impct = df_eco_impct[['Country', 'Output-CLI percent impact']]
df_eco_impct.columns = ['pais', 'Output-CLI percent impact']

## Population

In [8]:
df_pop = pd.read_csv('data/socio_stats_countries.csv')

df_pop = df_pop[df_pop.variable == 'Population']
df_pop = df_pop[['Name', 'value']]
df_pop.columns = ['pais', 'pop']

## Export Goods and Services

In [9]:
df_goods_serv = pd.read_csv('data/export_goods_serv.csv', delimiter =';', dtype={'2018': np.float32}, encoding='cp1252')
df_goods_serv.columns = ['pais', 'value_goods_serv']

## Covid Deaths

In [10]:
df_covid = pd.read_csv('data/total_cases_countries_normalized.csv')
df_covid = df_covid[df_covid.Date == '2020-05-18']
df_covid = df_covid[['Name', 'TotalDeaths']]
df_covid.columns = ['pais', 'TotalDeaths0518']


# 1st  Occurrence of Covid

In [11]:
covid = pd.read_csv('data/total_cases_countries_normalized.csv')

In [12]:
first_date = []

for pais in covid.Name.unique():
    auxdf = covid[covid.Name == pais]
    auxdf = auxdf[auxdf.TotalDeaths != 0]
    if not(auxdf.empty):
        fd = auxdf.iloc[0].Day
    first_date.append([pais, fd])
df_first_date = pd.DataFrame(first_date, columns = ['pais', '1stDay' ])

## Eigenvector Centrality

In [13]:
df_eigenvec = pd.read_csv('data/eigenvector_centrality.csv')
df_eigenvec.columns = ['pais', 'centrality']

## Target Probabilty

In [14]:
df_target_prob = pd.read_csv('data/probability_matrix.csv')
df_target_prob = df_target_prob[['target', 'prob']].groupby('target').mean().reset_index()

In [15]:
df_target_prob.columns = ['pais', 'prob_target']

## Coordinates

In [16]:
df_lat_long = pd.read_csv('data/lat-long.csv', delimiter = ';')
df_lat_long = df_lat_long[['name', 'latitude', 'longitude']]
df_lat_long.columns = ['pais', 'lat', 'long']

## Solving country names problem

In [17]:
#Hong Kong Macau Russia Slovakia South Korea Taiwan

def rename_dataworldbank(d):
    d = d.set_index('pais')
    d = d.rename(index ={'Hong Kong SAR, China':'Hong Kong'})
    d = d.rename(index ={'Macao SAR, China':'Macau'})
    d = d.rename(index ={'Russian Federation':'Russia'})
    d = d.rename(index ={'Slovak Republic':'Slovakia'})
    d = d.rename(index ={'Korea, Rep.':'South Korea'})
    d = d.reset_index()
    return d

df_gdp = rename_dataworldbank(df_gdp)
df_gdp_pc = rename_dataworldbank(df_gdp_pc)
df_goods_serv = rename_dataworldbank(df_goods_serv)


## Merging DataFrames

In [18]:
df = df_pct_loss.merge(df_gdp_group, 'left', on = 'pais')
df = df.merge(df_gdp, 'left', on = 'pais')
df = df.merge(df_gdp_pc, 'left', on = 'pais')
df = df.merge(df_eco_impct, 'left', on = 'pais')
df = df.merge(df_pop, 'left', on = 'pais')
df = df.merge(df_goods_serv, 'left', on = 'pais')
df = df.merge(df_covid, 'left', on = 'pais')
df = df.merge(df_first_date, 'left', on = 'pais')
df = df.merge(df_eigenvec, 'left', on = 'pais')
df = df.merge(df_target_prob, 'left', on = 'pais')
df = df.merge(df_lat_long, 'left', on = 'pais')

In [19]:
paises = df.pais
mapping = {country.name: country.alpha_3 for country in pycountry.countries}
cod_pais = [mapping.get(x) for x in paises]

In [20]:
df['cod_pais'] = cod_pais

In [21]:
df

Unnamed: 0,pais,valor_semana,dia_semana,voos_soma,jan_mean,pct_loss,date,continent,IncomeGroup,gdp,...,Output-CLI percent impact,pop,value_goods_serv,TotalDeaths0518,1stDay,centrality,prob_target,lat,long,cod_pais
0,Algeria,15,0,3.0,130.076923,0.976937,2020-04-11,Africa,Upper middle income,1.740000e+11,...,,42228429.0,,548.0,73.0,0.084755,0.004530,28.033886,1.659626,DZA
1,Algeria,15,1,3.0,130.076923,0.976937,2020-04-11,Africa,Upper middle income,1.740000e+11,...,,42228429.0,,548.0,73.0,0.084755,0.004530,28.033886,1.659626,DZA
2,Algeria,15,2,17.0,130.076923,0.869308,2020-04-11,Africa,Upper middle income,1.740000e+11,...,,42228429.0,,548.0,73.0,0.084755,0.004530,28.033886,1.659626,DZA
3,Algeria,15,3,24.0,130.076923,0.815494,2020-04-11,Africa,Upper middle income,1.740000e+11,...,,42228429.0,,548.0,73.0,0.084755,0.004530,28.033886,1.659626,DZA
4,Algeria,15,4,5.0,130.076923,0.961561,2020-04-11,Africa,Upper middle income,1.740000e+11,...,,42228429.0,,548.0,73.0,0.084755,0.004530,28.033886,1.659626,DZA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3589,Kazakhstan,22,3,12.0,133.192308,0.909905,2020-05-30,Asia,Upper middle income,1.790000e+11,...,,18272430.0,6.708301e+10,34.0,90.0,0.111078,0.005259,48.019573,66.923684,KAZ
3590,Latvia,15,6,1.0,198.615385,0.994965,2020-04-11,Europe,High income,3.440923e+10,...,,1927174.0,2.108244e+10,19.0,95.0,0.102359,0.008210,56.879635,24.603189,LVA
3591,Morocco,22,0,2.0,110.653846,0.981926,2020-05-30,Africa,Lower middle income,1.180000e+11,...,,36029138.0,4.312939e+10,192.0,72.0,0.097476,0.004595,31.791702,-7.092620,MAR
3592,Singapore,19,6,6.0,1038.461538,0.994222,2020-05-09,Asia,High income,3.640000e+11,...,,5638676.0,6.631230e+11,22.0,82.0,0.094453,0.012056,1.352083,103.819836,SGP


In [22]:
df['gdp_pcap'] = df['gdp']/df['pop']
df['value_goods_serv_pcap'] = df['value_goods_serv']/df['pop']

df = df.rename(columns={'pais':'Country'})

df['Day'] = ((df['valor_semana']-1) * 7) + df['dia_semana']

df.head()

Unnamed: 0,Country,valor_semana,dia_semana,voos_soma,jan_mean,pct_loss,date,continent,IncomeGroup,gdp,...,TotalDeaths0518,1stDay,centrality,prob_target,lat,long,cod_pais,gdp_pcap,value_goods_serv_pcap,Day
0,Algeria,15,0,3.0,130.076923,0.976937,2020-04-11,Africa,Upper middle income,174000000000.0,...,548.0,73.0,0.084755,0.00453,28.033886,1.659626,DZA,4120.446842,,98
1,Algeria,15,1,3.0,130.076923,0.976937,2020-04-11,Africa,Upper middle income,174000000000.0,...,548.0,73.0,0.084755,0.00453,28.033886,1.659626,DZA,4120.446842,,99
2,Algeria,15,2,17.0,130.076923,0.869308,2020-04-11,Africa,Upper middle income,174000000000.0,...,548.0,73.0,0.084755,0.00453,28.033886,1.659626,DZA,4120.446842,,100
3,Algeria,15,3,24.0,130.076923,0.815494,2020-04-11,Africa,Upper middle income,174000000000.0,...,548.0,73.0,0.084755,0.00453,28.033886,1.659626,DZA,4120.446842,,101
4,Algeria,15,4,5.0,130.076923,0.961561,2020-04-11,Africa,Upper middle income,174000000000.0,...,548.0,73.0,0.084755,0.00453,28.033886,1.659626,DZA,4120.446842,,102


In [23]:
df_potential

Unnamed: 0,Country,Day,DailyCases,ActiveCases,centrality,ViralPotential
0,Algeria,0,0.0,0.0,0.071152,0.000000
1,Algeria,1,0.0,0.0,0.071152,0.000000
2,Algeria,2,0.0,0.0,0.071152,0.000000
3,Algeria,3,0.0,0.0,0.071152,0.000000
4,Algeria,4,0.0,0.0,0.071152,0.000000
...,...,...,...,...,...,...
8009,United States,135,20782.0,350837.0,0.168602,0.823113
8010,United States,136,27143.0,348063.0,0.168602,0.816605
8011,United States,137,25508.0,339616.0,0.168602,0.796787
8012,United States,138,24487.0,334815.0,0.168602,0.785523


In [24]:
df = df.merge(df_potential, on=['Country', 'Day'])

df

Unnamed: 0,Country,valor_semana,dia_semana,voos_soma,jan_mean,pct_loss,date,continent,IncomeGroup,gdp,...,lat,long,cod_pais,gdp_pcap,value_goods_serv_pcap,Day,DailyCases,ActiveCases,centrality_y,ViralPotential
0,Algeria,15,0,3.0,130.076923,0.976937,2020-04-11,Africa,Upper middle income,1.740000e+11,...,28.033886,1.659626,DZA,4120.446842,,98,103.0,1193.0,0.071152,0.001181
1,Algeria,15,1,3.0,130.076923,0.976937,2020-04-11,Africa,Upper middle income,1.740000e+11,...,28.033886,1.659626,DZA,4120.446842,,99,45.0,1204.0,0.071152,0.001192
2,Algeria,15,2,17.0,130.076923,0.869308,2020-04-11,Africa,Upper middle income,1.740000e+11,...,28.033886,1.659626,DZA,4120.446842,,100,104.0,1270.0,0.071152,0.001257
3,Algeria,15,3,24.0,130.076923,0.815494,2020-04-11,Africa,Upper middle income,1.740000e+11,...,28.033886,1.659626,DZA,4120.446842,,101,94.0,1299.0,0.071152,0.001286
4,Algeria,15,4,5.0,130.076923,0.961561,2020-04-11,Africa,Upper middle income,1.740000e+11,...,28.033886,1.659626,DZA,4120.446842,,102,95.0,1352.0,0.071152,0.001339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2717,Iceland,15,3,2.0,94.576923,0.978853,2020-04-11,Europe,High income,2.587847e+10,...,64.963051,-19.020835,ISL,73368.114606,34708.795586,101,32.0,846.0,0.089691,0.001056
2718,Indonesia,17,6,1.0,114.423077,0.991261,2020-04-25,Asia,Lower middle income,1.040000e+12,...,-0.789275,113.921327,IDN,3885.476642,791.785416,118,275.0,4641.0,0.065662,0.004241
2719,Indonesia,18,0,1.0,114.423077,0.991261,2020-05-02,Asia,Lower middle income,1.040000e+12,...,-0.789275,113.921327,IDN,3885.476642,791.785416,119,214.0,4539.0,0.065662,0.004147
2720,Latvia,15,6,1.0,198.615385,0.994965,2020-04-11,Europe,High income,3.440923e+10,...,56.879635,24.603189,LVA,17854.760564,10939.559990,104,21.0,275.0,0.111276,0.000426


In [25]:
print(df['date'].min(), df['date'].max())

2020-04-11 2020-05-16


In [26]:
df.columns

Index(['Country', 'valor_semana', 'dia_semana', 'voos_soma', 'jan_mean',
       'pct_loss', 'date', 'continent', 'IncomeGroup', 'gdp', 'gdp_pc',
       'Output-CLI percent impact', 'pop', 'value_goods_serv',
       'TotalDeaths0518', '1stDay', 'centrality_x', 'prob_target', 'lat',
       'long', 'cod_pais', 'gdp_pcap', 'value_goods_serv_pcap', 'Day',
       'DailyCases', 'ActiveCases', 'centrality_y', 'ViralPotential'],
      dtype='object')

In [27]:
df = df.groupby(['Country', 'jan_mean', 'gdp', 'gdp_pc', 'pop', 'value_goods_serv', 'centrality_y', 'valor_semana'])[['pct_loss', 'DailyCases', 'ActiveCases', 'ViralPotential']].mean().reset_index()

df

Unnamed: 0,Country,jan_mean,gdp,gdp_pc,pop,value_goods_serv,centrality_y,valor_semana,pct_loss,DailyCases,ActiveCases,ViralPotential
0,Argentina,265.692308,5.200000e+11,1.168395e+09,44494502.0,7.680732e+10,0.050207,15,0.939242,92.714286,1330.428571,0.000929
1,Argentina,265.692308,5.200000e+11,1.168395e+09,44494502.0,7.680732e+10,0.050207,16,0.923112,103.857143,1377.714286,0.000963
2,Argentina,265.692308,5.200000e+11,1.168395e+09,44494502.0,7.680732e+10,0.050207,17,0.916660,119.571429,1494.428571,0.001044
3,Argentina,265.692308,5.200000e+11,1.168395e+09,44494502.0,7.680732e+10,0.050207,18,0.926338,143.285714,1805.142857,0.001261
4,Argentina,265.692308,5.200000e+11,1.168395e+09,44494502.0,7.680732e+10,0.050207,19,0.900529,143.714286,1966.142857,0.001374
...,...,...,...,...,...,...,...,...,...,...,...,...
366,United States,57956.346154,2.050000e+13,6.279459e+09,326687501.0,2.501310e+12,0.168602,16,0.592730,28873.714286,422243.714286,0.990643
367,United States,57956.346154,2.050000e+13,6.279459e+09,326687501.0,2.501310e+12,0.168602,17,0.581759,29460.428571,409404.142857,0.960519
368,United States,57956.346154,2.050000e+13,6.279459e+09,326687501.0,2.501310e+12,0.168602,18,0.534273,27447.285714,400802.571429,0.940339
369,United States,57956.346154,2.050000e+13,6.279459e+09,326687501.0,2.501310e+12,0.168602,19,0.510918,24536.857143,381981.714286,0.896183


In [28]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [29]:
mod = smf.ols(formula='pct_loss ~ gdp', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.226
Method:                 Least Squares   F-statistic:                     109.2
Date:                Sat, 12 Jun 2021   Prob (F-statistic):           1.45e-22
Time:                        16:09:25   Log-Likelihood:                 280.60
No. Observations:                 371   AIC:                            -557.2
Df Residuals:                     369   BIC:                            -549.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.8378      0.006    131.252      0.0

In [30]:
mod = smf.ols(formula='pct_loss ~ value_goods_serv', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.216
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     101.7
Date:                Sat, 12 Jun 2021   Prob (F-statistic):           2.76e-21
Time:                        16:09:25   Log-Likelihood:                 277.67
No. Observations:                 371   AIC:                            -551.3
Df Residuals:                     369   BIC:                            -543.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.8532      0.007  

In [31]:
mod = smf.ols(formula='pct_loss ~ gdp', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.226
Method:                 Least Squares   F-statistic:                     109.2
Date:                Sat, 12 Jun 2021   Prob (F-statistic):           1.45e-22
Time:                        16:09:25   Log-Likelihood:                 280.60
No. Observations:                 371   AIC:                            -557.2
Df Residuals:                     369   BIC:                            -549.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.8378      0.006    131.252      0.0

In [32]:
mod = smf.ols(formula='pct_loss ~ ViralPotential', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.042
Method:                 Least Squares   F-statistic:                     17.09
Date:                Sat, 12 Jun 2021   Prob (F-statistic):           4.41e-05
Time:                        16:09:25   Log-Likelihood:                 240.91
No. Observations:                 371   AIC:                            -477.8
Df Residuals:                     369   BIC:                            -470.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.8204      0.007    119.

In [33]:
mod = smf.ols(formula='pct_loss ~ gdp + ViralPotential', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.300
Model:                            OLS   Adj. R-squared:                  0.296
Method:                 Least Squares   F-statistic:                     78.72
Date:                Sat, 12 Jun 2021   Prob (F-statistic):           3.47e-29
Time:                        16:09:25   Log-Likelihood:                 298.57
No. Observations:                 371   AIC:                            -591.1
Df Residuals:                     368   BIC:                            -579.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.8396      0.006    137.

In [34]:
mod = smf.ols(formula='pct_loss ~ gdp + ViralPotential + C(Country)', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.803
Method:                 Least Squares   F-statistic:                     25.66
Date:                Sat, 12 Jun 2021   Prob (F-statistic):           6.88e-90
Time:                        16:09:25   Log-Likelihood:                 566.89
No. Observations:                 371   AIC:                            -1010.
Df Residuals:                     309   BIC:                            -767.0
Df Model:                          61                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [37]:
mod = smf.ols(formula='pct_loss ~ gdp + ViralPotential + C(Country) + C(valor_semana)', data=df)

res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:               pct_loss   R-squared:                       0.871
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     31.17
Date:                Sun, 13 Jun 2021   Prob (F-statistic):          3.88e-102
Time:                        08:07:01   Log-Likelihood:                 612.76
No. Observations:                 371   AIC:                            -1092.
Df Residuals:                     304   BIC:                            -829.1
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep