# Covid19 and Population Data and GDP Data Joined

In [1]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.3
numpy version:  1.18.1


In [2]:
# prepare the covid dataframe from covid data of Johns Hopkins University

df_covid = pd.read_csv('data/2020-04-11.csv')

# construct a dataframe with six columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_covid = df_covid.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_covid['fatality_rate'] = df_covid.Deaths / df_covid.Confirmed

df_covid

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,555,18,32,505,0.032432
Albania,433,23,197,213,0.053118
Algeria,1825,275,460,1090,0.150685
Andorra,601,26,71,504,0.043261
Angola,19,2,4,13,0.105263
...,...,...,...,...,...
West Bank and Gaza,268,2,57,209,0.007463
Western Sahara,4,0,0,4,0.000000
Yemen,1,0,0,1,0.000000
Zambia,40,2,28,10,0.050000


In [3]:
# prepare the population dataframe from population data of united nations.
# The population columns are all in thousand persons. 
population = pd.read_csv('data/WPP2019_TotalPopulationBySex.csv')
pop2019 = population[population.Time == 2019]

pop2019.Location.replace(
    {'Bolivia (Plurinational State of)':'Bolivia',
    'Brunei Darussalam':'Brunei', 
    'Myanmar': 'Burma', 
    'Congo':'Congo (Brazzaville)', 
    'Democratic Republic of the Congo':'Congo (Kinshasa)', 
    'Côte d\'Ivoire':'Cote d\'Ivoire', 
    'Iran (Islamic Republic of)':'Iran', 
    'Republic of Korea':'Korea, South', 
    'Lao People\'s Democratic Republic':'Laos', 
    'Republic of Moldova':'Moldova', 
    'Russian Federation':'Russia', 
    'Syrian Arab Republic':'Syria', 
    'China, Taiwan Province of China':'Taiwan*', 
    'United Republic of Tanzania':'Tanzania', 
    'United States of America':'US', 
    'Venezuela (Bolivarian Republic of)':'Venezuela', 
    'Viet Nam':'Vietnam', 
    'State of Palestine':'West Bank and Gaza'}, inplace=True)

df_pop2019 = pop2019.drop(columns=['LocID','VarID','Variant','Time','MidPeriod'])
df_pop2019

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


Unnamed: 0,Location,PopMale,PopFemale,PopTotal,PopDensity
69,Afghanistan,19529.727,18512.030,38041.757,58.269
953,Africa,653513.680,654550.496,1308064.176,44.119
1837,African Group,652644.714,653675.858,1306320.572,44.464
1988,African Union,652949.469,653953.561,1306903.030,44.085
2139,African Union: Central Africa,76945.498,77068.207,154013.705,29.192
...,...,...,...,...,...
277314,World,3889034.611,3824433.594,7713468.205,59.291
278198,World Bank Regional Groups (developing only),3261387.881,3191129.174,6452517.055,70.258
278349,Yemen,14692.284,14469.638,29161.922,55.234
279233,Zambia,8843.214,9017.820,17861.034,24.026


In [4]:
# prepare the gdp dataframe from gdp data of world bank.
gdp = pd.read_csv('data/gdp_world_bank/gdp_world_bank.csv', skiprows=4)
gdp.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2549721000.0,2534637000.0,2581564000.0,2649721000.0,2691620000.0,2646927000.0,2700559000.0,,,
1,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,...,17804280000.0,20001620000.0,20561050000.0,20484870000.0,19907110000.0,19362640000.0,20191760000.0,19362970000.0,,
2,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,111789700000.0,128052900000.0,136709900000.0,145712200000.0,116193600000.0,101123900000.0,122123800000.0,105751000000.0,,
3,Albania,ALB,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,12890870000.0,12319780000.0,12776280000.0,13228240000.0,11386930000.0,11861350000.0,13025060000.0,15102500000.0,,
4,Andorra,AND,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,3442063000.0,3164615000.0,3281585000.0,3350736000.0,2811489000.0,2877312000.0,3013387000.0,3236544000.0,,


In [5]:
df_gdp = gdp[['Country Name', '2018']]
df_gdp

Unnamed: 0,Country Name,2018
0,Aruba,
1,Afghanistan,1.936297e+10
2,Angola,1.057510e+11
3,Albania,1.510250e+10
4,Andorra,3.236544e+09
...,...,...
259,Kosovo,7.938991e+09
260,"Yemen, Rep.",2.691440e+10
261,South Africa,3.682889e+11
262,Zambia,2.672007e+10


In [6]:
df_gdp.rename(columns={"Country Name": "Location"}, errors="raise", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [7]:
countries_covid19_only = df_covid[~df_covid.index.isin(df_gdp.Location)]
countries_covid19_only.index

Index(['Bahamas', 'Brunei', 'Burma', 'Congo (Brazzaville)', 'Congo (Kinshasa)',
       'Czechia', 'Diamond Princess', 'Egypt', 'Gambia', 'Holy See', 'Iran',
       'Korea, South', 'Kyrgyzstan', 'Laos', 'MS Zaandam', 'Russia',
       'Saint Kitts and Nevis', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Slovakia', 'Syria', 'Taiwan*',
       'US', 'Venezuela', 'Western Sahara', 'Yemen'],
      dtype='object', name='Country_Region')

In [8]:
len(countries_covid19_only)

26

In [9]:
df_gdp[df_gdp.Location.str.contains('Yemen')]

Unnamed: 0,Location,2018
260,"Yemen, Rep.",26914400000.0


In [10]:
df_covid[df_covid.index.str.contains('Macao')]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [11]:
df_gdp.Location.replace(
    {'Bahamas, The':'Bahamas', 
    'Brunei Darussalam':'Brunei', 
    'Myanmar': 'Burma', 
    'Congo, Rep.':'Congo (Brazzaville)', 
    'Congo, Dem. Rep.':'Congo (Kinshasa)', 
    'Czech Republic':'Czechia', 
    'Egypt, Arab Rep.':'Egypt',
    'Gambia, The':'Gambia',
    'Iran, Islamic Rep.':'Iran',
    'Korea, Rep.':'Korea, Rep.',
    'Kyrgyz Republic':'Kyrgyzstan',
    'Lao PDR':'Laos',
    'Russian Federation':'Russia',
    'St. Kitts and Nevis':'Saint Kitts and Nevis',
    'St. Lucia':'Saint Lucia',
    'St. Vincent and the Grenadines':'Saint Vincent and the Grenadines',
    'Slovak Republic':'Slovakia',
    'Syrian Arab Republic':'Syria',
    'United States':'US',
    'Venezuela, RB':'Venezuela',
    'Yemen, Rep.':'Yemen'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [12]:
# now join the covid and population data frames

df_covid19_pop2019 = pd.merge(df_covid, df_pop2019, how='inner', left_on='Country_Region', right_on='Location')
df_covid19_pop2019.head()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,fatality_rate,Location,PopMale,PopFemale,PopTotal,PopDensity
0,555,18,32,505,0.032432,Afghanistan,19529.727,18512.03,38041.757,58.269
1,433,23,197,213,0.053118,Albania,1466.785,1414.128,2880.913,105.143
2,1825,275,460,1090,0.150685,Algeria,21749.666,21303.388,43053.054,18.076
3,601,26,71,504,0.043261,Andorra,,,77.146,164.14
4,19,2,4,13,0.105263,Angola,15744.779,16080.52,31825.299,25.528


In [13]:
df_covid19_pop2019.shape

(182, 10)

In [14]:
# locations that are in the covid19 dataset but not in population or gdp datasets:
# Diamond Princess, Ms Zaandam, Kosovo, Taiwan, Holy See.

df_overview = pd.merge(df_covid19_pop2019, df_gdp, how='inner', left_on='Location', right_on='Location')
df_overview.head()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,fatality_rate,Location,PopMale,PopFemale,PopTotal,PopDensity,2018
0,555,18,32,505,0.032432,Afghanistan,19529.727,18512.03,38041.757,58.269,19362970000.0
1,433,23,197,213,0.053118,Albania,1466.785,1414.128,2880.913,105.143,15102500000.0
2,1825,275,460,1090,0.150685,Algeria,21749.666,21303.388,43053.054,18.076,173758000000.0
3,601,26,71,504,0.043261,Andorra,,,77.146,164.14,3236544000.0
4,19,2,4,13,0.105263,Angola,15744.779,16080.52,31825.299,25.528,105751000000.0


In [15]:
df_overview.shape

(178, 11)

In [16]:
df_overview.rename(columns={"2018": "gdp_2018"}, errors="raise", inplace=True)
df_overview

Unnamed: 0,Confirmed,Deaths,Recovered,Active,fatality_rate,Location,PopMale,PopFemale,PopTotal,PopDensity,gdp_2018
0,555,18,32,505,0.032432,Afghanistan,19529.727,18512.030,38041.757,58.269,1.936297e+10
1,433,23,197,213,0.053118,Albania,1466.785,1414.128,2880.913,105.143,1.510250e+10
2,1825,275,460,1090,0.150685,Algeria,21749.666,21303.388,43053.054,18.076,1.737580e+11
3,601,26,71,504,0.043261,Andorra,,,77.146,164.140,3.236544e+09
4,19,2,4,13,0.105263,Angola,15744.779,16080.520,31825.299,25.528,1.057510e+11
...,...,...,...,...,...,...,...,...,...,...,...
173,258,0,144,114,0.000000,Vietnam,48151.352,48310.756,96462.108,311.098,2.452137e+11
174,268,2,57,209,0.007463,West Bank and Gaza,2526.350,2455.072,4981.422,827.479,1.461590e+10
175,1,0,0,1,0.000000,Yemen,14692.284,14469.638,29161.922,55.234,2.691440e+10
176,40,2,28,10,0.050000,Zambia,8843.214,9017.820,17861.034,24.026,2.672007e+10


In [17]:
df_overview['gdp_per_capita'] = df_overview.gdp_2018 / (df_overview.PopTotal * 1000)
df_overview.set_index('Location', inplace=True)

In [18]:
df_overview.sort_values(by = 'gdp_per_capita', ascending=False).head(25)

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate,PopMale,PopFemale,PopTotal,PopDensity,gdp_2018,gdp_per_capita
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Monaco,92,1,5,86,0.01087,,,38.967,26152.349,7184844000.0,184382.790377
Luxembourg,3270,62,500,2708,0.01896,311.11,304.62,615.73,237.734,70885330000.0,115124.041192
Switzerland,25107,1036,12100,11971,0.041263,4260.661,4330.7,8591.361,217.415,705140400000.0,82075.512153
Norway,6409,119,32,6258,0.018568,2716.812,2662.047,5378.859,14.726,434166600000.0,80717.233047
Ireland,8928,320,25,8583,0.035842,2422.418,2460.08,4882.498,70.874,382487500000.0,78338.483811
Iceland,1689,8,841,840,0.004737,170.23,168.807,339.037,3.382,25878480000.0,76329.355675
Qatar,2728,6,247,2475,0.002199,2133.521,698.55,2832.071,243.934,191362100000.0,67569.664713
Singapore,2299,8,528,1763,0.00348,3038.226,2766.117,5804.343,8291.919,364156700000.0,62738.65238
US,526396,20463,31270,0,0.038874,162826.299,166238.618,329064.917,35.974,20544340000000.0,62432.494002
Denmark,6191,260,2111,3820,0.041996,2869.672,2902.205,5771.877,136.033,355675300000.0,61622.125538


In [19]:
corr1 = df_overview.fatality_rate.corr(df_overview.gdp_per_capita)
corr1

-0.05684092353607492