# Covid19 and Population Data and GDP Data Joined

In [2]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.3
numpy version:  1.18.1


In [4]:
# prepare the covid dataframe from covid data of Johns Hopkins University

df_covid = pd.read_csv('data/2020-04-02.csv')

# construct a dataframe with six columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_covid = df_covid.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_covid['fatality_rate'] = df_covid.Deaths / df_covid.Confirmed

df_covid

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,273,6,10,257,0.021978
Albania,277,16,76,185,0.057762
Algeria,986,86,61,839,0.087221
Andorra,428,15,10,403,0.035047
Angola,8,2,1,5,0.250000
...,...,...,...,...,...
Venezuela,146,5,43,98,0.034247
Vietnam,233,0,75,158,0.000000
West Bank and Gaza,161,1,18,142,0.006211
Zambia,39,1,0,38,0.025641


In [5]:
# prepare the population dataframe from population data of united nations.
population = pd.read_csv('data/WPP2019_TotalPopulationBySex.csv')
pop2019 = population[population.Time == 2019]

pop2019.Location.replace({'Bolivia (Plurinational State of)':'Bolivia','Brunei Darussalam':'Brunei', 'Myanmar': 'Burma', 'Congo':'Congo (Brazzaville)', 'Democratic Republic of the Congo':'Congo (Kinshasa)', 'Côte d\'Ivoire':'Cote d\'Ivoire', 'Iran (Islamic Republic of)':'Iran', 'Republic of Korea':'Korea, South', 'Lao People\'s Democratic Republic':'Laos', 'Republic of Moldova':'Moldova', 'Russian Federation':'Russia', 'Syrian Arab Republic':'Syria', 'China, Taiwan Province of China':'Taiwan*', 'United Republic of Tanzania':'Tanzania', 'United States of America':'US', 'Venezuela (Bolivarian Republic of)':'Venezuela', 'Viet Nam':'Vietnam', 'State of Palestine':'West Bank and Gaza'}, inplace=True)

df_pop2019 = pop2019.drop(columns=['LocID','VarID','Variant','Time','MidPeriod'])
df_pop2019

Unnamed: 0,Location,PopMale,PopFemale,PopTotal,PopDensity
69,Afghanistan,19529.727,18512.030,38041.757,58.269
953,Africa,653513.680,654550.496,1308064.176,44.119
1837,African Group,652644.714,653675.858,1306320.572,44.464
1988,African Union,652949.469,653953.561,1306903.030,44.085
2139,African Union: Central Africa,76945.498,77068.207,154013.705,29.192
...,...,...,...,...,...
277314,World,3889034.611,3824433.594,7713468.205,59.291
278198,World Bank Regional Groups (developing only),3261387.881,3191129.174,6452517.055,70.258
278349,Yemen,14692.284,14469.638,29161.922,55.234
279233,Zambia,8843.214,9017.820,17861.034,24.026


In [9]:
# prepare the gdp dataframe from gdp data of world bank.
gdp = pd.read_csv('data/gdp_world_bank/gdp_world_bank.csv', skiprows=4)
gdp.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2549721000.0,2534637000.0,2581564000.0,2649721000.0,2691620000.0,2646927000.0,2700559000.0,,,
1,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,...,17804280000.0,20001620000.0,20561050000.0,20484870000.0,19907110000.0,19362640000.0,20191760000.0,19362970000.0,,
2,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,111789700000.0,128052900000.0,136709900000.0,145712200000.0,116193600000.0,101123900000.0,122123800000.0,105751000000.0,,
3,Albania,ALB,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,12890870000.0,12319780000.0,12776280000.0,13228240000.0,11386930000.0,11861350000.0,13025060000.0,15102500000.0,,
4,Andorra,AND,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,3442063000.0,3164615000.0,3281585000.0,3350736000.0,2811489000.0,2877312000.0,3013387000.0,3236544000.0,,


In [10]:
df_gdp = gdp[['Country Name', '2018']]
df_gdp

Unnamed: 0,Country Name,2018
0,Aruba,
1,Afghanistan,1.936297e+10
2,Angola,1.057510e+11
3,Albania,1.510250e+10
4,Andorra,3.236544e+09
...,...,...
259,Kosovo,7.938991e+09
260,"Yemen, Rep.",2.691440e+10
261,South Africa,3.682889e+11
262,Zambia,2.672007e+10


In [24]:
df_gdp.rename(columns={"Country Name": "Location"}, errors="raise", inplace=True)

In [25]:
countries_covid19_only = df_covid[~df_covid.index.isin(df_gdp.Location)]
countries_covid19_only.index

Index(['Bahamas', 'Brunei', 'Burma', 'Congo (Brazzaville)', 'Congo (Kinshasa)',
       'Czechia', 'Diamond Princess', 'Egypt', 'Gambia', 'Holy See', 'Iran',
       'Korea, South', 'Kyrgyzstan', 'Laos', 'MS Zaandam', 'Russia',
       'Saint Kitts and Nevis', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Slovakia', 'Syria', 'Taiwan*',
       'US', 'Venezuela'],
      dtype='object', name='Country_Region')

In [26]:
len(countries_covid19_only)

24

In [36]:
df_gdp[df_gdp.Location.str.contains('Tai')]

Unnamed: 0,Location,2018


In [None]:
df_gdp.Location.replace({'Bahamas, The':'Bahamas', 'Brunei Darussalam':'Brunei', 'Myanmar': 'Burma', 'Congo, Rep.':'Congo (Brazzaville)', 'Congo, Dem. Rep.':'Congo (Kinshasa)'}, inplace=True)

In [6]:
# now join the covid and population data frames

df_covid19_pop2019 = pd.merge(df_covid, df_pop2019, how='inner', left_on='Country_Region', right_on='Location')
df_covid19_pop2019.head()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,fatality_rate,Location,PopMale,PopFemale,PopTotal,PopDensity
0,273,6,10,257,0.021978,Afghanistan,19529.727,18512.03,38041.757,58.269
1,277,16,76,185,0.057762,Albania,1466.785,1414.128,2880.913,105.143
2,986,86,61,839,0.087221,Algeria,21749.666,21303.388,43053.054,18.076
3,428,15,10,403,0.035047,Andorra,,,77.146,164.14
4,8,2,1,5,0.25,Angola,15744.779,16080.52,31825.299,25.528
