# Covid-19 Data Wrangling in Python

## Fatality statistics and ranks among countries

In [35]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.3
numpy version:  1.18.1


In [36]:
covid_daily_df = pd.read_csv('data/2020-04-04.csv')

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-04-04 23:34:21,34.223334,-82.461707,6,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-04-04 23:34:21,30.295065,-92.414197,65,2,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-04-04 23:34:21,37.767072,-75.632346,8,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-04-04 23:34:21,43.452658,-116.241552,360,3,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-04-04 23:34:21,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [38]:
covid_daily_df.shape

(2679, 12)

In [40]:
# show all unique Country_Region codes

countries = covid_daily_df.Country_Region.unique()
print(countries)
len(countries)

['US' 'Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Burundi' 'Cabo Verde'
 'Cambodia' 'Cameroon' 'Central African Republic' 'Chad' 'Chile'
 'Colombia' 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica'
 "Cote d'Ivoire" 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess'
 'Djibouti' 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt'
 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini'
 'Ethiopia' 'Fiji' 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana'
 'Greece' 'Grenada' 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti'
 'Holy See' 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran'
 'Iraq' 'Ireland' 'Israel' 'It

181

In [41]:
# statistics grouped by countries
covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum})

Unnamed: 0_level_0,Confirmed
Country_Region,Unnamed: 1_level_1
Afghanistan,299
Albania,333
Algeria,1251
Andorra,466
Angola,10
...,...
Venezuela,155
Vietnam,240
West Bank and Gaza,217
Zambia,39


In [42]:
# construct a dataframe with five columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_country_cases = covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_country_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,299,7,10,282
Albania,333,20,99,214
Algeria,1251,130,90,1031
Andorra,466,17,21,428
Angola,10,2,2,6
...,...,...,...,...
Venezuela,155,7,52,96
Vietnam,240,0,90,150
West Bank and Gaza,217,1,21,195
Zambia,39,1,2,36


In [43]:
# show counties that do not have any corona patients yet
len(df_country_cases.loc[df_country_cases['Confirmed'] == 0])

0

In [44]:
# last operation tells that all countries in this csv file have confirmed cases. 
# now we can calculate the fatality rate

df_country_cases['fatality_rate'] = df_country_cases.Deaths / df_country_cases.Confirmed

fatalities = df_country_cases.query('(Deaths > 0) & (Confirmed > 30)') \
                            .sort_values(by = 'fatality_rate', ascending = False)

fatalities[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Marino,259,32,27,200,0.123552
Italy,124632,15362,20996,88274,0.123259
Congo (Kinshasa),154,18,3,133,0.116883
Bangladesh,70,8,30,32,0.114286
Algeria,1251,130,90,1031,0.103917
United Kingdom,42477,4320,215,37942,0.101702
Netherlands,16727,1656,262,14809,0.099002
Spain,126168,11947,34219,80002,0.094691
Indonesia,2092,191,150,1751,0.0913
France,90848,7574,15572,67702,0.08337


In [69]:
fatalities_asc = df_country_cases.query('(Deaths > 0) & (Confirmed > 30)').sort_values(by = 'fatality_rate', ascending = True)

fatalities_asc[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New Zealand,950,1,127,822,0.001053
Latvia,509,1,1,507,0.001965
Kuwait,479,1,93,385,0.002088
Slovakia,471,1,10,460,0.002123
Qatar,1325,3,109,1213,0.002264
Iceland,1417,4,396,1017,0.002823
Cote d'Ivoire,245,1,25,219,0.004082
Costa Rica,435,2,13,420,0.004598
West Bank and Gaza,217,1,21,195,0.004608
Singapore,1189,6,297,886,0.005046


In [46]:
df_country_cases.Deaths.sum()

64606

In [47]:
# fatalitiy over all countries having more than 30 confirmed cases. 

fatality_partial = fatalities.Deaths.sum() / fatalities.Confirmed.sum()

fatality_partial

0.054003064098532845

In [48]:
# fatalitiy over all countries, regardless of how many confirmed cases and deaths. 

fatality_overall = df_country_cases.Deaths.sum() / df_country_cases.Confirmed.sum()

fatality_overall

0.05395501104471753

In [49]:
# define a function which returns the rank of the fatality

def fatality_rank( country: str, asc: bool ) -> int:
    indices = fatalities.sort_values(by = 'fatality_rate', ascending = asc).index
    return indices.get_loc(country)


In [50]:
ranks_asc = fatalities.sort_values(by = 'fatality_rate', ascending = True).index

def get_fatality_and_rank(country: str) -> pd.Series:
    fatality = fatalities.loc[country].fatality_rate

    rank_asc = ranks_asc.get_loc(country) + 1
    rank_dsc = len(ranks_asc) - rank_asc + 1

    return pd.Series({'Country_Region': country, 'fatality_rate': fatality, 'rank_asc': rank_asc, 'rank_dsc': rank_dsc})


In [51]:
# test the function get_fatality_and_rank

get_fatality_and_rank('Italy')

Country_Region       Italy
fatality_rate     0.123259
rank_asc               120
rank_dsc                 2
dtype: object

In [52]:
# how do these countries score in the fatality_rate?

cols = ['Country_Region', 'fatality_rate', 'rank_asc', 'rank_dsc']
countries = ['Austria', 'China', 'Germany', 'Italy', 'New Zealand', 'San Marino', 'Singapore', 'US']

df_fatality_scores = pd.DataFrame(columns = cols)

for country in countries:
    df_fatality_scores = df_fatality_scores.append(get_fatality_and_rank(country), ignore_index=True)

df_fatality_scores

Unnamed: 0,Country_Region,fatality_rate,rank_asc,rank_dsc
0,Austria,0.015788,47,75
1,China,0.040343,86,36
2,Germany,0.015027,43,79
3,Italy,0.123259,120,2
4,New Zealand,0.001053,1,121
5,San Marino,0.123552,121,1
6,Singapore,0.005046,10,112
7,US,0.02722,68,54


In [53]:
# basic statistics of the fatality
fatalities.fatality_rate.describe()

count    121.000000
mean       0.032446
std        0.028943
min        0.001053
25%        0.011171
50%        0.024507
75%        0.045161
max        0.123552
Name: fatality_rate, dtype: float64

In [54]:
# median and mode of fatality rate

fatality_median = fatalities.fatality_rate.median()
fatality_mode = fatalities.fatality_rate.mode()

print("Fatality median is {} and mode is {}".format(fatality_median, fatality_mode))


Fatality median is 0.024506873879258817 and mode is 0    0.007407
1    0.073171
dtype: float64


In [55]:
# the mode above does not make sense, let's fix it.

# first, segment and sort fatality values into bins
bins = pd.cut(fatalities['fatality_rate'], 10, include_lowest=True)

bins

# I'd like to see the content of each bin, what to do?

Country_Region
San Marino                            (0.111, 0.124]
Italy                                 (0.111, 0.124]
Congo (Kinshasa)                      (0.111, 0.124]
Bangladesh                            (0.111, 0.124]
Algeria                              (0.0991, 0.111]
                                  ...               
Qatar               (-6.999999999999997e-05, 0.0133]
Slovakia            (-6.999999999999997e-05, 0.0133]
Kuwait              (-6.999999999999997e-05, 0.0133]
Latvia              (-6.999999999999997e-05, 0.0133]
New Zealand         (-6.999999999999997e-05, 0.0133]
Name: fatality_rate, Length: 121, dtype: category
Categories (10, interval[float64]): [(-6.999999999999997e-05, 0.0133] < (0.0133, 0.0256] < (0.0256, 0.0378] < (0.0378, 0.0501] ... (0.0746, 0.0868] < (0.0868, 0.0991] < (0.0991, 0.111] < (0.111, 0.124]]

## Joined with Population data from United Nations 

Poplation is in thousand persons.  

In [56]:
population = pd.read_csv('data/WPP2019_TotalPopulationBySex.csv')
population.head()

Unnamed: 0,LocID,Location,VarID,Variant,Time,MidPeriod,PopMale,PopFemale,PopTotal,PopDensity
0,4,Afghanistan,2,Medium,1950,1950.5,4099.243,3652.874,7752.117,11.874
1,4,Afghanistan,2,Medium,1951,1951.5,4134.756,3705.395,7840.151,12.009
2,4,Afghanistan,2,Medium,1952,1952.5,4174.45,3761.546,7935.996,12.156
3,4,Afghanistan,2,Medium,1953,1953.5,4218.336,3821.348,8039.684,12.315
4,4,Afghanistan,2,Medium,1954,1954.5,4266.484,3884.832,8151.316,12.486


In [57]:
# we only need the data of year 2020
pop2019 = population[population.Time == 2019]
pop2019.head()

Unnamed: 0,LocID,Location,VarID,Variant,Time,MidPeriod,PopMale,PopFemale,PopTotal,PopDensity
69,4,Afghanistan,2,Medium,2019,2019.5,19529.727,18512.03,38041.757,58.269
953,903,Africa,2,Medium,2019,2019.5,653513.68,654550.496,1308064.176,44.119
1837,1823,African Group,2,Medium,2019,2019.5,652644.714,653675.858,1306320.572,44.464
1988,1560,African Union,2,Medium,2019,2019.5,652949.469,653953.561,1306903.03,44.085
2139,2080,African Union: Central Africa,2,Medium,2019,2019.5,76945.498,77068.207,154013.705,29.192


In [58]:
len(pop2019)

477

### Now we need to unify the lation names of the population data and the corona data 

In [59]:
# The set of location names of the population data is much bigger than corona's. 
# So we first find out which country names are not in the population data

countryNames_covid19_only = df_country_cases[df_country_cases.index.isin(pop2019.Location) == False]

countryNames_covid19_only.index

Index(['Bolivia', 'Brunei', 'Burma', 'Congo (Brazzaville)', 'Congo (Kinshasa)',
       'Cote d'Ivoire', 'Diamond Princess', 'Iran', 'Korea, South', 'Kosovo',
       'Laos', 'MS Zaandam', 'Moldova', 'Russia', 'Syria', 'Taiwan*',
       'Tanzania', 'US', 'Venezuela', 'Vietnam', 'West Bank and Gaza'],
      dtype='object', name='Country_Region')

In [60]:
len(countryNames_covid19_only)

21

In [61]:
pop2019[pop2019.Location.str.contains('Pales')]

Unnamed: 0,LocID,Location,VarID,Variant,Time,MidPeriod,PopMale,PopFemale,PopTotal,PopDensity
236754,275,State of Palestine,2,Medium,2019,2019.5,2526.35,2455.072,4981.422,827.479


In [62]:
pop2019.Location.replace({'Bolivia (Plurinational State of)':'Bolivia', 'Brunei Darussalam':'Brunei', 'Myanmar': 'Burma', 'Congo':'Congo (Brazzaville)', 'Democratic Republic of the Congo':'Congo (Kinshasa)', 'Côte d\'Ivoire':'Cote d\'Ivoire', 'Iran (Islamic Republic of)':'Iran', 'Republic of Korea':'Korea, South', 'Lao People\'s Democratic Republic':'Laos', 'Republic of Moldova':'Moldova', 'Russian Federation':'Russia', 'Syrian Arab Republic':'Syria', 'China, Taiwan Province of China':'Taiwan*', 'United Republic of Tanzania':'Tanzania', 'United States of America':'US', 'Venezuela (Bolivarian Republic of)':'Venezuela', 'Viet Nam':'Vietnam', 'State of Palestine':'West Bank and Gaza'}, inplace=True)

In [63]:
# drop unnecessary columns

df_pop2019 = pop2019.drop(columns=['LocID','VarID','Variant','Time','MidPeriod'])
df_pop2019.head()

Unnamed: 0,Location,PopMale,PopFemale,PopTotal,PopDensity
69,Afghanistan,19529.727,18512.03,38041.757,58.269
953,Africa,653513.68,654550.496,1308064.176,44.119
1837,African Group,652644.714,653675.858,1306320.572,44.464
1988,African Union,652949.469,653953.561,1306903.03,44.085
2139,African Union: Central Africa,76945.498,77068.207,154013.705,29.192


In [64]:
# now join the corona and population data frames

df_covid19_pop2019 = pd.merge(df_country_cases, df_pop2019, how='inner', left_on='Country_Region', right_on='Location')
df_covid19_pop2019.head()

Unnamed: 0,Confirmed,Deaths,Recovered,Active,fatality_rate,Location,PopMale,PopFemale,PopTotal,PopDensity
0,299,7,10,282,0.023411,Afghanistan,19529.727,18512.03,38041.757,58.269
1,333,20,99,214,0.06006,Albania,1466.785,1414.128,2880.913,105.143
2,1251,130,90,1031,0.103917,Algeria,21749.666,21303.388,43053.054,18.076
3,466,17,21,428,0.036481,Andorra,,,77.146,164.14
4,10,2,2,6,0.2,Angola,15744.779,16080.52,31825.299,25.528


In [65]:
df_covid19_pop2019.shape

(178, 10)

In [66]:
# correlation between fatality_rate and PopDensity??

corr_fatality_popDensity = df_covid19_pop2019.fatality_rate.corr(df_covid19_pop2019.PopDensity)
corr_fatality_popDensity

-0.05805112816227241

In [67]:
corr_confirmed_popTotal = df_covid19_pop2019.Confirmed.corr(df_covid19_pop2019.PopTotal)
corr_confirmed_popTotal

0.2770900392937214

In [68]:
corr_deaths_popTotal = df_covid19_pop2019.Deaths.corr(df_covid19_pop2019.PopTotal)
corr_deaths_popTotal

0.16903918100873583