# Covid-19 Data Wrangling in Python

In [1]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.1
numpy version:  1.18.1


In [2]:
covid_daily_df = pd.read_csv('data/2020-03-27.csv')

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-27 22:14:55,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-27 22:14:55,30.295065,-92.414197,8,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-27 22:14:55,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-27 22:14:55,43.452658,-116.241552,54,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-27 22:14:55,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [3]:
covid_daily_df.dtypes

FIPS              float64
Admin2             object
Province_State     object
Country_Region     object
Last_Update        object
Lat               float64
Long_             float64
Confirmed           int64
Deaths              int64
Recovered           int64
Active              int64
Combined_Key       object
dtype: object

In [4]:
covid_daily_df.shape

(3429, 12)

In [5]:
# convert the FIPS County Code column to type of integer

covid_daily_df.FIPS = covid_daily_df.FIPS.fillna(-1) # get rid of NA 
covid_daily_df.FIPS = covid_daily_df.FIPS.astype(int)

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001,Abbeville,South Carolina,US,2020-03-27 22:14:55,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001,Acadia,Louisiana,US,2020-03-27 22:14:55,30.295065,-92.414197,8,1,0,0,"Acadia, Louisiana, US"
2,51001,Accomack,Virginia,US,2020-03-27 22:14:55,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001,Ada,Idaho,US,2020-03-27 22:14:55,43.452658,-116.241552,54,0,0,0,"Ada, Idaho, US"
4,19001,Adair,Iowa,US,2020-03-27 22:14:55,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [6]:
# show all unique Country_Region codes

countries = covid_daily_df.Country_Region.unique()
print(countries)
len(countries)

['US' 'Canada' 'China' 'Netherlands' 'Australia' 'United Kingdom'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Brunei'
 'Bulgaria' 'Burkina Faso' 'Burma' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Central African Republic' 'Chad' 'Chile' 'Colombia'
 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan

176

In [7]:
# statistics grouped by countries
covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum})

Unnamed: 0_level_0,Confirmed
Country_Region,Unnamed: 1_level_1
Afghanistan,110
Albania,186
Algeria,409
Andorra,267
Angola,4
...,...
Venezuela,107
Vietnam,163
West Bank and Gaza,91
Zambia,22


In [8]:
# construct a dataframe with five columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_country_cases = covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_country_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,110,4,2,104
Albania,186,8,31,147
Algeria,409,26,29,354
Andorra,267,3,1,263
Angola,4,0,0,4
...,...,...,...,...
Venezuela,107,1,31,75
Vietnam,163,0,20,143
West Bank and Gaza,91,1,17,73
Zambia,22,0,0,22


In [9]:
df_country_cases.dtypes

Confirmed    int64
Deaths       int64
Recovered    int64
Active       int64
dtype: object

In [10]:
# show counties that do not have any corona patients yet
len(df_country_cases.loc[df_country_cases['Confirmed'] == 0])


0

In [50]:
fatalities_asc = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)').sort_values(by = 'fatality_rate', ascending = True)

fatalities_asc[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
South Africa,1170,1,31,1138,0.000855
Estonia,575,1,11,563,0.001739
Serbia,457,1,0,456,0.002188
Iceland,890,2,97,791,0.002247
Saudi Arabia,1104,3,35,1066,0.002717
Singapore,732,2,183,547,0.002732
Armenia,329,1,28,300,0.00304
Chile,1610,5,43,1562,0.003106
Russia,1036,4,45,987,0.003861
Czechia,2279,9,11,2259,0.003949


In [23]:
# last operation tells that all countries in this csv file have confirmed cases. 
# now we can calculate the fatality rate

df_country_cases['fatality_rate'] = df_country_cases.Deaths / df_country_cases.Confirmed

fatalities = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)') \
                            .sort_values(by = 'fatality_rate', ascending = False)

fatalities[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,86498,9134,10950,66414,0.105598
San Marino,223,21,4,198,0.09417
Iraq,458,40,122,296,0.087336
Indonesia,1046,87,46,913,0.083174
Spain,65719,5138,9357,51224,0.078181
Iran,32332,2378,11133,18821,0.073549
Philippines,803,54,31,718,0.067248
Morocco,345,23,11,311,0.066667
Algeria,409,26,29,354,0.06357
Netherlands,8647,547,6,8094,0.063259


In [36]:
indices = fatalities.sort_values(by = 'fatality_rate', ascending = True).index

indices.get_loc('Germany')

18

In [37]:
# define a function which returns the rank of the fatality

def fatality_rank( countryName: str, asc: bool ) -> int:
    indices = fatalities.sort_values(by = 'fatality_rate', ascending = asc).index
    return indices.get_loc(countryName)


In [48]:
# How does Germany score in the fatality?

fatality_de = fatalities.loc['Germany'].fatality_rate
rank = fatality_rank('Germany', True)

print("Fatality of Germany is {}, it ranks {} among the countries which have more than 100 confirmed cases.".format(fatality_de, rank) )


Fatality of Germany is 0.006722887303178628, it ranks 18 among the countries which have more than 100 confirmed cases.


In [42]:
# basic statistics of the fatality
fatalities.fatality_rate.describe()

count    83.000000
mean      0.024420
std       0.024414
min       0.000855
25%       0.007533
50%       0.014895
75%       0.033356
max       0.105598
Name: fatality_rate, dtype: float64

In [43]:
# median and mode of fatality rate

fatality_median = fatalities.fatality_rate.median()
fatality_mode = fatalities.fatality_rate.mode()

print("Fatality median is {} and mode is {}".format(fatality_median, fatality_mode))


Fatality median is 0.014894984997856837 and mode is 0    0.009346
dtype: float64


In [39]:
# the mode above does not make sense, let's fix it.

# first, segment and sort fatality values into bins
bins = pd.cut(fatalities['fatality_rate'], 10, include_lowest=True)

# I'd like to see the content of each bin, what to do?

Country_Region
Italy              (0.0951, 0.106]
San Marino        (0.0846, 0.0951]
Iraq              (0.0846, 0.0951]
Indonesia         (0.0742, 0.0846]
Spain             (0.0742, 0.0846]
                       ...        
Saudi Arabia    (-0.00025, 0.0113]
Iceland         (-0.00025, 0.0113]
Serbia          (-0.00025, 0.0113]
Estonia         (-0.00025, 0.0113]
South Africa    (-0.00025, 0.0113]
Name: fatality_rate, Length: 83, dtype: category
Categories (10, interval[float64]): [(-0.00025, 0.0113] < (0.0113, 0.0218] < (0.0218, 0.0323] < (0.0323, 0.0428] ... (0.0637, 0.0742] < (0.0742, 0.0846] < (0.0846, 0.0951] < (0.0951, 0.106]]