# Covid-19 Data Wrangling in Python

In [1]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.1
numpy version:  1.18.1


In [2]:
covid_daily_df = pd.read_csv('data/2020-03-28.csv')

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-28 23:05:37,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-28 23:05:37,30.295065,-92.414197,9,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-28 23:05:37,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-28 23:05:37,43.452658,-116.241552,76,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-28 23:05:37,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [3]:
covid_daily_df.dtypes

FIPS              float64
Admin2             object
Province_State     object
Country_Region     object
Last_Update        object
Lat               float64
Long_             float64
Confirmed           int64
Deaths              int64
Recovered           int64
Active              int64
Combined_Key       object
dtype: object

In [4]:
covid_daily_df.shape

(3430, 12)

In [5]:
# convert the FIPS County Code column to type of integer

covid_daily_df.FIPS = covid_daily_df.FIPS.fillna(-1) # get rid of NA 
covid_daily_df.FIPS = covid_daily_df.FIPS.astype(int)

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001,Abbeville,South Carolina,US,2020-03-28 23:05:37,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001,Acadia,Louisiana,US,2020-03-28 23:05:37,30.295065,-92.414197,9,1,0,0,"Acadia, Louisiana, US"
2,51001,Accomack,Virginia,US,2020-03-28 23:05:37,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001,Ada,Idaho,US,2020-03-28 23:05:37,43.452658,-116.241552,76,0,0,0,"Ada, Idaho, US"
4,19001,Adair,Iowa,US,2020-03-28 23:05:37,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [6]:
# show all unique Country_Region codes

countries = covid_daily_df.Country_Region.unique()
print(countries)
len(countries)

['US' 'Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Brunei'
 'Bulgaria' 'Burkina Faso' 'Burma' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Central African Republic' 'Chad' 'Chile' 'Colombia'
 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan

177

In [7]:
# statistics grouped by countries
covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum})

Unnamed: 0_level_0,Confirmed
Country_Region,Unnamed: 1_level_1
Afghanistan,110
Albania,197
Algeria,454
Andorra,308
Angola,5
...,...
Venezuela,119
Vietnam,174
West Bank and Gaza,98
Zambia,28


In [8]:
# construct a dataframe with five columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_country_cases = covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_country_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,110,4,2,104
Albania,197,10,31,156
Algeria,454,29,31,394
Andorra,308,3,1,304
Angola,5,0,0,5
...,...,...,...,...
Venezuela,119,2,39,78
Vietnam,174,0,21,153
West Bank and Gaza,98,1,18,79
Zambia,28,0,0,28


In [9]:
df_country_cases.dtypes

Confirmed    int64
Deaths       int64
Recovered    int64
Active       int64
dtype: object

In [10]:
# show counties that do not have any corona patients yet
len(df_country_cases.loc[df_country_cases['Confirmed'] == 0])


0

In [11]:
# last operation tells that all countries in this csv file have confirmed cases. 
# now we can calculate the fatality rate

df_country_cases['fatality_rate'] = df_country_cases.Deaths / df_country_cases.Confirmed

fatalities = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)') \
                            .sort_values(by = 'fatality_rate', ascending = False)

fatalities[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,92472,10023,12384,70065,0.10839
San Marino,224,22,6,196,0.098214
Indonesia,1155,102,59,994,0.088312
Iraq,506,42,131,333,0.083004
Spain,73235,5982,12285,54968,0.081682
Iran,35408,2517,11679,21212,0.071086
Netherlands,9819,640,6,9173,0.06518
Algeria,454,29,31,394,0.063877
Philippines,1075,68,35,972,0.063256
Egypt,576,36,121,419,0.0625


In [12]:
fatalities_asc = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)').sort_values(by = 'fatality_rate', ascending = True)

fatalities_asc[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
South Africa,1187,1,31,1155,0.000842
Estonia,645,1,20,624,0.00155
Qatar,590,1,45,544,0.001695
Iceland,963,2,114,847,0.002077
Armenia,407,1,30,376,0.002457
Singapore,802,2,198,602,0.002494
Chile,1909,6,61,1842,0.003143
Russia,1264,4,49,1211,0.003165
Israel,3619,12,89,3518,0.003316
Saudi Arabia,1203,4,37,1162,0.003325


In [24]:
fatalities.Deaths.sum()

30616

In [22]:
fatality_overall = fatalities.Deaths.sum() / fatalities.Confirmed.sum()

fatality_overall

0.04663625216113087

In [15]:
# define a function which returns the rank of the fatality

def fatality_rank( country: str, asc: bool ) -> int:
    indices = fatalities.sort_values(by = 'fatality_rate', ascending = asc).index
    return indices.get_loc(country)


In [16]:
def get_fatality_and_rank(country: str):
    fatality = fatalities.loc[country].fatality_rate

    ranks_asc = fatalities.sort_values(by = 'fatality_rate', ascending = True).index
    rank_asc = ranks_asc.get_loc(country) + 1
    rank_dsc = len(ranks_asc) - rank_asc + 1

    return pd.Series({'Country_Region': country, 'fatality_rate': fatality, 'rank_asc': rank_asc, 'rank_dsc': rank_dsc})


In [17]:
# test the function get_fatality_and_rank

get_fatality_and_rank('Italy')

Country_Region      Italy
fatality_rate     0.10839
rank_asc               89
rank_dsc                1
dtype: object

In [18]:
# how do these countries score in the fatality_rate?

cols = ['Country_Region', 'fatality_rate', 'rank_asc', 'rank_dsc']
countries = ['Austria', 'China', 'Germany', 'Italy', 'Singapore', 'South Africa', 'US']

df_fatality_scores = pd.DataFrame(columns = cols)

for country in countries:
    df_fatality_scores = df_fatality_scores.append(get_fatality_and_rank(country), ignore_index=True)

df_fatality_scores

Unnamed: 0,Country_Region,fatality_rate,rank_asc,rank_dsc
0,Austria,0.008221,24,66
1,China,0.040232,74,16
2,Germany,0.007505,20,70
3,Italy,0.10839,89,1
4,Singapore,0.002494,6,84
5,South Africa,0.000842,1,89
6,US,0.016678,42,48


In [19]:
# basic statistics of the fatality
fatalities.fatality_rate.describe()

count    89.000000
mean      0.025028
std       0.024105
min       0.000842
25%       0.008027
50%       0.017766
75%       0.030461
max       0.108390
Name: fatality_rate, dtype: float64

In [20]:
# median and mode of fatality rate

fatality_median = fatalities.fatality_rate.median()
fatality_mode = fatalities.fatality_rate.mode()

print("Fatality median is {} and mode is {}".format(fatality_median, fatality_mode))


Fatality median is 0.017766497461928935 and mode is 0     0.000842
1     0.001550
2     0.001695
3     0.002077
4     0.002457
        ...   
84    0.081682
85    0.083004
86    0.088312
87    0.098214
88    0.108390
Length: 89, dtype: float64


In [21]:
# the mode above does not make sense, let's fix it.

# first, segment and sort fatality values into bins
bins = pd.cut(fatalities['fatality_rate'], 10, include_lowest=True)

bins

# I'd like to see the content of each bin, what to do?

Country_Region
Italy                             (0.0976, 0.108]
San Marino                        (0.0976, 0.108]
Indonesia                        (0.0869, 0.0976]
Iraq                             (0.0761, 0.0869]
Spain                            (0.0761, 0.0869]
                              ...                
Armenia         (-0.00026500000000000004, 0.0116]
Iceland         (-0.00026500000000000004, 0.0116]
Qatar           (-0.00026500000000000004, 0.0116]
Estonia         (-0.00026500000000000004, 0.0116]
South Africa    (-0.00026500000000000004, 0.0116]
Name: fatality_rate, Length: 89, dtype: category
Categories (10, interval[float64]): [(-0.00026500000000000004, 0.0116] < (0.0116, 0.0224] < (0.0224, 0.0331] < (0.0331, 0.0439] ... (0.0654, 0.0761] < (0.0761, 0.0869] < (0.0869, 0.0976] < (0.0976, 0.108]]