# Covid-19 Data Wrangling in Python

In [1]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.1
numpy version:  1.18.1


In [2]:
covid_daily_df = pd.read_csv('data/2020-03-30.csv')

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-30 22:52:45,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-30 22:52:45,30.295065,-92.414197,11,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-30 22:52:45,37.767072,-75.632346,6,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-30 22:52:45,43.452658,-116.241552,113,2,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-30 22:52:45,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [3]:
covid_daily_df.dtypes

FIPS              float64
Admin2             object
Province_State     object
Country_Region     object
Last_Update        object
Lat               float64
Long_             float64
Confirmed           int64
Deaths              int64
Recovered           int64
Active              int64
Combined_Key       object
dtype: object

In [4]:
covid_daily_df.shape

(3439, 12)

In [6]:
# convert the FIPS County Code column to type of integer

covid_daily_df.FIPS = covid_daily_df.FIPS.fillna(-1) # get rid of NA 
covid_daily_df.FIPS = covid_daily_df.FIPS.astype(int)

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001,Abbeville,South Carolina,US,2020-03-30 22:52:45,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001,Acadia,Louisiana,US,2020-03-30 22:52:45,30.295065,-92.414197,11,1,0,0,"Acadia, Louisiana, US"
2,51001,Accomack,Virginia,US,2020-03-30 22:52:45,37.767072,-75.632346,6,0,0,0,"Accomack, Virginia, US"
3,16001,Ada,Idaho,US,2020-03-30 22:52:45,43.452658,-116.241552,113,2,0,0,"Ada, Idaho, US"
4,19001,Adair,Iowa,US,2020-03-30 22:52:45,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [9]:
# show all unique Country_Region codes

countries = covid_daily_df.Country_Region.unique()
print(countries)
len(countries)

['US' 'Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Botswana' 'Brazil'
 'Brunei' 'Bulgaria' 'Burkina Faso' 'Burma' 'Cabo Verde' 'Cambodia'
 'Cameroon' 'Central African Republic' 'Chad' 'Chile' 'Colombia'
 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jama

178

In [7]:
# statistics grouped by countries
covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum})

Unnamed: 0_level_0,Confirmed
Country_Region,Unnamed: 1_level_1
Afghanistan,170
Albania,223
Algeria,584
Andorra,370
Angola,7
...,...
Venezuela,135
Vietnam,203
West Bank and Gaza,116
Zambia,35


In [8]:
# construct a dataframe with five columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_country_cases = covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_country_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,170,4,2,164
Albania,223,11,44,168
Algeria,584,35,37,512
Andorra,370,8,10,352
Angola,7,2,0,5
...,...,...,...,...
Venezuela,135,3,39,93
Vietnam,203,0,55,148
West Bank and Gaza,116,1,18,97
Zambia,35,0,0,35


In [10]:
# show counties that do not have any corona patients yet
len(df_country_cases.loc[df_country_cases['Confirmed'] == 0])


0

In [11]:
# last operation tells that all countries in this csv file have confirmed cases. 
# now we can calculate the fatality rate

df_country_cases['fatality_rate'] = df_country_cases.Deaths / df_country_cases.Confirmed

fatalities = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)') \
                            .sort_values(by = 'fatality_rate', ascending = False)

fatalities[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,101739,11591,14620,75528,0.113929
San Marino,230,25,13,192,0.108696
Spain,87956,7716,16780,63460,0.087726
Indonesia,1414,122,75,1217,0.08628
Netherlands,11817,865,253,10699,0.0732
Iraq,630,46,152,432,0.073016
France,45170,3030,7964,34176,0.06708
Iran,41495,2757,13911,24827,0.066442
United Kingdom,22453,1411,171,20871,0.062842
Egypt,656,41,150,465,0.0625


In [23]:
fatalities_asc = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)').sort_values(by = 'fatality_rate', ascending = True)

fatalities_asc[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Qatar,693,1,51,641,0.001443
New Zealand,589,1,63,525,0.001698
Iceland,1086,2,157,927,0.001842
South Africa,1326,3,31,1292,0.002262
Uruguay,310,1,0,309,0.003226
Chile,2449,8,156,2285,0.003267
Kazakhstan,302,1,21,280,0.003311
Israel,4695,16,161,4518,0.003408
Singapore,879,3,228,648,0.003413
Australia,4361,17,257,4087,0.003898


In [14]:
fatalities.Deaths.sum()

37534

In [26]:
# fatalitiy over all countries having more than 100 confirmed cases. 

fatality_partial = fatalities.Deaths.sum() / fatalities.Confirmed.sum()

fatality_partial

0.04820140466500833

In [25]:
# fatalitiy over all countries, regardless of how many confirmed cases and deaths. 

fatality_overall = df_country_cases.Deaths.sum() / df_country_cases.Confirmed.sum()

fatality_overall

0.048036402446428456

In [16]:
# define a function which returns the rank of the fatality

def fatality_rank( country: str, asc: bool ) -> int:
    indices = fatalities.sort_values(by = 'fatality_rate', ascending = asc).index
    return indices.get_loc(country)


In [17]:
ranks_asc = fatalities.sort_values(by = 'fatality_rate', ascending = True).index

def get_fatality_and_rank(country: str) -> pd.Series:
    fatality = fatalities.loc[country].fatality_rate

    rank_asc = ranks_asc.get_loc(country) + 1
    rank_dsc = len(ranks_asc) - rank_asc + 1

    return pd.Series({'Country_Region': country, 'fatality_rate': fatality, 'rank_asc': rank_asc, 'rank_dsc': rank_dsc})


In [18]:
# test the function get_fatality_and_rank

get_fatality_and_rank('Italy')

Country_Region       Italy
fatality_rate     0.113929
rank_asc                96
rank_dsc                 1
dtype: object

In [19]:
# how do these countries score in the fatality_rate?

cols = ['Country_Region', 'fatality_rate', 'rank_asc', 'rank_dsc']
countries = ['Austria', 'China', 'Germany', 'Italy', 'Qatar', 'Singapore', 'US']

df_fatality_scores = pd.DataFrame(columns = cols)

for country in countries:
    df_fatality_scores = df_fatality_scores.append(get_fatality_and_rank(country), ignore_index=True)

df_fatality_scores

Unnamed: 0,Country_Region,fatality_rate,rank_asc,rank_dsc
0,Austria,0.011229,30,67
1,China,0.040244,77,20
2,Germany,0.009643,27,70
3,Italy,0.113929,96,1
4,Qatar,0.001443,1,96
5,Singapore,0.003413,9,88
6,US,0.018405,45,52


In [20]:
# basic statistics of the fatality
fatalities.fatality_rate.describe()

count    96.000000
mean      0.026130
std       0.023618
min       0.001443
25%       0.008511
50%       0.020262
75%       0.033060
max       0.113929
Name: fatality_rate, dtype: float64

In [21]:
# median and mode of fatality rate

fatality_median = fatalities.fatality_rate.median()
fatality_mode = fatalities.fatality_rate.mode()

print("Fatality median is {} and mode is {}".format(fatality_median, fatality_mode))


Fatality median is 0.020261576256727025 and mode is 0    0.023529
dtype: float64


In [22]:
# the mode above does not make sense, let's fix it.

# first, segment and sort fatality values into bins
bins = pd.cut(fatalities['fatality_rate'], 10, include_lowest=True)

bins

# I'd like to see the content of each bin, what to do?

Country_Region
Italy              (0.103, 0.114]
San Marino         (0.103, 0.114]
Spain            (0.0802, 0.0914]
Indonesia        (0.0802, 0.0914]
Netherlands      (0.0689, 0.0802]
                      ...        
Uruguay         (0.00033, 0.0127]
South Africa    (0.00033, 0.0127]
Iceland         (0.00033, 0.0127]
New Zealand     (0.00033, 0.0127]
Qatar           (0.00033, 0.0127]
Name: fatality_rate, Length: 96, dtype: category
Categories (10, interval[float64]): [(0.00033, 0.0127] < (0.0127, 0.0239] < (0.0239, 0.0352] < (0.0352, 0.0464] ... (0.0689, 0.0802] < (0.0802, 0.0914] < (0.0914, 0.103] < (0.103, 0.114]]