# Covid-19 Data Wrangling in Python

In [2]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.1
numpy version:  1.18.1


In [3]:
covid_daily_df = pd.read_csv('data/2020-03-29.csv')

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-29 23:08:25,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-29 23:08:25,30.295065,-92.414197,9,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-29 23:08:25,37.767072,-75.632346,3,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-29 23:08:25,43.452658,-116.241552,92,1,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-29 23:08:25,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [3]:
covid_daily_df.dtypes

FIPS              float64
Admin2             object
Province_State     object
Country_Region     object
Last_Update        object
Lat               float64
Long_             float64
Confirmed           int64
Deaths              int64
Recovered           int64
Active              int64
Combined_Key       object
dtype: object

In [4]:
covid_daily_df.shape

(3434, 12)

In [5]:
# convert the FIPS County Code column to type of integer

covid_daily_df.FIPS = covid_daily_df.FIPS.fillna(-1) # get rid of NA 
covid_daily_df.FIPS = covid_daily_df.FIPS.astype(int)

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001,Abbeville,South Carolina,US,2020-03-29 23:08:25,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001,Acadia,Louisiana,US,2020-03-29 23:08:25,30.295065,-92.414197,9,1,0,0,"Acadia, Louisiana, US"
2,51001,Accomack,Virginia,US,2020-03-29 23:08:25,37.767072,-75.632346,3,0,0,0,"Accomack, Virginia, US"
3,16001,Ada,Idaho,US,2020-03-29 23:08:25,43.452658,-116.241552,92,1,0,0,"Ada, Idaho, US"
4,19001,Adair,Iowa,US,2020-03-29 23:08:25,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [6]:
# show all unique Country_Region codes

countries = covid_daily_df.Country_Region.unique()
print(countries)
len(countries)

['US' 'Canada' 'United Kingdom' 'China' 'Netherlands' 'Australia'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Brunei'
 'Bulgaria' 'Burkina Faso' 'Burma' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Central African Republic' 'Chad' 'Chile' 'Colombia'
 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan

177

In [7]:
# statistics grouped by countries
covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum})

Unnamed: 0_level_0,Confirmed
Country_Region,Unnamed: 1_level_1
Afghanistan,120
Albania,212
Algeria,511
Andorra,334
Angola,7
...,...
Venezuela,119
Vietnam,188
West Bank and Gaza,109
Zambia,29


In [8]:
# construct a dataframe with five columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_country_cases = covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_country_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,120,4,2,114
Albania,212,10,33,169
Algeria,511,31,31,449
Andorra,334,6,1,327
Angola,7,2,0,5
...,...,...,...,...
Venezuela,119,2,39,78
Vietnam,188,0,25,163
West Bank and Gaza,109,1,18,90
Zambia,29,0,0,29


In [9]:
# show counties that do not have any corona patients yet
len(df_country_cases.loc[df_country_cases['Confirmed'] == 0])


0

In [10]:
# last operation tells that all countries in this csv file have confirmed cases. 
# now we can calculate the fatality rate

df_country_cases['fatality_rate'] = df_country_cases.Deaths / df_country_cases.Confirmed

fatalities = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)') \
                            .sort_values(by = 'fatality_rate', ascending = False)

fatalities[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,97689,10779,13030,73880,0.11034
San Marino,224,22,6,196,0.098214
Indonesia,1285,114,64,1107,0.088716
Spain,80110,6803,14709,58598,0.084921
Iraq,547,42,143,362,0.076782
Netherlands,10930,772,253,9905,0.070631
Iran,38309,2640,12391,23278,0.068913
Egypt,609,40,132,437,0.065681
France,40708,2611,7226,30871,0.06414
United Kingdom,19780,1231,151,18398,0.062235


In [11]:
fatalities_asc = df_country_cases.query('(Deaths > 0) & (Confirmed > 100)').sort_values(by = 'fatality_rate', ascending = True)

fatalities_asc[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
South Africa,1280,2,31,1247,0.001563
Qatar,634,1,48,585,0.001577
New Zealand,514,1,56,457,0.001946
Iceland,1020,2,135,883,0.001961
Chile,2139,7,75,2057,0.003273
Uruguay,304,1,0,303,0.003289
Kazakhstan,284,1,20,263,0.003521
Israel,4247,15,132,4100,0.003532
Singapore,844,3,212,629,0.003555
Australia,3984,16,244,3724,0.004016


In [12]:
fatalities.Deaths.sum()

33888

In [13]:
fatality_overall = fatalities.Deaths.sum() / fatalities.Confirmed.sum()

fatality_overall

0.04727764314922006

In [14]:
# define a function which returns the rank of the fatality

def fatality_rank( country: str, asc: bool ) -> int:
    indices = fatalities.sort_values(by = 'fatality_rate', ascending = asc).index
    return indices.get_loc(country)


In [16]:
ranks_asc = fatalities.sort_values(by = 'fatality_rate', ascending = True).index

def get_fatality_and_rank(country: str) -> pd.Series:
    fatality = fatalities.loc[country].fatality_rate

    rank_asc = ranks_asc.get_loc(country) + 1
    rank_dsc = len(ranks_asc) - rank_asc + 1

    return pd.Series({'Country_Region': country, 'fatality_rate': fatality, 'rank_asc': rank_asc, 'rank_dsc': rank_dsc})


In [17]:
# test the function get_fatality_and_rank

get_fatality_and_rank('Italy')

Country_Region      Italy
fatality_rate     0.11034
rank_asc               96
rank_dsc                1
dtype: object

In [18]:
# how do these countries score in the fatality_rate?

cols = ['Country_Region', 'fatality_rate', 'rank_asc', 'rank_dsc']
countries = ['Austria', 'China', 'Germany', 'Italy', 'Singapore', 'South Africa', 'US']

df_fatality_scores = pd.DataFrame(columns = cols)

for country in countries:
    df_fatality_scores = df_fatality_scores.append(get_fatality_and_rank(country), ignore_index=True)

df_fatality_scores

Unnamed: 0,Country_Region,fatality_rate,rank_asc,rank_dsc
0,Austria,0.009786,32,65
1,China,0.040233,79,18
2,Germany,0.008584,27,70
3,Italy,0.11034,96,1
4,Singapore,0.003555,9,88
5,South Africa,0.001563,1,96
6,US,0.017511,46,51


In [19]:
# basic statistics of the fatality
fatalities.fatality_rate.describe()

count    96.000000
mean      0.024676
std       0.023249
min       0.001563
25%       0.008315
50%       0.017777
75%       0.030575
max       0.110340
Name: fatality_rate, dtype: float64

In [20]:
# median and mode of fatality rate

fatality_median = fatalities.fatality_rate.median()
fatality_mode = fatalities.fatality_rate.mode()

print("Fatality median is {} and mode is {}".format(fatality_median, fatality_mode))


Fatality median is 0.01777744701800987 and mode is 0    0.018868
dtype: float64


In [21]:
# the mode above does not make sense, let's fix it.

# first, segment and sort fatality values into bins
bins = pd.cut(fatalities['fatality_rate'], 10, include_lowest=True)

bins

# I'd like to see the content of each bin, what to do?

Country_Region
Italy                            (0.0995, 0.11]
San Marino                     (0.0886, 0.0995]
Indonesia                      (0.0886, 0.0995]
Spain                          (0.0777, 0.0886]
Iraq                           (0.0668, 0.0777]
                             ...               
Chile           (0.0004499999999999999, 0.0124]
Iceland         (0.0004499999999999999, 0.0124]
New Zealand     (0.0004499999999999999, 0.0124]
Qatar           (0.0004499999999999999, 0.0124]
South Africa    (0.0004499999999999999, 0.0124]
Name: fatality_rate, Length: 96, dtype: category
Categories (10, interval[float64]): [(0.0004499999999999999, 0.0124] < (0.0124, 0.0233] < (0.0233, 0.0342] < (0.0342, 0.0451] ... (0.0668, 0.0777] < (0.0777, 0.0886] < (0.0886, 0.0995] < (0.0995, 0.11]]