# Covid-19 Data Wrangling in Python

In [5]:
import pandas as pd
import numpy as np

print('pandas version: ', pd.__version__)
print('numpy version: ', np.version.version)

pandas version:  1.0.1
numpy version:  1.18.1


In [19]:
covid_daily_df = pd.read_csv('data/2020-03-26.csv')

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-26 23:48:35,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-26 23:48:35,30.295065,-92.414197,3,0,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-26 23:48:35,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-26 23:48:35,43.452658,-116.241552,39,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-26 23:48:35,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [20]:
covid_daily_df.dtypes

FIPS              float64
Admin2             object
Province_State     object
Country_Region     object
Last_Update        object
Lat               float64
Long_             float64
Confirmed           int64
Deaths              int64
Recovered           int64
Active              int64
Combined_Key       object
dtype: object

In [21]:
covid_daily_df.shape

(3421, 12)

In [23]:
# convert the FIPS County Code column to type of integer

covid_daily_df.FIPS = covid_daily_df.FIPS.fillna(-1) # get rid of NA 
covid_daily_df.FIPS = covid_daily_df.FIPS.astype(int)

covid_daily_df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001,Abbeville,South Carolina,US,2020-03-26 23:48:35,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001,Acadia,Louisiana,US,2020-03-26 23:48:35,30.295065,-92.414197,3,0,0,0,"Acadia, Louisiana, US"
2,51001,Accomack,Virginia,US,2020-03-26 23:48:35,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001,Ada,Idaho,US,2020-03-26 23:48:35,43.452658,-116.241552,39,0,0,0,"Ada, Idaho, US"
4,19001,Adair,Iowa,US,2020-03-26 23:48:35,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [25]:
# show all unique Country_Region codes

countries = covid_daily_df.Country_Region.unique()
print(countries)
len(countries)

['US' 'Canada' 'China' 'Netherlands' 'Australia' 'United Kingdom'
 'Denmark' 'France' 'Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola'
 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Austria' 'Azerbaijan'
 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize'
 'Benin' 'Bhutan' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Brunei'
 'Bulgaria' 'Burkina Faso' 'Cabo Verde' 'Cambodia' 'Cameroon'
 'Central African Republic' 'Chad' 'Chile' 'Colombia'
 'Congo (Brazzaville)' 'Congo (Kinshasa)' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Cyprus' 'Czechia' 'Diamond Princess' 'Djibouti'
 'Dominica' 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia' 'Fiji'
 'Finland' 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada'
 'Guatemala' 'Guinea' 'Guinea-Bissau' 'Guyana' 'Haiti' 'Holy See'
 'Honduras' 'Hungary' 'Iceland' 'India' 'Indonesia' 'Iran' 'Iraq'
 'Ireland' 'Israel' 'Italy' 'Jamaica' 'Japan' 'Jorda

175

In [30]:
# statistics grouped by countries
covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum})

Unnamed: 0_level_0,Confirmed,Deaths
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,94,4
Albania,174,6
Algeria,367,25
Andorra,224,3
Angola,4,0
...,...,...
Venezuela,107,0
Vietnam,153,0
West Bank and Gaza,84,1
Zambia,16,0


In [31]:
# construct a dataframe with five columns: 
# Country_Region (index), Confirmed, Deaths, Recovered, Active
df_country_cases = covid_daily_df.groupby('Country_Region').agg({'Confirmed': np.sum, 'Deaths': np.sum, 'Recovered': np.sum, 'Active': np.sum})

df_country_cases

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,94,4,2,88
Albania,174,6,17,151
Algeria,367,25,29,313
Andorra,224,3,1,220
Angola,4,0,0,4
...,...,...,...,...
Venezuela,107,0,15,92
Vietnam,153,0,20,133
West Bank and Gaza,84,1,17,66
Zambia,16,0,0,16


In [39]:
df_country_cases.dtypes

Confirmed    int64
Deaths       int64
Recovered    int64
Active       int64
dtype: object

In [41]:
# show counties that do not have any corona patients yet
df_country_cases.loc[df_country_cases['Confirmed'] == 0]


Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [52]:
# confirmed that all countries in this csv file have confirmed cases. 
# now we can calculate the fatality rate

df_country_cases['fatality_rate'] = df_country_cases.Deaths / df_country_cases.Confirmed

fatalities = df_country_cases.where((df_country_cases.Deaths > 0) & (df_country_cases.Confirmed > 1000)) \
                                .sort_values(by = 'fatality_rate', ascending = False)

fatalities[:25]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,fatality_rate
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Italy,80589.0,8215.0,10361.0,62013.0,0.101937
Iran,29406.0,2234.0,10457.0,16715.0,0.075971
Spain,57786.0,4365.0,7015.0,46406.0,0.075537
Netherlands,7468.0,435.0,6.0,7027.0,0.058249
France,29551.0,1698.0,4955.0,22898.0,0.05746
United Kingdom,11812.0,580.0,150.0,11082.0,0.049103
China,81782.0,3291.0,74181.0,4310.0,0.040241
Belgium,6235.0,220.0,675.0,5340.0,0.035285
Japan,1387.0,47.0,359.0,981.0,0.033886
Sweden,2840.0,77.0,16.0,2747.0,0.027113
