In [1]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime

# COVID-19

In [2]:
# Find path to the csv file
covid19_path='../Datasets/Pandemic/COVID-19 Cases.csv'

In [3]:
# Read in csv and print head of df
covid19_df = pd.read_csv(covid19_path)
covid19_df.head()


Unnamed: 0,Case_Type,Cases,Difference,Date,Country_Region,Province_State,Admin2,Combined_Key,FIPS,Lat,Long,Table_Names,Prep_Flow_Runtime
0,Deaths,0,0,2/27/2020,Bahamas,,,,,25.0343,-77.3963,Time Series,4/7/2020 8:34:39 PM
1,Confirmed,16,0,2/16/2020,Germany,,,,,51.0,9.0,Time Series,4/7/2020 8:34:39 PM
2,Deaths,0,0,1/31/2020,Canada,Alberta,,,,53.9333,-116.5765,Time Series,4/7/2020 8:34:39 PM
3,Deaths,0,0,3/4/2020,Australia,Queensland,,,,-28.0167,153.4,Time Series,4/7/2020 8:34:39 PM
4,Confirmed,0,0,1/25/2020,Suriname,,,,,3.9193,-56.0278,Time Series,4/7/2020 8:34:39 PM


In [4]:
# Select necessary columns and renaming if needed
covid19_df_2 = covid19_df[['Date', 'Case_Type', 'Cases', 'Difference', 'Country_Region',
       'Province_State', 'Admin2']]
covid19_df_2 = covid19_df_2.rename(columns = {'Cases':'Cumulative_Cases','Difference':'Daily_Cases',
                                                              'Country_Region':'Country', 'Province_State':'State',
                                                              'Admin2':'County'}) 
covid19_df_2.head()

Unnamed: 0,Date,Case_Type,Cumulative_Cases,Daily_Cases,Country,State,County
0,2/27/2020,Deaths,0,0,Bahamas,,
1,2/16/2020,Confirmed,16,0,Germany,,
2,1/31/2020,Deaths,0,0,Canada,Alberta,
3,3/4/2020,Deaths,0,0,Australia,Queensland,
4,1/25/2020,Confirmed,0,0,Suriname,,


In [5]:
# Fill in NaN values in columns so they dont dropped when pivoting the df
covid19_df_2 = covid19_df_2.fillna('None')
covid19_df_2.head()

Unnamed: 0,Date,Case_Type,Cumulative_Cases,Daily_Cases,Country,State,County
0,2/27/2020,Deaths,0,0,Bahamas,,
1,2/16/2020,Confirmed,16,0,Germany,,
2,1/31/2020,Deaths,0,0,Canada,Alberta,
3,3/4/2020,Deaths,0,0,Australia,Queensland,
4,1/25/2020,Confirmed,0,0,Suriname,,


In [6]:
# Pivot df so Case_Type (Deaths and Confirmed) become columns of their own
covid19_pivoted_df = covid19_df_2.pivot_table(index=['Date', 'Country', 'State', 'County'], 
                        columns='Case_Type', 
                        values='Daily_Cases',
                        aggfunc=sum)
covid19_pivoted_df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Case_Type,Confirmed,Deaths
Date,Country,State,County,Unnamed: 4_level_1,Unnamed: 5_level_1
1/22/2020,Afghanistan,,,0,0
1/22/2020,Albania,,,0,0
1/22/2020,Algeria,,,0,0
1/22/2020,Andorra,,,0,0
1/22/2020,Angola,,,0,0


In [7]:
# Reset index so we dont have grouped columns
covid19_pivoted_df.reset_index(inplace=True)
covid19_pivoted_df.head()

Case_Type,Date,Country,State,County,Confirmed,Deaths
0,1/22/2020,Afghanistan,,,0,0
1,1/22/2020,Albania,,,0,0
2,1/22/2020,Algeria,,,0,0
3,1/22/2020,Andorra,,,0,0
4,1/22/2020,Angola,,,0,0


In [8]:
# Now we want to get rid of any rows that have zero (0) on noth the Confirmed and Deaths columns, and reset the index
cols_of_interest = ['Confirmed', 'Deaths']
covid19_cleaned_df = covid19_pivoted_df[(covid19_pivoted_df[cols_of_interest] != 0).any(axis=1)]
covid19_cleaned_df = covid19_cleaned_df.reset_index(drop=True)
covid19_cleaned_df.head()

Case_Type,Date,Country,State,County,Confirmed,Deaths
0,1/23/2020,China,Anhui,,8,0
1,1/23/2020,China,Beijing,,8,0
2,1/23/2020,China,Chongqing,,3,0
3,1/23/2020,China,Fujian,,4,0
4,1/23/2020,China,Gansu,,2,0


In [9]:
# Save cleaned df into new csv
covid19_cleaned_df.to_csv('../Datasets/Pandemic/COVID19-clean.csv', index=False)