# COVID-19: United States Confirmed Cases & Deaths 

The following data cleaning and merging is performed to load the data into Power BI for further visualization and analysis.

In [141]:
# Packages / libraries
import os 
import numpy as np 
import pandas as pd
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

The data collected for this analysis is operated by the Johns Hopkins University Center for Systems Science and Engineering (https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series)

In [142]:
# load raw data from https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
confirmed = pd.read_csv("C:/Users/garcr/Desktop/time_series_covid19_confirmed_US.csv")
deaths = pd.read_csv("C:/Users/garcr/Desktop/time_series_covid19_deaths_US.csv")

# Confirm that the dataframes have the same number of rows (countries) and columns (dates)
# Deaths dataframe has an extra column 'Population'
print('The shape of confirmed is:', confirmed.shape)
print('The shape of deaths is:', deaths.shape)

confirmed.head()

The shape of confirmed is: (3262, 107)
The shape of deaths is: (3262, 108)


Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20,4/1/20,4/2/20,4/3/20,4/4/20,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20,4/19/20,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20
0,16.0,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,316.0,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,5,12,14,15,27,29,32,37,45,51,55,56,58,69,77,82,84,93,112,113,121,121,128,130,133,133,133,133,135,135,136,136,136,136,136,136,139,141,141,141
2,580.0,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,6,6,8,8,8,8,8,11,11,11,11,11,11,11,13,13,13,14,14,14,14,14,14,14,14,14
3,630.0,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,5,5,5,14,21,23,31,39,51,64,79,100,127,174,239,286,316,316,452,475,513,573,620,683,725,788,897,903,923,974,1043,1068,1118,1213,1252,1298,1252,1416,1276,1307,1371
4,850.0,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,3,3,6,6,7,17,17,17,19,22,23,30,30,30,30,37,40,42,43,43,45,45,50,51,51,51,51,51,51,51,53,53,53,53,54,54,54,55,57


In [143]:
# Drop unnecessary columns
confirmed.drop(['iso2','iso3','code3','FIPS','Combined_Key'], axis=1, inplace=True)
deaths.drop(['iso2','iso3','code3','FIPS','Combined_Key'], axis=1, inplace=True)

#confirmed.head()
#deaths.head()

In [144]:
# melt the date columns into one column
confirmed2 = pd.melt(confirmed, id_vars= ['UID','Admin2','Province_State','Country_Region','Lat','Long_'], var_name=['Date'])
deaths2 = pd.melt(deaths, id_vars= ['UID','Admin2','Province_State','Country_Region','Lat','Long_','Population'], var_name=['Date'])

print('The shape of confirmed is:', confirmed2.shape)
print('The shape of deaths is:', deaths2.shape)

#confirmed2.head()

The shape of confirmed is: (313152, 8)
The shape of deaths is: (313152, 9)


In [145]:
# convert Date column into datetime objects (xxxx-xx-xx)
confirmed2['Date'] = pd.to_datetime(confirmed2['Date'])
deaths2['Date'] = pd.to_datetime(deaths2['Date'])

confirmed2.head()

Unnamed: 0,UID,Admin2,Province_State,Country_Region,Lat,Long_,Date,value
0,16.0,,American Samoa,US,-14.271,-170.132,2020-01-22,0
1,316.0,,Guam,US,13.4443,144.7937,2020-01-22,0
2,580.0,,Northern Mariana Islands,US,15.0979,145.6739,2020-01-22,0
3,630.0,,Puerto Rico,US,18.2208,-66.5901,2020-01-22,0
4,850.0,,Virgin Islands,US,18.3358,-64.8963,2020-01-22,0


In [146]:
# Replace column headings with detailed fields
confirmed2.columns = confirmed2.columns.str.replace('value','Confirmed')
confirmed2.columns = confirmed2.columns.str.replace('Admin2','County')
deaths2.columns = deaths2.columns.str.replace('value', 'Deaths')
deaths2.columns = deaths2.columns.str.replace('Admin2','County')

# Investigate NULLs before join
print(confirmed2.isnull().sum())
print(deaths2.isnull().sum())

UID                96
County            672
Province_State      0
Country_Region      0
Lat                96
Long_              96
Date                0
Confirmed           0
dtype: int64
UID                96
County            672
Province_State      0
Country_Region      0
Lat                96
Long_              96
Population          0
Date                0
Deaths              0
dtype: int64


The null county values are a result of the District of Columbia, and the 5 inhabited US territories (American Samoa, Guam, Northern Mariana Islands, Puerto Rico, US Virgin Islands) not having counties, in addition to the Grand Princess and Diamond Princess cruise ships.

The null UID, Lat, and Long_ values are all from "Southwest" (County), Utah. Every "Southwest" County entry is null and will be removed from both of the Confirmed and Deaths dataframes. 

In [147]:
# Display Southwest County Utah entries
southwest_c = confirmed2[confirmed2['County'] == 'Southwest']
southwest_d = deaths2[deaths2['County'] == 'Southwest']
#pd.set_option('display.max_rows', 100)
#print(southwest_c)

# Remove Southwest County Utah entries from dataframes
confirmed2 = confirmed2[confirmed2['County'] != 'Southwest']
deaths2 = deaths2[deaths2['County'] != 'Southwest']

# Check nulls
print(confirmed2.isnull().sum())
print(deaths2.isnull().sum())

UID                 0
County            672
Province_State      0
Country_Region      0
Lat                 0
Long_               0
Date                0
Confirmed           0
dtype: int64
UID                 0
County            672
Province_State      0
Country_Region      0
Lat                 0
Long_               0
Population          0
Date                0
Deaths              0
dtype: int64


In [148]:
# Join the dataframes 
covid = confirmed2.merge(deaths2[['UID','Province_State','Country_Region','Date','Population','Deaths']],
                      how='outer',
                      left_on=['UID','Province_State','Country_Region','Date'],
                      right_on=['UID','Province_State','Country_Region','Date'])

# Investigate the shape of the dataframe after the join
print('\nThe shape of confirmed is:', confirmed2.shape)
print('The shape of deaths is:', deaths2.shape)
print('The shape of the joined dataframe is:', covid.shape)

#print(covid.isnull().sum())
#covid.tail()


The shape of confirmed is: (313056, 8)
The shape of deaths is: (313056, 9)
The shape of the joined dataframe is: (313056, 10)


The joined dataframe has 10 columns because Deaths and Population have been added to the confirmed dataframe. 

In [149]:
# Fill County NaN values with Province_State values (Applies only to DC, US territories, and cruise ships)
covid['County'].fillna(covid['Province_State'],inplace=True)

#covid.head()
covid.isnull().sum()

UID               0
County            0
Province_State    0
Country_Region    0
Lat               0
Long_             0
Date              0
Confirmed         0
Population        0
Deaths            0
dtype: int64

In [150]:
# Add Month-Year column
covid['Month-Year'] = covid['Date'].dt.strftime('%b-%Y')
#covid.tail()

In [151]:
# Copy df
temp = covid.copy()

# Create columns for previous date's COVID-19 cases to create daily numbers and the running total 
temp['Temp Date'] = covid['Date'] + pd.Timedelta(days=1)
temp.rename(columns={'Confirmed':'Confirmed - 1', 'Deaths':'Deaths - 1', 'Date':'Date - 1'}, inplace=True)

# Perform left join on DFs
covid_19 = covid.merge(temp[['UID','Province_State','Country_Region','Confirmed - 1','Deaths - 1',
                             'Temp Date','Date - 1']], how='left', 
                       left_on=['UID','Province_State','Country_Region','Date'],
                       right_on=['UID','Province_State','Country_Region','Temp Date'])
print(covid_19.shape)
covid_19.head()

(313056, 15)


Unnamed: 0,UID,County,Province_State,Country_Region,Lat,Long_,Date,Confirmed,Population,Deaths,Month-Year,Confirmed - 1,Deaths - 1,Temp Date,Date - 1
0,16.0,American Samoa,American Samoa,US,-14.271,-170.132,2020-01-22,0,55641,0,Jan-2020,,,NaT,NaT
1,316.0,Guam,Guam,US,13.4443,144.7937,2020-01-22,0,164229,0,Jan-2020,,,NaT,NaT
2,580.0,Northern Mariana Islands,Northern Mariana Islands,US,15.0979,145.6739,2020-01-22,0,55144,0,Jan-2020,,,NaT,NaT
3,630.0,Puerto Rico,Puerto Rico,US,18.2208,-66.5901,2020-01-22,0,2933408,0,Jan-2020,,,NaT,NaT
4,850.0,Virgin Islands,Virgin Islands,US,18.3358,-64.8963,2020-01-22,0,107268,0,Jan-2020,,,NaT,NaT


In [152]:
# Calculate the daily numbers for confirmed cases & deaths (current aggregate - previous aggregate = daily confirmed)
covid_19['Daily Confirmed'] = covid_19['Confirmed'] - covid_19['Confirmed - 1']
covid_19['Daily Deaths'] = covid_19['Deaths'] - covid_19['Deaths - 1']

print(covid_19.shape)
covid_19.head()

(313056, 17)


Unnamed: 0,UID,County,Province_State,Country_Region,Lat,Long_,Date,Confirmed,Population,Deaths,Month-Year,Confirmed - 1,Deaths - 1,Temp Date,Date - 1,Daily Confirmed,Daily Deaths
0,16.0,American Samoa,American Samoa,US,-14.271,-170.132,2020-01-22,0,55641,0,Jan-2020,,,NaT,NaT,,
1,316.0,Guam,Guam,US,13.4443,144.7937,2020-01-22,0,164229,0,Jan-2020,,,NaT,NaT,,
2,580.0,Northern Mariana Islands,Northern Mariana Islands,US,15.0979,145.6739,2020-01-22,0,55144,0,Jan-2020,,,NaT,NaT,,
3,630.0,Puerto Rico,Puerto Rico,US,18.2208,-66.5901,2020-01-22,0,2933408,0,Jan-2020,,,NaT,NaT,,
4,850.0,Virgin Islands,Virgin Islands,US,18.3358,-64.8963,2020-01-22,0,107268,0,Jan-2020,,,NaT,NaT,,


In [153]:
# Include daily numbers for the first day of data where there is no previous date available (2020-01-22)
covid_19['Daily Confirmed'].loc[covid_19['Date'] == '2020-01-22'] = covid_19['Confirmed']
covid_19['Daily Deaths'].loc[covid_19['Date'] == '2020-01-22'] = covid_19['Deaths']

# Delete unnecessary columns
del covid_19['Confirmed - 1'] 
del covid_19['Deaths - 1']
del covid_19['Temp Date']
del covid_19['Date - 1']

covid_19.head()

Unnamed: 0,UID,County,Province_State,Country_Region,Lat,Long_,Date,Confirmed,Population,Deaths,Month-Year,Daily Confirmed,Daily Deaths
0,16.0,American Samoa,American Samoa,US,-14.271,-170.132,2020-01-22,0,55641,0,Jan-2020,0.0,0.0
1,316.0,Guam,Guam,US,13.4443,144.7937,2020-01-22,0,164229,0,Jan-2020,0.0,0.0
2,580.0,Northern Mariana Islands,Northern Mariana Islands,US,15.0979,145.6739,2020-01-22,0,55144,0,Jan-2020,0.0,0.0
3,630.0,Puerto Rico,Puerto Rico,US,18.2208,-66.5901,2020-01-22,0,2933408,0,Jan-2020,0.0,0.0
4,850.0,Virgin Islands,Virgin Islands,US,18.3358,-64.8963,2020-01-22,0,107268,0,Jan-2020,0.0,0.0


In [154]:
# checking most recent US daily totals
daily_sum = covid_19.groupby('Date').agg({'Daily Confirmed': 'sum'})
print(daily_sum.tail())
daily_deaths = covid_19.groupby('Date').agg({'Daily Deaths': 'sum'})
print(daily_deaths.tail())

            Daily Confirmed
Date                       
2020-04-22          28127.0
2020-04-23          34020.0
2020-04-24          36188.0
2020-04-25          32796.0
2020-04-26          27631.0
            Daily Deaths
Date                    
2020-04-22        2326.0
2020-04-23        2312.0
2020-04-24        1769.0
2020-04-25        2262.0
2020-04-26        1126.0


In [155]:
# Export Data as csv to load into Power BI
covid_19.to_csv('UScovid', sep='\t')