# COVID-19: United States Confirmed Cases & Deaths 

The following data cleaning and merging is performed to load the data into Power BI for further visualization and analysis.

In [1]:
# Packages / libraries
import os
import numpy as np
import pandas as pd
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

The data collected for this analysis is operated by the Johns Hopkins University Center for Systems Science and Engineering (https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series)

In [2]:
# load raw data from https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
confirmed = pd.read_csv("C:/Users/garcr/Desktop/time_series_covid19_confirmed_US.csv")
deaths = pd.read_csv("C:/Users/garcr/Desktop/time_series_covid19_deaths_US.csv")

# Confirm that the dataframes have the same number of rows (countries) and columns (dates)
# Deaths dataframe has an extra column 'Population'
print('The shape of confirmed is:', confirmed.shape)
print('The shape of deaths is:', deaths.shape)

confirmed.head()
#deaths.head()

The shape of confirmed is: (3342, 492)
The shape of deaths is: (3342, 493)


Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21,5/14/21,5/15/21,5/16/21
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,6918,6918,6920,6920,6926,6928,6938,6971,7001,7005
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,21107,21123,21131,21135,21154,21170,21191,21290,21392,21411
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,2307,2307,2308,2308,2310,2314,2317,2319,2320,2320
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,2604,2605,2607,2607,2609,2612,2615,2630,2645,2647
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,6651,6656,6660,6661,6678,6680,6694,6750,6771,6773


In [3]:
# Drop unnecessary columns
confirmed.drop(['iso2','iso3','code3','FIPS','Combined_Key'], axis=1, inplace=True)
deaths.drop(['iso2','iso3','code3','FIPS','Combined_Key'], axis=1, inplace=True)

#confirmed.head()
#deaths.head()

In [4]:
# Melt the date columns into one column
confirmed2 = pd.melt(confirmed, id_vars=[
                     'UID', 'Admin2', 'Province_State', 'Country_Region', 'Lat', 'Long_'], var_name=['Date'])
deaths2 = pd.melt(deaths, id_vars=['UID', 'Admin2', 'Province_State',
                                   'Country_Region', 'Lat', 'Long_', 'Population'], var_name=['Date'])

print('The shape of confirmed is:', confirmed2.shape)
print('The shape of deaths is:', deaths2.shape)

# confirmed2.head()

The shape of confirmed is: (1607502, 8)
The shape of deaths is: (1607502, 9)


In [5]:
# Convert Date column into datetime objects (xxxx-xx-xx)
confirmed2['Date'] = pd.to_datetime(confirmed2['Date'])
deaths2['Date'] = pd.to_datetime(deaths2['Date'])

In [6]:
# Replace column headings with detailed fields
confirmed2.columns = confirmed2.columns.str.replace('value', 'Confirmed')
confirmed2.columns = confirmed2.columns.str.replace('Admin2', 'County')
deaths2.columns = deaths2.columns.str.replace('value', 'Deaths')
deaths2.columns = deaths2.columns.str.replace('Admin2', 'County')

# Investigate NULLs before join
print(confirmed2.isnull().sum())
print(deaths2.isnull().sum())

UID                  0
County            2886
Province_State       0
Country_Region       0
Lat                  0
Long_                0
Date                 0
Confirmed            0
dtype: int64
UID                  0
County            2886
Province_State       0
Country_Region       0
Lat                  0
Long_                0
Population           0
Date                 0
Deaths               0
dtype: int64


The null county values are due to the the District of Columbia, the 5 inhabited US territories (American Samoa, Guam, Northern Mariana Islands, Puerto Rico, US Virgin Islands) and the Grand Princess and Diamond Princess cruise ships having no counties. The null county value will be filled with the 'Province_State' name of the corresponding territory or cruise ship. 

In [7]:
# Join the dataframes
covid = confirmed2.merge(deaths2[['UID', 'Province_State', 'Country_Region', 'Date', 'Deaths', 'Population']],
                         how='outer',
                         left_on=['UID', 'Province_State',
                                  'Country_Region', 'Date'],
                         right_on=['UID', 'Province_State', 'Country_Region', 'Date'])

# Investigate the shape of the dataframe after the join
print('\nThe shape of confirmed is:', confirmed2.shape)
print('The shape of deaths is:', deaths2.shape)
print('The shape of the joined dataframe is:', covid.shape)

# print(covid.isnull().sum())
# covid.tail()


The shape of confirmed is: (1607502, 8)
The shape of deaths is: (1607502, 9)
The shape of the joined dataframe is: (1607502, 10)


The joined dataframe "covid" has 10 columns because Deaths and Population has been added to the confirmed dataframe. 

In [8]:
# Fill County NaN values with Province_State values (Applies only to DC, US territories, and cruise ships)
covid['County'].fillna(covid['Province_State'], inplace=True)

# covid.head()
covid.isnull().sum()

UID               0
County            0
Province_State    0
Country_Region    0
Lat               0
Long_             0
Date              0
Confirmed         0
Deaths            0
Population        0
dtype: int64

In [9]:
# Add Month-Year column
covid['Month-Year'] = covid['Date'].dt.strftime('%b-%Y')
# covid.tail()

In [10]:
# Copy df
temp = covid.copy()

# Create columns for previous date's COVID-19 cases to create daily aggregates and a running total
temp['Current Date'] = covid['Date'] + pd.Timedelta(days=1)
temp.rename(columns={'Confirmed': 'Confirmed - 1',
                     'Deaths': 'Deaths - 1', 'Date': 'Date - 1'}, inplace=True)

# Perform left join on DFs
covid_19 = covid.merge(temp[['UID', 'Province_State', 'Country_Region', 'Confirmed - 1', 'Deaths - 1',
                             'Current Date', 'Date - 1']], how='left',
                       left_on=['UID', 'Province_State',
                                'Country_Region', 'Date'],
                       right_on=['UID', 'Province_State', 'Country_Region', 'Current Date'])
print(covid_19.shape)
# covid_19.head()

(1607502, 15)


In [11]:
# Calculate the daily numbers for confirmed cases & deaths (current aggregate - previous aggregate = daily confirmed)
covid_19['Daily Confirmed'] = covid_19['Confirmed'] - covid_19['Confirmed - 1']
covid_19['Daily Deaths'] = covid_19['Deaths'] - covid_19['Deaths - 1']

print(covid_19.shape)
# covid_19.head()

(1607502, 17)


In [12]:
# Include daily numbers for the first day of data where there is no previous date available (2020-01-22)
covid_19['Daily Confirmed'].loc[covid_19['Date'] == '2020-01-22'] = covid_19['Confirmed']
covid_19['Daily Deaths'].loc[covid_19['Date'] == '2020-01-22'] = covid_19['Deaths']

# Delete unnecessary columns
covid_19.drop(['Confirmed - 1', 'Deaths - 1', 'Current Date', 'Date - 1'], axis=1, inplace=True)
print(covid_19.shape)
#covid_19.head()

(1607502, 13)


In [13]:
# Add Week Number column 
covid_19['Week Number'] = covid_19['Date'].dt.strftime('%Y-%W')

# Calculate weekly sum by state (Week is Monday-Sunday)
weekly_covid = covid_19.groupby(['Province_State', 'Week Number']).agg(
    {'Daily Confirmed': 'sum', 'Daily Deaths': 'sum'}).reset_index()

# Rename 'Daily' metrics to 'Weekly' metrics
weekly_covid.columns = weekly_covid.columns.str.replace(
    'Daily Confirmed', 'Weekly Confirmed')
weekly_covid.columns = weekly_covid.columns.str.replace(
    'Daily Deaths', 'Weekly Deaths')

# View most recent weekly numbers by state/territory
weekly_covid[weekly_covid['Week Number'] == weekly_covid['Week Number'].max()].head()

Unnamed: 0,Province_State,Week Number,Weekly Confirmed,Weekly Deaths
69,Alabama,2021-19,9095.0,60.0
139,Alaska,2021-19,584.0,4.0
209,American Samoa,2021-19,0.0,0.0
279,Arizona,2021-19,4148.0,50.0
349,Arkansas,2021-19,1272.0,33.0


In [14]:
# Calculate the weekly percentage change in confirmed cases and deaths for each state
weekly_change = weekly_covid.groupby(['Province_State', 'Week Number']).agg(
    {'Weekly Confirmed': 'sum', 'Weekly Deaths': 'sum'}).pct_change().reset_index()

In [15]:
# Add the weekly percentage change calculated above to the weekly_covid dataframe
weekly_covid[['Weekly Confirmed % Change', 'Weekly Deaths % Change']
             ] = weekly_change[['Weekly Confirmed', 'Weekly Deaths']]

# Replace NaN with 0 (zero percent change from prior week)
weekly_covid.replace(np.nan, 0, inplace=True)
weekly_covid.tail()

Unnamed: 0,Province_State,Week Number,Weekly Confirmed,Weekly Deaths,Weekly Confirmed % Change,Weekly Deaths % Change
4055,Wyoming,2021-15,394.0,2.0,-0.075117,1.0
4056,Wyoming,2021-16,429.0,2.0,0.088832,0.0
4057,Wyoming,2021-17,446.0,2.0,0.039627,0.0
4058,Wyoming,2021-18,481.0,3.0,0.078475,0.5
4059,Wyoming,2021-19,456.0,2.0,-0.051975,-0.333333


In [16]:
# Check the most recent US daily confirmed cases and deaths
daily_sum = covid_19.groupby(['Date']).sum()[['Daily Confirmed','Daily Deaths']]
#print(daily_sum.tail())

# Check the most recent US weekly confirmed cases and deaths 
weekly_sum = weekly_covid.groupby('Week Number').sum()[['Weekly Confirmed','Weekly Deaths']]
print(weekly_sum.tail())

             Weekly Confirmed  Weekly Deaths
Week Number                                 
2021-15              472154.0         5071.0
2021-16              407147.0         4933.0
2021-17              344463.0         4737.0
2021-18              286716.0         4538.0
2021-19              232489.0         4116.0


In [17]:
# Export Data as csv to load into Power BI
covid_19.to_csv('UScovid', sep='\t')
weekly_covid.to_csv('weekly_covid', sep='\t')