In [1]:
# Import the requests library.
import requests
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import numpy as np
import awswrangler as wr

In [14]:
# Create variables for each of the data files
nyt_data = 'https://group3data.s3.us-west-2.amazonaws.com/us-counties.csv'
hospital_data = 'https://group3data.s3.us-west-2.amazonaws.com/COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_Facility.csv'
vaccination_data = 'https://group3data.s3.us-west-2.amazonaws.com/COVID-19_Vaccinations_in_the_United_States_County.csv'

In [15]:
# read in data from S3 bucket and create dataframes
nyt_data_df = pd.read_csv(nyt_data)
hospital_data_df = pd.read_csv(hospital_data)
vaccination_data_df = pd.read_csv(vaccination_data)

# Data Cleaning:
## NYT Data

In [17]:
# Find the data types for the NYT data
nyt_data_df.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths    float64
dtype: object

In [20]:
# Convert the date column in the NYT data to a datetime object using pd.to_datetime
nyt_data_df['date'] = pd.to_datetime(nyt_data_df['date'])
nyt_data_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [22]:
# 1. Remove the data for dates before 7-31-20 and after 10-22-21
# 2. Bin the date column by weeks ending every friday
# 3. Sum the cases and deaths columns for each week binned

# Step 1:
nyt_data_filtered = nyt_data_df[nyt_data_df['date'] >= '2020-07-25']
nyt_data_filtered = nyt_data_filtered[nyt_data_filtered['date'] <= '2021-10-22']
nyt_data_filtered

Unnamed: 0,date,county,state,fips,cases,deaths
366652,2020-07-25,Autauga,Alabama,1001.0,932,21.0
366653,2020-07-25,Baldwin,Alabama,1003.0,2662,18.0
366654,2020-07-25,Barbour,Alabama,1005.0,552,4.0
366655,2020-07-25,Bibb,Alabama,1007.0,318,2.0
366656,2020-07-25,Blount,Alabama,1009.0,637,1.0
...,...,...,...,...,...,...
1842600,2021-10-22,Sweetwater,Wyoming,56037.0,7414,79.0
1842601,2021-10-22,Teton,Wyoming,56039.0,5119,14.0
1842602,2021-10-22,Uinta,Wyoming,56041.0,3781,26.0
1842603,2021-10-22,Washakie,Wyoming,56043.0,1676,33.0


In [27]:
# Step 2:
# Step 3:
columns_to_sum = [nyt_data_filtered['cases'], nyt_data_filtered['deaths']]
nyt_data_grouped = nyt_data_filtered.groupby(['fips', pd.Grouper(key='date', freq='W-FRI')]).agg({'cases':'sum','deaths':'sum'}).reset_index()
nyt_data_grouped

Unnamed: 0,fips,date,cases,deaths
0,1001.0,2020-07-31,6817,147.0
1,1001.0,2020-08-07,7503,150.0
2,1001.0,2020-08-14,8531,158.0
3,1001.0,2020-08-21,9009,161.0
4,1001.0,2020-08-28,9484,161.0
...,...,...,...,...
208977,78030.0,2021-09-24,24720,297.0
208978,78030.0,2021-10-01,24982,304.0
208979,78030.0,2021-10-08,25204,308.0
208980,78030.0,2021-10-15,25290,311.0


In [28]:
# Resort the nyt_data_grouped by date
nyt_data_grouped = nyt_data_grouped.sort_values(by='date')
nyt_data_grouped

Unnamed: 0,fips,date,cases,deaths
0,1001.0,2020-07-31,6817,147.0
167222,48119.0,2020-07-31,88,0.0
32676,13241.0,2020-07-31,1161,21.0
167287,48121.0,2020-07-31,44680,448.0
32611,13239.0,2020-07-31,185,7.0
...,...,...,...,...
179867,48507.0,2021-10-22,17555,371.0
51697,19017.0,2021-10-22,24875,472.0
142867,40151.0,2021-10-22,11280,163.0
51177,19001.0,2021-10-22,7311,241.0


In [29]:
# Set each column to a string value so we can concatonate them together 
nyt_data_grouped['fips'] = nyt_data_grouped['fips'].astype('str')
nyt_data_grouped['date'] = nyt_data_grouped['date'].astype('str')
nyt_data_grouped['fips'].dtype

dtype('O')

In [30]:
# Concatonate the date column and the fips column to create fips_date
nyt_data_grouped['fips_date'] = nyt_data_grouped['fips'] + nyt_data_grouped['date']
nyt_data_grouped

Unnamed: 0,fips,date,cases,deaths,fips_date
0,1001.0,2020-07-31,6817,147.0,1001.02020-07-31
167222,48119.0,2020-07-31,88,0.0,48119.02020-07-31
32676,13241.0,2020-07-31,1161,21.0,13241.02020-07-31
167287,48121.0,2020-07-31,44680,448.0,48121.02020-07-31
32611,13239.0,2020-07-31,185,7.0,13239.02020-07-31
...,...,...,...,...,...
179867,48507.0,2021-10-22,17555,371.0,48507.02021-10-22
51697,19017.0,2021-10-22,24875,472.0,19017.02021-10-22
142867,40151.0,2021-10-22,11280,163.0,40151.02021-10-22
51177,19001.0,2021-10-22,7311,241.0,19001.02021-10-22


## Hospital Data

In [31]:
# Convert the date column in the hospital data to a datetime object using pd.to_datetime
hospital_data_df['collection_week'] = pd.to_datetime(hospital_data_df['collection_week'])
# Check to verify the date is now a datetime object
hospital_data_df.dtypes

hospital_pk                                                        object
collection_week                                            datetime64[ns]
state                                                              object
ccn                                                                object
hospital_name                                                      object
                                                                ...      
total_personnel_covid_vaccinated_doses_one_7_day                  float64
total_personnel_covid_vaccinated_doses_all_7_day                  float64
previous_week_patients_covid_vaccinated_doses_one_7_day           float64
previous_week_patients_covid_vaccinated_doses_all_7_day           float64
is_corrected                                                         bool
Length: 106, dtype: object

In [32]:
# The NYT data is already sorted by date
# Sort the hospital data by date
hospital_data_df = hospital_data_df.sort_values(by='collection_week')
hospital_data_df

# The hospital data is binned by collection week ending on every friday

Unnamed: 0,hospital_pk,collection_week,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_pediatric_covid_confirmed_7_day_coverage,previous_day_admission_adult_covid_suspected_7_day_coverage,previous_day_admission_pediatric_covid_suspected_7_day_coverage,previous_week_personnel_covid_vaccinated_doses_administered_7_day,total_personnel_covid_vaccinated_doses_none_7_day,total_personnel_covid_vaccinated_doses_one_7_day,total_personnel_covid_vaccinated_doses_all_7_day,previous_week_patients_covid_vaccinated_doses_one_7_day,previous_week_patients_covid_vaccinated_doses_all_7_day,is_corrected
324747,110045,2020-07-31,GA,110045,NORTHEAST GEORGIA MEDICAL CENTER BARROW,316 NORTH BROAD STREET,WINDER,30680.0,Short Term,13013.0,...,0,0,0,,,,,,,False
319367,180045,2020-07-31,KY,180045,ST ELIZABETH FLORENCE,4900 HOUSTON ROAD,FLORENCE,41042.0,Short Term,21015.0,...,7,7,7,,,,,,,False
319371,100168,2020-07-31,FL,100168,BOCA RATON REGIONAL HOSPITAL,800 MEADOWS RD,BOCA RATON,33486.0,Short Term,12099.0,...,7,7,7,,,,,,,False
319374,161322,2020-07-31,IA,161322,DALLAS COUNTY HOSPITAL,610 TENTH STREET,PERRY,50220.0,Critical Access Hospitals,19049.0,...,7,7,7,,,,,,,False
319375,050660,2020-07-31,CA,050660,USC KENNETH NORRIS JR CANCER HOSPITAL,1441 EASTLAKE AVE,LOS ANGELES,90089.0,Short Term,6037.0,...,7,7,7,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10378,341320,2021-10-22,NC,341320,ALLEGHANY COUNTY MEMORIAL HOSPITAL,617 DOCTORS STREET,SPARTA,28675.0,Critical Access Hospitals,37005.0,...,7,7,7,,,,,,,False
10377,451311,2021-10-22,TX,451311,SWEENY COMMUNITY HOSPITAL,305 NORTH MCKINNEY,SWEENY,77480.0,Critical Access Hospitals,48039.0,...,7,7,7,,,,,,,False
6377,171318,2021-10-22,KS,171318,SEDAN CITY HOSPITAL,300 NORTH STREET,SEDAN,67361.0,Critical Access Hospitals,20019.0,...,7,7,7,0.0,12.0,-999999.0,29.0,0.0,0.0,False
10372,180011,2021-10-22,KY,180011,CHI SAINT JOSEPH LONDON,1001 SAINT JOSEPH LANE,LONDON,40741.0,Short Term,21125.0,...,7,7,7,0.0,0.0,0.0,0.0,0.0,0.0,False


In [33]:
# Convert the hospital date and fips_code column to strings
hospital_data_df.rename(columns={'collection_week': 'date'}, inplace=True)
hospital_data_df['date'] = hospital_data_df['date'].astype('str')
hospital_data_df['fips_code'] = hospital_data_df['fips_code'].astype('str')

In [34]:
# Concat the date and fips_code for new columns fips_date
hospital_data_df['fips_date'] = hospital_data_df['fips_code'] + hospital_data_df['date']
hospital_data_df

Unnamed: 0,hospital_pk,date,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_adult_covid_suspected_7_day_coverage,previous_day_admission_pediatric_covid_suspected_7_day_coverage,previous_week_personnel_covid_vaccinated_doses_administered_7_day,total_personnel_covid_vaccinated_doses_none_7_day,total_personnel_covid_vaccinated_doses_one_7_day,total_personnel_covid_vaccinated_doses_all_7_day,previous_week_patients_covid_vaccinated_doses_one_7_day,previous_week_patients_covid_vaccinated_doses_all_7_day,is_corrected,fips_date
324747,110045,2020-07-31,GA,110045,NORTHEAST GEORGIA MEDICAL CENTER BARROW,316 NORTH BROAD STREET,WINDER,30680.0,Short Term,13013.0,...,0,0,,,,,,,False,13013.02020-07-31
319367,180045,2020-07-31,KY,180045,ST ELIZABETH FLORENCE,4900 HOUSTON ROAD,FLORENCE,41042.0,Short Term,21015.0,...,7,7,,,,,,,False,21015.02020-07-31
319371,100168,2020-07-31,FL,100168,BOCA RATON REGIONAL HOSPITAL,800 MEADOWS RD,BOCA RATON,33486.0,Short Term,12099.0,...,7,7,,,,,,,False,12099.02020-07-31
319374,161322,2020-07-31,IA,161322,DALLAS COUNTY HOSPITAL,610 TENTH STREET,PERRY,50220.0,Critical Access Hospitals,19049.0,...,7,7,,,,,,,False,19049.02020-07-31
319375,050660,2020-07-31,CA,050660,USC KENNETH NORRIS JR CANCER HOSPITAL,1441 EASTLAKE AVE,LOS ANGELES,90089.0,Short Term,6037.0,...,7,7,,,,,,,False,6037.02020-07-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10378,341320,2021-10-22,NC,341320,ALLEGHANY COUNTY MEMORIAL HOSPITAL,617 DOCTORS STREET,SPARTA,28675.0,Critical Access Hospitals,37005.0,...,7,7,,,,,,,False,37005.02021-10-22
10377,451311,2021-10-22,TX,451311,SWEENY COMMUNITY HOSPITAL,305 NORTH MCKINNEY,SWEENY,77480.0,Critical Access Hospitals,48039.0,...,7,7,,,,,,,False,48039.02021-10-22
6377,171318,2021-10-22,KS,171318,SEDAN CITY HOSPITAL,300 NORTH STREET,SEDAN,67361.0,Critical Access Hospitals,20019.0,...,7,7,0.0,12.0,-999999.0,29.0,0.0,0.0,False,20019.02021-10-22
10372,180011,2021-10-22,KY,180011,CHI SAINT JOSEPH LONDON,1001 SAINT JOSEPH LANE,LONDON,40741.0,Short Term,21125.0,...,7,7,0.0,0.0,0.0,0.0,0.0,0.0,False,21125.02021-10-22


In [35]:
# Create a new df to groupby the new fips_date column so each row is unique
agg_hospital_data_df = hospital_data_df.groupby(['fips_date']).agg({'total_beds_7_day_sum':'sum','all_adult_hospital_beds_7_day_sum':'sum', 'all_adult_hospital_inpatient_beds_7_day_sum':'sum', 'inpatient_beds_used_7_day_sum':'sum', 'all_adult_hospital_inpatient_bed_occupied_7_day_sum':'sum', 
                                                                                                                       'total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum':'sum', 'total_adult_patients_hospitalized_confirmed_covid_7_day_sum': 'sum', 'inpatient_beds_7_day_sum':'sum', 'total_icu_beds_7_day_sum':'sum', 
                                                                                                                      'total_staffed_adult_icu_beds_7_day_sum':'sum', 'icu_beds_used_7_day_sum':'sum', 'staffed_adult_icu_bed_occupancy_7_day_sum':'sum', 'staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum':'sum', 'staffed_icu_adult_patients_confirmed_covid_7_day_sum':'sum'}).reset_index()
agg_hospital_data_df

Unnamed: 0,fips_date,total_beds_7_day_sum,all_adult_hospital_beds_7_day_sum,all_adult_hospital_inpatient_beds_7_day_sum,inpatient_beds_used_7_day_sum,all_adult_hospital_inpatient_bed_occupied_7_day_sum,total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum,total_adult_patients_hospitalized_confirmed_covid_7_day_sum,inpatient_beds_7_day_sum,total_icu_beds_7_day_sum,total_staffed_adult_icu_beds_7_day_sum,icu_beds_used_7_day_sum,staffed_adult_icu_bed_occupancy_7_day_sum,staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum,staffed_icu_adult_patients_confirmed_covid_7_day_sum
0,10001.02020-07-31,2100.0,1939.0,1688.0,1713.0,1558.0,152.0,84.0,1876.0,322.0,203.0,266.0,189.0,28.0,28.0
1,10001.02020-08-07,2100.0,1939.0,1694.0,1672.0,1548.0,112.0,71.0,1876.0,322.0,203.0,273.0,188.0,22.0,22.0
2,10001.02020-08-14,2100.0,1939.0,1210.0,1141.0,1028.0,132.0,55.0,1340.0,198.0,134.0,183.0,128.0,29.0,29.0
3,10001.02020-08-21,2100.0,1939.0,1729.0,1650.0,1502.0,93.0,57.0,1908.0,285.0,193.0,245.0,171.0,23.0,23.0
4,10001.02020-08-28,2100.0,1939.0,1939.0,1730.0,1624.0,65.0,49.0,2100.0,315.0,203.0,251.0,182.0,19.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159931,nan2021-09-24,1218.0,1204.0,1008.0,526.0,447.0,0.0,0.0,1218.0,49.0,49.0,37.0,37.0,0.0,0.0
159932,nan2021-10-01,1218.0,1204.0,1008.0,523.0,441.0,0.0,0.0,1218.0,49.0,49.0,34.0,34.0,0.0,0.0
159933,nan2021-10-08,1218.0,1204.0,1008.0,524.0,443.0,0.0,0.0,1218.0,49.0,49.0,38.0,38.0,0.0,0.0
159934,nan2021-10-15,1218.0,1204.0,1008.0,540.0,462.0,0.0,0.0,1218.0,49.0,49.0,41.0,41.0,0.0,0.0


# Merge the data sets

In [None]:
# Merge the NYT and hospital dataframes

merged_df = hospital_data_df.merge(nyt_data_grouped)
merged_df