In [1]:
# Import the requests library.
import requests
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import numpy as np
import awswrangler as wr

In [2]:
# Create variables for each of the data files
nyt_data = 'https://group3data.s3.us-west-2.amazonaws.com/us-counties.csv'
hospital_data = 'https://group3data.s3.us-west-2.amazonaws.com/COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_Facility.csv'
vaccination_data = 'https://group3data.s3.us-west-2.amazonaws.com/COVID-19_Vaccinations_in_the_United_States_County.csv'

In [3]:
# read in data from S3 bucket and create dataframes
nyt_data_df = pd.read_csv(nyt_data)
hospital_data_df = pd.read_csv(hospital_data)
vaccination_data_df = pd.read_csv(vaccination_data)

# Data Cleaning:
## NYT Data

In [4]:
# Find the data types for the NYT data
nyt_data_df.dtypes

date       object
county     object
state      object
fips      float64
cases       int64
deaths    float64
dtype: object

In [5]:
# Convert the date column in the NYT data to a datetime object using pd.to_datetime
nyt_data_df['date'] = pd.to_datetime(nyt_data_df['date'])
nyt_data_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0


In [6]:
# 1. Remove the data for dates before 7-31-20 and after 10-22-21
# 2. Bin the date column by weeks ending every friday
# 3. Sum the cases and deaths columns for each week binned

# Step 1:
nyt_data_filtered = nyt_data_df[nyt_data_df['date'] >= '2020-07-18']
nyt_data_filtered = nyt_data_filtered[nyt_data_filtered['date'] <= '2021-10-29']
nyt_data_filtered

Unnamed: 0,date,county,state,fips,cases,deaths
344228,2020-07-18,Autauga,Alabama,1001.0,827,21.0
344229,2020-07-18,Baldwin,Alabama,1003.0,1819,15.0
344230,2020-07-18,Barbour,Alabama,1005.0,483,3.0
344231,2020-07-18,Bibb,Alabama,1007.0,264,2.0
344232,2020-07-18,Blount,Alabama,1009.0,458,1.0
...,...,...,...,...,...,...
1865346,2021-10-29,Sweetwater,Wyoming,56037.0,7575,80.0
1865347,2021-10-29,Teton,Wyoming,56039.0,5204,14.0
1865348,2021-10-29,Uinta,Wyoming,56041.0,3829,26.0
1865349,2021-10-29,Washakie,Wyoming,56043.0,1730,35.0


In [7]:
# Step 2:
# Step 3:
columns_to_sum = [nyt_data_filtered['cases'], nyt_data_filtered['deaths']]
nyt_data_grouped = nyt_data_filtered.groupby(['fips', pd.Grouper(key='date', freq='W-FRI')]).agg({'cases':'sum','deaths':'sum'}).reset_index()
nyt_data_grouped

Unnamed: 0,fips,date,cases,deaths
0,1001.0,2020-07-24,6103,147.0
1,1001.0,2020-07-31,6817,147.0
2,1001.0,2020-08-07,7503,150.0
3,1001.0,2020-08-14,8531,158.0
4,1001.0,2020-08-21,9009,161.0
...,...,...,...,...
215376,78030.0,2021-10-01,24982,304.0
215377,78030.0,2021-10-08,25204,308.0
215378,78030.0,2021-10-15,25290,311.0
215379,78030.0,2021-10-22,25435,315.0


In [8]:
# Resort the nyt_data_grouped by date
nyt_data_grouped = nyt_data_grouped.sort_values(by='date')
nyt_data_grouped

Unnamed: 0,fips,date,cases,deaths
0,1001.0,2020-07-24,6103,147.0
55227,19077.0,2020-07-24,752,35.0
113913,31101.0,2020-07-24,66,0.0
162903,47027.0,2020-07-24,256,0.0
28587,13087.0,2020-07-24,2854,56.0
...,...,...,...,...
150041,42009.0,2021-10-29,47133,1187.0
149974,42007.0,2021-10-29,154109,3285.0
35621,13297.0,2021-10-29,105674,2386.0
35286,13287.0,2021-10-29,11911,338.0


In [9]:
# Set each column to a string value so we can concatonate them together 
nyt_data_grouped['fips'] = nyt_data_grouped['fips'].astype('str')
nyt_data_grouped['date'] = nyt_data_grouped['date'].astype('str')
nyt_data_grouped['fips'].dtype

dtype('O')

In [10]:
# Concatonate the date column and the fips column to create fips_date
nyt_data_grouped['fips_date'] = nyt_data_grouped['fips'] + nyt_data_grouped['date']
nyt_data_grouped

Unnamed: 0,fips,date,cases,deaths,fips_date
0,1001.0,2020-07-24,6103,147.0,1001.02020-07-24
55227,19077.0,2020-07-24,752,35.0,19077.02020-07-24
113913,31101.0,2020-07-24,66,0.0,31101.02020-07-24
162903,47027.0,2020-07-24,256,0.0,47027.02020-07-24
28587,13087.0,2020-07-24,2854,56.0,13087.02020-07-24
...,...,...,...,...,...
150041,42009.0,2021-10-29,47133,1187.0,42009.02021-10-29
149974,42007.0,2021-10-29,154109,3285.0,42007.02021-10-29
35621,13297.0,2021-10-29,105674,2386.0,13297.02021-10-29
35286,13287.0,2021-10-29,11911,338.0,13287.02021-10-29


In [11]:
# Get the difference in the number of cases/deaths instead of the cumulative total
test_nyt = nyt_data_grouped[nyt_data_grouped['fips'] == '1001.0'].sort_values(by='date')
#nyt_data_grouped['cases'] = nyt_data_grouped['cases'].diff()

test_nyt['cases'] = test_nyt['cases'].diff()
test_nyt['deaths'] = test_nyt['deaths'].diff()
test_nyt

Unnamed: 0,fips,date,cases,deaths,fips_date
0,1001.0,2020-07-24,,,1001.02020-07-24
1,1001.0,2020-07-31,714.0,0.0,1001.02020-07-31
2,1001.0,2020-08-07,686.0,3.0,1001.02020-08-07
3,1001.0,2020-08-14,1028.0,8.0,1001.02020-08-14
4,1001.0,2020-08-21,478.0,3.0,1001.02020-08-21
...,...,...,...,...,...
62,1001.0,2021-10-01,1035.0,55.0,1001.02021-10-01
63,1001.0,2021-10-08,688.0,27.0,1001.02021-10-08
64,1001.0,2021-10-15,403.0,14.0,1001.02021-10-15
65,1001.0,2021-10-22,414.0,11.0,1001.02021-10-22


In [12]:
# test diff() on whole nyt dataframe
test_df = nyt_data_grouped.sort_values(by=['fips','date'])
test_df['cases'] = test_df['cases'].diff()
test_df['deaths'] = test_df['deaths'].diff()
test_df

Unnamed: 0,fips,date,cases,deaths,fips_date
21016,10001.0,2020-07-24,,,10001.02020-07-24
21017,10001.0,2020-07-31,715.0,59.0,10001.02020-07-31
21018,10001.0,2020-08-07,651.0,12.0,10001.02020-08-07
21019,10001.0,2020-08-14,772.0,7.0,10001.02020-08-14
21020,10001.0,2020-08-21,1170.0,3.0,10001.02020-08-21
...,...,...,...,...,...
21011,9015.0,2021-10-01,1555.0,39.0,9015.02021-10-01
21012,9015.0,2021-10-08,1554.0,35.0,9015.02021-10-08
21013,9015.0,2021-10-15,1489.0,9.0,9015.02021-10-15
21014,9015.0,2021-10-22,1673.0,16.0,9015.02021-10-22


In [13]:
test_df.isnull().sum()
        

fips         0
date         0
cases        1
deaths       1
fips_date    0
dtype: int64

## Hospital Data

In [14]:
# Convert the date column in the hospital data to a datetime object using pd.to_datetime
hospital_data_df['collection_week'] = pd.to_datetime(hospital_data_df['collection_week'])
# Check to verify the date is now a datetime object
hospital_data_df.dtypes

hospital_pk                                                        object
collection_week                                            datetime64[ns]
state                                                              object
ccn                                                                object
hospital_name                                                      object
                                                                ...      
total_personnel_covid_vaccinated_doses_one_7_day                  float64
total_personnel_covid_vaccinated_doses_all_7_day                  float64
previous_week_patients_covid_vaccinated_doses_one_7_day           float64
previous_week_patients_covid_vaccinated_doses_all_7_day           float64
is_corrected                                                         bool
Length: 106, dtype: object

In [15]:
# The NYT data is already sorted by date
# Sort the hospital data by date
hospital_data_df = hospital_data_df.sort_values(by='collection_week')
hospital_data_df

# The hospital data is binned by collection week ending on every friday

Unnamed: 0,hospital_pk,collection_week,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_pediatric_covid_confirmed_7_day_coverage,previous_day_admission_adult_covid_suspected_7_day_coverage,previous_day_admission_pediatric_covid_suspected_7_day_coverage,previous_week_personnel_covid_vaccinated_doses_administered_7_day,total_personnel_covid_vaccinated_doses_none_7_day,total_personnel_covid_vaccinated_doses_one_7_day,total_personnel_covid_vaccinated_doses_all_7_day,previous_week_patients_covid_vaccinated_doses_one_7_day,previous_week_patients_covid_vaccinated_doses_all_7_day,is_corrected
324747,110045,2020-07-31,GA,110045,NORTHEAST GEORGIA MEDICAL CENTER BARROW,316 NORTH BROAD STREET,WINDER,30680.0,Short Term,13013.0,...,0,0,0,,,,,,,False
319367,180045,2020-07-31,KY,180045,ST ELIZABETH FLORENCE,4900 HOUSTON ROAD,FLORENCE,41042.0,Short Term,21015.0,...,7,7,7,,,,,,,False
319371,100168,2020-07-31,FL,100168,BOCA RATON REGIONAL HOSPITAL,800 MEADOWS RD,BOCA RATON,33486.0,Short Term,12099.0,...,7,7,7,,,,,,,False
319374,161322,2020-07-31,IA,161322,DALLAS COUNTY HOSPITAL,610 TENTH STREET,PERRY,50220.0,Critical Access Hospitals,19049.0,...,7,7,7,,,,,,,False
319375,050660,2020-07-31,CA,050660,USC KENNETH NORRIS JR CANCER HOSPITAL,1441 EASTLAKE AVE,LOS ANGELES,90089.0,Short Term,6037.0,...,7,7,7,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10378,341320,2021-10-22,NC,341320,ALLEGHANY COUNTY MEMORIAL HOSPITAL,617 DOCTORS STREET,SPARTA,28675.0,Critical Access Hospitals,37005.0,...,7,7,7,,,,,,,False
10377,451311,2021-10-22,TX,451311,SWEENY COMMUNITY HOSPITAL,305 NORTH MCKINNEY,SWEENY,77480.0,Critical Access Hospitals,48039.0,...,7,7,7,,,,,,,False
6377,171318,2021-10-22,KS,171318,SEDAN CITY HOSPITAL,300 NORTH STREET,SEDAN,67361.0,Critical Access Hospitals,20019.0,...,7,7,7,0.0,12.0,-999999.0,29.0,0.0,0.0,False
10372,180011,2021-10-22,KY,180011,CHI SAINT JOSEPH LONDON,1001 SAINT JOSEPH LANE,LONDON,40741.0,Short Term,21125.0,...,7,7,7,0.0,0.0,0.0,0.0,0.0,0.0,False


In [16]:
# Convert the hospital date and fips_code column to strings
hospital_data_df.rename(columns={'collection_week': 'date'}, inplace=True)
hospital_data_df['date'] = hospital_data_df['date'].astype('str')
hospital_data_df['fips_code'] = hospital_data_df['fips_code'].astype('str')

In [17]:
# Concat the date and fips_code for new columns fips_date
hospital_data_df['fips_date'] = hospital_data_df['fips_code'] + hospital_data_df['date']
hospital_data_df

Unnamed: 0,hospital_pk,date,state,ccn,hospital_name,address,city,zip,hospital_subtype,fips_code,...,previous_day_admission_adult_covid_suspected_7_day_coverage,previous_day_admission_pediatric_covid_suspected_7_day_coverage,previous_week_personnel_covid_vaccinated_doses_administered_7_day,total_personnel_covid_vaccinated_doses_none_7_day,total_personnel_covid_vaccinated_doses_one_7_day,total_personnel_covid_vaccinated_doses_all_7_day,previous_week_patients_covid_vaccinated_doses_one_7_day,previous_week_patients_covid_vaccinated_doses_all_7_day,is_corrected,fips_date
324747,110045,2020-07-31,GA,110045,NORTHEAST GEORGIA MEDICAL CENTER BARROW,316 NORTH BROAD STREET,WINDER,30680.0,Short Term,13013.0,...,0,0,,,,,,,False,13013.02020-07-31
319367,180045,2020-07-31,KY,180045,ST ELIZABETH FLORENCE,4900 HOUSTON ROAD,FLORENCE,41042.0,Short Term,21015.0,...,7,7,,,,,,,False,21015.02020-07-31
319371,100168,2020-07-31,FL,100168,BOCA RATON REGIONAL HOSPITAL,800 MEADOWS RD,BOCA RATON,33486.0,Short Term,12099.0,...,7,7,,,,,,,False,12099.02020-07-31
319374,161322,2020-07-31,IA,161322,DALLAS COUNTY HOSPITAL,610 TENTH STREET,PERRY,50220.0,Critical Access Hospitals,19049.0,...,7,7,,,,,,,False,19049.02020-07-31
319375,050660,2020-07-31,CA,050660,USC KENNETH NORRIS JR CANCER HOSPITAL,1441 EASTLAKE AVE,LOS ANGELES,90089.0,Short Term,6037.0,...,7,7,,,,,,,False,6037.02020-07-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10378,341320,2021-10-22,NC,341320,ALLEGHANY COUNTY MEMORIAL HOSPITAL,617 DOCTORS STREET,SPARTA,28675.0,Critical Access Hospitals,37005.0,...,7,7,,,,,,,False,37005.02021-10-22
10377,451311,2021-10-22,TX,451311,SWEENY COMMUNITY HOSPITAL,305 NORTH MCKINNEY,SWEENY,77480.0,Critical Access Hospitals,48039.0,...,7,7,,,,,,,False,48039.02021-10-22
6377,171318,2021-10-22,KS,171318,SEDAN CITY HOSPITAL,300 NORTH STREET,SEDAN,67361.0,Critical Access Hospitals,20019.0,...,7,7,0.0,12.0,-999999.0,29.0,0.0,0.0,False,20019.02021-10-22
10372,180011,2021-10-22,KY,180011,CHI SAINT JOSEPH LONDON,1001 SAINT JOSEPH LANE,LONDON,40741.0,Short Term,21125.0,...,7,7,0.0,0.0,0.0,0.0,0.0,0.0,False,21125.02021-10-22


In [18]:
# Create a new df to groupby the new fips_date column so each row is unique
agg_hospital_data_df = hospital_data_df.groupby(['fips_date']).agg({'total_beds_7_day_sum':'sum','all_adult_hospital_beds_7_day_sum':'sum', 'all_adult_hospital_inpatient_beds_7_day_sum':'sum', 'inpatient_beds_used_7_day_sum':'sum', 'all_adult_hospital_inpatient_bed_occupied_7_day_sum':'sum', 
                                                                                                                       'total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum':'sum', 'total_adult_patients_hospitalized_confirmed_covid_7_day_sum': 'sum', 'inpatient_beds_7_day_sum':'sum', 'total_icu_beds_7_day_sum':'sum', 
                                                                                                                      'total_staffed_adult_icu_beds_7_day_sum':'sum', 'icu_beds_used_7_day_sum':'sum', 'staffed_adult_icu_bed_occupancy_7_day_sum':'sum', 'staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum':'sum', 'staffed_icu_adult_patients_confirmed_covid_7_day_sum':'sum'}).reset_index()
agg_hospital_data_df

Unnamed: 0,fips_date,total_beds_7_day_sum,all_adult_hospital_beds_7_day_sum,all_adult_hospital_inpatient_beds_7_day_sum,inpatient_beds_used_7_day_sum,all_adult_hospital_inpatient_bed_occupied_7_day_sum,total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum,total_adult_patients_hospitalized_confirmed_covid_7_day_sum,inpatient_beds_7_day_sum,total_icu_beds_7_day_sum,total_staffed_adult_icu_beds_7_day_sum,icu_beds_used_7_day_sum,staffed_adult_icu_bed_occupancy_7_day_sum,staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum,staffed_icu_adult_patients_confirmed_covid_7_day_sum
0,10001.02020-07-31,2100.0,1939.0,1688.0,1713.0,1558.0,152.0,84.0,1876.0,322.0,203.0,266.0,189.0,28.0,28.0
1,10001.02020-08-07,2100.0,1939.0,1694.0,1672.0,1548.0,112.0,71.0,1876.0,322.0,203.0,273.0,188.0,22.0,22.0
2,10001.02020-08-14,2100.0,1939.0,1210.0,1141.0,1028.0,132.0,55.0,1340.0,198.0,134.0,183.0,128.0,29.0,29.0
3,10001.02020-08-21,2100.0,1939.0,1729.0,1650.0,1502.0,93.0,57.0,1908.0,285.0,193.0,245.0,171.0,23.0,23.0
4,10001.02020-08-28,2100.0,1939.0,1939.0,1730.0,1624.0,65.0,49.0,2100.0,315.0,203.0,251.0,182.0,19.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159931,nan2021-09-24,1218.0,1204.0,1008.0,526.0,447.0,0.0,0.0,1218.0,49.0,49.0,37.0,37.0,0.0,0.0
159932,nan2021-10-01,1218.0,1204.0,1008.0,523.0,441.0,0.0,0.0,1218.0,49.0,49.0,34.0,34.0,0.0,0.0
159933,nan2021-10-08,1218.0,1204.0,1008.0,524.0,443.0,0.0,0.0,1218.0,49.0,49.0,38.0,38.0,0.0,0.0
159934,nan2021-10-15,1218.0,1204.0,1008.0,540.0,462.0,0.0,0.0,1218.0,49.0,49.0,41.0,41.0,0.0,0.0


# Merge the data sets

In [19]:
# Merge the NYT and hospital dataframes
merged_df = agg_hospital_data_df.merge(nyt_data_grouped)
merged_df

Unnamed: 0,fips_date,total_beds_7_day_sum,all_adult_hospital_beds_7_day_sum,all_adult_hospital_inpatient_beds_7_day_sum,inpatient_beds_used_7_day_sum,all_adult_hospital_inpatient_bed_occupied_7_day_sum,total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum,total_adult_patients_hospitalized_confirmed_covid_7_day_sum,inpatient_beds_7_day_sum,total_icu_beds_7_day_sum,total_staffed_adult_icu_beds_7_day_sum,icu_beds_used_7_day_sum,staffed_adult_icu_bed_occupancy_7_day_sum,staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum,staffed_icu_adult_patients_confirmed_covid_7_day_sum,fips,date,cases,deaths
0,10001.02020-07-31,2100.0,1939.0,1688.0,1713.0,1558.0,152.0,84.0,1876.0,322.0,203.0,266.0,189.0,28.0,28.0,10001.0,2020-07-31,14925,737.0
1,10001.02020-08-07,2100.0,1939.0,1694.0,1672.0,1548.0,112.0,71.0,1876.0,322.0,203.0,273.0,188.0,22.0,22.0,10001.0,2020-08-07,15576,749.0
2,10001.02020-08-14,2100.0,1939.0,1210.0,1141.0,1028.0,132.0,55.0,1340.0,198.0,134.0,183.0,128.0,29.0,29.0,10001.0,2020-08-14,16348,756.0
3,10001.02020-08-21,2100.0,1939.0,1729.0,1650.0,1502.0,93.0,57.0,1908.0,285.0,193.0,245.0,171.0,23.0,23.0,10001.0,2020-08-21,17518,759.0
4,10001.02020-08-28,2100.0,1939.0,1939.0,1730.0,1624.0,65.0,49.0,2100.0,315.0,203.0,251.0,182.0,19.0,19.0,10001.0,2020-08-28,17907,763.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159059,9015.02021-09-24,951.0,937.0,783.0,565.0,565.0,56.0,42.0,783.0,119.0,119.0,34.0,34.0,14.0,10.0,9015.0,2021-09-24,85476,1400.0
159060,9015.02021-10-01,956.0,942.0,788.0,530.0,530.0,61.0,57.0,788.0,119.0,119.0,32.0,32.0,15.0,14.0,9015.0,2021-10-01,87031,1439.0
159061,9015.02021-10-08,956.0,942.0,788.0,557.0,557.0,87.0,85.0,788.0,119.0,119.0,32.0,32.0,20.0,20.0,9015.0,2021-10-08,88585,1474.0
159062,9015.02021-10-15,955.0,941.0,787.0,521.0,521.0,76.0,74.0,787.0,119.0,119.0,26.0,26.0,19.0,19.0,9015.0,2021-10-15,90074,1483.0


# Experiementation
Below is experiemntation on the data sets to clean them further

In [20]:
# NYT data Cleanup
nyt_test_df = nyt_data_grouped
# 1. create a list that holds all of the unique fips data
fips_list = list(nyt_data_grouped['fips'].unique())
fips_list

['1001.0',
 '19077.0',
 '31101.0',
 '47027.0',
 '13087.0',
 '19075.0',
 '31105.0',
 '47025.0',
 '19073.0',
 '31107.0',
 '47023.0',
 '13089.0',
 '31109.0',
 '4007.0',
 '19071.0',
 '31111.0',
 '47021.0',
 '47019.0',
 '72007.0',
 '13091.0',
 '19069.0',
 '31117.0',
 '47017.0',
 '47015.0',
 '19067.0',
 '31119.0',
 '13085.0',
 '47029.0',
 '47031.0',
 '31099.0',
 '47045.0',
 '13077.0',
 '31083.0',
 '47043.0',
 '54109.0',
 '19085.0',
 '31087.0',
 '47041.0',
 '13079.0',
 '19083.0',
 '31089.0',
 '31091.0',
 '13093.0',
 '47039.0',
 '13081.0',
 '72009.0',
 '1085.0',
 '19081.0',
 '31093.0',
 '47035.0',
 '31095.0',
 '47033.0',
 '13083.0',
 '55001.0',
 '19079.0',
 '31097.0',
 '47037.0',
 '72011.0',
 '31121.0',
 '47013.0',
 '55005.0',
 '19055.0',
 '31141.0',
 '46129.0',
 '31143.0',
 '46127.0',
 '72003.0',
 '13105.0',
 '19053.0',
 '31145.0',
 '46125.0',
 '31147.0',
 '46123.0',
 '13107.0',
 '19051.0',
 '31149.0',
 '46121.0',
 '1089.0',
 '55007.0',
 '31151.0',
 '46119.0',
 '13109.0',
 '19049.0',
 '31153.

In [58]:
# 1.a each fips code corresponds to 65 rows 1 fips per 65 dates
nyt_test_df = nyt_test_df.sort_values(by=['fips', 'date'])
nyt_test_df.head()

Unnamed: 0,fips,date,cases,deaths,fips_date
21016,10001.0,2020-07-24,14210,678.0,10001.02020-07-24
21017,10001.0,2020-07-31,14925,737.0,10001.02020-07-31
21018,10001.0,2020-08-07,15576,749.0,10001.02020-08-07
21019,10001.0,2020-08-14,16348,756.0,10001.02020-08-14
21020,10001.0,2020-08-21,17518,759.0,10001.02020-08-21


In [76]:
# 2. Loop through the list to create a new df and sort by date

for x in fips[:75]:
    i = 0
    if nyt_test_df['fips'][i] == x:
        print(nyt_test_df['fips'][i])
        i += 1
        print(i)
    else:
        print('not working :(')
        print(nyt_test_df['fips'][i])
        print(x)
#             cases = []
#             cases.append(nyt_test_df['cases'][y])
#             test_cases.append(cases.diff())

not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not working :(
1001.0
10001.0
not workin

In [53]:
# for x in nyt_test_df['fips'][:75]:
#     print(x)

In [87]:
fips = list(nyt_data_filtered['fips'])
len(fips)

1521123

In [83]:
nyt_data_filtered.dtypes

date      datetime64[ns]
county            object
state             object
fips             float64
cases              int64
deaths           float64
dtype: object