In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load Covid test data by zip code in New York City from April 1st 2020 through May 1st 2020
tests_by_zip = pd.read_csv('data/accum-nychealth-tests-by-zcta.csv')
tests_by_zip = tests_by_zip.dropna()
tests_by_zip['ZipCode'] = tests_by_zip['MODZCTA'].astype('int')

In [3]:
ny_zips = pd.read_csv('data/NY_Census_Zip.csv')
ny_zips['ZipCode'] = ny_zips['NAME'].replace(to_replace=r'ZCTA5 (\d+), New York', value=r'\1', regex=True)
ny_zips = ny_zips[ny_zips['ZipCode'].apply(lambda x: x.isnumeric())]
ny_zips['ZipCode'] = ny_zips['ZipCode'].astype('int')

In [4]:
days = 5
df = tests_by_zip.merge(ny_zips, on='ZipCode')
df = df[['GEO_ID', 'ZipCode', 'P001001', 'Total', 'Positive', 'Timestamp']]
df.columns = ['GEO_ID', 'Zip Code', 'Population', 'Total Tests', 'Positive CVD', 'Timestamp']
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='mixed')
df['Population'] = df['Population'].astype('int')
df['Incidence Rate'] = df['Positive CVD'] / df['Population'] * 100

In [5]:
new_df = pd.DataFrame()
for by, group in df.groupby(['Zip Code']):
    group = group.set_index('Timestamp', verify_integrity=True)
    group = group.sort_values(by='Timestamp')
    group.loc[:, str(days) + ' Day Mean Incidence Rate'] = group['Incidence Rate'].rolling(str(days) + 'd').mean()
    new_df = pd.concat([new_df, group])
df = new_df.reset_index()

In [6]:
df.to_csv('5_Day_Mean_Incidence_Rate_Per_NY_Zip_Code.csv', index=False)