# NYC Health COVID-19 Data Scraper
### March 30, 2020
### Matthew J. Beattie
### University of Oklahoma

Cases obtained from _https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf_

Deaths obtained from _https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths.pdf_

In [1]:
import pandas as pd
import re
import PyPDF2 as pdf
import tempfile
import urllib.request
from datetime import date

yr = 2020
mo = 4
day = 1

## Define Common Routines

In [2]:
"""
textfrompdf()
Reads in PDF file from NYC website or local file.  To read a local file, the user
must flag the readfrom variable as 'local'
"""
def textfrompdf(url,readfrom='online'):
    if readfrom != 'local':
        dataIn = urllib.request.urlopen(url).read()
    else:
        localf = open(url, 'rb')
        dataIn = localf.read()
        localf.close()

    fp = tempfile.TemporaryFile()

    # Write the pdf data to a temp file
    fp.write(dataIn)

    pdfReader = pdf.PdfFileReader(fp)
    pdfReader.getNumPages()

    # Get the first page
    return pdfReader.getPage(0).extractText()

## Read in NYC Cases Data

In [3]:
# Read in text from the PDF file and display
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf'
localurl = 'covid-19-daily-data-summary.pdf'
casespage = textfrompdf(url,readfrom='online')

casespage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily \nData\n Summary\n  The data in this report reflect events and activities\n as of\n April 1,\n 2020\n at 4:30 \nPM.  All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 Cases\n . Total Cases\n Total\n 45707 Median Age (Range)\n 49 (0\n-105) Age Group\n  -  0 to 17\n 816 (2%)\n -  18 to 44\n 18767 (41%)\n -  45 to 64\n 16104 (35%)\n -  65 to 74\n 5584 (12%)\n -  75 and over\n 4328 (9%)\n -  Unknown\n 108 Age 50 and over\n  -  Yes\n 22061 (48%)\n -  No 23538 (52%)\n Sex\n  -  Female\n 20522 (45%)\n -  Male\n 25128 (55%)\n -  Unknown\n 57 Borough\n  -  Bronx\n 8607 (19%)\n -  Brooklyn\n 12274 (27%)\n -  Manhattan\n 7022 (15%)\n -  Queens\n 15217 (33%)\n -  Staten Island\n 2552 (6%)\n -  Unknown\n 35 Deaths\n 1374      \n \n   \n \n  '

## Extract the COVID cases by Age

In [4]:
# Extract the values for the cases by Age.  Ignore Unknown Age. 
tblText = re.sub("Coronavirus.*Cases.*0 to 17", "0 to 17", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)


In [5]:
tblText

'0-17,816;18-44,18767;45-64,16104;65-74,5584;Over_75,4328'

In [6]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [7]:
rows

['0-17,816', '18-44,18767', '45-64,16104', '65-74,5584', 'Over_75,4328']

In [8]:
# Convert the rows of text into a list of lists
agedata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    agedata.append(record)

In [9]:
# Convert the list into a dataframe and display 
agedf = pd.DataFrame(agedata, columns=['category','count'])
agedf['feature']='age'
agedf['type']='cases'
agedf['date']=date(yr,mo,day)
agedf

Unnamed: 0,category,count,feature,type,date
0,0-17,816,age,cases,2020-04-01
1,18-44,18767,age,cases,2020-04-01
2,45-64,16104,age,cases,2020-04-01
3,65-74,5584,age,cases,2020-04-01
4,Over_75,4328,age,cases,2020-04-01


## Extract the COVID cases by Sex

In [10]:
# Extract the values for the cases by Sex.  Ignore Unknown Sex. 
tblText = re.sub("Coronavirus.*Cases.*Female", "Female", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)

In [11]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [12]:
# Convert the rows into a list of lists
sexdata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    sexdata.append(record)

# Convert the list into a dataframe and display
sexdf = pd.DataFrame(sexdata, columns=['category','count'])
sexdf['feature']='sex'
sexdf['type']='cases'
sexdf['date']=date(yr,mo,day)
sexdf

Unnamed: 0,category,count,feature,type,date
0,Female,20522,sex,cases,2020-04-01
1,Male,25128,sex,cases,2020-04-01


## Extract the COVID cases by Borough

In [13]:
# Extract the values for the cases by Borough.  Ignore Unknown Borough. 
tblText = re.sub("Coronavirus.*Cases.*Bronx", "Bronx", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)
tblText = re.sub("Staten Island", "Staten_Island", tblText, flags=re.DOTALL)

# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [14]:
# Covert the rows of text into a list of lists
boroughdata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    boroughdata.append(record)

# Convert the list into a dataframe and display    
boroughdf = pd.DataFrame(boroughdata, columns=['category','count'])
boroughdf['feature']='borough'
boroughdf['type']='cases'
boroughdf['date']=date(yr,mo,day)

# Concatenate Age, Sex, and Borough Stats into a cases dataframe and display
casesdf = pd.concat([agedf, sexdf, boroughdf]).reset_index().drop(columns=['index'])
casesdf

Unnamed: 0,category,count,feature,type,date
0,0-17,816,age,cases,2020-04-01
1,18-44,18767,age,cases,2020-04-01
2,45-64,16104,age,cases,2020-04-01
3,65-74,5584,age,cases,2020-04-01
4,Over_75,4328,age,cases,2020-04-01
5,Female,20522,sex,cases,2020-04-01
6,Male,25128,sex,cases,2020-04-01
7,Bronx,8607,borough,cases,2020-04-01
8,Brooklyn,12274,borough,cases,2020-04-01
9,Manhattan,7022,borough,cases,2020-04-01


## Read in NYC Deaths Data
Note that the deaths program is a bit more efficient.  We handle the entire table in one pass instead of breaking it into Age, Sex, and Borough.

In [16]:
# Read in text from PDF and show in raw format
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths.pdf'
localurl = 'covid-19-daily-data-summary-deaths-200330.pdf'
deathspage = textfrompdf(url,readfrom='online')
deathspage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily \nData\n Summary\n  The data in this report reflect \nevents and activities\n as of\n April 1,\n 2020\n at \n4:30 \nPM.  All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 Deaths\n  . Underlying \nConditions\n1 No \nUnderlying \nConditions\n Underlying \nConditions \nPending\n Total\n Age Group\n     -  0 to 17\n 1 0 0 1 -  18 to 44\n 51 4 23 78 -  45 to 64\n 254 9 70 333 -  65 to 74\n 257 1 76 334 -  75 and over\n 431 4 193 628 Sex\n     -  Female\n 373 4 135 512 -  Male\n 620 14 227 861 -  Unknown\n 1 0 0 1 Borough\n     -  Bronx\n 305 4 51 360 -  Brooklyn\n 207 4 117 328 -  Manhattan\n 113 1 51 165 -  Queens\n 311 9 127 447 -  Staten Island\n 57 0 16 73 -  Unknown\n 1 0 0 1 Total\n 994 18 362 1374  1Underlying illnesses include Diabetes, Lung Disease, Cancer, Immunodefic

## Extract the COVID Deaths and Parse

In [17]:
# Parse the deathspage text into a set of records
tblText = re.sub("Coronavirus.*0 to 17", "0 to 17", deathspage, flags=re.DOTALL)
tblText = re.sub("Total.*","", tblText, flags=re.DOTALL)
tblText = re.sub("\n", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \-.+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub(" \-.+", "", tblText, flags=re.DOTALL)
tblText = re.sub(" Sex.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub(" Borough.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)
tblText = re.sub(", ", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" ", ",", tblText, flags=re.DOTALL)
tblText = re.sub("Staten,Island", "Staten_Island", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=B)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)


In [18]:
# Display parsed text to check format
tblText

'0-17,1,0,0,1;18-44,51,4,23,78;45-64,254,9,70,333;65-74,257,1,76,334;Over_75,431,4,193,628;Female,373,4,135,512;Male,620,14,227,861;Bronx,305,4,51,360;Brooklyn,207,4,117,328;Manhattan,113,1,51,165;Queens,311,9,127,447;Staten_Island,57,0,16,73;'

In [19]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [20]:
rows

['0-17,1,0,0,1',
 '18-44,51,4,23,78',
 '45-64,254,9,70,333',
 '65-74,257,1,76,334',
 'Over_75,431,4,193,628',
 'Female,373,4,135,512',
 'Male,620,14,227,861',
 'Bronx,305,4,51,360',
 'Brooklyn,207,4,117,328',
 'Manhattan,113,1,51,165',
 'Queens,311,9,127,447',
 'Staten_Island,57,0,16,73']

In [21]:
# Convert text into a list of rows
dftemp = []
for row in rows:
    record = row.split(",")
    del record[-1]
    for i in range(1,4):
        record[i] = int(record[i])
    if (record[0][0].isdigit() or record[0][0]=='O'):
        record.append('age')
    elif (record[0]=='Female' or record[0]=='Male'):
        record.append('sex')
    else:
        record.append('borough')
    dftemp.append(record)
   

In [22]:
# Transpose list of rows into higher normal form
deathsdata = []
deathtype = ['death_underlying','death_no_underlying','death_underlying_pending']
for row in dftemp:
    for i in range(1,4):
        record = []
        record.append(row[-1])
        record.append(row[0])
        record.append(deathtype[i-1])
        record.append(row[i])
        deathsdata.append(record)
   

In [23]:
# Convert list into pandas dataframe and display
deathsdf = pd.DataFrame(deathsdata, columns=['feature','category','type','count'])
deathsdf['date']=date(yr,mo,day)
deathsdf

Unnamed: 0,feature,category,type,count,date
0,age,0-17,death_underlying,1,2020-04-01
1,age,0-17,death_no_underlying,0,2020-04-01
2,age,0-17,death_underlying_pending,0,2020-04-01
3,age,18-44,death_underlying,51,2020-04-01
4,age,18-44,death_no_underlying,4,2020-04-01
5,age,18-44,death_underlying_pending,23,2020-04-01
6,age,45-64,death_underlying,254,2020-04-01
7,age,45-64,death_no_underlying,9,2020-04-01
8,age,45-64,death_underlying_pending,70,2020-04-01
9,age,65-74,death_underlying,257,2020-04-01


In [24]:
# Concatenate cases and deaths dataframes and display
dailydf = pd.concat([casesdf, deathsdf]).reset_index().drop(columns=['index'])
dailydf

Unnamed: 0,category,count,feature,type,date
0,0-17,816,age,cases,2020-04-01
1,18-44,18767,age,cases,2020-04-01
2,45-64,16104,age,cases,2020-04-01
3,65-74,5584,age,cases,2020-04-01
4,Over_75,4328,age,cases,2020-04-01
5,Female,20522,sex,cases,2020-04-01
6,Male,25128,sex,cases,2020-04-01
7,Bronx,8607,borough,cases,2020-04-01
8,Brooklyn,12274,borough,cases,2020-04-01
9,Manhattan,7022,borough,cases,2020-04-01


In [25]:
# Save the dataframe to a CSV file
yrstr = str(yr)
mostr = str(mo) if mo >= 10 else ('0'+str(mo))
daystr = str(day) if day >= 10 else ('0'+str(day))
datestr = yrstr + mostr + daystr
fname = datestr + '-NYCHealth-Daily-COVID-data.csv'

In [26]:
dailydf.to_csv(fname,sep=",",index=False)