# NYC Health COVID-19 Data Scraper
### March 30, 2020
### Matthew J. Beattie
### University of Oklahoma

Cases obtained from _https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf_

Deaths obtained from _https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths.pdf_

In [1]:
import pandas as pd
import re
import PyPDF2 as pdf
import tempfile
import urllib.request
from datetime import date

yr = 2020
mo = 4
day = 5

## Define Common Routines

In [2]:
"""
textfrompdf()
Reads in PDF file from NYC website or local file.  To read a local file, the user
must flag the readfrom variable as 'local'
"""
def textfrompdf(url,readfrom='online'):
    if readfrom != 'local':
        dataIn = urllib.request.urlopen(url).read()
    else:
        localf = open(url, 'rb')
        dataIn = localf.read()
        localf.close()

    fp = tempfile.TemporaryFile()

    # Write the pdf data to a temp file
    fp.write(dataIn)

    pdfReader = pdf.PdfFileReader(fp)
    pdfReader.getNumPages()

    # Get the first page
    return pdfReader.getPage(0).extractText()

## Read in NYC Cases Data

In [16]:
# Read in text from the PDF file and display
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-04052020-2.pdf'
localurl = 'covid-19-daily-data-summary.pdf'
casespage = textfrompdf(url,readfrom='online')

casespage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily \nData\n Summary\n  The data in this report reflect events and activities\n as of\n April \n5, 2020\n at 4:45 PM.  All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 Cases\n . Total Cases\n Total\n 64955 Median Age (Range)\n 50 (0\n-107) Age Group\n  -  0 to 17\n 1135 (2%)\n -  18 to 44\n 25383 (39%)\n -  45 to 64\n 23135 (36%)\n -  65 to 74\n 8260 (13%)\n -  75 and over\n 6905 (11%)\n -  Unknown\n 137 Age 50 and over\n  -  Yes\n 32802 (51%)\n -  No 32016 (49%)\n Sex\n  -  Female\n 29607 (46%)\n -  Male\n 35256 (54%)\n -  Unknown\n 92 Borough\n  -  Bronx\n 12738 (20%)\n -  Brooklyn\n 17520 (27%)\n -  Manhattan\n 9251 (14%)\n -  Queens\n 21781 (34%)\n -  Staten Island\n 3628 (6%)\n -  Unknown\n 37 Deaths\n 2472           \n \n  '

## Extract the COVID cases by Age

In [17]:
# Extract the values for the cases by Age.  Ignore Unknown Age. 
tblText = re.sub("Coronavirus.*Cases.*0 to 17", "0 to 17", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)


In [18]:
tblText

'0-17,1135;18-44,25383;45-64,23135;65-74,8260;Over_75,6905'

In [19]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [20]:
rows

['0-17,1135', '18-44,25383', '45-64,23135', '65-74,8260', 'Over_75,6905']

In [21]:
# Convert the rows of text into a list of lists
agedata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    agedata.append(record)

In [22]:
# Convert the list into a dataframe and display 
agedf = pd.DataFrame(agedata, columns=['category','count'])
agedf['feature']='age'
agedf['type']='cases'
agedf['date']=date(yr,mo,day)
agedf

Unnamed: 0,category,count,feature,type,date
0,0-17,1135,age,cases,2020-04-05
1,18-44,25383,age,cases,2020-04-05
2,45-64,23135,age,cases,2020-04-05
3,65-74,8260,age,cases,2020-04-05
4,Over_75,6905,age,cases,2020-04-05


## Extract the COVID cases by Sex

In [23]:
# Extract the values for the cases by Sex.  Ignore Unknown Sex. 
tblText = re.sub("Coronavirus.*Cases.*Female", "Female", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)

In [24]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [25]:
# Convert the rows into a list of lists
sexdata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    sexdata.append(record)

# Convert the list into a dataframe and display
sexdf = pd.DataFrame(sexdata, columns=['category','count'])
sexdf['feature']='sex'
sexdf['type']='cases'
sexdf['date']=date(yr,mo,day)
sexdf

Unnamed: 0,category,count,feature,type,date
0,Female,29607,sex,cases,2020-04-05
1,Male,35256,sex,cases,2020-04-05


## Extract the COVID cases by Borough

In [26]:
# Extract the values for the cases by Borough.  Ignore Unknown Borough. 
tblText = re.sub("Coronavirus.*Cases.*Bronx", "Bronx", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)
tblText = re.sub("Staten Island", "Staten_Island", tblText, flags=re.DOTALL)

# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [27]:
# Covert the rows of text into a list of lists
boroughdata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    boroughdata.append(record)

# Convert the list into a dataframe and display    
boroughdf = pd.DataFrame(boroughdata, columns=['category','count'])
boroughdf['feature']='borough'
boroughdf['type']='cases'
boroughdf['date']=date(yr,mo,day)

# Concatenate Age, Sex, and Borough Stats into a cases dataframe and display
casesdf = pd.concat([agedf, sexdf, boroughdf]).reset_index().drop(columns=['index'])
casesdf

Unnamed: 0,category,count,feature,type,date
0,0-17,1135,age,cases,2020-04-05
1,18-44,25383,age,cases,2020-04-05
2,45-64,23135,age,cases,2020-04-05
3,65-74,8260,age,cases,2020-04-05
4,Over_75,6905,age,cases,2020-04-05
5,Female,29607,sex,cases,2020-04-05
6,Male,35256,sex,cases,2020-04-05
7,Bronx,12738,borough,cases,2020-04-05
8,Brooklyn,17520,borough,cases,2020-04-05
9,Manhattan,9251,borough,cases,2020-04-05


## Read in NYC Deaths Data
Note that the deaths program is a bit more efficient.  We handle the entire table in one pass instead of breaking it into Age, Sex, and Borough.

In [31]:
# Read in text from PDF and show in raw format
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-04052020-2.pdf'
localurl = 'covid-19-daily-data-summary-deaths-200330.pdf'
deathspage = textfrompdf(url,readfrom='online')
deathspage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily \nData\n Summary\n  The data in this report reflect events and \nactivities\n as of\n April \n5, 2020\n at \n4:45 PM.  All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 Deaths\n . Underlying \nConditions\n1 No \nUnderlying \nConditions\n Underlying \nConditions \nPending\n Total\n Age Group\n     -  0 to 17\n 2 0 0 2 -  18 to 44\n 88 9 43 140 -  45 to 64\n 411 21 155 587 -  65 to 74\n 410 4 198 612 -  75 and over\n 624 4 503 1131 Sex\n     -  Female\n 573 7 340 920 -  Male\n 960 31 559 1550 -  Unknown\n 2 0 0 2 Borough\n     -  Bronx\n 532 9 86 627 -  Brooklyn\n 328 7 333 668 -  Manhattan\n 175 7 95 277 -  Queens\n 425 15 331 771 -  Staten Island\n 74 0 54 128 -  Unknown\n 1 0 0 1 Total\n 1535 38 899 2472  1Underlying illnesses include Diabetes, Lung Disease, Cancer, Immun

## Extract the COVID Deaths and Parse

In [32]:
# Parse the deathspage text into a set of records
tblText = re.sub("Coronavirus.*0 to 17", "0 to 17", deathspage, flags=re.DOTALL)
tblText = re.sub("Total.*","", tblText, flags=re.DOTALL)
tblText = re.sub("\n", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \-.+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub(" \-.+", "", tblText, flags=re.DOTALL)
tblText = re.sub(" Sex.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub(" Borough.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)
tblText = re.sub(", ", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" ", ",", tblText, flags=re.DOTALL)
tblText = re.sub("Staten,Island", "Staten_Island", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=B)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)


In [33]:
# Display parsed text to check format
tblText

'0-17,2,0,0,2;18-44,88,9,43,140;45-64,411,21,155,587;65-74,410,4,198,612;Over_75,624,4,503,1131;Female,573,7,340,920;Male,960,31,559,1550;Bronx,532,9,86,627;Brooklyn,328,7,333,668;Manhattan,175,7,95,277;Queens,425,15,331,771;Staten_Island,74,0,54,128;'

In [34]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [35]:
rows

['0-17,2,0,0,2',
 '18-44,88,9,43,140',
 '45-64,411,21,155,587',
 '65-74,410,4,198,612',
 'Over_75,624,4,503,1131',
 'Female,573,7,340,920',
 'Male,960,31,559,1550',
 'Bronx,532,9,86,627',
 'Brooklyn,328,7,333,668',
 'Manhattan,175,7,95,277',
 'Queens,425,15,331,771',
 'Staten_Island,74,0,54,128']

In [36]:
# Convert text into a list of rows
dftemp = []
for row in rows:
    record = row.split(",")
    del record[-1]
    for i in range(1,4):
        record[i] = int(record[i])
    if (record[0][0].isdigit() or record[0][0]=='O'):
        record.append('age')
    elif (record[0]=='Female' or record[0]=='Male'):
        record.append('sex')
    else:
        record.append('borough')
    dftemp.append(record)
   

In [37]:
# Transpose list of rows into higher normal form
deathsdata = []
deathtype = ['death_underlying','death_no_underlying','death_underlying_pending']
for row in dftemp:
    for i in range(1,4):
        record = []
        record.append(row[-1])
        record.append(row[0])
        record.append(deathtype[i-1])
        record.append(row[i])
        deathsdata.append(record)
   

In [38]:
# Convert list into pandas dataframe and display
deathsdf = pd.DataFrame(deathsdata, columns=['feature','category','type','count'])
deathsdf['date']=date(yr,mo,day)
deathsdf

Unnamed: 0,feature,category,type,count,date
0,age,0-17,death_underlying,2,2020-04-05
1,age,0-17,death_no_underlying,0,2020-04-05
2,age,0-17,death_underlying_pending,0,2020-04-05
3,age,18-44,death_underlying,88,2020-04-05
4,age,18-44,death_no_underlying,9,2020-04-05
5,age,18-44,death_underlying_pending,43,2020-04-05
6,age,45-64,death_underlying,411,2020-04-05
7,age,45-64,death_no_underlying,21,2020-04-05
8,age,45-64,death_underlying_pending,155,2020-04-05
9,age,65-74,death_underlying,410,2020-04-05


In [39]:
# Concatenate cases and deaths dataframes and display
dailydf = pd.concat([casesdf, deathsdf]).reset_index().drop(columns=['index'])
dailydf

Unnamed: 0,category,count,feature,type,date
0,0-17,1135,age,cases,2020-04-05
1,18-44,25383,age,cases,2020-04-05
2,45-64,23135,age,cases,2020-04-05
3,65-74,8260,age,cases,2020-04-05
4,Over_75,6905,age,cases,2020-04-05
5,Female,29607,sex,cases,2020-04-05
6,Male,35256,sex,cases,2020-04-05
7,Bronx,12738,borough,cases,2020-04-05
8,Brooklyn,17520,borough,cases,2020-04-05
9,Manhattan,9251,borough,cases,2020-04-05


In [28]:
# Save the dataframe to a CSV file
#yrstr = str(yr)
#mostr = str(mo) if mo >= 10 else ('0'+str(mo))
#daystr = str(day) if day >= 10 else ('0'+str(day))
#datestr = yrstr + mostr + daystr
#fname = datestr + '-NYCHealth-Daily-COVID-data.csv'

In [29]:
#dailydf.to_csv(fname,sep=",",index=False)

## Read in NYC Hospitalization Data, Extract and Parse

In [40]:
# Read in text from PDF and show in raw format
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-hospitalizations-04032020-1.pdf'
localurl = 'covid-19-daily-data-summary-hospitals-200330.pdf'
hosppage = textfrompdf(url,readfrom='online')
hosppage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily \nData\n Summary\n  The data in this report reflect events and activities\n as of\n April \n3, 2020\n at \n4:30\n PM.\n  All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 \nHospitalizations\n  . Ever \nHospitalized \nCases\n1 Total\n Cases\n Age Group\n   -  0 to 17\n 92 (9%)\n 988 -  18 to 44\n 1962 (9%)\n 22434 -  45 to 64\n 4368 (22%)\n 19971 -  65 to 74\n 2572 (36%)\n 7064 -  75 and over\n 2745 (48%)\n 5709 -  Unknown\n 0 (0%)\n 123 Sex\n   -  Female\n 4765 (19%)\n 25536 -  Male\n 6970 (23%)\n 30672 -  Unknown\n 4 (5%)\n 81 Borough\n   -  Bronx\n 2924 (27%)\n 10766 -  Brooklyn\n 2826 (18%)\n 15327 -  Manhattan\n 1414 (17%)\n 8221 -  Queens\n 3889 (21%)\n 18823 -  Staten Island\n 677 (22%)\n 3117 -  Unknown\n 9 (26%)\n 35 Total\n 11739 (21%)\n 56289  1Percentages are ro

In [41]:
# Parse the deathspage text into a set of records
tblText = re.sub("Coronavirus.*0 to 17", "0 to 17", hosppage, flags=re.DOTALL)
tblText = re.sub("Total.*","", tblText, flags=re.DOTALL)
tblText = re.sub(" \([0-9]{1,3}%\)\n ", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \-.+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub(" Sex.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub(" Borough.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)
tblText = re.sub("\n ", ",", tblText, flags=re.DOTALL)
tblText = re.sub("Staten Island", "Staten_Island", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=F)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=B)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)


In [42]:
tblText

'0-17,92,988;18-44,1962,22434;45-64,4368,19971;65-74,2572,7064;Over_75,2745,5709;Female,4765,25536;Male,6970,30672;Bronx,2924,10766;Brooklyn,2826,15327;Manhattan,1414,8221;Queens,3889,18823;Staten_Island,677,3117;'

In [43]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]
rows

['0-17,92,988',
 '18-44,1962,22434',
 '45-64,4368,19971',
 '65-74,2572,7064',
 'Over_75,2745,5709',
 'Female,4765,25536',
 'Male,6970,30672',
 'Bronx,2924,10766',
 'Brooklyn,2826,15327',
 'Manhattan,1414,8221',
 'Queens,3889,18823',
 'Staten_Island,677,3117']

In [44]:
# Convert text into a list of rows
dftemp = []
for row in rows:
    record = row.split(",")
    del record[-1]
    for i in range(1,1):
        record[i] = int(record[i])
    if (record[0][0].isdigit() or record[0][0]=='O'):
        record.append('age')
    elif (record[0]=='Female' or record[0]=='Male'):
        record.append('sex')
    else:
        record.append('borough')
    dftemp.append(record)
   

In [45]:
dftemp

[['0-17', '92', 'age'],
 ['18-44', '1962', 'age'],
 ['45-64', '4368', 'age'],
 ['65-74', '2572', 'age'],
 ['Over_75', '2745', 'age'],
 ['Female', '4765', 'sex'],
 ['Male', '6970', 'sex'],
 ['Bronx', '2924', 'borough'],
 ['Brooklyn', '2826', 'borough'],
 ['Manhattan', '1414', 'borough'],
 ['Queens', '3889', 'borough'],
 ['Staten_Island', '677', 'borough']]

In [46]:
# Convert list into pandas dataframe and display
hospdf = pd.DataFrame(dftemp, columns=['category','count','feature'])
hospdf['date']=date(yr,mo,day)
hospdf['type']='hospitalizations'
hospdf

Unnamed: 0,category,count,feature,date,type
0,0-17,92,age,2020-04-05,hospitalizations
1,18-44,1962,age,2020-04-05,hospitalizations
2,45-64,4368,age,2020-04-05,hospitalizations
3,65-74,2572,age,2020-04-05,hospitalizations
4,Over_75,2745,age,2020-04-05,hospitalizations
5,Female,4765,sex,2020-04-05,hospitalizations
6,Male,6970,sex,2020-04-05,hospitalizations
7,Bronx,2924,borough,2020-04-05,hospitalizations
8,Brooklyn,2826,borough,2020-04-05,hospitalizations
9,Manhattan,1414,borough,2020-04-05,hospitalizations


In [47]:
# Concatenate cases and deaths dataframes and display
dailydf = pd.concat([casesdf, deathsdf, hospdf]).reset_index().drop(columns=['index'])
dailydf

Unnamed: 0,category,count,feature,type,date
0,0-17,1135,age,cases,2020-04-05
1,18-44,25383,age,cases,2020-04-05
2,45-64,23135,age,cases,2020-04-05
3,65-74,8260,age,cases,2020-04-05
4,Over_75,6905,age,cases,2020-04-05
5,Female,29607,sex,cases,2020-04-05
6,Male,35256,sex,cases,2020-04-05
7,Bronx,12738,borough,cases,2020-04-05
8,Brooklyn,17520,borough,cases,2020-04-05
9,Manhattan,9251,borough,cases,2020-04-05


In [48]:
# Save the dataframe to a CSV file
yrstr = str(yr)
mostr = str(mo) if mo >= 10 else ('0'+str(mo))
daystr = str(day) if day >= 10 else ('0'+str(day))
datestr = yrstr + mostr + daystr
fname = datestr + '-NYCHealth-Daily-COVID-data.csv'

In [49]:
dailydf.to_csv(fname,sep=",",index=False)