# NYC Health COVID-19 Data Scraper
### March 30, 2020
### Matthew J. Beattie
### University of Oklahoma

Cases obtained from _https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary.pdf_

Deaths obtained from _https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths.pdf_

In [1]:
import pandas as pd
import re
import PyPDF2 as pdf
import tempfile
import urllib.request
from datetime import date

yr = 2020
mo = 5
day = 15

## Define Common Routines

In [2]:
"""
textfrompdf()
Reads in PDF file from NYC website or local file.  To read a local file, the user
must flag the readfrom variable as 'local'
"""
def textfrompdf(url,readfrom='online'):
    if readfrom != 'local':
        dataIn = urllib.request.urlopen(url).read()
    else:
        localf = open(url, 'rb')
        dataIn = localf.read()
        localf.close()

    fp = tempfile.TemporaryFile()

    # Write the pdf data to a temp file
    fp.write(dataIn)

    pdfReader = pdf.PdfFileReader(fp)
    pdfReader.getNumPages()

    # Get the first page
    return pdfReader.getPage(0).extractText()

## Read in NYC Cases Data

In [3]:
# Read in text from the PDF file and display
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-05152020-1.pdf'
localurl = 'covid-19-daily-data-summary.pdf'
casespage = textfrompdf(url,readfrom='online')

casespage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily Data Summary\n  The data in this report reflect events and activities\n as of\n May 1\n4, 2020\n at 6:00\n PM.  All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 Cases\n . Total Cases\n Total\n 187848 Median Age \n(Range)\n 51 (0\n-110) Age Group\n  -  0 to 17\n 4824 (3%)\n -  18 to 44\n 68835 (37%)\n -  45 to 64\n 68520 (37%)\n -  65 to 74\n 23370 (12%)\n -  75 and over\n 21887 (12%)\n -  Unknown\n 412 Age 50 and over\n  -  Yes\n 97808 (52%)\n -  No 89628 (48%)\n Sex\n  -  Female\n 90959 (48%)\n -  Male\n 96764 (52%)\n -  Unknown\n 125 Borough\n  -  Bronx\n 42656 (23%)\n -  Brooklyn\n 51131 (27%)\n -  Manhattan\n 23274 (12%)\n -  Queens\n 57865 (31%)\n -  Staten Island\n 12832 (7%)\n -  Unknown\n 90 Deaths\n  -  Confirmed\n 15422 (75%)\n -  Probable\n 5054 (25%)\n    '

## Extract the COVID cases by Age

In [4]:
# Extract the values for the cases by Age.  Ignore Unknown Age. 
tblText = re.sub("Coronavirus.*Cases.*0 to 17", "0 to 17", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)


In [5]:
tblText

'0-17,4824;18-44,68835;45-64,68520;65-74,23370;Over_75,21887'

In [6]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [7]:
rows

['0-17,4824', '18-44,68835', '45-64,68520', '65-74,23370', 'Over_75,21887']

In [8]:
# Convert the rows of text into a list of lists
agedata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    agedata.append(record)

In [9]:
# Convert the list into a dataframe and display 
agedf = pd.DataFrame(agedata, columns=['category','count'])
agedf['feature']='age'
agedf['type']='cases'
agedf['date']=date(yr,mo,day)
agedf

Unnamed: 0,category,count,feature,type,date
0,0-17,4824,age,cases,2020-05-15
1,18-44,68835,age,cases,2020-05-15
2,45-64,68520,age,cases,2020-05-15
3,65-74,23370,age,cases,2020-05-15
4,Over_75,21887,age,cases,2020-05-15


## Extract the COVID cases by Sex

In [10]:
# Extract the values for the cases by Sex.  Ignore Unknown Sex. 
tblText = re.sub("Coronavirus.*Cases.*Female", "Female", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)

In [11]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [12]:
# Convert the rows into a list of lists
sexdata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    sexdata.append(record)

# Convert the list into a dataframe and display
sexdf = pd.DataFrame(sexdata, columns=['category','count'])
sexdf['feature']='sex'
sexdf['type']='cases'
sexdf['date']=date(yr,mo,day)
sexdf

Unnamed: 0,category,count,feature,type,date
0,Female,90959,sex,cases,2020-05-15
1,Male,96764,sex,cases,2020-05-15


## Extract the COVID cases by Borough

In [13]:
# Extract the values for the cases by Borough.  Ignore Unknown Borough. 
tblText = re.sub("Coronavirus.*Cases.*Bronx", "Bronx", casespage, flags=re.DOTALL)
tblText = re.sub("Deaths.*","", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("\(\d+%", "", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub("\n.+?(?=[0-9])", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \).+", "", tblText, flags=re.DOTALL)
tblText = re.sub("Staten Island", "Staten_Island", tblText, flags=re.DOTALL)

# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [14]:
# Covert the rows of text into a list of lists
boroughdata = []
for row in rows:
    record = []
    field1 = re.search("^([^,])+", row).group()
    field2 = re.search("[^,]+$", row).group()
    record.append(field1)
    record.append(int(field2))
    boroughdata.append(record)

# Convert the list into a dataframe and display    
boroughdf = pd.DataFrame(boroughdata, columns=['category','count'])
boroughdf['feature']='borough'
boroughdf['type']='cases'
boroughdf['date']=date(yr,mo,day)

# Concatenate Age, Sex, and Borough Stats into a cases dataframe and display
casesdf = pd.concat([agedf, sexdf, boroughdf]).reset_index().drop(columns=['index'])
casesdf

Unnamed: 0,category,count,feature,type,date
0,0-17,4824,age,cases,2020-05-15
1,18-44,68835,age,cases,2020-05-15
2,45-64,68520,age,cases,2020-05-15
3,65-74,23370,age,cases,2020-05-15
4,Over_75,21887,age,cases,2020-05-15
5,Female,90959,sex,cases,2020-05-15
6,Male,96764,sex,cases,2020-05-15
7,Bronx,42656,borough,cases,2020-05-15
8,Brooklyn,51131,borough,cases,2020-05-15
9,Manhattan,23274,borough,cases,2020-05-15


## Read in NYC Deaths Data
Note that the deaths program is a bit more efficient.  We handle the entire table in one pass instead of breaking it into Age, Sex, and Borough.

In [15]:
# Read in text from PDF and show in raw format
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-deaths-05152020-1.pdf'
localurl = 'covid-19-daily-data-summary-deaths-200330.pdf'
deathspage = textfrompdf(url,readfrom='online')
deathspage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily Data Summary\n  The data in this report reflect events and activities\n as of\n May 1\n4, 2020\n at \n6:00\n PM. All data in this report are preliminary and subject to change as cases continue to be investigated. \n These data include cases in NYC residents and foreign residents treated in NYC facilities.\n This table shows only confirmed deaths\n. A death is considered co\nnfirmed when the person \nhad a positive COVID\n-19 laboratory test.\n   NYC COVID\n-19 Deaths Among Confirmed Cases\n . Underlying \nConditions\n1  No \nUnderlying \nConditions \n Underlying \nConditions \nUnknown \n Total \n Age Group\n     -  0 to 17\n 7 1 0 8 -  18 to 44\n 485 19 106 610 -  45 to 64\n 2957 74 418 3449 -  65 to 74\n 3085 3 753 3841 -  75 and over\n 5818 2 1691 7511 -  Unknown\n 1 0 2 3 Sex\n     -  Female\n 4904 11 1129 6044 -  Male\n 7436 88 1837 9361 -  Unknown\n 13 0 4 17 Borough\n     -  Bronx\n 2991 16 286 3293 -  Brooklyn\n 3640 25 1040 4705 

## Extract the COVID Deaths and Parse

In [16]:
# Parse the deathspage text into a set of records
tblText = re.sub("Coronavirus.*0 to 17", "0 to 17", deathspage, flags=re.DOTALL)
tblText = re.sub("Total.*","", tblText, flags=re.DOTALL)
tblText = re.sub("\n", ",", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)
tblText = re.sub(" -.+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub(", ", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" ", ",", tblText, flags=re.DOTALL)
tblText = re.sub(",Sex.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub(",Borough.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=[A-Z])", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)
tblText = re.sub("Staten,Island", "Staten_Island", tblText, flags=re.DOTALL)


In [17]:
# Display parsed text to check format
tblText

'0-17,7,1,0,8;18-44,485,19,106,610;45-64,2957,74,418,3449;65-74,3085,3,753,3841;Over_75,5818,2,1691,7511;Female,4904,11,1129,6044;Male,7436,88,1837,9361;Bronx,2991,16,286,3293;Brooklyn,3640,25,1040,4705;Manhattan,1561,8,488,2057;Queens,3625,50,935,4610;Staten_Island,530,0,218,748;'

In [18]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]

In [19]:
rows

['0-17,7,1,0,8',
 '18-44,485,19,106,610',
 '45-64,2957,74,418,3449',
 '65-74,3085,3,753,3841',
 'Over_75,5818,2,1691,7511',
 'Female,4904,11,1129,6044',
 'Male,7436,88,1837,9361',
 'Bronx,2991,16,286,3293',
 'Brooklyn,3640,25,1040,4705',
 'Manhattan,1561,8,488,2057',
 'Queens,3625,50,935,4610',
 'Staten_Island,530,0,218,748']

In [20]:
# Convert text into a list of rows
dftemp = []
for row in rows:
    record = row.split(",")
    del record[-1]
    for i in range(1,4):
        record[i] = int(record[i])
    if (record[0][0].isdigit() or record[0][0]=='O'):
        record.append('age')
    elif (record[0]=='Female' or record[0]=='Male'):
        record.append('sex')
    else:
        record.append('borough')
    dftemp.append(record)
   

In [21]:
# Transpose list of rows into higher normal form
deathsdata = []
deathtype = ['death_underlying','death_no_underlying','death_underlying_pending']
for row in dftemp:
    for i in range(1,4):
        record = []
        record.append(row[-1])
        record.append(row[0])
        record.append(deathtype[i-1])
        record.append(row[i])
        deathsdata.append(record)
   

In [22]:
# Convert list into pandas dataframe and display
deathsdf = pd.DataFrame(deathsdata, columns=['feature','category','type','count'])
deathsdf['date']=date(yr,mo,day)
deathsdf

Unnamed: 0,feature,category,type,count,date
0,age,0-17,death_underlying,7,2020-05-15
1,age,0-17,death_no_underlying,1,2020-05-15
2,age,0-17,death_underlying_pending,0,2020-05-15
3,age,18-44,death_underlying,485,2020-05-15
4,age,18-44,death_no_underlying,19,2020-05-15
5,age,18-44,death_underlying_pending,106,2020-05-15
6,age,45-64,death_underlying,2957,2020-05-15
7,age,45-64,death_no_underlying,74,2020-05-15
8,age,45-64,death_underlying_pending,418,2020-05-15
9,age,65-74,death_underlying,3085,2020-05-15


In [23]:
# Concatenate cases and deaths dataframes and display
dailydf = pd.concat([casesdf, deathsdf]).reset_index().drop(columns=['index'])
dailydf

Unnamed: 0,category,count,feature,type,date
0,0-17,4824,age,cases,2020-05-15
1,18-44,68835,age,cases,2020-05-15
2,45-64,68520,age,cases,2020-05-15
3,65-74,23370,age,cases,2020-05-15
4,Over_75,21887,age,cases,2020-05-15
5,Female,90959,sex,cases,2020-05-15
6,Male,96764,sex,cases,2020-05-15
7,Bronx,42656,borough,cases,2020-05-15
8,Brooklyn,51131,borough,cases,2020-05-15
9,Manhattan,23274,borough,cases,2020-05-15


In [28]:
# Save the dataframe to a CSV file
#yrstr = str(yr)
#mostr = str(mo) if mo >= 10 else ('0'+str(mo))
#daystr = str(day) if day >= 10 else ('0'+str(day))
#datestr = yrstr + mostr + daystr
#fname = datestr + '-NYCHealth-Daily-COVID-data.csv'

In [29]:
#dailydf.to_csv(fname,sep=",",index=False)

## Read in NYC Hospitalization Data, Extract and Parse

In [24]:
# Read in text from PDF and show in raw format
url = 'https://www1.nyc.gov/assets/doh/downloads/pdf/imm/covid-19-daily-data-summary-hospitalizations-05152020-1.pdf'
localurl = 'covid-19-daily-data-summary-hospitals-200330.pdf'
hosppage = textfrompdf(url,readfrom='online')
hosppage

'Coronavirus Disease 2019 (COVID\n-19) \n  Daily Data Summary\n  The data in this report reflect events and activities\n as of\n May 1\n4, 2020 at \n6:00\n PM. \n All data in this report are preliminary and subject to change as cases continue to be investigated. \n These\n data \ninclude \ncases in NYC residents and foreign residents treated in NYC facilities\n.  NYC COVID\n-19 Hospitalizations Among Confirmed Cases\n  . Ever \n Hospitalized Cases\n1 All Cases\n Age Group\n   -  0 to 17\n 414 (9%)\n 4824 -  18 to 44\n 7496 (11%)\n 68835 -  45 to 64\n 16854 (25%)\n 68520 -  65 to 74\n 11091 (47%)\n 23370 -  75 and over\n 13720 (63%)\n 21887 -  Unknown\n 5 (1%)\n 412 Sex\n   -  Female\n 21296 (23%)\n 90959 -  Male\n 28256 (29%)\n 96764 -  Unknown\n 28 (22%)\n 125 Borough\n   -  Bronx\n 11223 (26%)\n 42656 -  Brooklyn\n 13253 (26%)\n 51131 -  Manhattan\n 7251 (31%)\n 23274 -  Queens\n 15656 (27%)\n 57865 -  Staten Island\n 2174 (17%)\n 12832 -  Unknown\n 23 (26%)\n 90 Total\n 49580 (26%)\

In [25]:
# Parse the deathspage text into a set of records
tblText = re.sub("Coronavirus.*0 to 17", "0 to 17", hosppage, flags=re.DOTALL)
tblText = re.sub("Total.*","", tblText, flags=re.DOTALL)
tblText = re.sub("\n ", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" \([0-9]{1,3}%\),", ",", tblText, flags=re.DOTALL)
tblText = re.sub(" -.+?(?=[a-zA-Z0-9])", ";", tblText, flags=re.DOTALL)
tblText = re.sub(" Sex.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub(" Borough.+?(?=;)", "", tblText, flags=re.DOTALL)
tblText = re.sub("0 to 17", "0-17", tblText, flags=re.DOTALL)
tblText = re.sub("18 to 44", "18-44", tblText, flags=re.DOTALL)
tblText = re.sub("45 to 64", "45-64", tblText, flags=re.DOTALL)
tblText = re.sub("65 to 74", "65-74", tblText, flags=re.DOTALL)
tblText = re.sub("75 and over", "Over_75", tblText, flags=re.DOTALL)
tblText = re.sub(" \n", "_", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=F)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.+?(?=B)", "", tblText, flags=re.DOTALL)
tblText = re.sub("Unknown.*", "", tblText, flags=re.DOTALL)


In [26]:
tblText

'0-17,414,4824;18-44,7496,68835;45-64,16854,68520;65-74,11091,23370;Over_75,13720,21887;Female,21296,90959;Male,28256,96764;Bronx,11223,42656;Brooklyn,13253,51131;Manhattan,7251,23274;Queens,15656,57865;Staten Island,2174,12832;'

In [27]:
# Split the extracted text into rows and remove any rows that do not contain data
rows = re.split(";", tblText)
rows = [row for row in rows if re.search(r"[0-9]",row)!=None]
rows

['0-17,414,4824',
 '18-44,7496,68835',
 '45-64,16854,68520',
 '65-74,11091,23370',
 'Over_75,13720,21887',
 'Female,21296,90959',
 'Male,28256,96764',
 'Bronx,11223,42656',
 'Brooklyn,13253,51131',
 'Manhattan,7251,23274',
 'Queens,15656,57865',
 'Staten Island,2174,12832']

In [28]:
# Convert text into a list of rows
dftemp = []
for row in rows:
    record = row.split(",")
    del record[-1]
    for i in range(1,1):
        record[i] = int(record[i])
    if (record[0][0].isdigit() or record[0][0]=='O'):
        record.append('age')
    elif (record[0]=='Female' or record[0]=='Male'):
        record.append('sex')
    else:
        record.append('borough')
    dftemp.append(record)
   

In [29]:
dftemp

[['0-17', '414', 'age'],
 ['18-44', '7496', 'age'],
 ['45-64', '16854', 'age'],
 ['65-74', '11091', 'age'],
 ['Over_75', '13720', 'age'],
 ['Female', '21296', 'sex'],
 ['Male', '28256', 'sex'],
 ['Bronx', '11223', 'borough'],
 ['Brooklyn', '13253', 'borough'],
 ['Manhattan', '7251', 'borough'],
 ['Queens', '15656', 'borough'],
 ['Staten Island', '2174', 'borough']]

In [30]:
# Convert list into pandas dataframe and display
hospdf = pd.DataFrame(dftemp, columns=['category','count','feature'])
hospdf['date']=date(yr,mo,day)
hospdf['type']='hospitalizations'
hospdf

Unnamed: 0,category,count,feature,date,type
0,0-17,414,age,2020-05-15,hospitalizations
1,18-44,7496,age,2020-05-15,hospitalizations
2,45-64,16854,age,2020-05-15,hospitalizations
3,65-74,11091,age,2020-05-15,hospitalizations
4,Over_75,13720,age,2020-05-15,hospitalizations
5,Female,21296,sex,2020-05-15,hospitalizations
6,Male,28256,sex,2020-05-15,hospitalizations
7,Bronx,11223,borough,2020-05-15,hospitalizations
8,Brooklyn,13253,borough,2020-05-15,hospitalizations
9,Manhattan,7251,borough,2020-05-15,hospitalizations


In [31]:
# Concatenate cases and deaths dataframes and display
dailydf = pd.concat([casesdf, deathsdf, hospdf]).reset_index().drop(columns=['index'])
dailydf

Unnamed: 0,category,count,feature,type,date
0,0-17,4824,age,cases,2020-05-15
1,18-44,68835,age,cases,2020-05-15
2,45-64,68520,age,cases,2020-05-15
3,65-74,23370,age,cases,2020-05-15
4,Over_75,21887,age,cases,2020-05-15
5,Female,90959,sex,cases,2020-05-15
6,Male,96764,sex,cases,2020-05-15
7,Bronx,42656,borough,cases,2020-05-15
8,Brooklyn,51131,borough,cases,2020-05-15
9,Manhattan,23274,borough,cases,2020-05-15


In [32]:
# Save the dataframe to a CSV file
yrstr = str(yr)
mostr = str(mo) if mo >= 10 else ('0'+str(mo))
daystr = str(day) if day >= 10 else ('0'+str(day))
datestr = yrstr + mostr + daystr
fname = datestr + '-NYCHealth-Daily-COVID-data.csv'

In [33]:
dailydf.to_csv(fname,sep=",",index=False)