
# Outbreak Data Web Scrape

From: [outbreaks](http://www.phsa.ca/current-outbreaks)

---



In [70]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

***Northern***

In [1174]:
re = requests.get('https://www.northernhealth.ca/health-topics/current-outbreaks')
s = BeautifulSoup(re.text,'html5lib')
table =s.find_all('td')

In [1175]:
d = ['city', 'facility', 'outbreaktype', 'datedeclared']
outbreak = {}
for each,i in zip(table,d):
    outbreak[i] = each.text.strip()

In [1176]:
northern = pd.DataFrame(outbreak, index = [0])
northern['health_authority'] = "northern"

In [1177]:
northern.datedeclared = northern.datedeclared.str.replace('\n\n\t\t\t', '-')

In [1178]:
northern

Unnamed: 0,city,facility,outbreaktype,datedeclared,health_authority
0,None to date,,,,northern


***Interior Health***

In [1179]:
ih = requests.get('https://www.interiorhealth.ca/YourEnvironment/CommunicableDiseaseControl/Outbreaks/Pages/default.aspx')
ih = BeautifulSoup(ih.text,'html5lib')
outbreaks = ih.find_all('table')[7]

In [1180]:
outbreaks.find('td').text

'There are no items to show in this view of the "Outbreaks" list.'

***Vancouver Island***

In [1181]:
vi = requests.get('https://www.healthspace.ca/Clients/VIHA/VIHA_Website.nsf/Outbreak')
vi = BeautifulSoup(vi.text,'html5lib')
outbreaks = vi.find('td')

In [1182]:
outbreaks.getText()

' Currently, there are no reported outbreaks in VIHA hospitals or long term care facilities.'

***Vancouver Coastal Health***

In [2]:
from PyPDF2 import PdfFileReader
import io
import re

In [7]:
vch = requests.get('http://www.vch.ca/Documents/facility-outbreak-bulletin.pdf')

In [8]:
with io.BytesIO(vch.content) as f:
            pdf = PdfFileReader(f)
            information = pdf.getDocumentInfo()
            pages = pdf.getNumPages()
            for i in range(0, pages):
                page = pdf.getPage(i)
                page_content = page.extractText()           
                print(page_content)

Facility Outbreak Bulletin
This bulletin lists ongoing and recently ended outbreaks in licensed long-term and acute care
facilities throughout Vancouver Coastal Health, as of 01:00 PM, 06-Aug-2020
DISEASE
LOCATION
RESTRICTIONS
IMPOSED
RESTRICTIONS
LIFTED
FACILITY
COVID-19
9020 Bridgeport Road, Richmond
05-Aug-20
Richmond Lions Manor - Bridgeport, 2nd Floor
COVID-19
803 West 12th Avenue, Vancouver
06-Aug-20
Joseph & Rosalie Segal Family Health Centre, 8th 
Floor
COVID-19
1081 Burrard Street, Vancouver
16-Jul-20
28-Jul-20
St. Paul's Hospital, NICU
COVID-19
7801 Argyle St, Vancouver
09-Jun-20
Holy Family Hospital, LTCF (Rehabilitation Unit 
declared over)
Page 1 of 1
Red text denotes updates from previously issued bulletin
Grey text indicates that restrictions have been lifted
Restriction Imposed: 
Restrictions Lifted: 
               Date which outbreak measures were introduced
            Date which outbreak measures were discontinued



----

In [9]:
spl = page_content.split('\n')

In [10]:
spldf = pd.Series(spl)

In [30]:
spldf.iloc[10:29]

10                                             COVID-19
11                       9020 Bridgeport Road, Richmond
12                                            05-Aug-20
13         Richmond Lions Manor - Bridgeport, 2nd Floor
14                                             COVID-19
15                      803 West 12th Avenue, Vancouver
16                                            06-Aug-20
17    Joseph & Rosalie Segal Family Health Centre, 8th 
18                                                Floor
19                                             COVID-19
20                       1081 Burrard Street, Vancouver
21                                            16-Jul-20
22                                            28-Jul-20
23                            St. Paul's Hospital, NICU
24                                             COVID-19
25                            7801 Argyle St, Vancouver
26                                            09-Jun-20
27     Holy Family Hospital, LTCF (Rehabilitatio

In [97]:
vch_outbreaks = pd.DataFrame({'info' : spldf.iloc[11:29]})

In [98]:
vch_outbreaks.iloc[11] = vch_outbreaks.iloc[11:13].agg(' '.join)
vch_outbreaks.iloc[6] = vch_outbreaks.iloc[6:8].agg(' '.join)
vch_outbreaks.iloc[16] = vch_outbreaks.iloc[16:17].agg(' '.join)
vch_outbreaks.drop([vch_outbreaks.index[7] , vch_outbreaks.index[12], vch_outbreaks.index[17]], inplace = True)

In [99]:
# vch_outbreaks.reset_index()

In [100]:
vch_outbreaks = vch_outbreaks.groupby(vch_outbreaks.index // 5).agg('\n'.join)['info'].str.split('\n',expand=True)

In [101]:
vch_outbreaks.rename(columns = {0: 'address', 1: 'datedeclared/restrictionsimposed', 2:'facility', 3:'outbreaktype'}, inplace = True)

In [102]:
vch_outbreaks['health_authority'] = 'vancouver coastal health'

In [103]:
vch_outbreaks

Unnamed: 0,address,datedeclared/restrictionsimposed,facility,outbreaktype,health_authority
2,"9020 Bridgeport Road, Richmond",05-Aug-20,"Richmond Lions Manor - Bridgeport, 2nd Floor",COVID-19,vancouver coastal health
3,"803 West 12th Avenue, Vancouver",06-Aug-20,"Joseph & Rosalie Segal Family Health Centre, 8...",COVID-19,vancouver coastal health
4,"1081 Burrard Street, Vancouver",16-Jul-20,"28-Jul-20 St. Paul's Hospital, NICU",COVID-19,vancouver coastal health
5,"7801 Argyle St, Vancouver",09-Jun-20,"Holy Family Hospital, LTCF (Rehabilitation Unit",,vancouver coastal health


***Fraser***

In [71]:
f = requests.get(r'https://www.fraserhealth.ca/patients-and-visitors/current-outbreaks#.XzGwMS0ZNQI')
fraser = BeautifulSoup(f.text,'html5lib')
table = fraser.find_all('td')

In [72]:
newlist = []

for td in table:
        try:
            if td.find('h5').getText() == '\xa0':
                # if no info is provided, it will mess up indexing
                pass
            elif td.find('h5').getText() == '':
                # if no info is provided, it will mess up indexing
                pass
            else:    
                newlist.append(td.find('h5').getText())
        except AttributeError:
            pass
        
            
        
df = pd.DataFrame(newlist)

In [78]:
newlist

['July 31, 2020',
 'Dania Home',
 'Residential',
 'COVID-19',
 'August 4, 2020',
 'Maple Ridge Seniors Village',
 'Residential',
 'COVID-19',
 'August 6, 2020',
 'Derby Manor',
 'Residential',
 'COVID-19',
 'August 7, 2020',
 'George Derby Centre',
 'Residential',
 'COVID-19',
 'August 8, 2020',
 'New Vista Care Society',
 'Residential',
 'COVID-19']

In [93]:
fraser = df.groupby(df.index // 4).agg('\n'.join)[0].str.split('\n',expand=True)

In [94]:
fraser.rename(columns = {0:'datedeclared/restrictionsimposed', 1:'facility', 2:'facilitytype', 3:'outbreaktype'}, inplace = True)

In [95]:
fraser['health_authority'] = 'fraser'

In [96]:
fraser

Unnamed: 0,datedeclared/restrictionsimposed,facility,facilitytype,outbreaktype,health_authority
0,"July 31, 2020",Dania Home,Residential,COVID-19,fraser
1,"August 4, 2020",Maple Ridge Seniors Village,Residential,COVID-19,fraser
2,"August 6, 2020",Derby Manor,Residential,COVID-19,fraser
3,"August 7, 2020",George Derby Centre,Residential,COVID-19,fraser
4,"August 8, 2020",New Vista Care Society,Residential,COVID-19,fraser


### Merge facilities

In [113]:
# merge 2 includes scrapes up to aug 10
merge2 = pd.merge(fraser, vch_outbreaks, how = 'outer')

In [116]:
# add Northern outbreak at Terraceview Lodge from April 9, 2020
northern = pd.DataFrame({'facility':'Terraceview Lodge - Lakelse Unit',
              'outbreaktype': 'Respiratory Illness',
              'datedeclared/restrictionsimposed' : 'April 9, 2020-Declared over: April 13, 2020',
            'health_authority' : 'northern'}, index = [0])

aug10 = pd.merge(merge2, northern, how = 'outer')

In [117]:
# forward fill where applicable
aug10.facilitytype.ffill(inplace = True)
aug10.outbreaktype.ffill(inplace = True)

aug10

Unnamed: 0,datedeclared/restrictionsimposed,facility,facilitytype,outbreaktype,health_authority,address
0,"July 31, 2020",Dania Home,Residential,COVID-19,fraser,
1,"August 4, 2020",Maple Ridge Seniors Village,Residential,COVID-19,fraser,
2,"August 6, 2020",Derby Manor,Residential,COVID-19,fraser,
3,"August 7, 2020",George Derby Centre,Residential,COVID-19,fraser,
4,"August 8, 2020",New Vista Care Society,Residential,COVID-19,fraser,
5,05-Aug-20,"Richmond Lions Manor - Bridgeport, 2nd Floor",Residential,COVID-19,vancouver coastal health,"9020 Bridgeport Road, Richmond"
6,06-Aug-20,"Joseph & Rosalie Segal Family Health Centre, 8...",Residential,COVID-19,vancouver coastal health,"803 West 12th Avenue, Vancouver"
7,16-Jul-20,"28-Jul-20 St. Paul's Hospital, NICU",Residential,COVID-19,vancouver coastal health,"1081 Burrard Street, Vancouver"
8,09-Jun-20,"Holy Family Hospital, LTCF (Rehabilitation Unit",Residential,COVID-19,vancouver coastal health,"7801 Argyle St, Vancouver"
9,"April 9, 2020-Declared over: April 13, 2020",Terraceview Lodge - Lakelse Unit,Residential,Respiratory Illness,northern,


In [119]:
aug10.to_csv('../data/bc_ltc_scraped.csv')

# Caveats:

1. VCH outbreak status is colour coded and requires custom scripts for each new outbreak
2. All only report active outbreaks (no information on previous outbreaks)
3. VCH also reports hospital outbreaks - not just LTCs