In [2]:
import urllib3
import certifi
from bs4 import BeautifulSoup, element
import re
import openpyxl
import pickle
import json
import geopy
import requests
import pandas as pd
import numpy as np

# a few handy url generator funcitons
base_url = lambda start_num=1: "https://www.dss.virginia.gov/facility/search/alf.cgi?rm=Search;;Start={start_num}".format(start_num=start_num)
loc_url = lambda loc_id: "https://www.dss.virginia.gov/facility/search/alf.cgi?rm=Details;ID={loc_id}".format(loc_id=loc_id)
insp_url = lambda inspection_id, loc_id: "https://www.dss.virginia.gov/facility/search/alf.cgi?rm=Inspection;Inspection={inspection_id};ID={loc_id}".format(inspection_id=inspection_id, loc_id=loc_id)

http = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED',  # Force certificate check.
    ca_certs=certifi.where(),  # Path to the Certifi bundle.
)



In [3]:
### I couldn't get the google maps api to work. I created an api_key under my personal google account but it didn't seem to work
### I believe this was needed for some spatial analysis this project needed so not sure it's needed.

# Define geolocator from GooglemapsV3 api :: no key required :: output projection EPSG:3857 Spherical Mercator (Web Mercator)
#geolocator = geopy.geocoders.GoogleV3(api_key='AIzaSyDDWgZjAuas0i7U2BSpvGR8gn8KPMItLcE')
#domain='maps.googleapis.com', 

def get_page(url):
    '''Get all of our page data in a consistent fashion'''

    r = http.request('GET', url)
    # lxml is much better than stock python parser
    return BeautifulSoup(r.data, 'lxml')


In [4]:
def get_key(tag):
    '''a lot of time we will need to extract a key from a tag'''
    return tag.get_text().strip().strip(':').lower().replace(' ', '_').encode('ascii', 'ignore')

In [5]:
def get_loc_ids(start_num=1):
    '''Location id Generator function.
    This will automatically handeling paging of main id lookup, but you can skip ahead by passing in a number representing the ids place in global list, 1 based index'''

    done = False
    while not done:
        print ('Fetching some location ids')
        soup = get_page(base_url(start_num))

        num_locs = int(re.search('\t(\d{1,9}) records', soup.find_all('table')[1].find_all('td')[1].text).group(1))

        ids = ([int(re.search(';ID=(\d{1,9});', a['href']).group(1)) for a in soup.find_all('table')[3].find_all('a')])

        for loc_id in ids:
            start_num += 1
            yield loc_id

        if start_num >= num_locs:
            done = True


In [432]:
encoding = 'utf-8'
def parse_loc(loc_id):
    '''Fetch detailed info for a single location based on id'''

    print("Fetching info for location id =", loc_id)
    
    soup = get_page(loc_url(loc_id))

    location_info = {
        '_type': 'location_info',
        'id': loc_id
    }

    # big breakdowns go by tables
    basic_info, additional_info, inspection_info = soup.find_all('table')[:3]

    # first table has a bunch of data in fairly unstructured format
    name_and_address, city_zip, phone_number = basic_info.find_all('tr')

    location_info.update({
        'phone_number': phone_number.get_text().strip()
    })

    parsed_name_address = [line.strip() for line in name_and_address.get_text().split('\n') if line.strip()]
    location_info.update({
        'name': parsed_name_address[0],
        'address' : parsed_name_address[1],
        'city_zip' : city_zip.get_text().strip()
    })
    
### Commented out Google API geolocator since API key didn't seem to work    
    # Get address for geolocator with city, state and without \n
    #gcode_address = city_zip.get_text().strip()#' '.join([' '.join(parsed_name_address[1:]), city_zip.get_text().strip()])

### Address needs to have name stripped from it.    
    #location_info.update({'address': name_and_address.get_text().strip()})

    # there are a lot of additional info that follows the general format of <td>key</td><td>value</td>
    # but some need some extra parsing
    extra_parsing = {
        'ages': lambda ages: ages.replace('\t', '').replace('\n', ''),
        'inspector': lambda inspector_info: [line.strip() for line in inspector_info.split('\n') if line.strip()]
    }
    for row in additional_info.find_all('tr')[:-1]:
        key = get_key(row.find_all('td')[0])
        key = str(key, encoding)
        val = row.find_all('td')[1].get_text().strip()
        if key in extra_parsing:
            val = extra_parsing[key](val)

        location_info.update({key: val})

    if 'inspector' in location_info:
        location_info.update({
            'inspector_name': location_info['inspector'][0],
            'inspector_phone': location_info['inspector'][1]
        })
        del location_info['inspector']

    if inspection_info.table:
        inspection_ids = [int(re.search(';Inspection=(\d{1,6});', tag.a['href']).group(1)) for tag in inspection_info.table.find_all('tr')[1:]]
        location_info['inspections'] = [parse_inspection(insp_id, loc_id) for insp_id in inspection_ids]

    else:
        location_info['inspections'] = []

    return location_info



In [435]:
def parse_inspection(insp_id, loc_id):
    '''To get inspection data, you need to give the site both the inspection id and location id'''

    print (" Fetching info for inspection id =", insp_id)

    soup = get_page(insp_url(insp_id, loc_id))

    inspection_info = {
        '_type': 'inspection_info',
        'id': insp_id,
        'loc_id': loc_id
    }

    # there is some redundant info about location, then some relevant stuff
    date, complaint = soup.find('div', id='main_content').find_all('p')[3:5]
    inspection_info.update({
        'date': date.get_text().split('\n')[5].strip(),
        'complaint': complaint.get_text().split('\n')[3].strip()
    })
    
    parsers = {
        'areas_reviewed': lambda areas_reviewed: areas_reviewed.get_text().strip(),
        'technical_assistance': lambda technical_assistance: technical_assistance.get_text().strip(),
        'comments': lambda comments: comments.get_text().strip().replace('\r', '\n'),
        'violations': parse_violations
    }

    # also have a variable number of tables, id'd by <dt>s
    table_ids = [get_key(tag) for tag in soup.find_all('dt')]

    for key, tag in zip(table_ids, soup.find_all('table')[:len(table_ids)]):
        key = str(key, encoding)
        inspection_info[key] = parsers[key](tag)

  
    return inspection_info

In [473]:
# we will need a lot of specialized parsers
encoding = 'utf-8'
def parse_violations(violations):

    parsers = {
        'standard_#': lambda val: val.strip(),
        'description': lambda val: val.strip().replace('\r', '\n'),
        'complaint_related': lambda val: val.strip(),
        'plan_of_correction': lambda val: val.strip().replace('\r', '\n')
    }
    line_num = 0
    violation_lines = violations.find_all('tr')
    violations_info = []
    violation_info = {}

    while line_num < len(violation_lines):

        if violation_lines[line_num].td is None:
            # there seems to be blank lines after 'complain_related' that don't have <td>s
            pass

        elif violation_lines[line_num].hr:
            violations_info.append(violation_info)
            violation_info = {}

        else:
            raw_key, val = violation_lines[line_num].get_text().split(':', 1)
            key = raw_key.strip().strip(':').lower().replace(' ', '_').encode('ascii', 'ignore')
            key = str(key, encoding)
            violation_info[key] = parsers[key](val)

        line_num += 1

    return violations_info


In [228]:
## Get all location IDs
ids = list(get_loc_ids())
#df_ids = pd.DataFrame(ids,columns=["LocationIDs"])

Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids
Fetching some location ids


In [229]:
## Testing top N location IDs to view data
top_n_idx = np.argsort(ids)[-10:]
top_n_values = [ids[i] for i in top_n_idx]
top_n_values

[47824, 47825, 47826, 47827, 47828, 47869, 47870, 47883, 47918, 48088]

In [474]:
del loc

## Create dataframe to append data to
loc = pd.DataFrame()

In [475]:
loc_id = top_n_values ## Sample of facilities #ids ## All facilities
    #top_n_values ## Sample of facilities 
for i in loc_id:
    parse_loc(i)
    loc = loc.append(parse_loc(i), ignore_index = True)


Fetching info for location id = 47824
 Fetching info for inspection id = 30331
Fetching info for location id = 47824
 Fetching info for inspection id = 30331
Fetching info for location id = 47825
 Fetching info for inspection id = 30333
Fetching info for location id = 47825
 Fetching info for inspection id = 30333
Fetching info for location id = 47826
 Fetching info for inspection id = 30334
Fetching info for location id = 47826
 Fetching info for inspection id = 30334
Fetching info for location id = 47827
 Fetching info for inspection id = 30478
 Fetching info for inspection id = 30335
Fetching info for location id = 47827
 Fetching info for inspection id = 30478
 Fetching info for inspection id = 30335
Fetching info for location id = 47828
 Fetching info for inspection id = 30664
Fetching info for location id = 47828
 Fetching info for inspection id = 30664
Fetching info for location id = 47869
 Fetching info for inspection id = 30722
Fetching info for location id = 47869
 Fetching i

In [476]:
loc

Unnamed: 0,_type,address,administrator,capacity,city_zip,expiration_date,facility_type,id,inspections,inspector_name,inspector_phone,license_type,name,phone_number,qualification,business_hours
0,location_info,10140 Hastings Drive,Mr. Mike Williams,25,"MANASSAS, VA 20110","Aug. 31, 2022",Assisted Living Facility,47824.0,"[{'_type': 'inspection_info', 'id': 30331, 'lo...",Laura Lunceford:,(540) 219-9264,Conditional,English Meadows Prince William Campus,No phone number available,Non-Ambulatory\n\n\nResidential and Assisted L...,
1,location_info,106 Westminster Drive,Ms. Micaela Gordon,24,"FRONT ROYAL, VA 22630","Aug. 31, 2022",Assisted Living Facility,47825.0,"[{'_type': 'inspection_info', 'id': 30333, 'lo...",Rhonda L Whitmer:,(540) 292-5932,Conditional,Lavender Hills Front Royal Campus,No phone number available,Residential and Assisted Living Care\n\n\nSpec...,"24 hour operation - 24 hour operation, \r\n\t..."
2,location_info,680 University Lane #200,Ms Terrika Neely,29,"ORANGE, VA 22960","Aug. 31, 2022",Assisted Living Facility,47826.0,"[{'_type': 'inspection_info', 'id': 30334, 'lo...",Rhonda L Whitmer:,(540) 292-5932,Conditional,Lavender Hills Orange Campus,(540) 661-3333,Residential and Assisted Living Care\n\n\nSpec...,"24 hour operation - 24 hour operation, \r\n\t..."
3,location_info,110 Spanish Oak Rd,Ms. Samatha Ait Keys,41,"STEPHENS CITY, VA 22655","Aug. 31, 2022",Assisted Living Facility,47827.0,"[{'_type': 'inspection_info', 'id': 30478, 'lo...",Rhonda L Whitmer:,(540) 292-5932,Conditional,English Meadows Stephens City Campus,(540) 868-0200,Residential and Assisted Living Care,"24 hour operation - 24 hour operation, \r\n\t..."
4,location_info,5100 Fillmore Avenue,Mr. Darnell Jenkins,215,"ALEXANDRIA, VA 22311","Oct. 31, 2022",Assisted Living Facility,47828.0,"[{'_type': 'inspection_info', 'id': 30664, 'lo...",Marshall G Massenberg:,(703) 431-4247,Conditional,The Fountains at Washington House,(703) 845-5100,Residential and Assisted Living Care\n\n\nNon-...,
5,location_info,301 Village Circle,Cirena West,30,"BRISTOL, VA 24201","Nov. 30, 2022",Assisted Living Facility,47869.0,"[{'_type': 'inspection_info', 'id': 30722, 'lo...",Crystal Mullins:,(276) 608-1067,Conditional,Memory Care at Bristol,(276) 477-5334,Residential and Assisted Living Care\n\n\nNon-...,
6,location_info,27468 Overbrook Dr.,Rachel Moore,34,"MEADOWVIEW, VA 24361","Oct. 31, 2022",Assisted Living Facility,47870.0,"[{'_type': 'inspection_info', 'id': 30566, 'lo...",Crystal Mullins:,(276) 608-1067,Conditional,"Hillcrest Residential Living, LLC",(276) 944-3150,Ambulatory Only\n\n\nResidential care only,24 hours/day-7days/week - 24 hours/day-7days/...
7,location_info,5113 Cavedo Lane,Letitia Beasley Wilder,8,"RICHMOND, VA 23231","Sept. 21, 2022",Assisted Living Facility,47883.0,"[{'_type': 'inspection_info', 'id': 30702, 'lo...",Belinda Dyson:,(804) 662-9780,Conditional,Continued Care II,(804) 447-9049,Residential care only\n\n\nAmbulatory Only,"24 hours per day , \r\n\tSunday-Saturday 7 day..."
8,location_info,15089 Harmony Hills Lane,Anna Laura Henderson,82,"ABINGDON, VA 24210","Oct. 14, 2022",Assisted Living Facility,47918.0,"[{'_type': 'inspection_info', 'id': 30548, 'lo...",Crystal Mullins:,(276) 608-1067,Conditional,English Meadows Abingdon Campus,(276) 619-4572,Residential and Assisted Living Care\n\n\nNon-...,
9,location_info,10521 Wylie Lane,Timothy Jones,7,"GLEN ALLEN, VA 23059","Oct. 24, 2022",Assisted Living Facility,48088.0,"[{'_type': 'inspection_info', 'id': 30591, 'lo...",Belinda Dyson:,(804) 662-9780,Conditional,Truu Life LLC,(804) 305-8708,Ambulatory Only\n\n\nResidential care only,"7 days per week 365 days per year , \r\n\tSund..."


In [477]:
## rename id to loc_id for merging with inspection data
loc = loc.rename({"id":"loc_id"},axis=1)

In [478]:
## convert float to int
loc['loc_id'] = loc['loc_id'].astype('int')

In [479]:
## Expand for each inspection
loc_insp = loc.explode('inspections').reset_index(drop=True)

In [480]:
## break out inspection column on its own to expand table to merge back to loc table
insp = loc_insp['inspections']
insp = insp.apply(pd.Series)

In [481]:
insp

Unnamed: 0,_type,id,loc_id,date,complaint,areas_reviewed,comments,technical_assistance,violations
0,inspection_info,30331,47824,"Jan. 28, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,"Date of Inspection: January 28, 2022Type of In...",,
1,inspection_info,30333,47825,"Jan. 27, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,An unannounced initial inspection was conducte...,The local health department will be contacted ...,
2,inspection_info,30334,47826,"Jan. 28, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,An unannounced initial inspection was conducte...,,
3,inspection_info,30478,47827,"March 15, 2022",No,22VAC40-73 RESIDENT CARE AND RELATED SERVICES,A non-mandated self-report-monitoring inspecti...,,[]
4,inspection_info,30335,47827,"Jan. 27, 2022",No,,An unannounced initial inspection was conducte...,The local health department will be contacted ...,
5,inspection_info,30664,47828,"April 29, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,An unannounced initial inspection was conducte...,Documentation was discussed with the provider.,
6,inspection_info,30722,47869,"April 18, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,The licensing inspector for Memory Care at Bri...,,"[{'standard_#': '22VAC40-73-710-C', 'descripti..."
7,inspection_info,30566,47870,"April 5, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,The licensing inspector for Hillcrest Resident...,,[]
8,inspection_info,30702,47883,"May 24, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,Type of inspection: Monitoring Date(s) of ins...,,
9,inspection_info,30429,47883,"March 22, 2022",No,22VAC40-73 ADMINISTRATION AND ADMINISTRATIVE S...,An initial announced on-site inspection was co...,,


In [482]:
insp_viol = insp.explode('violations').reset_index(drop=True)
insp_viol = insp_viol['violations']
insp_viol = insp_viol.apply(pd.Series)

In [483]:
insp_viol

Unnamed: 0,0,description,plan_of_correction,standard_#
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,Based on resident record review and observatio...,Resident #1 bed rails were removed from reside...,22VAC40-73-710-C
7,,,,
8,,,,
9,,,,


In [484]:
insp_viol = insp.join(insp_viol)

In [485]:
insp_viol.drop(insp_viol.columns[[0,8,9]] , axis=1, inplace = True)
insp_viol

Unnamed: 0,id,loc_id,date,complaint,areas_reviewed,comments,technical_assistance,description,plan_of_correction,standard_#
0,30331,47824,"Jan. 28, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,"Date of Inspection: January 28, 2022Type of In...",,,,
1,30333,47825,"Jan. 27, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,An unannounced initial inspection was conducte...,The local health department will be contacted ...,,,
2,30334,47826,"Jan. 28, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,An unannounced initial inspection was conducte...,,,,
3,30478,47827,"March 15, 2022",No,22VAC40-73 RESIDENT CARE AND RELATED SERVICES,A non-mandated self-report-monitoring inspecti...,,,,
4,30335,47827,"Jan. 27, 2022",No,,An unannounced initial inspection was conducte...,The local health department will be contacted ...,,,
5,30664,47828,"April 29, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,An unannounced initial inspection was conducte...,Documentation was discussed with the provider.,,,
6,30722,47869,"April 18, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,The licensing inspector for Memory Care at Bri...,,Based on resident record review and observatio...,Resident #1 bed rails were removed from reside...,22VAC40-73-710-C
7,30566,47870,"April 5, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,The licensing inspector for Hillcrest Resident...,,,,
8,30702,47883,"May 24, 2022",No,22VAC40-73 GENERAL PROVISIONS22VAC40-73 ADMINI...,Type of inspection: Monitoring Date(s) of ins...,,,,
9,30429,47883,"March 22, 2022",No,22VAC40-73 ADMINISTRATION AND ADMINISTRATIVE S...,An initial announced on-site inspection was co...,,,,


In [486]:
## merge inspection table back to loc table
final = loc.merge(insp_viol, on='loc_id', how='left')

In [489]:
final = final.drop(columns=['inspections'])

In [490]:
final.to_csv('dss.csv',index=False) 