In [1]:
import urllib3
import certifi
from bs4 import BeautifulSoup, element
import re
import openpyxl
import pickle
import json
import geopy
import requests
import pandas as pd
import numpy as np

# a few handy url generator funcitons
base_url = lambda start_num=1: "https://www.dss.virginia.gov/facility/search/alf.cgi?rm=Search;;Start={start_num}".format(start_num=start_num)
loc_url = lambda loc_id: "https://www.dss.virginia.gov/facility/search/alf.cgi?rm=Details;ID={loc_id}".format(loc_id=loc_id)
insp_url = lambda inspection_id, loc_id: "https://www.dss.virginia.gov/facility/search/alf.cgi?rm=Inspection;Inspection={inspection_id};ID={loc_id}".format(inspection_id=inspection_id, loc_id=loc_id)

http = urllib3.PoolManager(
    cert_reqs='CERT_REQUIRED',  # Force certificate check.
    ca_certs=certifi.where(),  # Path to the Certifi bundle.
)



In [2]:
### I couldn't get the google maps api to work. I created an api_key under my personal google account but it didn't seem to work
### I believe this was needed for some spatial analysis this project needed so not sure it's needed.

# Define geolocator from GooglemapsV3 api :: no key required :: output projection EPSG:3857 Spherical Mercator (Web Mercator)
#geolocator = geopy.geocoders.GoogleV3(api_key='AIzaSyDDWgZjAuas0i7U2BSpvGR8gn8KPMItLcE')
#domain='maps.googleapis.com', 

def get_page(url):
    '''Get all of our page data in a consistent fashion'''

    r = http.request('GET', url)
    # lxml is much better than stock python parser
    return BeautifulSoup(r.data, 'lxml')


In [3]:
def get_key(tag):
    '''a lot of time we will need to extract a key from a tag'''
    return tag.get_text().strip().strip(':').lower().replace(' ', '_').encode('ascii', 'ignore')

In [4]:
def get_loc_ids(start_num=1):
    '''Location id Generator function.
    This will automatically handeling paging of main id lookup, but you can skip ahead by passing in a number representing the ids place in global list, 1 based index'''

    done = False
    while not done:
        #print ('Fetching some location ids')
        soup = get_page(base_url(start_num))

        num_locs = int(re.search('\t(\d{1,9}) records', soup.find_all('table')[1].find_all('td')[1].text).group(1))

        ids = ([int(re.search(';ID=(\d{1,9});', a['href']).group(1)) for a in soup.find_all('table')[3].find_all('a')])

        for loc_id in ids:
            start_num += 1
            yield loc_id

        if start_num >= num_locs:
            done = True


In [5]:
def parse_loc(loc_id):
    '''Fetch detailed info for a single location based on id'''

    #print("Fetching info for location id =", loc_id)
    
    soup = get_page(loc_url(loc_id))

    location_info = {
        '_type': 'location_info',
        'id': loc_id
    }

    # big breakdowns go by tables
    basic_info, additional_info, inspection_info = soup.find_all('table')[:3]

    # first table has a bunch of data in fairly unstructured format
    name_and_address, city_zip, phone_number = basic_info.find_all('tr')

    location_info.update({
        'phone_number': phone_number.get_text().strip()
    })

    parsed_name_address = [line.strip() for line in name_and_address.get_text().split('\n') if line.strip()]
    location_info.update({
        'name': parsed_name_address[0]
    })
    
### Commented out Google API geolocator since API key didn't seem to work    
    # Get address for geolocator with city, state and without \n
    #gcode_address = city_zip.get_text().strip()#' '.join([' '.join(parsed_name_address[1:]), city_zip.get_text().strip()])

### Address needs to have name stripped from it.    
    location_info.update({'address': name_and_address.get_text().strip()})

    # there are a lot of additional info that follows the general format of <td>key</td><td>value</td>
    # but some need some extra parsing
    extra_parsing = {
        'ages': lambda ages: ages.replace('\t', '').replace('\n', ''),
        'inspector': lambda inspector_info: [line.strip() for line in inspector_info.split('\n') if line.strip()]
    }
    for row in additional_info.find_all('tr')[:-1]:
        key = get_key(row.find_all('td')[0])
        val = row.find_all('td')[1].get_text().strip()
        if key in extra_parsing:
            val = extra_parsing[key](val)

        location_info.update({key: val})

    if 'inspector' in location_info:
        location_info.update({
            'inspector_name': location_info['inspector'][0],
            'inspector_phone': location_info['inspector'][1]
        })
        del location_info['inspector']

    if inspection_info.table:
        inspection_ids = [int(re.search(';Inspection=(\d{1,6});', tag.a['href']).group(1)) for tag in inspection_info.table.find_all('tr')[1:]]
        location_info['inspections'] = [parse_inspection(insp_id, loc_id) for insp_id in inspection_ids]

    else:
        location_info['inspections'] = []
    
    return location_info



In [16]:
def parse_inspection(insp_id, loc_id):
    '''To get inspection data, you need to give the site both the inspection id and location id'''

    #print (" Fetching info for inspection id =", insp_id)

    soup = get_page(insp_url(insp_id, loc_id))

    inspection_info = {
        '_type': 'inspection_info',
        'id': insp_id,
        'loc_id': loc_id
    }

    # there is some redundant info about location, then some relevant stuff
    date, complaint = soup.find('div', id='main_content').find_all('p')[3:5]
    inspection_info.update({
        'date': date.get_text().split('\n')[5].strip(),
        'complaint': complaint.get_text().split('\n')[3].strip()
    })
    
   ## pull in areas_reviewed    
    areas_reviewed,comments=soup.find_all('table')[:2]
    inspection_info.update({
        'areas_reviewed': parse_areas_reviewed(areas_reviewed),
        'comments': parse_areas_reviewed(comments)
    })
    

  
    return inspection_info

In [17]:
# we will need a lot of specialized parsers
def parse_violations(violations):

    parsers = {
        'standard_#': lambda val: val.strip(),
        'description': lambda val: val.strip().replace('\r', '\n'),
        'complaint_related': lambda val: val.strip(),
        'action_to_be_taken': lambda val: val.strip().replace('\r', '\n')
    }
    line_num = 0
    violation_lines = violations.find_all('tr')
    violations_info = []
    violation_info = {}

    while line_num < len(violation_lines):

        if violation_lines[line_num].td is None:
            # there seems to be blank lines after 'complain_related' that don't have <td>s
            pass

        elif violation_lines[line_num].hr:
            violations_info.append(violation_info)
            violation_info = {}

        else:
            raw_key, val = violation_lines[line_num].get_text().split(':', 1)
            key = raw_key.strip().strip(':').lower().replace(' ', '_').encode('ascii', 'ignore')
            violation_info[key] = parsers[key](val)

        line_num += 1

    return violations_info


In [10]:
def parse_areas_reviewed(areas_reviewed):
    if areas_reviewed.br:
        return [areas_reviewed.br.previousSibling.strip()] + [foo.nextSibling.strip() for foo in areas_reviewed.find_all('br') if isinstance(foo.nextSibling, element.NavigableString)]

    return [areas_reviewed.td.text.strip()]

    parsers = {
        'areas_reviewed': parse_areas_reviewed,
        'technical_assistance': lambda technical_assistance: technical_assistance.get_text().strip(),
        'comments': lambda comments: comments.get_text().strip().replace('\r', '\n'),
        'violations': parse_violations
    }

    # also have a variable number of tables, id'd by <dt>s
    table_ids = [get_key(tag) for tag in soup.find_all('dt')]

    for key, tag in zip(table_ids, soup.find_all('table')[:len(table_ids)]):
        inspection_info[key] = parsers[key](tag)

    return inspection_info

In [11]:
## Get all location IDs
ids = list(get_loc_ids())
#df_ids = pd.DataFrame(ids,columns=["LocationIDs"])

In [12]:
## Testing top N location IDs to view data
top_n_idx = np.argsort(ids)[-10:]
top_n_values = [ids[i] for i in top_n_idx]
top_n_values

[47824, 47825, 47826, 47827, 47828, 47869, 47870, 47883, 47918, 48088]

In [30]:
## del df

## Create dataframe to append data to
df = pd.DataFrame()

In [31]:
loc_id = ids ## All facilities
    #top_n_values ## Sample of facilities 
for i in loc_id:
    parse_loc(i)
    df = df.append(parse_loc(i), ignore_index = True)


Fetching info for location id = 8364
 Fetching info for inspection id = 29026
 Fetching info for inspection id = 27827
 Fetching info for inspection id = 26908
 Fetching info for inspection id = 26257
 Fetching info for inspection id = 24888
Fetching info for location id = 8364
 Fetching info for inspection id = 29026
 Fetching info for inspection id = 27827
 Fetching info for inspection id = 26908
 Fetching info for inspection id = 26257
 Fetching info for inspection id = 24888
Fetching info for location id = 21384
 Fetching info for inspection id = 30175
 Fetching info for inspection id = 30029
 Fetching info for inspection id = 28403
 Fetching info for inspection id = 27808
 Fetching info for inspection id = 27422
 Fetching info for inspection id = 27137
 Fetching info for inspection id = 26678
 Fetching info for inspection id = 25792
 Fetching info for inspection id = 24587
Fetching info for location id = 21384
 Fetching info for inspection id = 30175
 Fetching info for inspection 

ValueError: not enough values to unpack (expected 2, got 1)

In [25]:
df
df.to_csv('dss.csv',index=False) 

In [None]:
## Used for testing specific locations/inspections
soup = get_page(insp_url(30548,47918))
#areas_reviewed = soup.find('div', id='main_content').find_all('b')[3:8]
#areas_reviewed,comments,violations =soup.find_all('table')[:2]
#areas_reviewed.find_all('br')
#test = parse_areas_reviewed(areas_reviewed)
#test2 = parse_areas_reviewed(comments)
#comments
#areas_reviewed = areas_reviewed.find_all('tr')
#test = areas_reviewed.get_text().strip()
#test = areas_reviewed.td.text.strip()
vio = soup.find('div', id='main_content').find_all('a name')
vio

In [None]:
def dump_locs_csv(loc_array, file_name='dss_virginia.csv'):
    field_names = [
        'id',
        'phone_number',
        'business_hours',
        'name',
        'locality',
        'inspector_phone',
        'address',
        'ages',
        'administrator',
        'longitude',
        'facility_type',
        'latitude',
        'expiration_date',
        'license_type',
        'capacity',
        'inspector_name',
        'fips'
    ]
    import csv
    with open(file_name, 'wb') as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=field_names, extrasaction='ignore')

        csv_writer.writeheader()

        for loc in loc_array:
            csv_writer.writerows(loc)

In [None]:
def dump_locs_xslx(loc_array, file_name= "dss_virginia.xlsx"):
    loc_field_order = ['id', 'name', 'facility_type', 'license_type', 'capacity', 'locality', 'ages', 'address', 'phone_number', 'fips', 'web_link']
    wb = openpyxl.Workbook()

    loc_ws = wb['Sheet']
    loc_ws.title = 'Location Information'

    for c, field in enumerate(loc_field_order):
        loc_ws.cell(row=1, column=c + 1).value = field

    for i, loc_info in enumerate(loc_array):
        for c, field in enumerate(loc_field_order):
            if field != 'web_link':
                loc_ws.cell(row=i + 2, column=c + 1).value = loc_info.get(field, None)
            else:
                loc_ws.cell(row=i + 2, column=c + 1).value = loc_url(loc_info['id'])

    wb.save(file_name)