# Alachua County restaurant inspection analysis
_This will take restaurant inspection data by the state of Florida and format it in a more reader-friendly way for publication in print and online. We'll filter for the most egregious current violations at restaurants in Alachua County._

After importing Pandas,etc., this reads in the state summary report year-to-date for District 5, which includes Alachua County, and adds an exception in case the file is not found (output probably needs to be set as variable so it can be written into the output file). The raw file has no headers and 82 columns. So this removes all but five columns and adds headers for those. Finally, it displays the first five rows of values.

In [1]:
import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError


In [2]:
try:
    insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv", 
                               usecols=[2,14,18,80,81])
except IOError:
    print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate", 
                "NumHighVio", "LicenseID", "VisitID"]

In [3]:
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']

In [4]:
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]

In [5]:
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)

__Goal with this next is to select date range prior to 'today', but this is hard coded now.__

In [6]:
# need to allow user input to set timedelta; 'input' doesn't seem to work
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=90)
## want to get user input for timedelta 
alachua = alachua[(alachua['InspectDate'] > startDay) & (alachua['InspectDate'] < today)]


In [7]:
# takes LicenseID and VisitID, passes it into the urls for detailed reports
result = []
for index, rows in alachua.iterrows():
    visitID = rows['VisitID']
    licID = rows['LicenseID']
    #trail = ','
    urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID= %s &licid= %s" % (visitID, licID)
    urls = urls.replace(' ', '')
    #print(urls)
    result.append(urls)

urlList = result
print(urlList)

['https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6533891&licid=7003533', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6362752&licid=6772721', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6538630&licid=7009925', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6517783&licid=6982298', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6539001&licid=6312191', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6444510&licid=6475603', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6292423&licid=6040839', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6463635&licid=6370423', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6451801&licid=5858850', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6535804&licid=3296677', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6535766&licid=5430232'

__Now it's time to use those urls to access the detailed reports with inspector comments, and scrape those. This will need to be a "for loop" that cycles through all urls and then puts the output variables somehow, like a database?__

In [None]:
def get_inspect_detail(urlList):
    html = urlopen(urlList)
    soup = bs4.BeautifulSoup(html.read(), 'lxml')
    details = soup.find_all('font', {'face': 'verdana'})[10:]
    result = []
    for detail in details:
        siteName = details[0].text
        licNum = details[2].text
        siteRank = details[4].text
        expDate = details[6].text
        primeStatus = details[8].text
        secStatus = details[10].text
        siteAddress = details[12].text
        inspectResult = details[20].text
        observed1 = details[34].get_text
        observed2 = details[36].text
        observed3 = details[38].text
        observed4 = details[40].text
        observed5 = details[42].text
        observed6 = details[44].text
        observed7 = details[46].text
        observed8 = details[48].text
        observed9 = details[50].text
        observed10 = details[52].text

        detailsLib = {
            'Restaurant': siteName,
            'License': licNum,
            'Rank': siteRank,
            'Expires': expDate,
            'Primary': primeStatus,
            'Secondary': secStatus,
            'Address': siteAddress,
            'Result': inspectResult,
            'Observed1': observed1,
            'Observed2': observed2,
            'Observed3': observed3,
            'Observed4': observed4,
            'Observed5': observed5,
            'Observed6': observed6,
            'Observed7': observed7,
            'Observed8': observed8,
            'Observed9': observed9,
            'Observed10': observed10
        }
        result.append(detailsLib)

    return result

all_results = []
for inspurl in urlList:
    html = urlopen(inspurl)
    #details = get_inspect_detail(url)
    #all_results.extend(details)
    
  

#for 'Restaurant', siteName in detailsLib.items()
#        print(siteName)

In [None]:
print(detailsLib['Restaurant'])
street = address.split(' ')
print(detailsLib['Address'])


In [None]:
html = urlopen(urls)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.findAll('font', {'face':'verdana'})[0:]
print(details)

#more of a regex job but less dependent on sit layout

In [None]:
def clean_up(text, strip_chars=[], replace_extras={}):
    # Handle strip_chars
    strip_items = '|'.join(re.escape(s) for s in strip_chars)
    strip_re = r'^(?:{}|\s)+|(?:{}|\s)+$'.format(strip_items, strip_items)
    text = re.sub(strip_re, '', text, re.MULTILINE)

    # Normalize whitespace and handle replace_extras
    replace_keys = list(replace_extras.keys())
    replace_keys.sort(key=len, reverse=True)
    replace_re = '|'.join([re.escape(s) for s in replace_keys] + [r'\s+'])
    return re.sub(
        replace_re,
        lambda match: replace_extras.get(match.group(), ' '),
        text
    ).strip()

clean_up(str(details))
print(details)

In [None]:
def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return ouput

getNgrams(content, 2)
