# Alachua County restaurant inspection analysis
This will take restaurant inspection data by the state of Florida and format it in a more reader-friendly way for publication in print and online. We'll filter for the most egregious current violations at restaurants in Alachua County.

__After importing Pandas and Datetime, this reads in the state summary report year-to-date for District 5, which includes Alachua County, and adds an exception in case the file is not found (output probably needs to be set as variable so it can be written into the output file). The raw file has no headers and 82 columns. So this removes all but five columns and adds headers for those. Finally, it displays the first five rows of values.__

In [1]:
import pandas as pd
import bs4
import datetime 
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError

In [2]:
try:
    insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv", 
                               usecols=[2,14,18,80,81])
    
except IOError:
    print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate", "NumHighVio", "LicenseID", "VisitID"]

__This creates a DataFrame named 'alachua', filters out records for Alachua County:__

In [3]:
# filter for alachua county restaurants with high violations
insp[(insp.CountyName == 'Alachua') & (insp.NumHighVio > 0)]

Unnamed: 0,CountyName,InspectDate,NumHighVio,LicenseID,VisitID
5,Alachua,03/21/2018,1.0,5399007,6509808
6,Alachua,03/20/2018,2.0,5399007,6280880
24,Alachua,11/22/2017,1.0,6621480,6306950
26,Alachua,04/30/2018,1.0,6621480,6433038
30,Alachua,07/21/2017,1.0,6381936,6302632
31,Alachua,03/01/2018,1.0,6943776,6489116
32,Alachua,12/05/2017,5.0,6381936,6349967
36,Alachua,02/06/2018,1.0,3638712,6377421
38,Alachua,01/25/2018,1.0,6767936,6396972
39,Alachua,10/04/2017,1.0,6767936,6341203


In [4]:
# change date string to date object, sort most recent# chang 
insp['InspectDate'] = pd.to_datetime(insp['InspectDate'])
insp = insp.sort_values('InspectDate', ascending=False)

In [5]:
# prefer to let user set timedelta
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=14)
insp = insp[(insp['InspectDate'] > startDay) & (insp['InspectDate'] < today)]
len(insp)

141

In [6]:
# takes LicenseID and VisitID, passes it into the urls for detailed reports# takes 
result = []
for index, rows in insp.iterrows():
    visitID = rows['VisitID']
    licID = rows['LicenseID']
    urls = "https://www.myfloridalicense.com/inspectionDetail.asp? \
        InspVisitID= %s &licid= %s" % (visitID, licID)
    urls = urls.replace(' ', '')
    result.append(urls)

urlList = result
print(urlList) #not needed later
len(urlList) #not needed later

['https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6540174&licid=6017101', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6452493&licid=5874202', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6285785&licid=5707105', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6446676&licid=6523472', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6263363&licid=6545268', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6542587&licid=5369216', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6366387&licid=6748879', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6539293&licid=6089798', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6332727&licid=2130081', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6503829&licid=2205735', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6541810&licid=3680650'

141

In [7]:
urlList = ['https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6353619&licid=6683171', 'https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=6452493&licid=5874202']
# loop through the url list to gather inspection details
detailsLib = {}
def get_inspect_detail(html):
    html = urlopen(urlList)
    soup = bs4.BeautifulSoup(html.read(), 'lxml')
    details = soup.find_all('font', {'face': 'verdana'})[10:]
    result = []
    for detail in details:
        siteName = details[0].text
        licNum = details[2].text
        siteRank = details[4].text
        expDate = details[6].text
        primeStatus = details[8].text
        secStatus = details[10].text
        siteAddress = details[12].text
        inspectResult = details[20].text
        
#       observed1 = details[34].get_text
#       observed2 = details[36].text
#       observed3 = details[38].text
#       observed4 = details[40].text
#       observed5 = details[42].text
#       observed6 = details[44].text
#       observed7 = details[46].text
#       observed8 = details[48].text
#       observed9 = details[50].text
#       observed10 = details[52].text

        detailsLib = {
            'Restaurant': siteName,
            'License': licNum,
            'Rank': siteRank,
            'Expires': expDate,
            'Primary': primeStatus,
            'Secondary': secStatus,
            'Address': siteAddress,
            'Result': inspectResult,
            
#            'Observed1': observed1,
#            'Observed2': observed2,
#            'Observed3': observed3,
#            'Observed4': observed4,
#            'Observed5': observed5,
#            'Observed6': observed6,
#            'Observed7': observed7,
#            'Observed8': observed8,
#            'Observed9': observed9,
#            'Observed10': observed10
        }
        result.append(detailsLib)
        
    return result

#all_results = []
for inspurl in urlList:
    html = urlopen(inspurl)
    #details = get_inspect_detail(url)
    #all_results.extend(details)

[]


In [None]:
print(detailsLib['Restaurant'])
street = address.split(' ')
print(detailsLib['Address'])

In [None]:
def clean_up(text, strip_chars=[], replace_extras={}):
    # Handle strip_chars
    strip_items = '|'.join(re.escape(s) for s in strip_chars)
    strip_re = r'^(?:{}|\s)+|(?:{}|\s)+$'.format(strip_items, strip_items)
    text = re.sub(strip_re, '', text, re.MULTILINE)

    # Normalize whitespace and handle replace_extras
    replace_keys = list(replace_extras.keys())
    replace_keys.sort(key=len, reverse=True)
    replace_re = '|'.join([re.escape(s) for s in replace_keys] + [r'\s+'])
    return re.sub(
        replace_re,
        lambda match: replace_extras.get(match.group(), ' '),
        text
    ).strip()

clean_up(str(details))
print(details)

In [None]:
def getNgrams(content, n):
    content = re.sub('\n|[[\d+\]]', ' ', content)
    content = bytes(content, 'UTF-8')
    content = content.decode('ascii', 'ignore')
    content = content.split(' ')
    content = [word for word in content if word != '']
    output = []
    for i in range(len(content)-n+1):
        output.append(content[i:i+n])
    return ouput

getNgrams(content, 2)

__Time for some Beautiful Soup.__