# Data

In [287]:
import pandas as pd
import json as json
import math
import random
import pprint

MOBILE_FOOD_SCHEDULE = "../data/mobile_food_schedule/"
RESTAURANT_SCORES    = "../data/restaurant_scores/SFFoodProgram_Complete_Data/"
OUTPUT_PATH          = "../data/json/"

mobile_food_data_df = pd.read_csv(MOBILE_FOOD_SCHEDULE + "Mobile_Food_Schedule.csv")
businesses_plus_df  = pd.read_csv(RESTAURANT_SCORES + "businesses_plus.csv", encoding='latin-1')
inspections_plus_df = pd.read_csv(RESTAURANT_SCORES + "inspections_plus.csv", encoding='latin-1')
violations_plus_df  = pd.read_csv(RESTAURANT_SCORES + "violations_plus.csv", encoding='latin-1')

In [288]:
# Create dict from violation, inspection, and business data
# @key: a business id
# @value: dictionary of dictionaries containing
# 
# all data is encoded as utf-8
# print all_data[538] for an example
all_restaurant_data = {}

# add business data
for index, row in businesses_plus_df.iterrows():
    all_restaurant_data.setdefault(row["business_id"], {"business_data": {}, "inspection_data": [], "violation_data": []})
    for col in businesses_plus_df.columns.values.tolist():
        datum = row[col].encode('utf-8') if type(row[col]) == type("s") else row[col]
        if type(datum) == type(3.14) and math.isnan(datum):
            datum = None
        all_restaurant_data[row["business_id"]]["business_data"][col] = datum
        
# add inspection data
for index, row in inspections_plus_df.iterrows():
    all_restaurant_data.setdefault(row["business_id"], {"business_data": {}, "inspection_data": [], "violation_data": []})
    obj = {}
    for col in inspections_plus_df.columns.values.tolist():
        datum = row[col].encode('utf-8') if type(row[col]) == type("s") else row[col]
        if type(datum) == type(3.14) and math.isnan(datum):
            datum = None
        obj[col] = datum
    all_restaurant_data[row["business_id"]]["inspection_data"].append(obj)
        
# add violation data
for index, row in violations_plus_df.iterrows():
    all_restaurant_data.setdefault(row["business_id"], {"business_data": {}, "inspection_data": [], "violation_data": []})
    obj = {}
    for col in violations_plus_df.columns.values.tolist():
        datum = row[col].encode('utf-8') if type(row[col]) == type("s") else row[col]
        if type(datum) == type(3.14) and math.isnan(datum):
            datum = None
        obj[col] = datum
    all_restaurant_data[row["business_id"]]["violation_data"].append(obj)

In [289]:
# create dict from mobile food data
mobile_food_data = []
for index, row in mobile_food_data_df.iterrows():
        obj = {}
        for col in mobile_food_data_df.columns.values.tolist():
            datum = row[col]
            if type(datum) == type(3.14) and math.isnan(datum):
                datum = None
            obj[col] = datum
        mobile_food_data.append(obj)

In [290]:
# insert neighborhoods from the chloropleth
neighborhoods = ["Seacliff","Lake Street","Presidio National Park","Presidio Terrace","Inner Richmond",
                 "Sutro Heights","Lincoln Park / Ft. Miley","Outer Richmond","Golden Gate Park",
                 "Presidio Heights","Laurel Heights / Jordan Park","Lone Mountain","Anza Vista",
                 "Cow Hollow","Union Street","Marina","Telegraph Hill","Downtown / Union Square",
                 "Tenderloin","Civic Center","Hayes Valley","Alamo Square","Panhandle","Haight Ashbury",
                 "Lower Haight","Mint Hill","Duboce Triangle","Cole Valley","Rincon Hill","South Beach",
                 "South of Market","Showplace Square","Mission Bay","Yerba Buena Island","Treasure Island",
                 "Mission Dolores","Castro","Outer Sunset","Parkside","Stonestown","Parkmerced","Lakeshore",
                 "Golden Gate Heights","Forest Hill","West Portal","Clarendon Heights","Midtown Terrace",
                 "Laguna Honda","Upper Market","Dolores Heights","Mission","Potrero Hill","Dogpatch",
                 "Central Waterfront","Diamond Heights","Fairmount","Peralta Heights","Holly Park","Merced Manor",
                 "Balboa Terrace","Ingleside","Merced Heights","Outer Mission","Ingleside Terraces","Mt. Davidson Manor",
                 "Monterey Heights","Westwood Highlands","Westwood Park","Miraloma Park","Crocker Amazon","McLaren Park",
                 "Sunnydale","Visitacion Valley","India Basin","Hunters Point","Candlestick Point SRA","Northern Waterfront",
                 "Cayuga","Oceanview","Apparel City","Bernal Heights","Noe Valley","Produce Market","Bayview",
                 "Silver Terrace","Bret Harte","Little Hollywood","Portola","University Mound","St. Marys Park",
                 "Mission Terrace","Excelsior","Sunnyside","Glen Park","Aquatic Park / Ft. Mason","Fishermans Wharf",
                 "Cathedral Hill","Japantown","Pacific Heights","Lower Pacific Heights","Western Addition","Chinatown",
                 "Nob Hill","Lower Nob Hill","Polk Gulch","North Beach","Russian Hill","Financial District",
                 "Inner Sunset","Parnassus Heights","Forest Knolls","Buena Vista","Corona Heights","Ashbury Heights",
                 "Eureka Valley","St. Francis Wood","Sherwood Forest"]

for business_id in all_restaurant_data:
    all_restaurant_data[business_id]["business_data"]["neighborhood"] = random.choice(neighborhoods)

In [291]:
# remove business that are missing longitude or latitude data
print "Data PRE-filter length: ", len(all_restaurant_data.keys())
all_restaurant_data = {k:v for k,v in all_restaurant_data.iteritems() if ("latitude" in v["business_data"]) and (v["business_data"]["latitude"]) and ("longitude" in v["business_data"]) and (v["business_data"]["longitude"])}
print "Data POST-filter length: ", len(all_restaurant_data.keys())

Data PRE-filter length:  7622
Data POST-filter length:  4669


In [292]:
# write data to files in JSON format

# mobile food data
with open(OUTPUT_PATH + "mobile_food_data.json", "w") as outfile:
    json.dump(mobile_food_data, outfile)

# violation, inspection, and business data
with open(OUTPUT_PATH + "restaurant_data.json", "w") as outfile:
    json.dump([all_restaurant_data], outfile)

# Neighborhood Data

In [293]:
# takes a neighborhood and returns average of all of the most
# recent inspection scores recorded in that neighborhood.
# @neighborhood:string: a neighborhood name
def calculate_avg_inspection_score(neighborhood):
    total_score = 0.0
    num_scores_added = 0
    for business_id in all_restaurant_data:
        if all_restaurant_data[business_id]["business_data"]["neighborhood"] == neighborhood:
            inspection_data = filter(lambda elt: elt["Score"] != None, all_restaurant_data[business_id]["inspection_data"])
            if len(inspection_data) > 0:
                inspection_data.sort(key=lambda x: x["Score"])
                total_score += inspection_data[0]["Score"]
                num_scores_added += 1
    return total_score / num_scores_added
    
# takes a neighborhood and returns average number of violations per
# restaurant in that neighborhood
def calculate_avg_violation_score(neighborhood):
    total_score = 0.0
    num_scores_added = 0
    for business_id in all_restaurant_data:
        if all_restaurant_data[business_id]["business_data"]["neighborhood"] == neighborhood:
            violation_data = all_restaurant_data[business_id]["violation_data"]
            total_score += len(violation_data)
            num_scores_added += 1
    return total_score / num_scores_added

In [294]:
# dict of violation and inspection data for choropleth
# @key:string: a neighborhood
# @value:dict: dict of dicts
chloropleth_data = {n: {"avg_inspection_score": calculate_avg_inspection_score(n), 
                        "avg_violation_score": calculate_avg_violation_score(n)} for n in neighborhoods}

In [295]:
# write chloropleth data to json file
with open(OUTPUT_PATH + "chloropleth_data.json", "w") as outfile:
    json.dump(chloropleth_data, outfile)