# Food Safety in Chicago with Respect to Public Health

In [None]:
import pandas as pd
import seaborn as sn
import numpy as np

In [None]:
import geopandas as gpd
from geopandas.tools import sjoin
from shapely.geometry import Point
from shapely.wkt import dumps, loads

In [None]:
# creating dataframes from csv files
df_food_inspections = pd.read_csv("data/foodinspections.csv")
df_community_areas = pd.read_csv("data/Community_Areas.csv")
df_public_health = pd.read_csv("data/Public_Health.csv")

#### Evaluation of food inspection data
Looking if there is any missing data. From the documentation of the dataset on [Food inspections Chicago](https://www.kaggle.com/chicago/chicago-food-inspections) we can see that there is no data in the columns "Historical Wards 2003-2015", "Zip Codes", "Community Areas", "Census Tracts" and "Wards". 
We want to explore this further and make sure of these numbers. 

In [5]:
# full dataset wih all columns
df_food_inspections.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Results,Violations,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards
0,2320315,SERENDIPITY CHILDCARE,SERENDIPITY CHILDCARE,2216009.0,Daycare Above and Under 2 Years,Risk 1 (High),1300 W 99TH ST,CHICAGO,IL,60643.0,...,Pass,,41.714168,-87.655291,"{'longitude': '41.7141680989703', 'latitude': ...",,,,,
1,2320342,YOLK TEST KITCHEN,YOLK TEST KITCHEN,2589655.0,Restaurant,Risk 1 (High),1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,...,Pass w/ Conditions,23. PROPER DATE MARKING AND DISPOSITION - Comm...,41.913588,-87.682203,"{'longitude': '41.9135877900482', 'latitude': ...",,,,,
2,2320328,LAS ASADAS MEXICAN GRILL,LAS ASADAS MEXICAN GRILL,2583309.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,...,Out of Business,,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':...",,,,,
3,2320319,LA PALAPITA,LA PALAPITA,2694702.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,...,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':...",,,,,
4,2320228,47TH ST CANTINA,47TH ST CANTINA,2678250.0,Liquor,Risk 3 (Low),4311 W 47TH ST,CHICAGO,IL,60632.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.807662,-87.73148,"{'longitude': '41.80766199360051', 'latitude':...",,,,,


In [6]:
# seeing the amount of missing values in each column in the dataframe 
print("Sum of missing values per column in Food Inspections dataset")
print(df_food_inspections.isnull().sum())

# the row count for the dataset
print("\nTotal length of dataframe")
count_rows_fi = df_food_inspections.shape[0] 
print(count_rows_fi)

Sum of missing values per column in Food Inspections dataset
Inspection ID                      0
DBA Name                           0
AKA Name                        2455
License #                         17
Facility Type                   4776
Risk                              72
Address                            0
City                             138
State                             42
Zip                               50
Inspection Date                    0
Inspection Type                    1
Results                            0
Violations                     51691
Latitude                         681
Longitude                        681
Location                         681
Historical Wards 2003-2015    194814
Zip Codes                     194814
Community Areas               194814
Census Tracts                 194814
Wards                         194814
dtype: int64

Total length of dataframe
194814


Can here see that the columns mentioned above in fact have no values. We therefore choose to delete these columns. 

In [8]:
# deletion of empty columns, 'inplace = true' to apply directly to the dataframe. 
df_food_inspections.drop(["Historical Wards 2003-2015", "Zip Codes", "Community Areas", "Census Tracts", "Wards"], axis = 1, inplace = True)
df_food_inspections.head()

KeyError: "['Historical Wards 2003-2015' 'Zip Codes' 'Community Areas'\n 'Census Tracts' 'Wards'] not found in axis"

As we are looking at how the food inspections affect the public health, it is quite important to be able to see the correlations between the different facilities and in which area they are located. To be able to do this we will merge the Food Inspections dataset with the [Community Area](https://data.cityofchicago.org/dataset/Community-Areas/vrxf-vc4k/data?fbclid=IwAR2YiR_0kgW1s0iSrKFti5LXmy7zTqQDQqDpFGdaTQ92jS-TYA0gDsU5LzU) dataset.   

*Documentation geopandas [here](http://geopandas.org/data_structures.html)*

In [9]:
# making a new dataframe with geodataframe locations column as well as of longitude/latitude
geo = [Point(xy) for xy in zip(df_food_inspections['Longitude'], df_food_inspections['Latitude'])]

# Coordinate reference system : WGS84
crs = {'init': 'epsg:4326'}

df_geo_fi = gpd.GeoDataFrame(df_food_inspections, crs=crs, geometry=geo)

In [10]:
df_geo_fi.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location,geometry
0,2320315,SERENDIPITY CHILDCARE,SERENDIPITY CHILDCARE,2216009.0,Daycare Above and Under 2 Years,Risk 1 (High),1300 W 99TH ST,CHICAGO,IL,60643.0,2019-10-23T00:00:00.000,License Re-Inspection,Pass,,41.714168,-87.655291,"{'longitude': '41.7141680989703', 'latitude': ...",POINT (-87.65529 41.71417)
1,2320342,YOLK TEST KITCHEN,YOLK TEST KITCHEN,2589655.0,Restaurant,Risk 1 (High),1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,2019-10-23T00:00:00.000,Canvass,Pass w/ Conditions,23. PROPER DATE MARKING AND DISPOSITION - Comm...,41.913588,-87.682203,"{'longitude': '41.9135877900482', 'latitude': ...",POINT (-87.68220 41.91359)
2,2320328,LAS ASADAS MEXICAN GRILL,LAS ASADAS MEXICAN GRILL,2583309.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,2019-10-23T00:00:00.000,Canvass,Out of Business,,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':...",POINT (-87.72004 41.80803)
3,2320319,LA PALAPITA,LA PALAPITA,2694702.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,2019-10-23T00:00:00.000,License,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':...",POINT (-87.72004 41.80803)
4,2320228,47TH ST CANTINA,47TH ST CANTINA,2678250.0,Liquor,Risk 3 (Low),4311 W 47TH ST,CHICAGO,IL,60632.0,2019-10-22T00:00:00.000,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.807662,-87.73148,"{'longitude': '41.80766199360051', 'latitude':...",POINT (-87.73148 41.80766)


In [36]:
# NB!! you must address that there is 618 missing values for longitude and latitude. 

# join dataframes by geopoint (if geopoint inside multipolygon then join)
df_community_areas.head()

#df_geo_fi['the_geom'] = np.nan


#for multipoly in df_community_areas['the_geom']:
#    for point in df_geo_fi['geometry']:
#        if multipoly.contains(point):
#            df_geo_fi.loc[df_geo_fi['the_geom']] = multipoly
            




    


Unnamed: 0,_feature_id,_feature_id_string,the_geom,area,perimeter,comarea_,comarea_id,area_numbe,community,area_num_1,shape_area,shape_len
0,1,CommAreas.1,MULTIPOLYGON (((-87.60914087616999 41.84469250...,0,0,0,0,35,DOUGLAS,35,46004620.0,31027.05451
1,2,CommAreas.2,MULTIPOLYGON (((-87.59215283878491 41.81692934...,0,0,0,0,36,OAKLAND,36,16913960.0,19565.506153
2,3,CommAreas.3,MULTIPOLYGON (((-87.62879823732865 41.80189303...,0,0,0,0,37,FULLER PARK,37,19916700.0,25339.08975
3,4,CommAreas.4,MULTIPOLYGON (((-87.60670812560363 41.81681377...,0,0,0,0,38,GRAND BOULEVARD,38,48492500.0,28196.837157
4,5,CommAreas.5,MULTIPOLYGON (((-87.59215283878491 41.81692934...,0,0,0,0,39,KENWOOD,39,29071740.0,23325.167906


In [68]:
multipolygons = []

for multipoly in df_community_areas['the_geom']:
    mp = loads(multipoly)
    multipolygons.append(mp)

In [69]:
df_community_areas['the_geom'] = multipolygons

In [None]:
print(multipolygons[3])

for point in df_geo_fi['geometry']:
    for mp in multipolygons:
        if mp.contains(point):
            df_geo_fi.loc[df_geo_fi['the_geom']] 

For one of the other columns "Violations" we can see a great deal of values are missing. These columns might still be important though, to get the extent of the missing values we find the percentage. 

In [52]:
# finding percentage of missing values in "Violations"
perc_violations = 100 * df_food_inspections['Violations'].isnull().sum() / count_rows_fi
print("Percentage of missing values in 'Violations': %.2f" %(perc_violations))

Percentage of missing values in 'Violations': 26.53
