In [52]:
import re
import warnings
import os
import folium
import geopandas as gpd
import pandas as pd
import numpy as np
import statsmodels as sm
import seaborn as sns # for beautiful graphs
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
import scipy.stats as stats # to calculate r^2 for linear regressions
import branca.colormap as cm

from scipy.stats import powerlaw # for plotting linear regressions
from folium import plugins
from folium.plugins import HeatMap
from IPython.display import IFrame
from mapping_functions import generateBaseMap, geodataframe, chlorepleth_map, adding_Marker

%matplotlib inline
sns.set()
warnings.filterwarnings('ignore')

# Visual Exploration of the data

> In this part, we explore the data by mapping. 

In [53]:
# import the cleaning dataset 
data = pd.read_csv('data/clean_dataset.csv', delimiter = ',')

# drop the unnamed column
data.drop(['Unnamed: 0'], axis = 1,inplace = True)

# show the dataframe
display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2352734,CHILI'S T-I,CHILI'S (T1-B14),34169.0,Restaurant,Risk 1 (High),11601 W TOUHY AVE,60666.0,2019-12-04,Canvass,Pass,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,42.008536,-87.914428,"{'latitude': '-87.91442843927047', 'longitude'..."
1,2352727,PORTAGE PARK DAY NURSERY,MOSAIC EARLY CHILDHOOD ACADEMY,2215815.0,Children's Services Facility,Risk 1 (High),5332-5334 W ADDISON ST,60641.0,2019-12-04,Canvass,Pass,,41.946065,-87.760722,"{'latitude': '-87.76072227616888', 'longitude'..."
2,2352738,AMARIT RESTAURANT,AMARIT RESTAURANT,1801618.0,Restaurant,Risk 1 (High),600 S DEARBORN ST,60605.0,2019-12-04,Canvass Re-Inspection,Pass,,41.874481,-87.629357,"{'latitude': '-87.62935653990546', 'longitude'..."


In [54]:
# convert the zip column into an str
data.zip = data.zip.astype(str)
    
# reformat the zip code writing in order to compare it with the zip code in geojson file (for vizualisation step)
data['zip'] = data['zip'].apply(lambda x : x.split('.')[0])

# Area with the highest number of inspection 

> It is important to see if there is a homogeneous inspection repartition in Chicago. Let's check if there are areas with higher inspection counts than others.

In [55]:
# mapping the areas with the highest number of inspections 
data['count'] = 1
base_map = generateBaseMap()
HeatMap(data = data[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(base_map)
#base_map.save('base_map.html')
IFrame(src = 'maps/base_map.html', width = 700, height = 600)

**Obsersations:** we can observe that the areas in the Chicago periphery are less inspected. This is coherent with what we expected since there are more facilities in a city center and also near the sea.

# Creation of the geo dataframe 

In [56]:
# creating our geodataframe based on the basic dataframe
gdf = geodataframe(data)
gdf.head(3)

Unnamed: 0,objectid,shape_area,shape_len,zip,geometry,centroid_lon,centroid_lat,facility_number_per_zip
0,33,106052287.488,42720.0444058,60647,(POLYGON ((-87.67762151065281 41.9177578010629...,-87.702259,41.921098,4628
1,34,127476050.762,48103.7827213,60639,(POLYGON ((-87.72683253163021 41.9226462671259...,-87.755996,41.920456,3436
2,35,45069038.4783,27288.6096123,60707,(POLYGON ((-87.78500237831095 41.9091478547167...,-87.795738,41.919948,735


# Mapping Chlorepeth

In [57]:
style_function = lambda x: {'fillColor': '#ffffff', 
                            'color':'#000000', 
                            'fillOpacity': 0.1, 
                            'weight': 0.1}

highlight_function = lambda x: {'fillColor': '#000000', 
                                'color':'#000000', 
                                'fillOpacity': 0.50, 
                                'weight': 0.1}


# Proportion of failed inspection per zip area

> The aim of this part is to see if there is a correlation between the localization of the facilities and the results of the inspection. Here, we focus on the facilities that failed the inspection. 

In [58]:
# creating a dataset with only the fail results 
fail_data = data[~data['results'].isin(['Pass','Pass w/ Conditions'])]

# counting the number of fail per zip code
fail_count_per_zip = pd.DataFrame(fail_data.groupby('zip')['results'].count()).reset_index()
fail_count_per_zip.rename(columns = {'results': 'fail_count'}, inplace = True)

As we saw in the heat map above, some areas are more inspected that others. Thus, we need to calculate a proportion of fails per zip code ( not an absolute count since it will be influenced by the facilities count in a particular area).

In [59]:
# merge with gdf dataframe and facility count dataframe 
fail_gdf = pd.merge(gdf,fail_count_per_zip, on = 'zip')

# proportion of facility that failed the investigation per zip code
fail_gdf['fail_proportion'] = round(fail_gdf['fail_count'].divide(fail_gdf['facility_number_per_zip'])*100,2)
fail_gdf.head(3)

Unnamed: 0,objectid,shape_area,shape_len,zip,geometry,centroid_lon,centroid_lat,facility_number_per_zip,fail_count,fail_proportion
0,33,106052287.488,42720.0444058,60647,(POLYGON ((-87.67762151065281 41.9177578010629...,-87.702259,41.921098,4628,1018,22.0
1,34,127476050.762,48103.7827213,60639,(POLYGON ((-87.72683253163021 41.9226462671259...,-87.755996,41.920456,3436,796,23.17
2,35,45069038.4783,27288.6096123,60707,(POLYGON ((-87.78500237831095 41.9091478547167...,-87.795738,41.919948,735,145,19.73


In [60]:
fail_map = chlorepleth_map('Fail proportion',fail_gdf,['zip','fail_proportion'],'Fail proportion','Reds')
#fail_map.save('fail_map.html')
#IFrame(src = 'maps/fail_map.html', width = 700, height = 600)


**Obervations:** We can observe that some zip code areas have a higher number of failed inspections than others. We can notice a high number of failed in the middle south of Chicago. 

# Mapping the proportion of pass and pass w/conditions

In [61]:
# creating a dataset with only the pass and pass w/conditions results 
pass_data = data[~data['results'].isin(['Fail'])]

# counting the number of pass inspections per zip code
pass_count_per_zip = pd.DataFrame(pass_data.groupby('zip')['results'].count()).reset_index()
pass_count_per_zip.rename(columns = {'results': 'pass_count'}, inplace = True)

In [62]:
# merge with gdf dataframe and facility count dataframe 
pass_gdf = pd.merge(pass_count_per_zip,gdf, on = 'zip')

# proportion of facility that passed the investigation per zip code
pass_gdf['pass_proportion'] = pass_gdf['pass_count'].divide(pass_gdf['facility_number_per_zip'])
pass_gdf.head(3)

Unnamed: 0,zip,pass_count,objectid,shape_area,shape_len,geometry,centroid_lon,centroid_lat,facility_number_per_zip,pass_proportion
0,60601,2206,27,9166245.79985,19804.5821088,(POLYGON ((-87.62271368513663 41.8888359224763...,-87.621022,41.886404,2517,0.87644
1,60602,945,26,4847124.8171,14448.1749926,(POLYGON ((-87.60996958016865 41.8843572835999...,-87.623661,41.883318,1099,0.859873
2,60603,1209,19,4560228.98203,13672.6822885,(POLYGON ((-87.61633485348139 41.8821117438214...,-87.624471,41.880647,1408,0.858665


In [63]:
pass_map = chlorepleth_map('Pass proportion',pass_gdf,['zip','pass_proportion'],'Pass proportion','YlGn')
#pass_map.save('pass_map.html')
IFrame(src = 'maps/pass_map.html', width = 700, height = 600)

**Obervations**: This map leads to the same conclusion than the one above , as expected.

# Proportion of poisoning complain in a given zip area

In [64]:
# creating a dataset with only the inspections due to poisoning
poisoning_data = data[data['inspection_type'] == 'Suspected Food Poisoning']

# counting the number of fail per zip code
poisoning_count_per_zip = pd.DataFrame(poisoning_data.groupby('zip')['results'].count()).reset_index()
poisoning_count_per_zip.rename(columns = {'results': 'poisoning_count'}, inplace = True)

In [65]:
# merge with gdf dataframe and facility count dataframe 
poisoning_gdf = pd.merge(gdf,poisoning_count_per_zip, on = 'zip')

# proportion of facility that passed the investigation per zip code
poisoning_gdf['poisoning_proportion'] = round((poisoning_gdf['poisoning_count'].divide(poisoning_gdf['facility_number_per_zip']))*100,2)
poisoning_gdf.head(3)

Unnamed: 0,objectid,shape_area,shape_len,zip,geometry,centroid_lon,centroid_lat,facility_number_per_zip,poisoning_count,poisoning_proportion
0,33,106052287.488,42720.0444058,60647,(POLYGON ((-87.67762151065281 41.9177578010629...,-87.702259,41.921098,4628,18,0.39
1,34,127476050.762,48103.7827213,60639,(POLYGON ((-87.72683253163021 41.9226462671259...,-87.755996,41.920456,3436,17,0.49
2,35,45069038.4783,27288.6096123,60707,(POLYGON ((-87.78500237831095 41.9091478547167...,-87.795738,41.919948,735,5,0.68


In [66]:
poisoning_map = chlorepleth_map ('Poisoning proportion',poisoning_gdf,['zip','poisoning_proportion'],'Suspected poisoning proportion','PuRd')
#poisoning_map.save('poisoning_map.html')
IFrame(src = 'maps/poisoning_map.html', width = 700, height = 600)

**Observations** : We can observe that some areas had a higher number of inspections due to suspected food poisoning. These areas do not seem to necessary correlate with the pass/fail results of inspections. Also, it seems that we have a high concentration of this kind of investigation in the center of Chicago near to the sea. We need to investigate more about that.
> It seems like the area where there are the more inspections due to supsected food poisoning are near the sea. We  need to investigate more about this because of the presence of e-coli in sea water. Does it play a role in this complain about food poisoning ? To do so, we will study later a dataframe which provide predictions about e-coli concentration at beaches in Chicago.

# E. Coli Prevention in Chicago Beach Waters

Escherichia coli is a bacterium that lives in the intestines of humans and animals alike and also causes food poisoning illness. A major source of E. coli infections is undercooked beef. Other sources of E. coli bacteria include drinking or swimming in water that is contaminated by sewage. E. coli bacterium, which is present in stool, can be passed from person-to-person as a result of improper hygiene or handwashing practices.

People can become infected when a contaminated city or town water supply has not been properly treated with chlorine or when people accidentally swallow contaminated water while swimming in a lake, pool, or irrigation canal.
The bacteria can also spread from one person to another, usually when an infected person does not wash his or her hands well after a bowel movement. E. coli can spread from an infected person's hands to other people or to objects.

> The Chicago Park District issues swim advisories at beaches along Chicago's Lake Michigan lakefront based on E. coli levels. The dataset below shows predicted E. coli levels based on an experimental analytical modeling approach.

In [67]:
ecoli_df = pd.read_csv('data/beach-e.-coli-predictions.csv',delimiter = ',')

> US Environmental Protection Agency (USEPA) recommends notifying the public when E. coli bacteria levels are above the federal water quality Beach Action Value (BAV), which is 235*CFU. Thus, we will keep only the predicted values that are above this limit.

In [68]:
data_filtered = ecoli_df[ecoli_df['Predicted Level'] > 235]

In [69]:
high_ecoli_concentration = pd.DataFrame(data_filtered.groupby('Beach Name')['Predicted Level'].count())
high_ecoli_concentration.reset_index(inplace = True)
high_ecoli_concentration

Unnamed: 0,Beach Name,Predicted Level
0,12th Street,47
1,57th Street,9
2,Foster,49
3,Hartigan (Albion),4
4,Howard,5
5,Juneway,9
6,Leone,9
7,Margaret T Burroughs (31st),72
8,Marion Mahony Griffin (Jarvis),9
9,North Avenue,7


In [70]:
# merging the data_filtered dataframe with the high_ecoli_concentration dataframe to have beaches' location
high_ecoli_concentration_location = pd.merge(data_filtered,high_ecoli_concentration,on = 'Beach Name', how = 'inner')

# groupby beaches and creating a dataframe
high_ecoli_concentration_location = high_ecoli_concentration_location.groupby('Beach Name').first()

high_ecoli_concentration_location = pd.DataFrame(high_ecoli_concentration_location)
high_ecoli_concentration_location.rename(columns = {'Predicted Level_y': 'High measure count'}, inplace=True)
high_ecoli_concentration_location.reset_index(inplace = True)
high_ecoli_concentration_location.sort_values(by=['High measure count'], ascending = False)

high_ecoli_concentration_location.head(3)

Unnamed: 0,Beach Name,Date,Prediction Source,Predicted Level_x,RecordID,Latitude,Longitude,Location,High measure count
0,12th Street,2017-06-05T00:00:00,DNA Model,240.8,12thStreet20170605,41.8638,-87.6082,"{'needs_recoding': False, 'longitude': '-87.60...",47
1,57th Street,2017-07-20T00:00:00,DNA Model,312.6,57thStreet20170720,41.7911,-87.5797,"{'needs_recoding': False, 'longitude': '-87.57...",9
2,Foster,2017-05-26T00:00:00,DNA Model,334.7,Foster20170526,41.9785,-87.6515,"{'needs_recoding': False, 'longitude': '-87.65...",49


In [71]:
# create a map using the Map() function and the coordinates for Chicago
m = folium.Map(location=[41.8600, -87.6298], zoom_start = 12)

for i in range(len(high_ecoli_concentration_location)):
    popup = str(high_ecoli_concentration_location['Beach Name'].values[i]) + '\n'+'#High measure count :'+ str(high_ecoli_concentration_location['High measure count'].values[i])
    if (high_ecoli_concentration_location['High measure count'].values[i]  < 10):
        colour = 'green'
    if (high_ecoli_concentration_location['High measure count'].values[i] < 30 and high_ecoli_concentration_location['High measure count'].values[i]>10 ):
        colour = 'orange'
    if (high_ecoli_concentration_location['High measure count'].values[i] >= 30):
        colour = 'red'
    adding_Marker(m,high_ecoli_concentration_location.Longitude.values[i], high_ecoli_concentration_location.Latitude.values[i], popup , colour)

#m.save('ecoli_map.html')
IFrame(src = 'maps/ecoli_map.html', width = 700, height = 600)

**Observations** :  We observe that we have a high e-coli sea water concentration in some beaches in Chicago. What is interesting to notice, is the 12th Street beach which is located at Chicago center and correlate with a high number of inspection due to suspected food poisoning. We need to investigate this fact in order to understand how this both parameter could be linked.
> We can try to overlap this map with the one above.

In [72]:
for i in range(len(high_ecoli_concentration_location)):
    popup = str(high_ecoli_concentration_location['Beach Name'].values[i]) + '\n'+'High measure count :'+ str(high_ecoli_concentration_location['High measure count'].values[i])
    if (high_ecoli_concentration_location['High measure count'].values[i]  < 10):
        colour = 'green'
    if (high_ecoli_concentration_location['High measure count'].values[i] <30 and high_ecoli_concentration_location['High measure count'].values[i]>10 ):
        colour = 'orange'
    if (high_ecoli_concentration_location['High measure count'].values[i] >= 30):
        colour = 'red'
    adding_Marker(poisoning_map,high_ecoli_concentration_location.Longitude.values[i], high_ecoli_concentration_location.Latitude.values[i], popup , colour)

    
#poisoning_map.save('ecoli_wpoisoning.html')
IFrame(src = 'maps/ecoli_wpoisoning.html', width = 700, height = 600)

**Observations** : When we overlap this data, we observe that in most cases the high concentration of e-coli is correlated with a higher of inspection due to food poisoning.

# Is there a link with Chicago Sanitation Violations ? 

In [73]:
# import the cleaning dataset 
sanitation_df = pd.read_csv('data/sanitation.csv', delimiter = ',')
sanitation_df.head(5)

Unnamed: 0,Creation Date,Status,Completion Date,Service Request Number,Type of Service Request,What is the Nature of this Code Violation?,Street Address,ZIP Code,X Coordinate,Y Coordinate,Ward,Police District,Community Area,Latitude,Longitude,Location
0,08/01/2017,Completed - Dup,08/04/2017,17-05101063,Sanitation Code Violation,Garbage in alley,3016 W MONTROSE AVE,60618.0,1155406.0,1929085.0,33.0,17.0,14.0,41.961215,-87.704035,"(41.961215172275, -87.704034715236)"
1,05/31/2017,Completed,08/04/2017,17-03559234,Sanitation Code Violation,Other,3359 W 19TH ST,60623.0,1154205.0,1890509.0,24.0,10.0,29.0,41.855383,-87.709482,"(41.855383440674, -87.709481507782)"
2,07/14/2017,Completed,08/04/2017,17-04636140,Sanitation Code Violation,Garbage in alley,7212 S 73RD ST ER,60619.0,1182810.0,1857288.0,8.0,3.0,69.0,41.763603,-87.605521,"(41.763602787373, -87.605520591847)"
3,07/09/2017,Completed,08/04/2017,17-04526947,Sanitation Code Violation,Other,3418 W GRENSHAW ST,60624.0,1153756.0,1894848.0,24.0,11.0,29.0,41.867299,-87.711014,"(41.867298732126, -87.711014045253)"
4,08/03/2017,Completed,08/04/2017,17-05156088,Sanitation Code Violation,,2659 N MASON AVE,60639.0,1136274.0,1917063.0,30.0,25.0,19.0,41.928589,-87.774662,"(41.928588854849, -87.774662171276)"


In [74]:
# counting number of sanitation complaints per zip code area
sanitation_df_per_zip = sanitation_df.groupby('ZIP Code')['Service Request Number'].count()
sanitation_df_per_zip = pd.DataFrame(sanitation_df_per_zip)
sanitation_df_per_zip.reset_index(inplace = True)
sanitation_df_per_zip.rename(columns = {'Service Request Number': 'violation_count','ZIP Code':'zip'}, inplace=True)

In [75]:
sanitation_df_per_zip.head(5)

Unnamed: 0,zip,violation_count
0,0.0,2
1,60601.0,196
2,60602.0,169
3,60603.0,75
4,60604.0,59


In [76]:
# creation of a geodataframe

# path to geoJson file 
geo = os.path.join('data/Boundaries-ZIPCodes.geojson')

# creation of a geodataframe using geopandas
gdf_sanitation = gpd.read_file(geo)

# add a column with the x-coordinate of the multipolygon
gdf_sanitation['centroid_lon'] = gdf_sanitation['geometry'].centroid.x

# add a column with the y-coordinate of the multipolygon
gdf_sanitation['centroid_lat'] = gdf_sanitation['geometry'].centroid.y

# setting a projection  by assigning the WGS84 latitude-longitude CRS to the crs attribute
gdf_sanitation.crs = {'init' :'epsg:4326'}
    
# convert the zip column into an str
sanitation_df_per_zip.zip = sanitation_df_per_zip.zip.astype(str)
    
# reformat the zip code writing in order to compare it with the zip code in geojson file (for vizualisation step)
sanitation_df_per_zip['zip'] = sanitation_df_per_zip['zip'].apply(lambda x : x.split('.')[0])
    
# merge with the geodataframe
gdf_sanitation = pd.merge(gdf,sanitation_df_per_zip,on='zip')
    

In [77]:
gdf_sanitation.head(5)

Unnamed: 0,objectid,shape_area,shape_len,zip,geometry,centroid_lon,centroid_lat,facility_number_per_zip,violation_count
0,33,106052287.488,42720.0444058,60647,(POLYGON ((-87.67762151065281 41.9177578010629...,-87.702259,41.921098,4628,5888
1,34,127476050.762,48103.7827213,60639,(POLYGON ((-87.72683253163021 41.9226462671259...,-87.755996,41.920456,3436,4281
2,35,45069038.4783,27288.6096123,60707,(POLYGON ((-87.78500237831095 41.9091478547167...,-87.795738,41.919948,735,715
3,51,3450671.14336,7909.89040711,60707,(POLYGON ((-87.80662355756071 41.9345111843722...,-87.811606,41.936196,735,715
4,36,70853834.3797,42527.9896789,60622,(POLYGON ((-87.6670686895295 41.88885188496992...,-87.684212,41.903126,4264,3563


In [78]:
sanitation_map = chlorepleth_map('Sanitation violation',gdf_sanitation,['zip','violation_count'],'Sanitation Violation Count','BuPu')
#sanitation_map.save('sanitation_map.html')
#IFrame(src = 'maps/sanitation_map.html', width = 700, height = 600)

**Observations**: We can observe sanitation problems in south Chicago. But, it seems that there is no correlation between those sanitations problem and the poisoning. 

> We will investigate this further during milestone 3. 

# Interactif map for the datastory

## Sanitation

In [79]:
NIL = folium.features.GeoJson(
    gdf_sanitation,
    style_function=style_function, 
    control=False,
    highlight_function=highlight_function, 
    tooltip=folium.features.GeoJsonTooltip(
        fields=['zip','violation_count'],
        aliases=['Zip: ','Number of sanitation violations: '],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)
sanitation_map.add_child(NIL)
sanitation_map.keep_in_front(NIL)
folium.LayerControl().add_to(sanitation_map)


<folium.map.LayerControl at 0x1a2736b490>

In [80]:
sanitation_map.save('maps/interactif_sanitation_map.html')

## Fail 

In [81]:
FIL = folium.features.GeoJson(
    fail_gdf,
    style_function=style_function, 
    control=False,
    highlight_function=highlight_function, 
    tooltip=folium.features.GeoJsonTooltip(
        fields=['zip','facility_number_per_zip','fail_proportion'],
        aliases=['Zip: ','Number of inspections','% of failed inspections: '],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)
fail_map.add_child(FIL)
fail_map.keep_in_front(FIL)
folium.LayerControl().add_to(fail_map)


<folium.map.LayerControl at 0x1a23217d90>

In [82]:
fail_map.save('maps/interactif_fail_map.html')

## Poisoning with e-coli 

In [83]:
PIL = folium.features.GeoJson(
    poisoning_gdf,
    style_function=style_function, 
    control=False,
    highlight_function=highlight_function, 
    tooltip=folium.features.GeoJsonTooltip(
        fields=['zip','facility_number_per_zip','poisoning_count','poisoning_proportion'],
        aliases=['Zip: ','Total number of inspections','Number of inspection due to suspected food poisoning','Percentage: '],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)
poisoning_map.add_child(PIL)
poisoning_map.keep_in_front(PIL)
folium.LayerControl().add_to(poisoning_map)


<folium.map.LayerControl at 0x1a227a4d90>

In [84]:
poisoning_map.save('maps/interactif_poison_map.html')

## Zip code with the highest number of inspections

In [85]:
inspection_map = chlorepleth_map ('Inspections proportion',gdf,['zip','facility_number_per_zip'],'Number of inspections','YlGnBu')

IIL = folium.features.GeoJson(
    gdf,
    style_function=style_function, 
    control=False,
    highlight_function=highlight_function, 
    tooltip=folium.features.GeoJsonTooltip(
        fields=['zip','facility_number_per_zip'],
        aliases=['Zip: ','Number of inspections :'],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)
inspection_map.add_child(IIL)
inspection_map.keep_in_front(IIL)
folium.LayerControl().add_to(inspection_map)

<folium.map.LayerControl at 0x1a253c4250>

In [86]:
inspection_map.save('maps/interactif_inspection_map.html')