# Important factor for Food Poisoning ? 

In [36]:
import os 
import folium
import warnings
import requests
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import folium.plugins as plugins
import scipy.stats as stats
import plotly.io as pio
import plotly.offline as py
import colorlover as cl

from matplotlib.ticker import MaxNLocator
from requests import get
from IPython.display import IFrame
from plotly import graph_objects as go
from pandas.io.json import json_normalize
from shapely.geometry import Polygon
from shapely.geometry import Point
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score , precision_score , f1_score, recall_score, matthews_corrcoef, roc_auc_score
from sklearn.metrics import classification_report 

# self made classes
from violation_preprocessing import violation_separator, violations_dataframe
from mapping_functions import chlorepleth_map, geodataframe, generateBaseMap, adding_CircleMarker, adding_Marker

# to ignore the warnings and make the notebook more presentable
warnings.filterwarnings('ignore') 
pio.renderers.default = 'iframe'
%matplotlib inline
sns.set()

# Evaluating Risk factors 

Let's take the data with the violations of each restaurant in Chicago.

In [37]:
data = pd.read_csv('data/clean_dataset.csv')

del data['Unnamed: 0']

# convert the zip column into an str
data.zip = data.zip.astype(str)
    
# reformat the zip code writing in order to compare it with the zip code in geojson file (for vizualisation step)
data['zip'] = data['zip'].apply(lambda x : x.split('.')[0])

In [38]:
data.head(2)

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2352734,CHILI'S T-I,CHILI'S (T1-B14),34169.0,Restaurant,Risk 1 (High),11601 W TOUHY AVE,60666,2019-12-04,Canvass,Pass,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,42.008536,-87.914428,"{'latitude': '-87.91442843927047', 'longitude'..."
1,2352727,PORTAGE PARK DAY NURSERY,MOSAIC EARLY CHILDHOOD ACADEMY,2215815.0,Children's Services Facility,Risk 1 (High),5332-5334 W ADDISON ST,60641,2019-12-04,Canvass,Pass,,41.946065,-87.760722,"{'latitude': '-87.76072227616888', 'longitude'..."


In [39]:
# calculating the number of facilities per zip for the normalization 
data_zip = pd.DataFrame(data.groupby('zip')['inspection_id'].count())
data_zip.rename(columns = {'inspection_id': 'total_count'}, inplace = True)
data_zip.reset_index()

data_zip.head(3)

Unnamed: 0_level_0,total_count
zip,Unnamed: 1_level_1
60601,2517
60602,1099
60603,1408


This dataset gives us the following informations :  
1. The risk level of the restaurants (a factor for Ecoli poisoning) in each zip code
2. The amount of food poisoning per zip code

In [40]:
# we just want to extract the level of risk ( 1 , 2 ou 3)
data['risk'] = data['risk'].astype(str) 
data['risk_level'] = data['risk'].str[5]

data['risk_level'] = data['risk_level'].astype(float)

# counting the number of facilities with inspection due to suspected food poisoning
poisoning_data = data[data['inspection_type'] == 'Suspected Food Poisoning']

In [41]:
# create bars
risk_level_count = list(poisoning_data.groupby('risk_level')['inspection_id'].count())
total = len(poisoning_data)
percentage = [ (a/total)*100 for a in risk_level_count]

In [42]:
percentage

[87.72348033373063, 12.038140643623361, 0.23837902264600713]

In [49]:
# plotting of bar plot

bars = ['High','Medium','Low']

fig = go.Figure()


# set up the first trace
fig.add_trace(go.Bar(
                    x = bars,
                    y = percentage,
                    visible = True,
                    marker_color='maroon'
                    
                )
             )

fig.layout.update(
    autosize = False,
    width = 400,
    height = 400,
    template = "plotly_white",
    #paper_bgcolor='#333'
)

fig.layout.update(
    title = go.layout.Title(
        text = 'Risk level associated w/ suspected food poisoning',
        y = 0.9,
        x = 0.5,
        xanchor = 'center',
        yanchor = 'top',
        font = dict(size = 14)
    ),
    yaxis = dict(
        title = 'Percentage %',
        titlefont_size = 14
    ),
    xaxis = dict(
        title = 'Risk level',
        titlefont_size = 14
    )
)

fig.show()

### Proportion of inspection due to suspected food poisoning per zip

In [10]:
# counting the number of facilities per zip 
facility_number_per_zip = pd.DataFrame(data.groupby('zip')['license'].count()).reset_index()


pois_facility_number_per_zip = pd.DataFrame(poisoning_data.groupby('zip')['license'].count()).reset_index()

# merge in order to calculate the proportion
number_facilities_df = pd.merge(facility_number_per_zip, pois_facility_number_per_zip, on ='zip')
number_facilities_df.rename(columns = {'license_x': 'total number', 'license_y':'poisoning number'}, inplace = True)

# frequency of inspection due to food poisoning in a given zip
number_facilities_df['frequency(%)'] = number_facilities_df['poisoning number'].divide(number_facilities_df['total number'])*100

In [11]:
number_facilities_df.head(2)

Unnamed: 0,zip,total number,poisoning number,frequency(%)
0,60601,2517,25,0.993246
1,60602,1099,6,0.545951


### Average risk level per zip

The risk level of the restaurants:

In [12]:
risk_per_zip = data[['zip','risk_level']]
risk_per_zip = data[data['risk_level'] == 1.0 ]

# counting the number of facilities per zip code where the risk is high
risk_per_zip = pd.DataFrame(risk_per_zip.groupby('zip')['inspection_id'].count())
risk_per_zip = risk_per_zip.reset_index()

risk_per_zip.rename(columns = {'inspection_id': 'high_risk_count'}, inplace = True)

risk_per_zip = pd.merge(risk_per_zip,data_zip, on ='zip')

risk_per_zip['percentage'] = risk_per_zip['high_risk_count'].divide(risk_per_zip['total_count'])*100

risk_per_zip.head(3)

Unnamed: 0,zip,high_risk_count,total_count,percentage
0,60601,2010,2517,79.856973
1,60602,856,1099,77.88899
2,60603,1167,1408,82.883523


In [13]:
#Map showing the repartition of average risk level in chicago per zip code

# creating our geodataframe based on the basic dataframe
gdf = geodataframe(data)
gdf.head(3)
    
style_function = lambda x: {'fillColor': '#ffffff', 
                            'color':'#000000', 
                            'fillOpacity': 0.1, 
                            'weight': 0.1}

highlight_function = lambda x: {'fillColor': '#000000', 
                                'color':'#000000', 
                                'fillOpacity': 0.50, 
                                'weight': 0.1}


# merge with gdf dataframe  
risk_gdf = pd.merge(gdf,risk_per_zip, on = 'zip')

risk_map = chlorepleth_map ('Risk proportion',risk_gdf,['zip','percentage'],'Average Risk Level','YlOrRd')

# add transparent background
folium.TileLayer('CartoDB positron', name = "Light Map", control = False).add_to(risk_map)

RIL = folium.features.GeoJson(
    risk_gdf,
    style_function = style_function, 
    control = False,
    highlight_function = highlight_function, 
    tooltip = folium.features.GeoJsonTooltip(
        fields = ['zip','percentage'],
        aliases = ['Zip: ','Proportion of facilities associated with high risk :'],
        style = ("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)
risk_map.add_child(RIL)
risk_map.keep_in_front(RIL)
folium.LayerControl().add_to(risk_map)

#risk_map.save('maps/interactif_risk_map.html')
IFrame(src = 'maps/interactif_risk_map.html', width = 700, height = 600)

In [14]:
# merge with the frequency dataset without central fusion 
risk_correlation = pd.merge(number_facilities_df,risk_per_zip, on = 'zip')

risk_correlation.head(3)

Unnamed: 0,zip,total number,poisoning number,frequency(%),high_risk_count,total_count,percentage
0,60601,2517,25,0.993246,2010,2517,79.856973
1,60602,1099,6,0.545951,856,1099,77.88899
2,60603,1408,16,1.136364,1167,1408,82.883523


In [15]:
# plotting of bar plot

fig = go.Figure()


# set up the first trace
fig.add_trace(go.Scatter(
    x = risk_correlation['percentage'],
    y = risk_correlation['frequency(%)'],
    visible = True,
    marker_color='maroon',
    mode = 'markers',
    text = risk_correlation['zip']
                )
             )

# adding trace for website
fig.add_trace(go.Scatter(
    x = [43.8, 45.8, 45.8, 43.8, 43.8],
    y = [1.18, 1.18, 1.11, 1.11, 1.18],
    mode = "lines",
    line = go.scatter.Line(color = "red"),
    showlegend = False))

fig.layout.update(
    autosize = False,
    width = 700,
    height = 700,
    template = "plotly_white",
    #paper_bgcolor='#333'
)

fig.layout.update(
    title = go.layout.Title(
        text = 'Poisoning inspections and facilities associated with high risk',
        y = 0.9,
        x = 0.5,
        xanchor = 'center',
        yanchor = 'top',
    ),
    yaxis = dict(
        title = 'Percentage of facilities inspected for food poisoning',
        titlefont_size = 14
    ),
    xaxis = dict(
        title = 'Percentage of high risk facilities per zip code',
        titlefont_size = 14
    ),
    showlegend = False
)

fig.show()

focus only on 60827 (haut poisoning percentage et bas risk ) , 60621 et 60656 (bas poisoning higgh risk level facilities et 60611 reflete max correlation .
Obervations : 
> 60827 : low high risk proportion , low number of sanitation violations but high number of food poisoning and number of failed inspections. Why ? cf datastory of Riverdale 

In [16]:
# we remove 'outliers'
outliers = ['60827','60656','60611']

risk_correlation_without_outliers = risk_correlation.query('zip != @outliers')

#spearman coeff 
print('The spearman coefficient is : ',stats.spearmanr(risk_correlation_without_outliers['percentage'],risk_correlation_without_outliers['frequency(%)']))


fig = go.Figure()


# set up the first trace
fig.add_trace(go.Scatter(
    x = risk_correlation_without_outliers['percentage'],
    y = risk_correlation_without_outliers['frequency(%)'],
    visible = True,
    marker_color='maroon',
    mode = 'markers',
    text = risk_correlation_without_outliers['zip']
                )
             )

fig.layout.update(
    autosize = False,
    width = 700,
    height = 700,
    template = "plotly_white",
    #paper_bgcolor='#333'
)

fig.layout.update(
    title = go.layout.Title(
        text = 'Poisoning inspections and facilities associated with high risk',
        y = 0.9,
        x = 0.5,
        xanchor = 'center',
        yanchor = 'top',
    ),
    yaxis = dict(
        title = 'Percentage of facilities inspected for food poisoning',
        titlefont_size = 14
    ),
    xaxis = dict(
        title = 'Percentage of high risk facilities per zip code',
        titlefont_size = 14
    )
)

fig.show()

The spearman coefficient is :  SpearmanrResult(correlation=0.37760421370164654, pvalue=0.0044815939194058916)


# FAILURE AND POISONING 

In [17]:
# create bars
results_count = list(poisoning_data.groupby('results')['inspection_id'].count())
total = len(poisoning_data)
percentage = [ (a/total)*100 for a in results_count]

poisoning_data.groupby('results').count()

Unnamed: 0_level_0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,inspection_type,violations,latitude,longitude,location,risk_level
results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Fail,214,214,214,214,214,214,214,214,214,214,211,214,214,214,214
Pass,394,394,392,394,394,394,394,394,394,394,377,394,394,394,394
Pass w/ Conditions,231,231,230,231,231,231,231,231,231,231,230,231,231,231,231


In [44]:
bars = ['Fail','Pass','Pass w/ Conditions ']



fig = go.Figure()


# set up the first trace
fig.add_trace(go.Bar(
                    x = bars,
                    y = percentage,
                    visible = True,
                    marker_color='maroon'
                    
                )
             )

fig.layout.update(
    autosize = False,
    width = 400,
    height = 400,
    template = "plotly_white",
    #paper_bgcolor='#333'
)

fig.layout.update(
    title = go.layout.Title(
        text = 'Inspections due to suspected food poisoning',
        y = 0.9,
        x = 0.5,
        xanchor = 'center',
        yanchor = 'top',
    ),
    yaxis = dict(
        title = 'Percentage %',
        titlefont_size = 14
    ),
    xaxis = dict(
        title = 'Inspection results',
        titlefont_size = 14
    )
)

fig.show()

Main conclusion : we can't use the inspection results in order to assess food safety within a facility. We need to investigate more to understand what factors may cause foodborn illness.