In [1]:
# map functions
import os 
import folium
import warnings
import requests

import folium.plugins as plugins
import plotly.io as pio
import plotly.offline as py
import pandas as pd
import numpy as np
import seaborn as sns;
import colorlover as cl 
import geopandas as gpd
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import scipy.stats as stats

from matplotlib.ticker import MaxNLocator
from requests import get
from IPython.display import IFrame
from pandas.io.json import json_normalize
from shapely.geometry import Polygon
from shapely.geometry import Point
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score , precision_score , f1_score, recall_score, matthews_corrcoef, roc_auc_score
from sklearn.metrics import classification_report 
from plotly import graph_objects as go
from violation_preprocessing import violation_separator, violations_dataframe

# to ignore the warnings and make the notebook more presentable
warnings.filterwarnings('ignore') 
pio.renderers.default = 'iframe'
%matplotlib inline
sns.set()

# Vizualisation with outbreak dataset 

Source of de dataset : https://wwwn.cdc.gov/norsdashboard/
In order to open in , need ot install xlrd ( pip or conda install )

In [2]:
# import the new dataset
data_outbreak_0 = pd.read_excel('data/NationalOutbreakPublicDataTool.xlsx')

data_outbreak_0.head(3)

Unnamed: 0,Year,Month,State,Primary Mode,Etiology,Serotype or Genotype,Etiology Status,Setting,Illnesses,Hospitalizations,...,Deaths,Info on Deaths,Food Vehicle,Food Contaminated Ingredient,IFSAC Category,Water Exposure,Water Type,Animal Type,Animal Type Specify,Water Status
0,2009,1,Minnesota,Food,Norovirus,,Suspected,Restaurant - Sit-down dining,2,0.0,...,0.0,2.0,,,,,,,,
1,2009,1,Minnesota,Food,Norovirus,,Confirmed,,16,0.0,...,0.0,16.0,,,,,,,,
2,2009,1,Minnesota,Food,Norovirus,,Suspected,Restaurant - Sit-down dining,5,0.0,...,0.0,5.0,,,,,,,,


### Data Preprocessing 

In [3]:
# selection of column of interest
interest_columns = ['Year','State','Etiology','Setting','Illnesses','Hospitalizations','Food Vehicle','Food Contaminated Ingredient']

# dataframe of interest
data_outbreak = data_outbreak_0[interest_columns]

data_outbreak.head(3)

Unnamed: 0,Year,State,Etiology,Setting,Illnesses,Hospitalizations,Food Vehicle,Food Contaminated Ingredient
0,2009,Minnesota,Norovirus,Restaurant - Sit-down dining,2,0.0,,
1,2009,Minnesota,Norovirus,,16,0.0,,
2,2009,Minnesota,Norovirus,Restaurant - Sit-down dining,5,0.0,,


In [4]:
# counting the number of outbreak per state 
data_outbreak_count = pd.DataFrame(data_outbreak.groupby('State').count())

data_outbreak_count.reset_index(inplace = True)

#selecting only the column of interest for the mapping
data_outbreak_count = data_outbreak_count[['State','Year']]

data_outbreak_count.rename(columns = {'Year': 'count','State':'name'}, inplace = True)

data_outbreak_count.head(3)

Unnamed: 0,name,count
0,Alabama,255
1,Alaska,94
2,Arizona,323


For the plot : https://github.com/python-visualization/folium/blob/master/examples/data/us-states.json

In [5]:
# path to geoJson file 
us_geo = os.path.join('data/us-states.json')

# creation of a geodataframe using geopandas
us_gdf = gpd.read_file(us_geo)

# add a column with the x-coordinate of the multipolygon
us_gdf['centroid_y'] = us_gdf['geometry'].centroid.x

# add a column with the y-coordinate of the multipolygon
us_gdf['centroid_x'] = us_gdf['geometry'].centroid.y

# setting a projection  by assigning the WGS84 latitude-longitude CRS to the crs attribute
us_gdf.crs = {'init' :'epsg:4326'}

In [6]:
us_outbreak_gdf = pd.merge(us_gdf,data_outbreak_count, on = 'name')

#us_outbreak_gdf.rename(columns = {,'id':'id_old'}, inplace = True)

us_outbreak_gdf.head(3)

Unnamed: 0,id,name,geometry,centroid_y,centroid_x,count
0,AL,Alabama,"POLYGON ((-87.359296 35.00118, -85.606675 34.9...",-86.827783,32.789907,255
1,AK,Alaska,"(POLYGON ((-131.602021 55.117982, -131.569159 ...",-152.373738,64.227768,94
2,AZ,Arizona,"POLYGON ((-109.042503 37.000263, -109.04798 31...",-111.663296,34.293393,323


In [7]:
# creating a basic map of Chicago
map_ = folium.Map(location = [us_gdf.centroid_x.mean(),us_gdf.centroid_y.mean()], control_scale = True, zoom_start = 4)
folium.TileLayer('CartoDB positron', name = "Light Map", control = False).add_to(map_)


#myscale = (us_outbreak_gdf['count'].quantile((0,0.1,0.75,0.9,0.98,1))).tolist()
myscale = list([0,500,1000,2000,3000])
folium.Choropleth(
        geo_data = us_geo,
        name = 'US Outbreak',
        data = us_outbreak_gdf,
        columns = ['id','count'],
        key_on = 'feature.id',
        fill_color = 'Blues',
        fill_opacity = 0.8,
        line_opacity = 1,
        legend_name = 'Outbreaks numbers',
        threshold_scale = myscale,
        smooth_factor = 0
        
    ).add_to(map_)

style_function = lambda x: {'fillColor': '#ffffff', 
                            'color':'#000000', 
                            'fillOpacity': 0.1, 
                            'weight': 0.1}

highlight_function = lambda x: {'fillColor': '#000000', 
                                'color':'#000000', 
                                'fillOpacity': 0.50, 
                                'weight': 0.1}


L = folium.features.GeoJson(
    us_outbreak_gdf,
    style_function = style_function, 
    control = False,
    highlight_function = highlight_function, 
    tooltip = folium.features.GeoJsonTooltip(
        fields = ['name','count'],
        aliases = ['State : ','Number of outbreak :'],
        style = ("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)

folium.Marker(
        location = [41.8600, -87.6298], # coordinates for the marker 
        popup = 'CHICAGO ' ,  # pop-up label for the marker
        icon = folium.Icon(color = 'darkred', icon ='arrow-down')
    ).add_to(map_)


map_.add_child(L)
map_.keep_in_front(L)
folium.LayerControl().add_to(map_)

<folium.map.LayerControl at 0x1a21feafd0>

In [8]:
#add text and focux on illinois 
#map_.save("US_map.html")
map_

### Food plot 

In [9]:
#selection of column of interest
interest_columns = ['Etiology','Illnesses','Hospitalizations','Food Vehicle','Food Contaminated Ingredient']

#creating a new dataset with food
data_food = data_outbreak_0[interest_columns]

#selecting only our state of interest : Illinois 
#data_food=data_food[data_food['State'] == 'Illinois'] 

#removing all the unknown values
data_food = data_food.dropna()

data_food.head(3)

Unnamed: 0,Etiology,Illnesses,Hospitalizations,Food Vehicle,Food Contaminated Ingredient
12,Bacillus cereus,13,0.0,"rice, white",rice
32,Norovirus Genogroup I,9,0.0,"oysters, raw",oysters
33,Shigella sonnei,96,0.0,multiple foods,lemon; lime


In [10]:
#cleaning the food vehicle column

# data_food['Food Vehicle']= data_food['Food Vehicle'].str.split(r",;")
data_food['Food Vehicle'] = data_food['Food Vehicle'].str.split()

data_food = data_food.explode('Food Vehicle')

# cleaning the food column
data_food['Food Vehicle'] = data_food['Food Vehicle'].str.replace('[^\w\s]','')

ignore = ['unspecified','bbq','other','and','unpasteurized','gravy','ground','raw']
#delete unspecified 
data_food = data_food[~data_food['Food Vehicle'].str.contains('|'.join(ignore))]

total_food = data_food['Food Vehicle'].nunique()

total_illnesses = data_food['Illnesses'].sum()

print('There is ', total_food ,' different ingredients that can cause a foodborn illness.')

data_food.head(3)

There is  561  different ingredients that can cause a foodborn illness.


Unnamed: 0,Etiology,Illnesses,Hospitalizations,Food Vehicle,Food Contaminated Ingredient
12,Bacillus cereus,13,0.0,rice,rice
12,Bacillus cereus,13,0.0,white,rice
32,Norovirus Genogroup I,9,0.0,oysters,oysters


In [11]:
grouped_df = pd.DataFrame(data_food.groupby('Food Vehicle').first())

ill_df = pd.DataFrame(data_food.groupby('Food Vehicle')['Illnesses'].sum())

ill_df.reset_index(inplace = True)

merged = pd.merge(grouped_df,ill_df, on = 'Food Vehicle')

merged.rename(columns = {'Illnesses_y': 'illnesses_cases'}, inplace = True)


In [12]:
merged.sort_values('illnesses_cases', ascending = False)

Unnamed: 0,Food Vehicle,Etiology,Illnesses_x,Hospitalizations,Food Contaminated Ingredient,illnesses_cases
105,chicken,Bacillus cereus,20,1.0,chicken,4759
37,beef,Clostridium perfringens,5,0.0,ground beef,3937
423,salad,Scombroid toxin,2,0.0,tuna,3715
527,turkey,Norovirus,32,0.0,lettuce; turkey,3396
376,pork,Clostridium perfringens,20,0.0,"pork, other",2953
...,...,...,...,...,...,...
515,tofu,Clostridium botulinum,2,2.0,tofu,2
141,crepes,Norovirus,2,0.0,"egg; sausage, pork; egg; sausage, pork",2
131,confit,Salmonella enterica,2,0.0,chicken; chili sauce; onion; tomato,2
125,coffee,Cleaning agents,2,0.0,coffee,2


In [13]:
merged['percentage'] = round((merged['illnesses_cases']/total_illnesses)*100,2)

In [14]:
merged = merged.sort_values('percentage', ascending = False)
merged

Unnamed: 0,Food Vehicle,Etiology,Illnesses_x,Hospitalizations,Food Contaminated Ingredient,illnesses_cases,percentage
105,chicken,Bacillus cereus,20,1.0,chicken,4759,5.97
37,beef,Clostridium perfringens,5,0.0,ground beef,3937,4.94
423,salad,Scombroid toxin,2,0.0,tuna,3715,4.66
527,turkey,Norovirus,32,0.0,lettuce; turkey,3396,4.26
376,pork,Clostridium perfringens,20,0.0,"pork, other",2953,3.71
...,...,...,...,...,...,...,...
131,confit,Salmonella enterica,2,0.0,chicken; chili sauce; onion; tomato,2,0.00
342,oyster,Norovirus Genogroup I,3,0.0,"oysters, raw",3,0.00
125,coffee,Cleaning agents,2,0.0,coffee,2,0.00
121,clam,Scombroid toxin,2,0.0,clams,2,0.00


In [15]:
merged_plot = merged[merged['percentage'] > 1.25]
merged_plot

Unnamed: 0,Food Vehicle,Etiology,Illnesses_x,Hospitalizations,Food Contaminated Ingredient,illnesses_cases,percentage
105,chicken,Bacillus cereus,20,1.0,chicken,4759,5.97
37,beef,Clostridium perfringens,5,0.0,ground beef,3937,4.94
423,salad,Scombroid toxin,2,0.0,tuna,3715,4.66
527,turkey,Norovirus,32,0.0,lettuce; turkey,3396,4.26
376,pork,Clostridium perfringens,20,0.0,"pork, other",2953,3.71
308,milk,Campylobacter jejuni,81,1.0,"milk, whole milk unpasteurized",2352,2.95
274,lettuce,Norovirus unknown,24,0.0,lettuce,1201,1.51
226,ham,Bacillus cereus; Staphylococcus aureus,14,0.0,ham; turkey,1113,1.4
526,tuna,Scombroid toxin,2,0.0,tuna steak,1024,1.29


In [16]:
other = merged[merged['percentage'] <= 1.25]
other_percentage = other.percentage.sum()

In [17]:
merged_plot.loc[0] = ['others', 'many', 'NAN','NAN','NAN','NAN',other_percentage]

In [18]:
# data used for the plots 
merged_plot

Unnamed: 0,Food Vehicle,Etiology,Illnesses_x,Hospitalizations,Food Contaminated Ingredient,illnesses_cases,percentage
105,chicken,Bacillus cereus,20,1,chicken,4759,5.97
37,beef,Clostridium perfringens,5,0,ground beef,3937,4.94
423,salad,Scombroid toxin,2,0,tuna,3715,4.66
527,turkey,Norovirus,32,0,lettuce; turkey,3396,4.26
376,pork,Clostridium perfringens,20,0,"pork, other",2953,3.71
308,milk,Campylobacter jejuni,81,1,"milk, whole milk unpasteurized",2352,2.95
274,lettuce,Norovirus unknown,24,0,lettuce,1201,1.51
226,ham,Bacillus cereus; Staphylococcus aureus,14,0,ham; turkey,1113,1.4
526,tuna,Scombroid toxin,2,0,tuna steak,1024,1.29
0,others,many,NAN,NAN,NAN,NAN,69.49


In [19]:
# plotting of pie chart

fig = go.Figure()


# set up the first trace
fig.add_trace(go.Pie(
                    labels = merged_plot['Food Vehicle'],
                    values = merged_plot['percentage'],
                    visible = True,
                    marker_colors = cl.scales['10']['div']['RdBu']
                )
             )

fig.layout.update(
    autosize = False,
    width = 400,
    height = 400,
    template = "plotly_white",
    #paper_bgcolor='#333'
)

fig.layout.update(
    title = go.layout.Title(
        text = 'Food',
        y = 0.9,
        x = 0.5,
        xanchor = 'center',
        yanchor = 'top',
    )
)

fig.show()