In [97]:
import pandas as pd
import numpy as np
import requests
from requests import get
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup # for web scraping
import seaborn as sns # for beautiful graphs
import scipy.stats as stats # to calculate r^2 for linear regressions
from scipy.stats import powerlaw # for plotting linear regressions
import statsmodels as sm
import matplotlib.ticker as mtick
import re
sns.set()

# Dataset Cleaning 

First , we need to remove the empty columns from the dataset.

In [3]:
#opening the data
data= pd.read_csv('/Users/Mariam/Desktop/chicago-food-inspections/food-inspections.csv',delimiter=',')

# drop all the empty columns
data.drop(['Historical Wards 2003-2015', 'Zip Codes', 'Community Areas','Census Tracts','Wards'], axis=1,inplace=True)

#show the dataframe
display(data.head(3))

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2320315,SERENDIPITY CHILDCARE,SERENDIPITY CHILDCARE,2216009.0,Daycare Above and Under 2 Years,Risk 1 (High),1300 W 99TH ST,CHICAGO,IL,60643.0,2019-10-23T00:00:00.000,License Re-Inspection,Pass,,41.714168,-87.655291,"{'longitude': '41.7141680989703', 'latitude': ..."
1,2320342,YOLK TEST KITCHEN,YOLK TEST KITCHEN,2589655.0,Restaurant,Risk 1 (High),1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,2019-10-23T00:00:00.000,Canvass,Pass w/ Conditions,23. PROPER DATE MARKING AND DISPOSITION - Comm...,41.913588,-87.682203,"{'longitude': '41.9135877900482', 'latitude': ..."
2,2320328,LAS ASADAS MEXICAN GRILL,LAS ASADAS MEXICAN GRILL,2583309.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,2019-10-23T00:00:00.000,Canvass,Out of Business,,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':..."


We need to standardize columns format in order to make it more friendly to use.

In [4]:
#We create a function that fills empty space by '_' and lower case all the letters (reformat all column headers)
def standardize(column):
    column = column.lower().replace(" ", "_")
    column = re.sub('\W+',"", column)
    if len(column) > 1:
        if column[-1] == "_":
            return column[:-1]
    return column

#application of the function to the dataset
data.columns = [standardize(x) for x in data.columns]
display(data.columns)


Index(['inspection_id', 'dba_name', 'aka_name', 'license', 'facility_type',
       'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
       'inspection_type', 'results', 'violations', 'latitude', 'longitude',
       'location'],
      dtype='object')

We need to check if the inspection ID is unique. If it is not the case ,we need to remove the duplicates as an ID reffers to an unique inspection.

In [5]:
#Check is the inspection ID is unique
display(data['inspection_id'].is_unique)

False

In [6]:
#Removing the duplicates 
data.drop_duplicates('inspection_id', inplace=True)

#check if all the duplicates were removed 
display(data['inspection_id'].is_unique)

True

We need to remove all the NA values since we can't use this kind of information. But we need to remove them only from particular column. ( ex : for the violations , NA only means that there were no violations so we need to keep this NA).

In [7]:
#Remove NA values from relevent columns
data.dropna(subset=['inspection_date','license','latitude','longitude'],inplace=True)

We need to remove the time of inspection (useless information) in order to clean the inspection date column

In [8]:
#We remove the time ( all the caracters after 'T')
data['inspection_date']=data['inspection_date'].apply(lambda x : x.split('T')[0])
display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2320315,SERENDIPITY CHILDCARE,SERENDIPITY CHILDCARE,2216009.0,Daycare Above and Under 2 Years,Risk 1 (High),1300 W 99TH ST,CHICAGO,IL,60643.0,2019-10-23,License Re-Inspection,Pass,,41.714168,-87.655291,"{'longitude': '41.7141680989703', 'latitude': ..."
1,2320342,YOLK TEST KITCHEN,YOLK TEST KITCHEN,2589655.0,Restaurant,Risk 1 (High),1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,2019-10-23,Canvass,Pass w/ Conditions,23. PROPER DATE MARKING AND DISPOSITION - Comm...,41.913588,-87.682203,"{'longitude': '41.9135877900482', 'latitude': ..."
2,2320328,LAS ASADAS MEXICAN GRILL,LAS ASADAS MEXICAN GRILL,2583309.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,2019-10-23,Canvass,Out of Business,,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':..."


We need to check if there are only information from Chicago.

In [9]:
#check if we have only data from chicage, if not we need to remove all the extra information
data.city.unique()

array(['CHICAGO', nan, 'Chicago', 'CCHICAGO', 'CHICAGO.',
       'CHESTNUT STREET', 'CHICAGOCHICAGO', 'chicago', 'CHICAGOHICAGO',
       'CHicago', '312CHICAGO', 'BEDFORD PARK', 'CHCICAGO',
       'CHARLES A HAYES', 'CHCHICAGO', 'CHICAGOI', 'SUMMIT', 'WESTMONT',
       'LOMBARD', 'INACTIVE', 'alsip', 'BLUE ISLAND'], dtype=object)

We need to remove :   Bedford Park (Gas Station) ; Blue Island; Lombard ( a village near to Chicago) , Summit ( a city near to Chicago) ; WESTMONT ( village near Chicago) ; aslip (suburb of chicago)
We need to replace by chicago : 'CHARLES A HAYES'  (postal location), 312Chicago (Restaurant) ; CHICAGOI (Chicago)  ; CHESTNUT STREET (street in chicago); INACTIVE (out of business restaurant in Chicago); Chestnut street ;

In [10]:
#Check if the state is unique
display(data.state.unique())

#As the state is unique and we will not use this columns for our further investigations, we can drop it 
data.drop(['state'], axis=1,inplace=True)

#Check if there are other city than Chicago
display(data.inspection_id.groupby(data['city']).count())

#Drop the selected locations 
data = data[~data['city'].isin(["BEDFORD PARK", "BLUE ISLAND", "LOMBARD","SUMMIT","WESTMONT","alsip"])]

#check if the column is clean 
display(data.inspection_id.groupby(data['city']).count())

#now that we are sure that we have only information from Chicago ,we can delete the city columns
data.drop(['city'], axis=1,inplace=True)

array(['IL', nan], dtype=object)

city
312CHICAGO              2
BEDFORD PARK            2
BLUE ISLAND             1
CCHICAGO               45
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            193192
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               317
INACTIVE                8
LOMBARD                 1
SUMMIT                  4
WESTMONT                1
alsip                   1
chicago                82
Name: inspection_id, dtype: int64

city
312CHICAGO              2
CCHICAGO               45
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            193192
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               317
INACTIVE                8
chicago                82
Name: inspection_id, dtype: int64

If we explore the license numbers, we find that there are some null license number. We need to remove them,

In [11]:
# Drop "0.0" licenses
data = data[data.license != 0.0]

In [93]:
#Converting inspection dates into float numbers
data.inspection_date = data.inspection_date.astype('datetime64[ns]')

#make sure that the latitude and longitude are float numbers
data['latitude'] = data['latitude'].astype(float)
data['longitude'] = data['longitude'].astype(float)

# Dataset with only the last inspection results 

In [77]:
#we want to selection only the informations from the last inspection for each facility
date_last_inspection=pd.DataFrame(data.groupby('license')['inspection_date'].max())
date_last_inspection
#by merging 'data' and 'date_last_inspection' we can create a data frame containing only infromations about the last inspection for each facility
data_last_inspection=pd.merge(date_last_inspection,data, on=['license', 'inspection_date'])
data_last_inspection.groupby('license').count()
data_last_inspection.rename(columns={'inspection_date': 'last_inspection_date'}, inplace=True)
data_last_inspection.head(3)

Unnamed: 0,license,last_inspection_date,inspection_id,dba_name,aka_name,facility_type,risk,address,zip,inspection_type,results,violations,latitude,longitude,location
0,1.0,2010-06-04,250567,HARVEST CRUSADES MINISTRIES,HARVEST CRUSADES MINISTRIES,Special Event,Risk 2 (Medium),118 N CENTRAL AVE,60644.0,Special Events (Festivals),Pass,,41.882845,-87.765095,"{'longitude': '41.88284507471884', 'latitude':..."
1,2.0,2018-02-13,2144871,COSI,COSI,Restaurant,Risk 1 (High),230 W MONROE ST,60606.0,Canvass,Pass w/ Conditions,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,41.880757,-87.634709,"{'longitude': '41.88075715864721', 'latitude':..."
2,9.0,2019-08-09,2304407,XANDO COFFEE & BAR / COSI SANDWICH BAR,XANDO COFFEE & BAR / COSI SANDWICH BAR,Restaurant,Risk 1 (High),116 S MICHIGAN AVE,60603.0,Canvass,Pass w/ Conditions,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.880396,-87.624502,"{'longitude': '41.88039583825962', 'latitude':..."


# Basic Score for the last inspection of a facility

We want to make the violation column more readible.
Using the followingl link https://webapps1.chicago.gov/healthinspection/Code_Violations.jsp#minor  , each number is associated with a unique violation.
For simplification purpose, we will consider only the violations from 1 to 44 and the violation number 70. 

In [78]:
#function that split the violations number from the comments

def violation_separator(violations):
    violation_number = pd.Series([])   #creating an empty dataframe in order to stock the violation numbers
    if type(violations) == str:
        violations = violations.split(' | ') #each different violation is separated by a ' | ' in a dataframe cell
        for violation in violations:        #now, we can iterate on the differente violations of each inspection
            index = "#" + violation.split('.')[0]  #the index refers to the violation number
            violation_number[index] = 1 #add 1 if there is a violation #.. and 0 if not.
    return violation_number

#we create a new data frame by applying the function to the dataset thhe old one and fill the nan value by 0 . 1= violation , 0= no violation
violations_data = data_last_inspection.violations.apply(violation_separator).fillna(0)

In [79]:
# Generate column names
critical = [("#" + str(num)) for num in range(1, 15)]
serious = [("#" + str(num)) for num in range(15, 30)]
minor = [("#" + str(num)) for num in range(30, 45)]
minor.append("#70")

# Create complete list of column names
columns = critical + serious + minor

# Create dataframe using column names, violation data and inspection ID
violations_data = pd.DataFrame(violations_data, columns=columns)
violations_data['inspection_id'] = data_last_inspection.inspection_id
violations_data['license'] = data_last_inspection.license

In [80]:
#counting the kind of violation per inspection ID
violation_counts = pd.DataFrame({
    "critical_count": violations_data[critical].sum(axis=1),
    "serious_count": violations_data[serious].sum(axis=1),
    "minor_count": violations_data[minor].sum(axis=1)
})

violation_counts['inspection_id'] = data_last_inspection.inspection_id
violation_counts['license'] = data_last_inspection.license
# Display selection of sums dataframe
violation_counts.iloc[3:6]

Unnamed: 0,critical_count,serious_count,minor_count,inspection_id,license
3,3.0,0.0,1.0,2222357,40.0
4,0.0,0.0,0.0,1418967,43.0
5,0.0,0.0,2.0,2145199,62.0


In [81]:
#minor violatios are more frequent .
violation_counts.sum()

critical_count    1.309800e+04
serious_count     5.197000e+03
minor_count       2.091100e+04
inspection_id     6.282335e+10
license           6.640389e+10
dtype: float64

Here, we want to create a basic score that depends on the violations type (critical, serious or minor).
critical = 3 points
serious= 2 points
minor=1 points 
The lower the score, the better.

In [92]:
#calculation of the basic score for each inspection
basic_score=violation_counts.apply(lambda x: x.critical_count*3+x.serious_count*2+x.minor_count*1, axis = 1)
violation_counts['basic_score'] = basic_score
violation_counts.sort_values('basic_score', ascending=False)

Unnamed: 0,critical_count,serious_count,minor_count,inspection_id,license,basic_score
26333,5.0,5.0,8.0,2269090,2304080.0,33.0
21780,3.0,6.0,7.0,2261633,2146568.0,28.0
15518,4.0,2.0,10.0,1575732,1964481.0,26.0
26560,5.0,3.0,4.0,2290366,2308923.0,25.0
3769,6.0,3.0,1.0,2279417,38365.0,25.0
...,...,...,...,...,...,...
15040,0.0,0.0,0.0,2050788,1942588.0,0.0
15039,0.0,0.0,0.0,1370071,1942579.0,0.0
15038,0.0,0.0,0.0,120390,1942574.0,0.0
15037,0.0,0.0,0.0,1098332,1942557.0,0.0


In [157]:
#add the name and the location information to this dataframe 
data_score = pd.merge(violation_counts,data[['license','zip','longitude','latitude']],on='license', how='left')
data_score.head(3)


Unnamed: 0,critical_count,serious_count,minor_count,inspection_id,license,basic_score,zip,longitude,latitude
0,0.0,0.0,0.0,250567,1.0,0.0,60644.0,-87.765095,41.882845
1,1.0,0.0,1.0,2144871,2.0,4.0,60606.0,-87.634709,41.880757
2,1.0,0.0,1.0,2144871,2.0,4.0,60606.0,-87.634709,41.880757


In [231]:
#We can compute the average score per zip code + rajouter la division
score_zip=data_score.groupby('zip')['basic_score'].sum()

#convertion into a dataframe (more friendly to use)
data_score_zip=pd.DataFrame(score_zip)

#we want to have zip as a column not as an index
data_score_zip.reset_index(level=0, inplace=True)

#convert into string (useful for the choropleth) -> passer en cleaning
data_score_zip.zip=data_score_zip.zip.astype(str)

#check if the convertion is donne correctly
data_score_zip.dtypes

#reformat the zip code writing in order to compare it with the zip code in geojson file
data_score_zip['zip']=data_score_zip['zip'].apply(lambda x : x.split('.')[0])
display(data_score_zip.head(3))





Unnamed: 0,zip,basic_score
0,60601,9229.0
1,60602,5523.0
2,60603,5005.0


# Area with the highest number of inspection 

In [242]:
%%capture
!pip install folium
!pip install geopandas
import folium
from folium import plugins
from folium.plugins import HeatMap
import os
import geopandas as gpd

%matplotlib inline

In [320]:
#function that generate basic maps 

def generateBaseMap(default_location=[41.8600, -87.6298], default_zoom_start=10):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map



In [146]:
#Mapping the areas with the highest number of inspections 

data['count'] = 1
base_map = generateBaseMap()
HeatMap(data=data[['latitude', 'longitude', 'count']].groupby(['latitude', 'longitude']).sum().reset_index().values.tolist(), radius=8, max_zoom=13).add_to(base_map)
base_map

# Basic score per zip code area

In [279]:
#path to geoJson file 
geo = os.path.join('/Users/Mariam/Desktop/Boundaries-ZIPCodes.geojson')

#creation of a geodataframe using geopandas
gdf = gpd.read_file(geo)
#add a column with the x-coordinate of the multipolygon
gdf['centroid_lon']=gdf['geometry'].centroid.x
#add a column with the y-coordinate of the multipolygon
gdf['centroid_lat']=gdf['geometry'].centroid.y
#setting a projection  by assigning the WGS84 latitude-longitude CRS to the crs attribute
gdf.crs = {'init' :'epsg:4326'}
gdf_with_score=pd.merge(gdf,data_score_zip,on='zip')
gdf_with_score

Unnamed: 0,objectid,shape_area,shape_len,zip,geometry,centroid_lon,centroid_lat,basic_score
0,33,106052287.488,42720.0444058,60647,"MULTIPOLYGON (((-87.67762 41.91776, -87.67761 ...",-87.702259,41.921098,27362.0
1,34,127476050.762,48103.7827213,60639,"MULTIPOLYGON (((-87.72683 41.92265, -87.72693 ...",-87.755996,41.920456,12454.0
2,35,45069038.4783,27288.6096123,60707,"MULTIPOLYGON (((-87.78500 41.90915, -87.78531 ...",-87.795738,41.919948,2568.0
3,51,3450671.14336,7909.89040711,60707,"MULTIPOLYGON (((-87.80662 41.93451, -87.80686 ...",-87.811606,41.936196,2568.0
4,36,70853834.3797,42527.9896789,60622,"MULTIPOLYGON (((-87.66707 41.88885, -87.66707 ...",-87.684212,41.903126,21919.0
...,...,...,...,...,...,...,...,...
56,57,155285532.005,53406.9156168,60623,"MULTIPOLYGON (((-87.69479 41.83008, -87.69486 ...",-87.716914,41.848458,13386.0
57,58,211114779.439,58701.3253749,60629,"MULTIPOLYGON (((-87.68306 41.75786, -87.68306 ...",-87.712752,41.775196,12657.0
58,59,211696050.967,58466.1602979,60620,"MULTIPOLYGON (((-87.62373 41.72167, -87.62388 ...",-87.653430,41.739570,5539.0
59,60,125424284.172,52377.8545408,60637,"MULTIPOLYGON (((-87.57691 41.79511, -87.57700 ...",-87.603050,41.781003,4979.0


In [317]:
#choropleth map

map_zip_score=generateBaseMap()

folium.Choropleth(
    geo_data=geo,
    name='choropleth',
    data=data_score_zip,
    columns=['zip', 'basic_score'],
    key_on='feature.properties.zip',
    fill_color='YlOrRd',
    fill_opacity=0.8,
    line_opacity=1,
    legend_name='Basic score'
).add_to(map_zip_score)





<folium.features.Choropleth at 0x1b1c287090>

In [326]:
# add markers with basic information
fg = folium.FeatureGroup(name='Zip Code Info')
for lat, lon,val, name in zip(gdf_with_score['centroid_lat'].tolist(),
                               gdf_with_score['centroid_lon'].tolist(),
                               gdf_with_score['basic_score'].tolist(),
                               gdf_with_score['zip'].tolist()):
    html = f"""
    <h2>{name}<\h2><br>
    <h4>Basic Score : {int(round(val,0))} HUF <\h4>
    """
    fg.add_child(folium.Marker(location=[lat, lon], popup=html))

map_zip_score.add_child(fg)
folium.LayerControl().add_to(map_zip_score)
map_zip_score