In [6]:
import pandas as pd
import numpy as np
import requests
from requests import get
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup # for web scraping
import seaborn as sns # for beautiful graphs
import scipy.stats as stats # to calculate r^2 for linear regressions
from scipy.stats import powerlaw # for plotting linear regressions
import statsmodels as sm
import matplotlib.ticker as mtick
import re
sns.set()

# Dataset Cleaning 

First , we need to remove the empty columns from the dataset.

In [252]:
#opening the data
data= pd.read_csv('/Users/ahmedbenromdhane/Desktop/food-inspections.csv')

# drop all the empty columns
data.drop(['Historical Wards 2003-2015', 'Zip Codes', 'Community Areas','Census Tracts','Wards'], axis=1,inplace=True)

#show the dataframe
data

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2320971,JUMPSTART EARLY LEARNING ACADEMY,JUMPSTART EARLY LEARNING ACADEMY,2589822.0,Children's Services Facility,Risk 1 (High),7559 W ADDISON ST,CHICAGO,IL,60634.0,2019-11-01T00:00:00.000,Canvass,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.945065,-87.816734,"{'longitude': '41.945064857019986', 'latitude'..."
1,2320918,BEEFSTEAK,BEEFSTEAK,2698445.0,Restaurant,Risk 1 (High),303 E SUPERIOR ST,CHICAGO,IL,60611.0,2019-11-01T00:00:00.000,License,Pass,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,41.895692,-87.620143,"{'longitude': '41.895692401410514', 'latitude'..."
2,2320986,BABA'S COFFEE,BABA'S COFFEE,2423353.0,Restaurant,Risk 1 (High),5544-5546 N KEDZIE AVE,CHICAGO,IL,60625.0,2019-11-01T00:00:00.000,Canvass,No Entry,,41.982582,-87.708996,"{'longitude': '41.98258181784537', 'latitude':..."
3,2320910,J.T.'S GENUINE SANDWICH,J.T.'S GENUINE SANDWICH,2689893.0,Restaurant,Risk 1 (High),3970 N ELSTON AVE,CHICAGO,IL,60618.0,2019-11-01T00:00:00.000,License,Pass,51. PLUMBING INSTALLED; PROPER BACKFLOW DEVICE...,41.953378,-87.718848,"{'longitude': '41.95337788158545', 'latitude':..."
4,2320904,"KID'Z COLONY DAYCARE, INC.","KID'Z COLONY DAYCARE, INC.",2215609.0,Daycare Above and Under 2 Years,Risk 1 (High),6287 S ARCHER AVE,CHICAGO,IL,60638.0,2019-11-01T00:00:00.000,Canvass,Fail,16. FOOD-CONTACT SURFACES: CLEANED & SANITIZED...,41.793235,-87.777776,"{'longitude': '41.7932347787373', 'latitude': ..."
5,2320969,JUST A PIZZA PLUS INC,JUST A PIZZA PLUS,75583.0,Restaurant,Risk 1 (High),5136 S ARCHER AVE,CHICAGO,IL,60632.0,2019-11-01T00:00:00.000,Complaint,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.800619,-87.731143,"{'longitude': '41.8006193315046', 'latitude': ..."
6,2320901,GIFT 4 KIDS DAYCARE CENTER LLC,GIFT 4 KIDS DAYCARE,2341291.0,Daycare Above and Under 2 Years,Risk 1 (High),1305-1307 W 111TH ST,CHICAGO,IL,60643.0,2019-11-01T00:00:00.000,Canvass Re-Inspection,Pass,56. ADEQUATE VENTILATION & LIGHTING; DESIGNATE...,41.692098,-87.654809,"{'longitude': '41.6920980711081', 'latitude': ..."
7,2320922,FIFTH CAMPUS,HUNTER PERKINS CAMPUS,2510248.0,School,Risk 2 (Medium),1700 W 83RD ST,CHICAGO,IL,60620.0,2019-11-01T00:00:00.000,Canvass,Pass,47. FOOD & NON-FOOD CONTACT SURFACES CLEANABLE...,41.743149,-87.665746,"{'longitude': '41.74314889790767', 'latitude':..."
8,2320960,CARNICERIA Y TAQUERIA TIERRA,CARNICERIA Y TAQUERIA TIERRA,2428138.0,Grocery Store,Risk 1 (High),3312-3314 W NORTH AVE,CHICAGO,IL,60647.0,2019-11-01T00:00:00.000,Canvass Re-Inspection,Pass,,41.910185,-87.709907,"{'longitude': '41.91018535990397', 'latitude':..."
9,2320905,MCDONALD'S #4305,MCDONALD'S,1922105.0,Restaurant,Risk 2 (Medium),4844 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-11-01T00:00:00.000,Complaint,Pass w/ Conditions,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,41.969754,-87.689403,"{'longitude': '41.96975372057437', 'latitude':..."


We need to standardize columns format in order to make it more friendly to use.

In [253]:
#We create a function that fills empty space by '_' and lower case all the letters (reformat all column headers)
def standardize(column):
    column = column.lower().replace(" ", "_")
    column = re.sub('\W+',"", column)
    if len(column) > 1:
        if column[-1] == "_":
            return column[:-1]
    return column

#application of the function to the dataset
data.columns = [standardize(x) for x in data.columns]
display(data.columns)


Index(['inspection_id', 'dba_name', 'aka_name', 'license', 'facility_type',
       'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
       'inspection_type', 'results', 'violations', 'latitude', 'longitude',
       'location'],
      dtype='object')

We need to check if the inspection ID is unique. If it is not the case ,we need to remove the duplicates as an ID reffers to an unique inspection.

In [254]:
#Check is the inspection ID is unique
display(data['inspection_id'].is_unique)

False

In [255]:
#Removing the duplicates 
data.drop_duplicates('inspection_id', inplace=True)

#check if all the duplicates were removed 
display(data['inspection_id'].is_unique)

True

We need to remove all the NA values since we can't use this kind of information. But we need to remove them only from particular column. ( ex : for the violations , NA only means that there were no violations so we need to keep this NA).

In [256]:
#Remove NA values from relevent columns
data.dropna(subset=['inspection_date','license','latitude','longitude'],inplace=True)

We need to remove the time of inspection (useless information) in order to clean the inspection date column

In [257]:
#We remove the time ( all the caracters after 'T')
data['inspection_date']=data['inspection_date'].apply(lambda x : x.split('T')[0])
display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2320971,JUMPSTART EARLY LEARNING ACADEMY,JUMPSTART EARLY LEARNING ACADEMY,2589822.0,Children's Services Facility,Risk 1 (High),7559 W ADDISON ST,CHICAGO,IL,60634.0,2019-11-01,Canvass,Pass w/ Conditions,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.945065,-87.816734,"{'longitude': '41.945064857019986', 'latitude'..."
1,2320918,BEEFSTEAK,BEEFSTEAK,2698445.0,Restaurant,Risk 1 (High),303 E SUPERIOR ST,CHICAGO,IL,60611.0,2019-11-01,License,Pass,39. CONTAMINATION PREVENTED DURING FOOD PREPAR...,41.895692,-87.620143,"{'longitude': '41.895692401410514', 'latitude'..."
2,2320986,BABA'S COFFEE,BABA'S COFFEE,2423353.0,Restaurant,Risk 1 (High),5544-5546 N KEDZIE AVE,CHICAGO,IL,60625.0,2019-11-01,Canvass,No Entry,,41.982582,-87.708996,"{'longitude': '41.98258181784537', 'latitude':..."


We need to check if there are only information from Chicago.

In [258]:
#check if we have only data from chicage, if not we need to remove all the extra information
data.city.unique()

array(['CHICAGO', nan, 'Chicago', 'CCHICAGO', 'CHICAGO.',
       'CHESTNUT STREET', 'CHICAGOCHICAGO', 'chicago', 'CHICAGOHICAGO',
       'CHicago', '312CHICAGO', 'BEDFORD PARK', 'CHCICAGO',
       'CHARLES A HAYES', 'CHCHICAGO', 'CHICAGOI', 'SUMMIT', 'WESTMONT',
       'LOMBARD', 'INACTIVE', 'alsip', 'BLUE ISLAND'], dtype=object)

We need to remove :   Bedford Park (Gas Station) ; Blue Island; Lombard ( a village near to Chicago) , Summit ( a city near to Chicago) ; WESTMONT ( village near Chicago) ; aslip (suburb of chicago)
We need to replace by chicago : 'CHARLES A HAYES'  (postal location), 312Chicago (Restaurant) ; CHICAGOI (Chicago)  ; CHESTNUT STREET (street in chicago); INACTIVE (out of business restaurant in Chicago); Chestnut street ;

In [259]:
#Check if the state is unique
display(data.state.unique())

#As the state is unique and we will not use this columns for our further investigations, we can drop it 
data.drop(['state'], axis=1,inplace=True)

#Check if there are other city than Chicago
display(data.inspection_id.groupby(data['city']).count())

#Drop the selected locations 
data = data[~data['city'].isin(["BEDFORD PARK", "BLUE ISLAND", "LOMBARD","SUMMIT","WESTMONT","alsip"])]

#check if the column is clean 
display(data.inspection_id.groupby(data['city']).count())

#now that we are sure that we have only information from Chicago ,we can delete the city columns
data.drop(['city'], axis=1,inplace=True)

array(['IL', nan], dtype=object)

city
312CHICAGO              2
BEDFORD PARK            2
BLUE ISLAND             1
CCHICAGO               46
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            193715
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               317
INACTIVE                8
LOMBARD                 1
SUMMIT                  4
WESTMONT                1
alsip                   1
chicago                82
Name: inspection_id, dtype: int64

city
312CHICAGO              2
CCHICAGO               46
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            193715
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               317
INACTIVE                8
chicago                82
Name: inspection_id, dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


If we explore the license numbers, we find that there are some null license number. We need to remove them,

In [260]:
# Drop "0.0" licenses
data = data[data.license != 0.0]

We want to make the violation column more readible.
Using ( mettre un le link ) , each number is associated with a unique violation.

In [138]:
#function that split the violations number from the comments

def violation_separator(violations):
    violation_number = pd.Series([])   #creating an empty dataframe in order to stock the violation numbers
    if type(violations) == str:
        violations = violations.split(' | ') #each different violation is separated by a ' | ' in a dataframe cell
        for violation in violations:        #now, we can iterate on the differente violations of each inspection
            index = "#" + violation.split('.')[0]  #the index refers to the violation number
            violation_number[index] = 1 #add 1 if there is a violation #.. and 0 if not.
    return violation_number

#apply the function to the dataset and fill the nan value by 0 . 1= violation , 0= no violation
violations_data = data.violations.apply(violation_separator).fillna(0)

KeyboardInterrupt: 

In [None]:
violations_data

In [261]:
#Converting inspection dates into float numbers
data.inspection_date = data.inspection_date.astype('datetime64[ns]')

In [262]:
#Sorting by ascending = False the inspection dates for each facility (license)
#Each df is a dataframe and contains the inspection data of a given facility
dfs = []
for elem, df in data.groupby(['license']):
    dfs.append(df.sort_values(by='inspection_date',ascending=False)) 


In [263]:
#We concatenate to obtain our entire dataframe sorted by inspection date for each license
d = pd.concat(dfs,sort=True).reset_index().drop(columns = 'index')

In [264]:
data2 = d.set_index(['license','inspection_id','inspection_date'])

In [265]:
#data with sorted inspection dates for each facility
data2 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,address,aka_name,dba_name,facility_type,inspection_type,latitude,location,longitude,results,risk,violations,zip
license,inspection_id,inspection_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,250567,2010-06-04,118 N CENTRAL AVE,HARVEST CRUSADES MINISTRIES,HARVEST CRUSADES MINISTRIES,Special Event,Special Events (Festivals),41.882845,"{'longitude': '41.88284507471884', 'latitude':...",-87.765095,Pass,Risk 2 (Medium),,60644.0
2.0,2144871,2018-02-13,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass w/ Conditions,Risk 1 (High),3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,60606.0
2.0,2050308,2017-05-12,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass,Risk 1 (High),33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,60606.0
2.0,1977093,2016-12-14,230 W MONROE ST,COSI,COSI,Restaurant,Short Form Complaint,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass w/ Conditions,Risk 1 (High),3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,60606.0
2.0,1970902,2016-11-04,230 W MONROE ST,COSI,COSI,Restaurant,Canvass Re-Inspection,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass,Risk 1 (High),"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",60606.0
2.0,1970312,2016-10-27,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Fail,Risk 1 (High),3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,60606.0
2.0,1684285,2016-03-22,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass w/ Conditions,Risk 1 (High),3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,60606.0
2.0,1607237,2015-12-14,230 W MONROE ST,COSI,COSI,Restaurant,Complaint,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass,Risk 1 (High),33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,60606.0
2.0,1418938,2015-04-21,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass,Risk 1 (High),31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...,60606.0
2.0,1447571,2014-11-20,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass,Risk 1 (High),31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...,60606.0


In [266]:
#We can keep only the last date of inspection (first inspection date for each facility)
data_with_last_Inspection_date = d[d.groupby('license')['inspection_date'].transform('max') == d['inspection_date']]

In [267]:
data3 = data_with_last_Inspection_date.set_index(['license','inspection_id','inspection_date'])

In [268]:
data3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,address,aka_name,dba_name,facility_type,inspection_type,latitude,location,longitude,results,risk,violations,zip
license,inspection_id,inspection_date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,250567,2010-06-04,118 N CENTRAL AVE,HARVEST CRUSADES MINISTRIES,HARVEST CRUSADES MINISTRIES,Special Event,Special Events (Festivals),41.882845,"{'longitude': '41.88284507471884', 'latitude':...",-87.765095,Pass,Risk 2 (Medium),,60644.0
2.0,2144871,2018-02-13,230 W MONROE ST,COSI,COSI,Restaurant,Canvass,41.880757,"{'longitude': '41.88075715864721', 'latitude':...",-87.634709,Pass w/ Conditions,Risk 1 (High),3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,60606.0
9.0,2304407,2019-08-09,116 S MICHIGAN AVE,XANDO COFFEE & BAR / COSI SANDWICH BAR,XANDO COFFEE & BAR / COSI SANDWICH BAR,Restaurant,Canvass,41.880396,"{'longitude': '41.88039583825962', 'latitude':...",-87.624502,Pass w/ Conditions,Risk 1 (High),"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",60603.0
40.0,2222357,2018-09-14,233 N MICHIGAN AVE,COSI,COSI,Restaurant,Complaint,41.886567,"{'longitude': '41.886567370886944', 'latitude'...",-87.624385,Pass w/ Conditions,Risk 1 (High),2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,60601.0
43.0,1418967,2015-05-04,28 E JACKSON BLVD,COSI,COSI,,Canvass,41.878342,"{'longitude': '41.87834161206342', 'latitude':...",-87.626675,Out of Business,Risk 3 (Low),,60604.0
62.0,2145199,2018-02-20,230 W WASHINGTON ST,XANDO COFFEE & BAR / COSI SANDWICH BAR,XANDO COFFEE & BAR / COSI SANDWICH BAR,Restaurant,Canvass,41.883318,"{'longitude': '41.88331785985083', 'latitude':...",-87.634769,Pass,Risk 1 (High),33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,60606.0
85.0,1361105,2013-09-20,55 E GRAND AVE,XANDO COFFEE & BAR / COSI SANDWICH BAR,XANDO COFFEE & BAR / COSI SANDWICH BAR,Restaurant,Canvass,41.891591,"{'longitude': '41.891590741083505', 'latitude'...",-87.625867,Out of Business,Risk 1 (High),,60611.0
99.0,1982143,2017-01-26,203 N LA SALLE ST,COSI,XANDO COFFEE & BAR / COSI SANDWICH BAR,Restaurant,Canvass,41.885822,"{'longitude': '41.885822047853026', 'latitude'...",-87.632304,Out of Business,Risk 1 (High),,60601.0
104.0,1150252,2012-04-04,8433-8435 S PULASKI RD,VITO & NICK'S LOUNGE,VITO & NICK'S LOUNGE,,Canvass,41.739329,"{'longitude': '41.739329410001126', 'latitude'...",-87.721440,Out of Business,Risk 3 (Low),,60652.0
115.0,2285484,2019-04-19,3714 S HALSTED ST,JOHN SCHALLER,JOHN SCHALLER,Restaurant,Canvass,41.827185,"{'longitude': '41.82718501563474', 'latitude':...",-87.646170,Out of Business,Risk 1 (High),,60609.0
