In [1]:
import re
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

# Data Cleaning 

In [2]:
# opening the data
data = pd.read_csv('data/food-inspections.csv', delimiter = ',')

display(data.head(3))

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Results,Violations,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards
0,2345969,LORDANCHILD CHRISTIAN DAY CARE INC.,LORDANCHILD CHRISTIAN DAY CARE INC.,2215931.0,Children's Services Facility,Risk 1 (High),3344 W 79TH ST,CHICAGO,IL,60652.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.749915,-87.706628,"{'longitude': '41.749914910076974', 'latitude'...",,,,,
1,2345986,TACOS & SALSAS LLC,TACOS & SALSAS,2684121.0,Mobile Food Preparer,Risk 2 (Medium),2300 S THROOP ST,CHICAGO,IL,60608.0,...,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.850451,-87.658798,"{'longitude': '41.85045102427', 'latitude': '-...",,,,,
2,2345977,"KIMBALL DAY CARE CENTER & KINDERGARTEN , INC.","KIMBALL DAY CARE CENTER & KINDERGARTEN , INC.",2215859.0,Daycare (2 - 6 Years),Risk 1 (High),1636 N KIMBALL AVE,CHICAGO,IL,60647.0,...,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.911155,-87.711859,"{'longitude': '41.911154536126396', 'latitude'...",,,,,


### 1 - General cleaning

In this first part, we will do general cleaning : homogenize the format of our dataframe and remove the duplicates, empty columns, NA values and null values.

In [42]:
# drop all the empty columns
data.drop(['Historical Wards 2003-2015', 'Zip Codes', 'Community Areas','Census Tracts','Wards'], axis = 1, inplace = True)

# show the dataframe
display(data.head(3))

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2320315,SERENDIPITY CHILDCARE,SERENDIPITY CHILDCARE,2216009.0,Daycare Above and Under 2 Years,Risk 1 (High),1300 W 99TH ST,CHICAGO,IL,60643.0,2019-10-23T00:00:00.000,License Re-Inspection,Pass,,41.714168,-87.655291,"{'longitude': '41.7141680989703', 'latitude': ..."
1,2320342,YOLK TEST KITCHEN,YOLK TEST KITCHEN,2589655.0,Restaurant,Risk 1 (High),1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,2019-10-23T00:00:00.000,Canvass,Pass w/ Conditions,23. PROPER DATE MARKING AND DISPOSITION - Comm...,41.913588,-87.682203,"{'longitude': '41.9135877900482', 'latitude': ..."
2,2320328,LAS ASADAS MEXICAN GRILL,LAS ASADAS MEXICAN GRILL,2583309.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,2019-10-23T00:00:00.000,Canvass,Out of Business,,41.808025,-87.720037,"{'longitude': '41.80802515275297', 'latitude':..."


> We can create a function that standardize columns format in order to make the data more friendly to use. It fills empty space with '_' and lower case all the letters .

In [6]:
def standardize(column):
    '''
    standardize columns as lower case and empty space represented as '_'
    
    Parameters
    ----------
    column: str
        column name to be standardized
    
    Returns
    -------
    column: str
        column name after standardisation
    '''
    
    column = column.replace("#", "")
    column = column.lower().replace(" ", "_")
    
    if len(column) > 1:
        if column[-1] == "_":
            return column[:-1]
    return column

# application of the function to the dataset
data.columns = [standardize(x) for x in data.columns]

display(data.columns)


Index(['inspection_id', 'dba_name', 'aka_name', 'license', 'facility_type',
       'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
       'inspection_type', 'results', 'violations', 'latitude', 'longitude',
       'location', 'historical_wards_2003-2015', 'zip_codes',
       'community_areas', 'census_tracts', 'wards'],
      dtype='object')

> We need to check if the inspection ID is unique. If it is not the case, we need to remove the duplicates as an ID refers to an unique inspection.

In [7]:
# check is the inspection ID is unique
display(data['inspection_id'].is_unique)

False

In [8]:
# removing the duplicates 
data.drop_duplicates('inspection_id', inplace=True)

# check if all the duplicates were removed 
display(data['inspection_id'].is_unique)

True

> We need to remove all the NA values since we can't use this kind of information. But we need to remove them only from particular column ( as an example, the NA is the  violations columns only means that there were no violations so we need to keep it).

In [9]:
# remove NA values from relevent columns
data.dropna(subset=['inspection_date','license','latitude','longitude','inspection_type'], inplace = True)

> If we explore the license numbers, we find that there are some null license number. We need to remove them.

In [10]:
# drop "0.0" licenses
data = data[data.license != 0.0]

### 2 - Cleaning the column *inspection_date*

We need to remove the time of inspection (useless information) in order to clean the inspection date column.

In [11]:
# we remove the time ( all the caracters after 'T')
data['inspection_date']=data['inspection_date'].apply(lambda x : x.split('T')[0])

# converting inspection dates into float numbers
data.inspection_date = data.inspection_date.astype('datetime64[ns]')

display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,city,state,zip,...,results,violations,latitude,longitude,location,historical_wards_2003-2015,zip_codes,community_areas,census_tracts,wards
0,2345969,LORDANCHILD CHRISTIAN DAY CARE INC.,LORDANCHILD CHRISTIAN DAY CARE INC.,2215931.0,Children's Services Facility,Risk 1 (High),3344 W 79TH ST,CHICAGO,IL,60652.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.749915,-87.706628,"{'longitude': '41.749914910076974', 'latitude'...",,,,,
1,2345986,TACOS & SALSAS LLC,TACOS & SALSAS,2684121.0,Mobile Food Preparer,Risk 2 (Medium),2300 S THROOP ST,CHICAGO,IL,60608.0,...,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.850451,-87.658798,"{'longitude': '41.85045102427', 'latitude': '-...",,,,,
2,2345977,"KIMBALL DAY CARE CENTER & KINDERGARTEN , INC.","KIMBALL DAY CARE CENTER & KINDERGARTEN , INC.",2215859.0,Daycare (2 - 6 Years),Risk 1 (High),1636 N KIMBALL AVE,CHICAGO,IL,60647.0,...,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.911155,-87.711859,"{'longitude': '41.911154536126396', 'latitude'...",,,,,


### 3 - Cleaning of the column *city* and removing the column *state*

We need to check if there is only information from Chicago. After, we can remove the columns *city* and *state* since we don't need them for further use.

In [12]:
# check if we have only data from Chicago, if not we need to remove all the extra information
data.city.unique()

array(['CHICAGO', nan, 'Chicago', 'CCHICAGO', 'CHICAGO.',
       'CHESTNUT STREET', 'CHICAGOCHICAGO', 'chicago', 'CHICAGOHICAGO',
       'CHicago', '312CHICAGO', 'BEDFORD PARK', 'CHCICAGO',
       'CHARLES A HAYES', 'CHCHICAGO', 'CHICAGOI', 'SUMMIT', 'WESTMONT',
       'LOMBARD', 'INACTIVE', 'BLUE ISLAND'], dtype=object)

> We need to remove :   Bedford Park (Gas Station) ; Blue Island; Lombard ( a village near to Chicago) , Summit ( a city near to Chicago) ; WESTMONT ( village near Chicago) ; aslip (suburb of chicago)
We need to replace by chicago : 'CHARLES A HAYES'  (postal location), 312Chicago (Restaurant) ; CHICAGOI (Chicago)  ; CHESTNUT STREET (street in chicago); INACTIVE (out of business restaurant in Chicago); Chestnut street ;

In [13]:
# check if the state is unique
display(data.state.unique())

# as the state is unique and we will not use this column for our further investigations, we can drop it 
data.drop(['state'], axis = 1, inplace = True)

# check if there are other cities than Chicago
display(data.inspection_id.groupby(data['city']).count())

# drop the selected locations 
data = data[~data['city'].isin(["BEDFORD PARK", "BLUE ISLAND", "LOMBARD","SUMMIT","WESTMONT","alsip"])]

# check if the column is clean 
display(data.inspection_id.groupby(data['city']).count())

# now that we are sure that we have only information from Chicago, we can delete the city columns
data.drop(['city'], axis=1,inplace=True)

array(['IL', nan], dtype=object)

city
312CHICAGO              2
BEDFORD PARK            2
BLUE ISLAND             1
CCHICAGO               45
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            194108
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               318
INACTIVE                8
LOMBARD                 1
SUMMIT                  4
WESTMONT                1
chicago                82
Name: inspection_id, dtype: int64

city
312CHICAGO              2
CCHICAGO               45
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            194108
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               318
INACTIVE                8
chicago                82
Name: inspection_id, dtype: int64

### 4 - Cleaning the column *inspection_result*

In [51]:
# looking for the inspection results type
data.groupby('results')['inspection_id'].count()

results
Business Not Located        49
Fail                     37354
No Entry                  6158
Not Ready                 1843
Out of Business          16676
Pass                    104678
Pass w/ Conditions       26566
Name: inspection_id, dtype: int64

> We are only interested in 3 type of results : Pass ,Pass with condition and Fail. We need to remove the others.

In [52]:
data = data[~data.results.isin(['Out of Business', 'Business Not Located', 'No Entry','Not Ready'])]

### 5 - Cleaning the location information:  *zip* , *latitude* and *longitude*

In [67]:
# make sure that the latitude and longitude are float numbers
data['latitude'] = data['latitude'].astype(float)
data['longitude'] = data['longitude'].astype(float)

# convert the zip into a string (useful for the mapping ) 
data.zip = data.zip.astype(str)

# reformat the zip code writing in order to compare it with the zip code in geojson file (for vizualisation step)
data['zip'] = data['zip'].apply(lambda x : x.split('.')[0])

### 6 - Cleaning the column *inspection_type*

We want to focus only on inspections about food and not alcohol. So, we can not keep the 'force task' inspections. Also, we decided to remove the license inspection.

In [14]:
# cleaning of the inspections type
searchfor = ['Re', 'Canvass','Food','Complaint']
ignore = ['Fire','Not Ready','Recent Inspection','License','Tag Removal','Recent inspection','SFP']
data = data[data.inspection_type.str.contains('|'.join(searchfor))]
data = data[~data.inspection_type.str.contains('|'.join(ignore))]

data.groupby('inspection_type').count()

Unnamed: 0_level_0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,results,violations,latitude,longitude,location,historical_wards_2003-2015,zip_codes,community_areas,census_tracts,wards
inspection_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Canvass,102930,102930,102060,102930,98837,102928,102930,102909,102930,102930,78702,102930,102930,102930,0,0,0,0,0
Canvass Re-Inspection,20490,20490,20417,20490,20476,20490,20490,20486,20490,20490,14830,20490,20490,20490,0,0,0,0,0
Complaint,18049,18049,17973,18049,18007,18049,18049,18048,18049,18049,16929,18049,18049,18049,0,0,0,0,0
Complaint Re-Inspection,7506,7506,7471,7506,7502,7506,7506,7506,7506,7506,5344,7506,7506,7506,0,0,0,0,0
Short Form Complaint,6706,6706,6676,6706,6694,6706,6706,6705,6706,6706,4897,6706,6706,6706,0,0,0,0,0
Suspected Food Poisoning,848,848,845,848,848,848,848,848,848,848,817,848,848,848,0,0,0,0,0
Suspected Food Poisoning Re-inspection,191,191,191,191,191,191,191,191,191,191,138,191,191,191,0,0,0,0,0


***For a coordination purpose within the group , we will export this clean dataset in a csv file.***

In [15]:
# export the dataframe to a csv 
data.to_csv(r'./clean_dataset.csv')