In [1]:
import re
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

# Data Cleaning 

In [3]:
# opening the data
data = pd.read_csv('data/food-inspections.csv', delimiter = ',')

display(data.head(3))

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Results,Violations,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards
0,2346127,CREPE HOUSE CAFE,CREPE HOUSE CAFE,2637127.0,Restaurant,Risk 1 (High),5033 N ELSTON AVE,CHICAGO,IL,60630.0,...,Pass,,41.972349,-87.746825,"{'latitude': '-87.74682508578468', 'longitude'...",,,,,
1,2346148,EDIBLE ARRANGEMENTS #1250,EDIBLE ARRANGEMENTS #1250,2703497.0,Restaurant,Risk 2 (Medium),1783 W HOWARD ST,CHICAGO,IL,60626.0,...,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",42.01929,-87.675802,"{'latitude': '-87.67580161669515', 'longitude'...",,,,,
2,2346153,PINK'S CHILD CARE ACADEMY III,PINK'S CHILD CARE ACADEMY III,2215653.0,Daycare Combo 1586,Risk 1 (High),2914 W 87TH ST,CHICAGO,IL,60652.0,...,No Entry,,41.735418,-87.695297,"{'latitude': '-87.69529667271209', 'longitude'...",,,,,


### 1 - General cleaning

In this first part, we will do general cleaning : homogenize the format of our dataframe and remove the duplicates, empty columns, NA values and null values.

In [4]:
# drop all the empty columns
data.drop(['Historical Wards 2003-2015', 'Zip Codes', 'Community Areas','Census Tracts','Wards'], axis = 1, inplace = True)

# show the dataframe
display(data.head(3))

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2346127,CREPE HOUSE CAFE,CREPE HOUSE CAFE,2637127.0,Restaurant,Risk 1 (High),5033 N ELSTON AVE,CHICAGO,IL,60630.0,2019-11-22T00:00:00.000,Canvass Re-Inspection,Pass,,41.972349,-87.746825,"{'latitude': '-87.74682508578468', 'longitude'..."
1,2346148,EDIBLE ARRANGEMENTS #1250,EDIBLE ARRANGEMENTS #1250,2703497.0,Restaurant,Risk 2 (Medium),1783 W HOWARD ST,CHICAGO,IL,60626.0,2019-11-22T00:00:00.000,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",42.01929,-87.675802,"{'latitude': '-87.67580161669515', 'longitude'..."
2,2346153,PINK'S CHILD CARE ACADEMY III,PINK'S CHILD CARE ACADEMY III,2215653.0,Daycare Combo 1586,Risk 1 (High),2914 W 87TH ST,CHICAGO,IL,60652.0,2019-11-22T00:00:00.000,Canvass,No Entry,,41.735418,-87.695297,"{'latitude': '-87.69529667271209', 'longitude'..."


> We can create a function that standardize columns format in order to make the data more friendly to use. It fills empty space with '_' and lower case all the letters .

In [5]:
def standardize(column):
    '''
    standardize columns as lower case and empty space represented as '_'
    
    Parameters
    ----------
    column: str
        column name to be standardized
    
    Returns
    -------
    column: str
        column name after standardisation
    '''
    
    column = column.replace("#", "")
    column = column.lower().replace(" ", "_")
    
    if len(column) > 1:
        if column[-1] == "_":
            return column[:-1]
    return column

# application of the function to the dataset
data.columns = [standardize(x) for x in data.columns]

display(data.columns)


Index(['inspection_id', 'dba_name', 'aka_name', 'license', 'facility_type',
       'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
       'inspection_type', 'results', 'violations', 'latitude', 'longitude',
       'location'],
      dtype='object')

> We need to check if the inspection ID is unique. If it is not the case, we need to remove the duplicates as an ID refers to an unique inspection.

In [6]:
# check is the inspection ID is unique
display(data['inspection_id'].is_unique)

False

In [7]:
# removing the duplicates 
data.drop_duplicates('inspection_id', inplace=True)

# check if all the duplicates were removed 
display(data['inspection_id'].is_unique)

True

> We need to remove all the NA values since we can't use this kind of information. But we need to remove them only from particular column ( as an example, the NA is the  violations columns only means that there were no violations so we need to keep it).

In [8]:
# remove NA values from relevent columns
data.dropna(subset=['inspection_date','license','latitude','longitude','inspection_type'], inplace = True)

> If we explore the license numbers, we find that there are some null license number. We need to remove them.

In [9]:
# drop "0.0" licenses
data = data[data.license != 0.0]

### 2 - Cleaning the column *inspection_date*

We need to remove the time of inspection (useless information) in order to clean the inspection date column.

In [10]:
# we remove the time ( all the caracters after 'T')
data['inspection_date']=data['inspection_date'].apply(lambda x : x.split('T')[0])

# converting inspection dates into float numbers
data.inspection_date = data.inspection_date.astype('datetime64[ns]')

display(data.head(3))

Unnamed: 0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,results,violations,latitude,longitude,location
0,2346127,CREPE HOUSE CAFE,CREPE HOUSE CAFE,2637127.0,Restaurant,Risk 1 (High),5033 N ELSTON AVE,CHICAGO,IL,60630.0,2019-11-22,Canvass Re-Inspection,Pass,,41.972349,-87.746825,"{'latitude': '-87.74682508578468', 'longitude'..."
1,2346148,EDIBLE ARRANGEMENTS #1250,EDIBLE ARRANGEMENTS #1250,2703497.0,Restaurant,Risk 2 (Medium),1783 W HOWARD ST,CHICAGO,IL,60626.0,2019-11-22,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",42.01929,-87.675802,"{'latitude': '-87.67580161669515', 'longitude'..."
2,2346153,PINK'S CHILD CARE ACADEMY III,PINK'S CHILD CARE ACADEMY III,2215653.0,Daycare Combo 1586,Risk 1 (High),2914 W 87TH ST,CHICAGO,IL,60652.0,2019-11-22,Canvass,No Entry,,41.735418,-87.695297,"{'latitude': '-87.69529667271209', 'longitude'..."


### 3 - Cleaning of the column *city* and removing the column *state*

We need to check if there is only information from Chicago. After, we can remove the columns *city* and *state* since we don't need them for further use.

In [11]:
# check if we have only data from Chicago, if not we need to remove all the extra information
data.city.unique()

array(['CHICAGO', nan, 'Chicago', 'CCHICAGO', 'CHICAGO.',
       'CHESTNUT STREET', 'CHICAGOCHICAGO', 'chicago', 'CHICAGOHICAGO',
       'CHicago', '312CHICAGO', 'BEDFORD PARK', 'CHCICAGO',
       'CHARLES A HAYES', 'CHCHICAGO', 'CHICAGOI', 'SUMMIT', 'WESTMONT',
       'LOMBARD', 'INACTIVE', 'BLUE ISLAND'], dtype=object)

> We need to remove :   Bedford Park (Gas Station) ; Blue Island; Lombard ( a village near to Chicago) , Summit ( a city near to Chicago) ; WESTMONT ( village near Chicago) ; aslip (suburb of chicago)
We need to replace by chicago : 'CHARLES A HAYES'  (postal location), 312Chicago (Restaurant) ; CHICAGOI (Chicago)  ; CHESTNUT STREET (street in chicago); INACTIVE (out of business restaurant in Chicago); Chestnut street ;

In [12]:
# check if the state is unique
display(data.state.unique())

# as the state is unique and we will not use this column for our further investigations, we can drop it 
data.drop(['state'], axis = 1, inplace = True)

# check if there are other cities than Chicago
display(data.inspection_id.groupby(data['city']).count())

# drop the selected locations 
data = data[~data['city'].isin(["BEDFORD PARK", "BLUE ISLAND", "LOMBARD","SUMMIT","WESTMONT","alsip"])]

# check if the column is clean 
display(data.inspection_id.groupby(data['city']).count())

# now that we are sure that we have only information from Chicago, we can delete the city columns
data.drop(['city'], axis=1,inplace=True)

array(['IL', nan], dtype=object)

city
312CHICAGO              2
BEDFORD PARK            2
BLUE ISLAND             1
CCHICAGO               44
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            194253
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               318
INACTIVE                8
LOMBARD                 1
SUMMIT                  4
WESTMONT                1
chicago                82
Name: inspection_id, dtype: int64

city
312CHICAGO              2
CCHICAGO               44
CHARLES A HAYES         4
CHCHICAGO               6
CHCICAGO                3
CHESTNUT STREET        11
CHICAGO            194253
CHICAGO.                2
CHICAGOCHICAGO          7
CHICAGOHICAGO           2
CHICAGOI                3
CHicago                12
Chicago               318
INACTIVE                8
chicago                82
Name: inspection_id, dtype: int64

### 4 - Cleaning the column *inspection_result*

In [13]:
# looking for the inspection results type
data.groupby('results')['inspection_id'].count()

results
Business Not Located        50
Fail                     37665
No Entry                  6266
Not Ready                 1883
Out of Business          16772
Pass                    105133
Pass w/ Conditions       27117
Name: inspection_id, dtype: int64

> We are only interested in 3 type of results : Pass ,Pass with condition and Fail. We need to remove the others.

In [14]:
data = data[~data.results.isin(['Out of Business', 'Business Not Located', 'No Entry','Not Ready'])]

### 5 - Cleaning the location information:  *zip* , *latitude* and *longitude*

In [15]:
# make sure that the latitude and longitude are float numbers
data['latitude'] = data['latitude'].astype(float)
data['longitude'] = data['longitude'].astype(float)

# convert the zip into a string (useful for the mapping ) 
data.zip = data.zip.astype(str)

# reformat the zip code writing in order to compare it with the zip code in geojson file (for vizualisation step)
data['zip'] = data['zip'].apply(lambda x : x.split('.')[0])

### 6 - Cleaning the column *inspection_type*

We want to focus only on inspections about food and not alcohol. So, we can not keep the 'force task' inspections. Also, we decided to remove the license inspection.

In [16]:
# cleaning of the inspections type
searchfor = ['Re', 'Canvass','Food','Complaint']
ignore = ['Fire','Not Ready','Recent Inspection','License','Tag Removal','Recent inspection','SFP']
data = data[data.inspection_type.str.contains('|'.join(searchfor))]
data = data[~data.inspection_type.str.contains('|'.join(ignore))]

data.groupby('inspection_type').count()

Unnamed: 0_level_0,inspection_id,dba_name,aka_name,license,facility_type,risk,address,zip,inspection_date,results,violations,latitude,longitude,location
inspection_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Canvass,81763,81763,81453,81763,81565,81763,81763,81763,81763,81763,78598,81763,81763,81763
Canvass Re-Inspection,20196,20196,20126,20196,20187,20196,20196,20196,20196,20196,14657,20196,20196,20196
Complaint,17272,17272,17200,17272,17254,17272,17272,17272,17272,17272,16895,17272,17272,17272
Complaint Re-Inspection,7411,7411,7378,7411,7407,7411,7411,7411,7411,7411,5284,7411,7411,7411
Short Form Complaint,6606,6606,6576,6606,6599,6606,6606,6606,6606,6606,4876,6606,6606,6606
Suspected Food Poisoning,838,838,835,838,838,838,838,838,838,838,817,838,838,838
Suspected Food Poisoning Re-inspection,191,191,191,191,191,191,191,191,191,191,138,191,191,191


***For a coordination purpose within the group , we will export this clean dataset in a csv file.***

In [18]:
# export the dataframe to a csv 
data.to_csv(r'data/clean_dataset.csv')