# DineSafe Data Analysis

## Data Wrangling

In [2]:
#relevant libraries
import os
import re
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
import xml.etree.ElementTree as ET
import json
import numpy as np
#xmltodict is very useful here, let's get it

In [3]:
pip install xmltodict

Note: you may need to restart the kernel to use updated packages.


In [4]:
from xmltodict import parse
from collections import ChainMap

#need to flatten the XML as pandas has a hard time parsing this with from_XML()
raw_dict = parse(open('ds_od_xml 2022-11-13.xml', 'rb'))
raw_dict1 = dict(ChainMap(*raw_dict.values()))
raw_data_1 = pd.json_normalize(raw_dict1,record_path='ESTABLISHMENT')

#need to re-encode this file because this file is encoded in latin-1 and Pandas can't decode it for some reason
parser = ET.XMLParser(encoding='latin-1')
raw_data_2 = ET.parse('ds 2022-01-13.xml',parser)
raw_data_2.write('out.xml', encoding='utf-8')
raw_dict = parse(open('out.xml', 'rb'))
raw_dict2 = dict(ChainMap(*raw_dict.values())) #remove the initial dinesafe header key
raw_data_2 = pd.json_normalize(raw_dict2,record_path='ESTABLISHMENT')

#this older file is in UTF-8, hurray, but still too highly nested for Pandas
raw_dict = parse(open('dinesafe 2019-04-08.xml', 'rb'))
raw_dict3 = dict(ChainMap(*raw_dict.values()))
raw_data_3 = pd.json_normalize(raw_dict3,record_path='ROW')

#finally, hand-coded chains
chains = pd.read_csv('dinesafe 2019-04-08 chains.csv', encoding='latin-1', dtype={'Establishment ID':'int','Inspection ID':'int','Company/Franchise':'str','Name':'str','Type':'str','Address':'str','Lat':'float','Long':'float','Status':'str','Minimum inspections per year':'int','Infraction details':'str','Inspection date':'str','Severity':'str','Action':'str','Court outcome':'str','Amount fined':'float'})

In [1392]:
#processing the first file
#inspections are stored in four places: the root level of 'raw_data', as nested lists in 'inspection.infraction', as nested lists in 'inspection', and as dictionaries in 'inspection'
#raw_data_1 has some inspection results present right in the first level, and inspection data further nested into the XML

#let's start with grabbing just the top-level data
unnested_inspections = raw_data_1['INSPECTION.DATE'].dropna()
unnested_inspections = raw_data_1['INSPECTION.DATE'].dropna().reset_index().iloc[:,0]
unnested_inspections = raw_data_1.iloc[unnested_inspections,:]
unnested_inspections = unnested_inspections.drop(['STATUS'],axis='columns').rename({'INSPECTION.STATUS':'STATUS','INSPECTION.DATE':'DATE','INSPECTION.INFRACTION.DEFICIENCY':'DEFICIENCY','INSPECTION.INFRACTION.SEVERITY':'SEVERITY','INSPECTION.INFRACTION.ACTION':'ACTION','INSPECTION.INFRACTION.CONVICTION_DATE':'CONVICTION_DATE','INSPECTION.INFRACTION.COURT_OUTCOME':'COURT_OUTCOME','INSPECTION.INFRACTION.AMOUNT_FINED':'AMOUNT_FINED'},axis='columns')
#unnested_inspections = unnested_inspections.loc[unnested_inspections['INSPECTION.INFRACTION'].isna(),:].drop(['INSPECTION.INFRACTION','INSPECTION'],axis='columns')

#let's also check the times where there's nothing in inspection but something put under inspection.infraction
split_embed_infractions = raw_data_1['INSPECTION.INFRACTION'].dropna().explode().reset_index().rename({'index':'level_0'},axis='columns') #shares index with raw_data
#need to split it from the index and rejoin later, because the from/to-dict denesting method drops duplicate keys so multiple infractions aren't recorded
flattened_inspections2 = pd.concat([split_embed_infractions.iloc[:,0],pd.DataFrame.from_dict(split_embed_infractions.iloc[:,1].to_dict(),orient='index')],axis='columns')
#does not have date, level_0 references raw_data index
#join to unnested_inspections to get the date back in
flattened_inspections2 = flattened_inspections2.join(unnested_inspections.loc[unnested_inspections['INSPECTION.INFRACTION'].notna(),:],rsuffix='.un',on='level_0')
unnested_inspections = unnested_inspections.loc[unnested_inspections['INSPECTION.INFRACTION'].isna(),:]

#now for the inspection records shown as dicts in the 'inspection' column
#first remove null values - ones with no inspections
no_nulls = raw_data_1['INSPECTION'].dropna()

# now pull out single-infraction data
inspections = no_nulls.reset_index().rename({'index':'level_0'},axis='columns').explode('INSPECTION') #9978 rows, level_0 references raw_data
one_infraction = pd.json_normalize(inspections.loc[:,'INSPECTION'])
one_infraction = one_infraction.loc[one_infraction['INFRACTION'].isna(),:]
one_infraction = one_infraction.join(inspections.reset_index()) #now level_0 references raw_data

# now pull out multiple-infracted nested data
nested_data = pd.json_normalize(inspections.loc[:,'INSPECTION']) #find rows with nested inspection data
nested_data = pd.concat([inspections.loc[:,'level_0'].reset_index(),nested_data],axis='columns') #assign level 0 which refs raw_data
nested_data = nested_data.loc[nested_data['INFRACTION'].notna(),:] #use only rows with nested inspection data

#now unstack the multiple-infractions into a single set of columns
multi_infractions = pd.concat([nested_data.rename({'level_0':'raw_data_index'},axis='columns').reset_index(),pd.json_normalize(nested_data.loc[:,'INFRACTION'])],axis='columns')#index references inspections
#join the unstacked columns back to the index
multi_infr_unstacked = pd.json_normalize(multi_infractions.drop(multi_infractions.columns[1:12],axis='columns').melt('level_0').loc[:,'value'])
multi_infr_index = multi_infractions.drop(multi_infractions.columns[1:12],axis='columns').melt('level_0').loc[:,'level_0']
multi_infr_unstacked = pd.concat([multi_infr_index,multi_infr_unstacked],axis='columns')

multi_infractions = multi_infr_unstacked.join(multi_infractions.set_index('level_0'),on='level_0')
multi_infractions = multi_infractions.loc[multi_infractions['SEVERITY'].notna(),:] #remove blank rows from melt
multi_infractions = multi_infractions.join(nested_data.drop('level_0',axis='columns').reset_index(),rsuffix='.i',on='level_0') #now we have added raw_data indices
multi_infractions = multi_infractions.drop(multi_infractions.columns[11:],axis='columns') #clean up a bit
multi_infractions = multi_infractions.drop('level_0',axis='columns').reset_index().join(raw_data_1,on='raw_data_index',rsuffix='.raw') #and join to raw_index
multi_infractions = multi_infractions.drop(multi_infractions.columns[17:],axis='columns') #clean up some more

#need to get all the infractions back into a single inspection dataframe then back to the restaurant list
one_infraction.rename({'INFRACTION.SEVERITY':'SEVERITY','INFRACTION.DEFICIENCY':'DEFICIENCY','INFRACTION.ACTION':'ACTION','INFRACTION.CONVICTION_DATE':'CONVICTION_DATE','INFRACTION.COURT_OUTCOME':'COURT_OUTCOME','INFRACTION.AMOUNT_FINED':'AMOUNT_FINED'},axis='columns',inplace=True)
infractions_1 = pd.concat([multi_infractions,unnested_inspections,flattened_inspections2])
#unnested inspections already has restaurant data since we took it from the top level

#clean things up
cleaned_data_1 = pd.concat([infractions_1.join(raw_data_1,on='level_0',rsuffix='.raw'),one_infraction.join(raw_data_1,on='level_0',rsuffix='.raw')])
cleaned_data_1['NAME'].fillna(cleaned_data_1['NAME.raw'],inplace=True)
cleaned_data_1['ID'].fillna(cleaned_data_1['ID.raw'],inplace=True)
cleaned_data_1['TYPE'].fillna(cleaned_data_1['TYPE.raw'],inplace=True)
cleaned_data_1['ADDRESS'].fillna(cleaned_data_1['ADDRESS.raw'],inplace=True)
cleaned_data_1['LATITUDE'].fillna(cleaned_data_1['LATITUDE.raw'],inplace=True)
cleaned_data_1['LONGITUDE'].fillna(cleaned_data_1['LONGITUDE.raw'],inplace=True)
cleaned_data_1['SEVERITY'].fillna(cleaned_data_1['INSPECTION.INFRACTION.SEVERITY'],inplace=True)
cleaned_data_1['DEFICIENCY'].fillna(cleaned_data_1['INSPECTION.INFRACTION.DEFICIENCY'],inplace=True)
cleaned_data_1['ACTION'].fillna(cleaned_data_1['INSPECTION.INFRACTION.ACTION'],inplace=True)
cleaned_data_1['CONVICTION_DATE'].fillna(cleaned_data_1['INSPECTION.INFRACTION.CONVICTION_DATE'],inplace=True)
cleaned_data_1['COURT_OUTCOME'].fillna(cleaned_data_1['INSPECTION.INFRACTION.COURT_OUTCOME'],inplace=True)
cleaned_data_1['AMOUNT_FINED'].fillna(cleaned_data_1['INSPECTION.INFRACTION.AMOUNT_FINED'],inplace=True)
cleaned_data_1.drop(['NAME.raw','ID.raw','STATUS.raw','TYPE.raw','ADDRESS.raw','LATITUDE.raw','LONGITUDE.raw','LONGITUDE.raw','INSPECTION.raw',
                     'INSPECTION.INFRACTION.raw','SEVERITY.un','DEFICIENCY.un','ACTION.un','CONVICTION_DATE.un','COURT_OUTCOME.un','AMOUNT_FINED.un',
                     'INSPECTION.DATE','INSPECTION.INFRACTION','INSPECTION.STATUS','INSPECTION.INFRACTION.SEVERITY','INSPECTION.INFRACTION.DEFICIENCY',
                     'INSPECTION.INFRACTION.ACTION','INSPECTION.INFRACTION.CONVICTION_DATE','INSPECTION.INFRACTION.COURT_OUTCOME',
                     'INSPECTION.INFRACTION.AMOUNT_FINED','index','level_0','raw_data_index','INFRACTION','INSPECTION'],axis='columns',inplace=True)
cleaned_data_1.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 36937 entries, 0 to 16592
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   SEVERITY         22330 non-null  object
 1   DEFICIENCY       22330 non-null  object
 2   ACTION           22330 non-null  object
 3   CONVICTION_DATE  40 non-null     object
 4   COURT_OUTCOME    95 non-null     object
 5   AMOUNT_FINED     39 non-null     object
 6   STATUS           36937 non-null  object
 7   DATE             36937 non-null  object
 8   ID               36937 non-null  object
 9   NAME             36937 non-null  object
 10  TYPE             36937 non-null  object
 11  ADDRESS          36937 non-null  object
 12  LATITUDE         36937 non-null  object
 13  LONGITUDE        36937 non-null  object
dtypes: object(14)
memory usage: 4.2+ MB


In [1389]:
#cleaned_data_1.loc[cleaned_data_1['NAME'].str.contains('hashtag indi',case=False),:]

In [1394]:
#processing the second file
#inspections are stored in four places: the root level of 'raw_data', as nested lists in 'inspection.infraction', as nested lists in 'inspection', and as dictionaries in 'inspection'
#raw_data_1 has some inspection results present right in the first level, and inspection data further nested into the XML

#let's start with grabbing just the top-level data
unnested_inspections = raw_data_2['INSPECTION.DATE'].dropna()
unnested_inspections = raw_data_2['INSPECTION.DATE'].dropna().reset_index().iloc[:,0]
unnested_inspections = raw_data_2.iloc[unnested_inspections,:]
unnested_inspections = unnested_inspections.drop(['STATUS'],axis='columns').rename({'INSPECTION.STATUS':'STATUS','INSPECTION.DATE':'DATE','INSPECTION.INFRACTION.DEFICIENCY':'DEFICIENCY','INSPECTION.INFRACTION.SEVERITY':'SEVERITY','INSPECTION.INFRACTION.ACTION':'ACTION','INSPECTION.INFRACTION.CONVICTION_DATE':'CONVICTION_DATE','INSPECTION.INFRACTION.COURT_OUTCOME':'COURT_OUTCOME','INSPECTION.INFRACTION.AMOUNT_FINED':'AMOUNT_FINED'},axis='columns')
#unnested_inspections = unnested_inspections.loc[unnested_inspections['INSPECTION.INFRACTION'].isna(),:].drop(['INSPECTION.INFRACTION','INSPECTION'],axis='columns')

#let's also check the times where there's nothing in inspection but something put under inspection.infraction
split_embed_infractions = raw_data_2['INSPECTION.INFRACTION'].dropna().explode().reset_index().rename({'index':'level_0'},axis='columns') #shares index with raw_data
#need to split it from the index and rejoin later, because the from/to-dict denesting method drops duplicate keys so multiple infractions aren't recorded
flattened_inspections2 = pd.concat([split_embed_infractions.iloc[:,0],pd.DataFrame.from_dict(split_embed_infractions.iloc[:,1].to_dict(),orient='index')],axis='columns')
#does not have date, level_0 references raw_data index
#join to unnested_inspections to get the date back in
flattened_inspections2 = flattened_inspections2.join(unnested_inspections.loc[unnested_inspections['INSPECTION.INFRACTION'].notna(),:],rsuffix='.un',on='level_0')
unnested_inspections = unnested_inspections.loc[unnested_inspections['INSPECTION.INFRACTION'].isna(),:]

#now for the inspection records shown as dicts in the 'inspection' column
#first remove null values - ones with no inspections
no_nulls = raw_data_2['INSPECTION'].dropna()

# now pull out single-infraction data
inspections = no_nulls.reset_index().rename({'index':'level_0'},axis='columns').explode('INSPECTION') #9978 rows, level_0 references raw_data
one_infraction = pd.json_normalize(inspections.loc[:,'INSPECTION'])
one_infraction = one_infraction.loc[one_infraction['INFRACTION'].isna(),:]
one_infraction = one_infraction.join(inspections.reset_index()) #now level_0 references raw_data

# now pull out multiple-infracted nested data
nested_data = pd.json_normalize(inspections.loc[:,'INSPECTION']) #find rows with nested inspection data
nested_data = pd.concat([inspections.loc[:,'level_0'].reset_index(),nested_data],axis='columns') #assign level 0 which refs raw_data
nested_data = nested_data.loc[nested_data['INFRACTION'].notna(),:] #use only rows with nested inspection data

#now unstack the multiple-infractions into a single set of columns
multi_infractions = pd.concat([nested_data.rename({'level_0':'raw_data_index'},axis='columns').reset_index(),pd.json_normalize(nested_data.loc[:,'INFRACTION'])],axis='columns')#index references inspections
#join the unstacked columns back to the index
multi_infr_unstacked = pd.json_normalize(multi_infractions.drop(multi_infractions.columns[1:12],axis='columns').melt('level_0').loc[:,'value'])
multi_infr_index = multi_infractions.drop(multi_infractions.columns[1:12],axis='columns').melt('level_0').loc[:,'level_0']
multi_infr_unstacked = pd.concat([multi_infr_index,multi_infr_unstacked],axis='columns')

multi_infractions = multi_infr_unstacked.join(multi_infractions.set_index('level_0'),on='level_0')
multi_infractions = multi_infractions.loc[multi_infractions['SEVERITY'].notna(),:] #remove blank rows from melt
multi_infractions = multi_infractions.join(nested_data.drop('level_0',axis='columns').reset_index(),rsuffix='.i',on='level_0') #now we have added raw_data indices
multi_infractions = multi_infractions.drop(multi_infractions.columns[11:],axis='columns') #clean up a bit
multi_infractions = multi_infractions.drop('level_0',axis='columns').reset_index().join(raw_data_2,on='raw_data_index',rsuffix='.raw') #and join to raw_index
multi_infractions = multi_infractions.drop(multi_infractions.columns[17:],axis='columns') #clean up some more

#need to get all the infractions back into a single inspection dataframe then back to the restaurant list
one_infraction.rename({'INFRACTION.SEVERITY':'SEVERITY','INFRACTION.DEFICIENCY':'DEFICIENCY','INFRACTION.ACTION':'ACTION','INFRACTION.CONVICTION_DATE':'CONVICTION_DATE','INFRACTION.COURT_OUTCOME':'COURT_OUTCOME','INFRACTION.AMOUNT_FINED':'AMOUNT_FINED'},axis='columns',inplace=True)
infractions_1 = pd.concat([multi_infractions,unnested_inspections,flattened_inspections2])
#unnested inspections already has restaurant data since we took it from the top level

#clean things up
cleaned_data_2 = pd.concat([infractions_1.join(raw_data_2,on='level_0',rsuffix='.raw'),one_infraction.join(raw_data_2,on='level_0',rsuffix='.raw')])
cleaned_data_2['NAME'].fillna(cleaned_data_2['NAME.raw'],inplace=True)
cleaned_data_2['ID'].fillna(cleaned_data_2['ID.raw'],inplace=True)
cleaned_data_2['TYPE'].fillna(cleaned_data_2['TYPE.raw'],inplace=True)
cleaned_data_2['ADDRESS'].fillna(cleaned_data_2['ADDRESS.raw'],inplace=True)
cleaned_data_2['LATITUDE'].fillna(cleaned_data_2['LATITUDE.raw'],inplace=True)
cleaned_data_2['LONGITUDE'].fillna(cleaned_data_2['LONGITUDE.raw'],inplace=True)
cleaned_data_2['SEVERITY'].fillna(cleaned_data_2['INSPECTION.INFRACTION.SEVERITY'],inplace=True)
cleaned_data_2['DEFICIENCY'].fillna(cleaned_data_2['INSPECTION.INFRACTION.DEFICIENCY'],inplace=True)
cleaned_data_2['ACTION'].fillna(cleaned_data_2['INSPECTION.INFRACTION.ACTION'],inplace=True)
cleaned_data_2['CONVICTION_DATE'].fillna(cleaned_data_2['INSPECTION.INFRACTION.CONVICTION_DATE'],inplace=True)
cleaned_data_2['COURT_OUTCOME'].fillna(cleaned_data_2['INSPECTION.INFRACTION.COURT_OUTCOME'],inplace=True)
cleaned_data_2['AMOUNT_FINED'].fillna(cleaned_data_2['INSPECTION.INFRACTION.AMOUNT_FINED'],inplace=True)
cleaned_data_2.drop(['NAME.raw','ID.raw','STATUS.raw','TYPE.raw','ADDRESS.raw','LATITUDE.raw','LONGITUDE.raw','LONGITUDE.raw','INSPECTION.raw',
                     'INSPECTION.INFRACTION.raw','SEVERITY.un','DEFICIENCY.un','ACTION.un','CONVICTION_DATE.un','COURT_OUTCOME.un','AMOUNT_FINED.un',
                     'INSPECTION.DATE','INSPECTION.INFRACTION','INSPECTION.STATUS','INSPECTION.INFRACTION.SEVERITY','INSPECTION.INFRACTION.DEFICIENCY',
                     'INSPECTION.INFRACTION.ACTION','INSPECTION.INFRACTION.CONVICTION_DATE','INSPECTION.INFRACTION.COURT_OUTCOME',
                     'INSPECTION.INFRACTION.AMOUNT_FINED','index','level_0','raw_data_index','INFRACTION','INSPECTION'],axis='columns',inplace=True)
cleaned_data_2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 24361 entries, 0 to 9977
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   SEVERITY         14322 non-null  object
 1   DEFICIENCY       14322 non-null  object
 2   ACTION           14322 non-null  object
 3   CONVICTION_DATE  22 non-null     object
 4   COURT_OUTCOME    110 non-null    object
 5   AMOUNT_FINED     21 non-null     object
 6   STATUS           24361 non-null  object
 7   DATE             24361 non-null  object
 8   ID               24361 non-null  object
 9   NAME             24361 non-null  object
 10  TYPE             24361 non-null  object
 11  ADDRESS          24361 non-null  object
 12  LATITUDE         24361 non-null  object
 13  LONGITUDE        24361 non-null  object
dtypes: object(14)
memory usage: 2.8+ MB


In [1395]:
#check
#cleaned_data_2.info() #7814 rows
cleaned_data_2.loc[cleaned_data_2['NAME'].str.contains('4c bro',case=False),:] #cross-validate with DineSafe webtool: https://www.toronto.ca/community-people/health-wellness-care/health-programs-advice/food-safety/dinesafe/#infraction_details/10613283/0

Unnamed: 0,SEVERITY,DEFICIENCY,ACTION,CONVICTION_DATE,COURT_OUTCOME,AMOUNT_FINED,STATUS,DATE,ID,NAME,TYPE,ADDRESS,LATITUDE,LONGITUDE
6,S - Significant,Use handwashing station other than for handwas...,Notice to Comply,,,,Conditional Pass,2021-02-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
7,S - Significant,USE FOOD EQUIPMENT NOT OF READILY CLEANABLE FO...,Notice to Comply,,,,Conditional Pass,2021-11-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
8,M - Minor,FOOD PREMISE NOT MAINTAINED TO KEEP FOOD-HANDL...,Notice to Comply,,,,Pass,2021-11-26,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
2505,M - Minor,FAIL TO ENSURE EQUIPMENT SURFACE SANITIZED AS ...,Notice to Comply,,,,Conditional Pass,2021-02-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
2506,M - Minor,FAIL TO ENSURE EQUIPMENT SURFACE SANITIZED AS ...,Notice to Comply,,,,Conditional Pass,2021-11-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
2507,M - Minor,FOOD PREMISE NOT MAINTAINED TO KEEP FOOD-HANDL...,Notice to Comply,,,,Pass,2021-11-26,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
5000,M - Minor,FOOD PREMISE NOT MAINTAINED TO KEEP FOOD-HANDL...,Notice to Comply,,,,Conditional Pass,2021-02-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
5001,M - Minor,Food premise not maintained with floors in goo...,Notice to Comply,,,,Conditional Pass,2021-11-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
6461,M - Minor,Food premise not maintained with walls in good...,Notice to Comply,,,,Conditional Pass,2021-02-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782
6462,M - Minor,FOOD PREMISE NOT MAINTAINED WITH FOOD HANDLING...,Notice to Comply,,,,Conditional Pass,2021-11-23,10433183,4C Broast Chicken,Restaurant,1758 LAWRENCE AVE E,43.74285,-79.30782


In [1396]:
#processing the third (oldest) file which is in a different format
#first normalize column names
cleaned_data_3 = raw_data_3.rename(columns={'ESTABLISHMENT_ID':'ID','ESTABLISHMENT_NAME':'NAME','ESTABLISHMENTTYPE':'TYPE','ESTABLISHMENT_ADDRESS':'ADDRESS','ESTABLISHMENT_STATUS':'STATUS','INSPECTION_DATE':'DATE','INFRACTION_DETAILS':'DEFICIENCY'},inplace=False)
#and that's it! what a much easier thing to work with

In [766]:
cleaned_data_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88819 entries, 0 to 88818
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ROW_ID                       88819 non-null  object
 1   ID                           88819 non-null  object
 2   INSPECTION_ID                88819 non-null  object
 3   NAME                         88819 non-null  object
 4   TYPE                         88819 non-null  object
 5   ADDRESS                      88819 non-null  object
 6   LATITUDE                     88819 non-null  object
 7   LONGITUDE                    88819 non-null  object
 8   STATUS                       88819 non-null  object
 9   MINIMUM_INSPECTIONS_PERYEAR  88819 non-null  object
 10  DEFICIENCY                   60262 non-null  object
 11  DATE                         88819 non-null  object
 12  SEVERITY                     60262 non-null  object
 13  ACTION                       60

In [1397]:
#great, all the data is fully flattened, let's combine them together
cleaned_dataset=pd.concat([cleaned_data_1,cleaned_data_2,cleaned_data_3]) #inspections

#now join chain assignments on establishment ID/ID
#reindex chains to make establishment ID the index for join
#chains_condensed = pd.concat([chains['Company/Franchise'],chains['Establishment ID']],axis='columns').set_index('Establishment ID').loc[~chains.index.duplicated(), :]
#inspection_chain = inspections.join(chains_condensed,on=inspections['ID'].astype('int'))

#reset the index as there are duplicates per inspection row, and remove overlapping inspections
#cleaned_dataset = inspection_chain.reset_index().rename({'index':'inspection_index'},axis='columns').loc[~inspection_chain.index.duplicated(), :]

#datetime assignment to dates
cleaned_dataset['DATE']=pd.to_datetime(cleaned_dataset['DATE'],errors='ignore',yearfirst=True)

cleaned_dataset
#potential future work: new stores have opened since, so for the top 100 chains, try to find this keyword in each store name with this dict:

Unnamed: 0,SEVERITY,DEFICIENCY,ACTION,CONVICTION_DATE,COURT_OUTCOME,AMOUNT_FINED,STATUS,DATE,ID,NAME,TYPE,ADDRESS,LATITUDE,LONGITUDE,ROW_ID,INSPECTION_ID,MINIMUM_INSPECTIONS_PERYEAR
0,M - Minor,Operate food premise - equipment not construct...,Notice to Comply,,,,Pass,2022-05-26,10752656,# HASHTAG INDIA RESTAURANT,Food Take Out,1871 O'CONNOR DR,43.72199,-79.30349,,,
1,M - Minor,Fail to protect against entry of pests - Sec. ...,Notice to Comply,,,,Conditional Pass,2022-08-10,10752656,# HASHTAG INDIA RESTAURANT,Food Take Out,1871 O'CONNOR DR,43.72199,-79.30349,,,
2,M - Minor,FAIL TO ENSURE EQUIPMENT SURFACE SANITIZED AS ...,Notice to Comply,,,,Pass,2022-10-06,10737088,1 HOTELS - CASA MADERA,Restaurant,550 WELLINGTON ST W,43.64284,-79.40167,,,
3,M - Minor,Operate food premise - furniture not construct...,Notice to Comply,,,,Pass,2022-10-06,10737091,1 HOTELS - HARRIET'S,Restaurant,550 WELLINGTON ST W,43.64284,-79.40167,,,
4,M - Minor,Operate food premise - equipment not construct...,Notice to Comply,,,,Pass,2022-04-26,10727411,1 Hotels - 1 KITCHEN,Restaurant,550 WELLINGTON ST W,43.64284,-79.40167,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88814,,,,,,,Pass,2019-04-04,10666877,SCOTIABANK-GALLERY B,Cafeteria,40 BAY ST,43.64407,-79.37843,88815,104445080,2
88815,,,,,,,Pass,2019-04-05,10666885,FAST DELICIOUS,Restaurant,2633 YONGE ST,43.7153668309,-79.3999061013,88816,104445125,3
88816,,,,,,,Pass,2019-04-04,10666887,SCOTIABANK-GALLERY D,Cafeteria,40 BAY ST,43.64407,-79.37843,88817,104445135,2
88817,,,,,,,Pass,2019-04-02,10666903,BATHURST & FINCH COMMUNITY FOOD BANK,Food Bank,550 FINCH AVE W,43.773882768,-79.446117704,88818,104445231,1


In [1398]:
#export
cleaned_dataset.to_csv('cleaned_dataset2022-11-27.csv')