In [2]:
# Always include these two lines.
# They allow multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Importing required libraries
import requests  # To make HTTP requests and fetch data from APIs
import pandas as pd  # To store, manipulate, and clean tabular data
import sqlite3  # To interact with an SQLite database for data storage
import json  # To handle JSON data from APIs
import matplotlib.pyplot as plt  # Optional, for data visualization
import os # working with operating system functions


# Display confirmation
print("Libraries imported successfully!")

Libraries imported successfully!


In [15]:
import pandas as pd

# Load the JSON file into a DataFrame
df = pd.read_json('data/api_data.json')

# Preview the data
print(df.head())


   inspection_id                   dba_name                   aka_name  \
0        2616252  ROTI MODERN MEDITERRANEAN  ROTI MODERN MEDITERRANEAN   
1        2616253     MAGGIE GYROS & CHICKEN     MAGGIE GYROS & CHICKEN   
2        2616226    FORNO ROSSO ON RANDOLPH    FORNO ROSSO ON RANDOLPH   
3        2616137   HOOK POINT FISHERIES LLC   HOOK POINT FISHERIES LLC   
4        2616171          TO KOREAN CUISINE          TO KOREAN CUISINE   

    license_ facility_type           risk             address     city state  \
0  2594670.0    Restaurant  Risk 1 (High)  1012 W RANDOLPH ST  CHICAGO    IL   
1  1869774.0    Restaurant  Risk 1 (High)       349 E 47TH ST  CHICAGO    IL   
2  2358717.0    Restaurant  Risk 1 (High)  1048 W RANDOLPH ST  CHICAGO    IL   
3  3020919.0           NaN            NaN   4755 S TALMAN AVE  CHICAGO    IL   
4  2997283.0    Restaurant  Risk 1 (High)     3108 N BROADWAY  CHICAGO    IL   

       zip  ...    results   latitude  longitude  \
0  60607.0  ...       

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290850 entries, 0 to 290849
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   inspection_id                290850 non-null  int64  
 1   dba_name                     290850 non-null  object 
 2   aka_name                     288406 non-null  object 
 3   license_                     290832 non-null  float64
 4   facility_type                285615 non-null  object 
 5   risk                         290768 non-null  object 
 6   address                      290850 non-null  object 
 7   city                         290689 non-null  object 
 8   state                        290792 non-null  object 
 9   zip                          290809 non-null  float64
 10  inspection_date              290850 non-null  object 
 11  inspection_type              290849 non-null  object 
 12  results                      290850 non-null  object 
 13 

In [5]:
df.drop(columns=[
    'location',
    ':@computed_region_awaf_s7ux',
    ':@computed_region_6mkv_f3dw',
    ':@computed_region_vrxf_vc4k',
    ':@computed_region_bdys_3d7i',
    ':@computed_region_43wa_7qmu'
], inplace=True)


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 290342 entries, 0 to 290341
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   inspection_id                290342 non-null  int64  
 1   dba_name                     290342 non-null  object 
 2   aka_name                     287919 non-null  object 
 3   license_                     290324 non-null  float64
 4   facility_type                285136 non-null  object 
 5   risk                         290260 non-null  object 
 6   address                      290342 non-null  object 
 7   city                         290182 non-null  object 
 8   state                        290284 non-null  object 
 9   zip                          290302 non-null  float64
 10  inspection_date              290342 non-null  object 
 11  inspection_type              290341 non-null  object 
 12  results                      290342 non-null  object 
 13  lati

inspection_id

In [8]:
df = df[df['inspection_type'].notnull()]
print(df['inspection_id'].unique())

[2616252 2616253 2616226 ...   67757   70269   67732]


In [18]:
# Convert 'inspection_id' to numeric (non-convertible values become NaN)
df['inspection_id'] = pd.to_numeric(df['inspection_id'], errors='coerce')

# Drop rows where 'inspection_id' could not be converted (optional but recommended)
df = df.dropna(subset=['inspection_id'])

# Ensure it's of integer type (optional)
df['inspection_id'] = df['inspection_id'].astype(int)

# Drop duplicates based on 'inspection_id'
df = df.drop_duplicates(subset='inspection_id', keep='first')


dba_name & aka_name

In [7]:
df['dba_name'] = df['aka_name'].combine_first(df['dba_name'])

# Fill aka_name with updated dba_name if it's missing
df['aka_name'] = df['aka_name'].fillna(df['dba_name'])

# Make both columns fully uppercase
df['dba_name'] = df['dba_name'].str.upper()
df['aka_name'] = df['aka_name'].str.upper()

liscene_

In [8]:
df = df[df['license_'].notnull()]
df = df[df['license_'] != 0]
df['license_'] = pd.to_numeric(df['license_'], errors='coerce', downcast='integer')


facility_type

In [9]:
# Remove rows with missing facility_type


In [10]:
df = df.dropna(subset=['facility_type'])

In [11]:
import re

def map_facility_type(val):
    if pd.isna(val): return 'unknown'
    val = val.lower()
    if re.search(r'restaurant|restu|restaurant bar', val): return 'restaurant'
    if re.search(r'daycare|after school|children.*services|1023|combo', val): return 'daycare'
    if re.search(r'school|cafeteria|university|college', val): return 'school'
    if re.search(r'grocery|taqueria|butcher', val): return 'grocery store'
    if re.search(r'bakery|pastry', val): return 'bakery'
    if re.search(r'coffee|tea|cafe', val): return 'coffee shop'
    if re.search(r'ice cream|paleteria|frozen dessert|gelato', val): return 'ice cream'
    if re.search(r'tavern|bar|liquor|lounge', val): return 'tavern/bar'
    if re.search(r'mobile|push cart|vending machine', val): return 'mobile vendor'
    if re.search(r'chur|church', val): return 'church'
    if re.search(r'banquet|cater|event|venue', val): return 'event/catering'
    if re.search(r'shared kitchen|commissary', val): return 'shared kitchen'
    if re.search(r'hospital|nursing|rehab|senior|care', val): return 'healthcare'
    if re.search(r'convenience|dollar|store|retail', val): return 'retail store'
    if re.search(r'gym|fitness|health club|nutrition', val): return 'fitness'
    return 'other'

In [12]:
df['facility_type_clean'] = (
    df['facility_type']
    .str.lower()
    .str.strip()
    .str.replace(r'[-_/]', ' ', regex=True)
)

risk

In [37]:
df = df[df['risk'].notnull()]
print("\nUnique risk values:")
print(df['results'].unique())



Unique risk values:
['Pass' 'No Entry' 'Fail' 'Pass w/ Conditions' 'Not Ready']


address

state

In [14]:
df = df[df['state'] == 'IL']


city

In [15]:
df['city'] = df['city'].str.upper()

In [16]:
unique_counts = df['city'].value_counts()
print(unique_counts)

city
CHICAGO         283901
CCHICAGO            59
SCHAUMBURG          28
EVANSTON            21
MAYWOOD             16
                 ...  
MORTON GROVE         1
WHEATON              1
WILMETTE             1
WADSWORTH            1
LANSING              1
Name: count, Length: 69, dtype: int64


In [17]:
valid_cities = [
    'Chicago', 'Grayslake', 'Brookfield', 'Skokie', 'Evanston', 'Merrillville',
    'Burbank', 'Evergreen Park', 'Naperville', 'Matteson', 'Berwyn', 'Oak Park',
    'Highland Park', 'Plainfield', 'Western Springs', 'Schaumburg', 'Torrance',
    'Summit', 'Lake Zurich', 'Whiting', 'Glen Ellyn', 'Los Angeles', 'Calumet City',
    'Burnham', 'Oak Lawn', 'Morton Grove', 'Bridgeview', 'Griffith', 'New York',
    'Elmhurst', 'New Holstein', 'Algonquin', 'Niles', 'Lansing', 'Wadsworth',
    'Wilmette', 'Wheaton', 'Rosemont', 'Palos Park', 'Elk Grove Village', 'Cicero',
    'Maywood', 'Lake Bluff', 'Schiller Park', 'Bannockburn', 'Bloomingdale',
    'Norridge', 'Charles A Hayes', 'Chicago Heights', 'Justice', 'Tinley Park',
    'Lombard', 'East Hazel Crest', 'Country Club Hills', 'Streamwood',
    'Bolingbrook', 'Des Plaines', 'Olympia Fields', 'Alsip', 'Blue Island',
    'Glencoe', 'Frankfort', 'Broadview', 'Worth'
]


In [18]:
from rapidfuzz import process, fuzz
import pandas as pd

def clean_and_standardize_cities(df, valid_cities, threshold=80):
    print("Unique cities before cleaning:")
    print(df['city'].value_counts())

    # Normalize valid cities and create a map
    valid_cities_normalized = [city.lower().strip() for city in valid_cities]
    valid_city_map = dict(zip(valid_cities_normalized, valid_cities))  # e.g., 'morton grove' -> 'Morton Grove'

    # Normalize input city names
    df['city_normalized'] = df['city'].str.lower().str.strip()

    # Safe match function with error handling
    def best_match(city):
        result = process.extractOne(city, valid_cities_normalized, scorer=fuzz.ratio)
        if result:
            match, score, _ = result
            if score >= threshold:
                return valid_city_map[match]
        return None

    # Apply matching
    df['city_cleaned'] = df['city_normalized'].apply(best_match)

    # Filter matched rows
    df_cleaned = df[df['city_cleaned'].notnull()].copy()
    df_cleaned['city'] = df_cleaned['city_cleaned']
    df_cleaned.drop(columns=['city_normalized', 'city_cleaned'], inplace=True)

    print("\nUnique cities after cleaning:")
    print(df_cleaned['city'].value_counts())

    return df_cleaned


In [19]:
df = clean_and_standardize_cities(df, valid_cities)


Unique cities before cleaning:
city
CHICAGO         283901
CCHICAGO            59
SCHAUMBURG          28
EVANSTON            21
MAYWOOD             16
                 ...  
MORTON GROVE         1
WHEATON              1
WILMETTE             1
WADSWORTH            1
LANSING              1
Name: count, Length: 69, dtype: int64

Unique cities after cleaning:
city
Chicago               283996
Schaumburg                28
Evanston                  21
Maywood                   16
Elk Grove Village         13
Skokie                     9
Oak Park                   9
Cicero                     9
Elmhurst                   7
Calumet City               6
Berwyn                     6
Summit                     5
Worth                      5
Highland Park              4
Bridgeview                 4
Naperville                 4
Charles A Hayes            4
Plainfield                 4
Schiller Park              3
Rosemont                   3
East Hazel Crest           3
Alsip                      3

zip

In [20]:
df = df[df['zip'].notnull()]
df['zip'] = pd.to_numeric(df['zip'], errors='coerce', downcast='integer')

latitude and longitude

In [21]:
from mapbox import Geocoder
import pandas as pd
import time
from tqdm import tqdm

# 🔑 Replace this with your own Mapbox access token
MAPBOX_TOKEN = "pk.eyJ1IjoiZW1pbHlqdWFyZXoiLCJhIjoiY205dDVjeHZlMDhhZDJqb3QwanU2YTl3cyJ9.K6u54UVcXIlN7RZFj-cNLQ"
geocoder = Geocoder(access_token=MAPBOX_TOKEN)

# Add a progress bar
tqdm.pandas()

# Helper function to geocode an address
def mapbox_geocode(row):
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        return row['latitude'], row['longitude']
    
    address = f"{row['address']}, {row['city']}, {row['state']} {int(row['zip']) if pd.notna(row['zip']) else ''}"
    try:
        response = geocoder.forward(address, limit=1)
        geojson = response.geojson()
        if geojson['features']:
            coords = geojson['features'][0]['geometry']['coordinates']
            # Mapbox returns [longitude, latitude]
            return coords[1], coords[0]
    except Exception as e:
        print(f"Error geocoding: {address} -> {e}")
    
    time.sleep(0.5)  # Throttle just in case
    return None, None

# Filter rows where lat/lon are missing
missing_coords = df[df['latitude'].isna() | df['longitude'].isna()]

# Apply Mapbox geocoding
coords = missing_coords.progress_apply(mapbox_geocode, axis=1)

# Fill in the lat/lon
df.loc[coords.index, ['latitude', 'longitude']] = list(coords)

100%|██████████| 946/946 [00:24<00:00, 38.53it/s]


inspection_date

In [22]:
df['inspection_date'] = pd.to_datetime(df['inspection_date'], format="%Y-%m-%dT%H:%M:%S.%f", errors='coerce')
df['inspection_date'][:3]

0   2025-04-23
1   2025-04-23
2   2025-04-23
Name: inspection_date, dtype: datetime64[ns]

inspection_type

In [23]:
df = df[df['inspection_type'].notnull()]

results

In [24]:
df = df[df['results'] != 'Out of Business']
df = df[df['results'] != 'Business Not Located']

violations

In [25]:
# Extracting the violation number from the violation

df["violations"] = df["violations"].fillna("")
violations_list = []
for violation in df["violations"]:
    violations_list += [violation.split(' | ')]

df["violations_list"] = violations_list
violations_list

[[''],
 [''],
 ['55. PHYSICAL FACILITIES INSTALLED, MAINTAINED & CLEAN - Comments: OBSERVED MISSING CEILING TILE PANEL ABOVE DISH MACHINE IN KITCHEN DISHWASH AREA. INSTRUCTED TO REPAIR, REPLACE MISSING PANEL AND MAINTAIN.'],
 ['37. FOOD PROPERLY LABELED; ORIGINAL CONTAINER - Comments: NOTED DRY INGREDIENTS AND OILS IN KITCHEN ON PREP TABLE, TOP AND BOTTOM SHELVES WITH OUT LABELS. INSTRUCTED TO PROVIDE LABELS OF COMMON NAMES OF ITEMS REMOVED FROM ITS ORIGINAL CONTAINERS. ',
  '54. GARBAGE & REFUSE PROPERLY DISPOSED; FACILITIES MAINTAINED - Comments: NOTED PILES OF SCRAP WOOD AND DIRT/DEBRIS ON THE GROUND ALONG EXTERIOR WALL OF BUILDING. INSTRUCTED TO REMOVE, CLEAN AND MAINTAIN AREA.'],
 ['3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL EMPLOYEE; KNOWLEDGE, RESPONSIBILITIES AND REPORTING - Comments: OBSERVED NO WRITTEN EMPLOYEE HEALTH POLICY ON THE PREMISES.INSTRUCTED TO PROVIDE A SIGNED EMPLOYEE HEALTH POLICY FOR EACH EMPLOYEE AND MAINTAIN AS REQUIRED. PRIORITY FOUNDATION VIOLATION 7-38-01

In [26]:
violation_number_list = []
import re

for violation in df["violations_list"]:
    number_list = []
    for v in violation:
        match = re.match(r"^\d+", v)  # Extract leading numbers
        if match:
            number_list.append(match.group())  # Append extracted violation number
    
    violation_number_list.append(number_list)  # Store numbers for the entity
df["violation_num_list"] = violation_number_list

In [27]:
# Check if there are any rows left with missing ZIP codes
missing_zip_count_after = df['license_'].isna().sum()
print(f"Number of missing License # after dropping rows: {missing_zip_count_after}")

Number of missing License # after dropping rows: 0


In [28]:
print("Null values count:")
print("dba_name:", df['inspection_type'].isnull().sum())

Null values count:
dba_name: 0


In [29]:
unique_counts = df['results'].value_counts()
print(unique_counts)

results
Pass                  149097
Fail                   55683
Pass w/ Conditions     43551
No Entry               12173
Not Ready               3657
Name: count, dtype: int64


In [30]:
unique_counts = df['inspection_type'].value_counts()
print(unique_counts)

inspection_type
Canvass                          126445
License                           38023
Canvass Re-Inspection             32159
Complaint                         26944
License Re-Inspection             12022
                                  ...  
SFP RECENTLY INSPECTED                1
SMOKING COMPLAINT                     1
Recent inspection                     1
expansion                             1
TASK FORCE PACKAGE GOODS 1474         1
Name: count, Length: 105, dtype: int64


In [31]:
# Display the first few rows of the DataFrame
print(df.head())

   inspection_id                   dba_name                   aka_name  \
0        2616252  ROTI MODERN MEDITERRANEAN  ROTI MODERN MEDITERRANEAN   
1        2616253     MAGGIE GYROS & CHICKEN     MAGGIE GYROS & CHICKEN   
2        2616226    FORNO ROSSO ON RANDOLPH    FORNO ROSSO ON RANDOLPH   
4        2616171          TO KOREAN CUISINE          TO KOREAN CUISINE   
6        2616216                 PUP SOCIAL                 PUP SOCIAL   

   license_ facility_type           risk             address     city state  \
0   2594670    Restaurant  Risk 1 (High)  1012 W RANDOLPH ST  Chicago    IL   
1   1869774    Restaurant  Risk 1 (High)       349 E 47TH ST  Chicago    IL   
2   2358717    Restaurant  Risk 1 (High)  1048 W RANDOLPH ST  Chicago    IL   
4   2997283    Restaurant  Risk 1 (High)     3108 N BROADWAY  Chicago    IL   
6   2997708        Liquor   Risk 3 (Low)  2200 N ASHLAND AVE  Chicago    IL   

     zip inspection_date inspection_type   results   latitude  longitude  \
0  6

In [32]:
null_counts = df.isnull().sum()
print("Null values in each column:")
print(null_counts)


Null values in each column:
inspection_id          0
dba_name               0
aka_name               0
license_               0
facility_type          0
risk                   0
address                0
city                   0
state                  0
zip                    0
inspection_date        0
inspection_type        0
results                0
latitude               0
longitude              0
violations             0
facility_type_clean    0
violations_list        0
violation_num_list     0
dtype: int64


In [32]:
count = ((df['results'] == 'Fail') & (df['violations'].isnull())).sum()
print("Number of 'Fail' rows with missing violations:", count)


Number of 'Fail' rows with missing violations: 0


In [34]:
df.to_csv('data/cleaned_chicago_data.csv', index=False)
