In [1]:
# 01b_clean_grocery_stores.ipynb

import pandas as pd
import os

# Set paths
RAW_PATH = '../data/raw/food_inspections_filtered.csv'
CLEAN_PATH = '../data/cleaned/grocery_stores_labeled.csv'

# Load the dataset
df = pd.read_csv(RAW_PATH)
print("✅ Loaded food inspections dataset:")
print(df.shape)
df.head()

# Optional: Look at common store names
print("🔍 Top DBA Name values:")
print(df['DBA Name'].str.upper().value_counts().head(20))

# Step 1: Create "junk store" keywords list
junk_keywords = [
    'DOLLAR', '7-ELEVEN', '7 ELEVEN', 'MOBIL', 'SHELL', 'CIRCLE K',
    'GAS', 'CONVENIENCE', 'MINI MART', 'WALGREENS', 'CVS'
]

# Step 2: Standardize store names
df['DBA_NAME_CLEAN'] = df['DBA Name'].str.upper()

# Step 3: Flag junk vendors
df['IS_JUNK_STORE'] = df['DBA_NAME_CLEAN'].apply(
    lambda name: any(junk in name for junk in junk_keywords)
)

# Step 4: Flag likely full-service grocery stores
grocery_keywords = ['ALDI', 'JEWEL', 'MARIANO', 'WHOLE FOODS', 'FOOD MARKET', 'SUPERMARKET']
df['IS_REAL_GROCERY'] = df['DBA_NAME_CLEAN'].apply(
    lambda name: any(grocery in name for grocery in grocery_keywords)
)

# Save the labeled dataset
os.makedirs(os.path.dirname(CLEAN_PATH), exist_ok=True)
df.to_csv(CLEAN_PATH, index=False)

print(f"✅ Labeled grocery store data saved to {CLEAN_PATH}")
print(df[['DBA Name', 'IS_REAL_GROCERY', 'IS_JUNK_STORE']].sample(10))


✅ Loaded food inspections dataset:
(35681, 17)
🔍 Top DBA Name values:
DBA Name
7-ELEVEN                                581
WHOLE FOODS MARKET                      329
CITGO                                   220
CERMAK PRODUCE                          138
TONY'S FINER FOODS ENTERPRISES, INC.    118
TREASURE ISLAND FOODS                   103
JEWEL FOOD STORE                         98
CERMAK FRESH MARKET                      94
PETE'S PRODUCE                           90
GATEWAY NEWSTAND                         87
HUDSON NEWS                              83
SOUTH LOOP MARKET                        69
FRESH MARKET PLACE                       66
MARIANO'S FRESH MARKET #8503             66
MARATHON                                 65
JEWEL FOOD  STORE # 3345                 61
7- ELEVEN                                61
FALCON FUEL                              61
7 - ELEVEN                               58
FOOD 4 LESS MIDWEST #552                 58
Name: count, dtype: int64
✅ Labeled groce