In [3]:
import pandas as pd
import os

# Set paths
RAW_PATH = '../data/cleaned/grocery_stores_v1_curated.csv'
CLEAN_PATH = '../data/cleaned/grocery_stores_cleaned_v1.csv'

# Load the dataset
df = pd.read_csv(RAW_PATH)
print("✅ Loaded food inspections dataset:")
print(df.shape)
df.head()

# Optional: Look at common store names
print("🔍 Top DBA Name values:")
print(df['DBA Name'].str.upper().value_counts().head(20))

# Create "junk store" keywords list
junk_keywords = [
    'DOLLAR', '7-ELEVEN', '7 ELEVEN', 'MOBIL', 'SHELL', 'CIRCLE K',
    'GAS', 'CONVENIENCE', 'MINI MART', 'WALGREENS', 'CVS'
]

# Standardize store names
df['DBA_NAME_CLEAN'] = df['DBA Name'].str.upper()

# Flag junk vendors
df['IS_JUNK_STORE'] = df['DBA_NAME_CLEAN'].apply(
    lambda name: any(junk in name for junk in junk_keywords)
)

# Flag likely full-service grocery stores
grocery_keywords = ['ALDI', 'JEWEL', 'MARIANO', 'WHOLE FOODS', 'FOOD MARKET', 'SUPERMARKET']
df['IS_REAL_GROCERY'] = df['DBA_NAME_CLEAN'].apply(
    lambda name: any(grocery in name for grocery in grocery_keywords)
)

# Save the labeled dataset
os.makedirs(os.path.dirname(CLEAN_PATH), exist_ok=True)
df.to_csv(CLEAN_PATH, index=False)

print(f"✅ Labeled grocery store data saved to {CLEAN_PATH}")
print(df[['DBA Name', 'IS_REAL_GROCERY', 'IS_JUNK_STORE']].sample(10))


✅ Loaded food inspections dataset:
(4503, 24)
🔍 Top DBA Name values:
DBA Name
CITGO                 36
7-ELEVEN              31
GATEWAY NEWSTAND      18
SHELL                 16
MARATHON              14
FALCON FUEL           13
CIRCLE K              13
BP                    12
MOBIL                 12
WHOLE FOODS MARKET     9
GETIR                  8
SOUTH LOOP MARKET      7
FAMILY DOLLAR          7
GOPUFF                 7
GO GROCER              6
GO! GROCER             6
SAVE A LOT             6
BUYK                   6
ALDI                   5
MUNCHIES               5
Name: count, dtype: int64
✅ Labeled grocery store data saved to ../data/cleaned/grocery_stores_cleaned_v1.csv
                   DBA Name  IS_REAL_GROCERY  IS_JUNK_STORE
2217     Jeff Food & Liquor            False          False
2685  B & B Supermarket Llc             True          False
546             Home Run Bp            False          False
1326                     Bp            False          False
4213     Bp 