In [8]:
import pandas as pd
import os
from glob import glob

# Paths
REVIEW_FOLDER = "../data/unclassified_review/"
MASTER_FILE = "../data/cleaned/grocery_stores_v1_curated.csv"
OUTPUT_FILE = "../data/cleaned/grocery_stores_cleaned_v1.csv"

# 1. Load master dataset
master_df = pd.read_csv(MASTER_FILE)

# 2. Load all reviewed unclassified CSVs
reviewed_files = glob(os.path.join(REVIEW_FOLDER, "unclassified_*.csv"))
print(f"🔎 Found {len(reviewed_files)} reviewed neighborhood files")

reviewed_all = pd.concat([pd.read_csv(f) for f in reviewed_files], ignore_index=True)

# 3. Clean up index columns if needed
if 'Unnamed: 0' in reviewed_all.columns:
    reviewed_all = reviewed_all.drop(columns=['Unnamed: 0'])

# 4. Set index for updating (assumes ADDRESS_CLEAN + DBA_NAME_CLEAN are unique together)
reviewed_all['key'] = reviewed_all['ADDRESS_CLEAN'].str.upper().str.strip() + "::" + reviewed_all['DBA_NAME_CLEAN'].str.upper().str.strip()
master_df['key'] = master_df['ADDRESS_CLEAN'].str.upper().str.strip() + "::" + master_df['DBA_NAME_CLEAN'].str.upper().str.strip()

# 5. Merge reviewed decisions back into master
cols_to_update = ['IS_REAL_GROCERY', 'IS_JUNK_STORE', 'IS_REVIEWED_REAL', 'REVIEW_NOTES']
for col in cols_to_update:
    master_df.loc[
        master_df['key'].isin(reviewed_all['key']),
        col
    ] = master_df.loc[
        master_df['key'].isin(reviewed_all['key'])
    ]['key'].map(
        reviewed_all.set_index('key')[col]
    )

# 6. Drop the helper key column
master_df = master_df.drop(columns=['key'])

# 7. Save cleaned version
master_df.to_csv(OUTPUT_FILE, index=False)
print(f"✅ Merged and saved to {OUTPUT_FILE}")


🔎 Found 76 reviewed neighborhood files
✅ Merged and saved to ../data/cleaned/grocery_stores_cleaned_v1.csv


  master_df.loc[
