In [7]:
# %% [markdown]
# # 09 · Merge real-store fix-ups  
# 
# Combines every `*_fixups.csv` in **data/review_logs/** with the
# current master (v3) and writes an *in-progress* cleaned file:
# 
# * `data/cleaned/grocery_stores_cleaned_v4.csv`
# 
# Run this notebook anytime you’ve finished reviewing another
# community area and want to see the maps update.

# %%
import pandas as pd
from pathlib import Path

# ---- paths -------------------------------------------------------------
ROOT = Path("..").resolve() # repo root (one level up)
MASTER_IN = ROOT / "data" / "cleaned" / "grocery_stores_cleaned_v3.csv"
FIXUP_DIR = ROOT / "data" / "review_logs"
OUT_FILE = ROOT / "data" / "cleaned" / "grocery_stores_cleaned_v4.csv"

print(f"🔄  Base master : {MASTER_IN}")
print(f"📂  Fix-ups dir : {FIXUP_DIR}")
print(f"💾  Writing to : {OUT_FILE}")

# %%
# ---- load master -------------------------------------------------------
master = pd.read_csv(MASTER_IN, dtype={"Zip": str})
print("Master rows:", len(master))

# %%
# ---- load & concatenate all *_fixups.csv ------------------------------
fixups = []
for csv in FIXUP_DIR.glob("*_fixups.csv"):
    df = pd.read_csv(csv, dtype={"Zip": str})
    df["__source_csv"] = csv.name          # keep a breadcrumb column
    fixups.append(df)
    
if not fixups:
    raise ValueError("No fix-up CSVs found!  Add some files to review_logs/ first.")

fixups_all = pd.concat(fixups, ignore_index=True)
print("Fix-up rows :", len(fixups_all))

# %%
# ---- merge logic -------------------------------------------------------
# We *trust* the fix-up rows: drop matching master rows, then concat.
key_cols = ["DBA_NAME_CLEAN", "ADDRESS_CLEAN"]     # whatever uniquely IDs each store

master_no_dupes = master.merge(
    fixups_all[key_cols], how="left", indicator=True, on=key_cols
).query("_merge == 'left_only'").drop(columns=["_merge"])

merged = pd.concat([master_no_dupes, fixups_all], ignore_index=True)

print("→  Final merged rows:", len(merged))

# Quick sanity check
display(
    merged.groupby(["IS_REAL_GROCERY", "IS_JUNK_STORE"])
          .size()
          .rename("count")
          .reset_index()
)

# %%
# ---- save --------------------------------------------------------------
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
merged.to_csv(OUT_FILE, index=False)
print("✅  Saved:", OUT_FILE)


🔄  Base master : /Users/archangel/Desktop/chi-food-access-map/data/cleaned/grocery_stores_cleaned_v3.csv
📂  Fix-ups dir : /Users/archangel/Desktop/chi-food-access-map/data/review_logs
💾  Writing to : /Users/archangel/Desktop/chi-food-access-map/data/cleaned/grocery_stores_cleaned_v4.csv
Master rows: 4505
Fix-up rows : 603
→  Final merged rows: 4506


  fixups_all = pd.concat(fixups, ignore_index=True)


Unnamed: 0,IS_REAL_GROCERY,IS_JUNK_STORE,count
0,False,True,300
1,False,True,2
2,False,False,8
3,False,True,99
4,False,True,3787
5,True,False,278
6,True,True,25
7,True,False,7


✅  Saved: /Users/archangel/Desktop/chi-food-access-map/data/cleaned/grocery_stores_cleaned_v4.csv


🗺️  Refreshed JSON for v2 map → /Users/archangel/Desktop/chi-food-access-map/docs/grocery_stores_cleaned_v3.json


  json_df[col]
