In [4]:
import pandas as pd
import os

# Paths
CLEANED_PATH = "../data/cleaned/grocery_stores_cleaned_v2.csv"
REAL_REVIEW_PATH = "../data/real_review/"
OUTPUT_PATH = "../data/cleaned/grocery_stores_cleaned_v3.csv"

# Load base cleaned data
df = pd.read_csv(CLEANED_PATH)

# Load and combine all reviewed real files
real_files = [
    os.path.join(REAL_REVIEW_PATH, f) 
    for f in os.listdir(REAL_REVIEW_PATH) 
    if f.endswith(".csv")
]
real = pd.concat([pd.read_csv(f) for f in real_files], ignore_index=True)

# Check available columns
available_cols = real.columns.tolist()
required_cols = ['Address', 'DBA Name', 'IS_REAL_GROCERY', 'IS_JUNK_STORE']
optional_cols = ['REVIEW_NOTES']
merge_cols = required_cols + [col for col in optional_cols if col in available_cols]

# Warn if optional column is missing
for col in optional_cols:
    if col not in available_cols:
        print(f"⚠️ Optional column missing from review data: {col}")

# Subset and merge
real_subset = real[merge_cols]
df = df.merge(real_subset, on=['Address', 'DBA Name'], how='left', suffixes=('', '_REVIEW'))

# Apply updates
df['IS_REAL_GROCERY'] = df['IS_REAL_GROCERY_REVIEW'].combine_first(df['IS_REAL_GROCERY'])
df['IS_JUNK_STORE'] = df['IS_JUNK_STORE_REVIEW'].combine_first(df['IS_JUNK_STORE'])

if 'REVIEW_NOTES_REVIEW' in df.columns:
    df['REVIEW_NOTES'] = df['REVIEW_NOTES_REVIEW'].combine_first(df['REVIEW_NOTES'])

# Drop temp review columns
df.drop(columns=[col for col in df.columns if col.endswith('_REVIEW')], inplace=True)

# Save final merged version
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Merged real store review updates saved to {OUTPUT_PATH}")


✅ Merged real store review updates saved to ../data/cleaned/grocery_stores_cleaned_v3.csv
