In [7]:
import pandas as pd
import os

# Load the curated dataset
df = pd.read_csv("../data/cleaned/grocery_stores_v1_curated.csv")

# Filter for unclassified stores and make an explicit copy
unclassified = df[
    (df['IS_REAL_GROCERY'] == False) &
    (df['IS_JUNK_STORE'] == False)
].copy()  # 👈 This removes the warning

# Fill missing Community values with a placeholder
unclassified['Community'] = unclassified['Community'].fillna('Unknown')

# Group by community area
unclassified_by_community = unclassified['Community'].value_counts().reset_index()
unclassified_by_community.columns = ['Community', 'UnclassifiedCount']
print(unclassified_by_community.head(10))

# Create output directory
output_dir = "../data/unclassified_review/"
os.makedirs(output_dir, exist_ok=True)

# Export CSVs per community
for community in unclassified['Community'].unique():
    community_str = str(community).lower().replace(' ', '_')
    file_path = os.path.join(output_dir, f"unclassified_{community_str}.csv")
    community_df = unclassified[unclassified['Community'] == community]
    community_df.to_csv(file_path, index=False)
    print(f"✅ Saved {len(community_df)} records for '{community}' to {file_path}")


         Community  UnclassifiedCount
0           AUSTIN                116
1       WEST RIDGE                 59
2  NEAR NORTH SIDE                 58
3      ALBANY PARK                 55
4        WEST TOWN                 54
5   SOUTH LAWNDALE                 53
6    HUMBOLDT PARK                 50
7             LOOP                 48
8   BELMONT CRAGIN                 42
9   NEAR WEST SIDE                 41
✅ Saved 40 records for 'GREATER GRAND CROSSING' to ../data/review/unclassified_greater_grand_crossing.csv
✅ Saved 11 records for 'WASHINGTON HEIGHTS' to ../data/review/unclassified_washington_heights.csv
✅ Saved 29 records for 'UPTOWN' to ../data/review/unclassified_uptown.csv
✅ Saved 13 records for 'WOODLAWN' to ../data/review/unclassified_woodlawn.csv
✅ Saved 16 records for 'WEST LAWN' to ../data/review/unclassified_west_lawn.csv
✅ Saved 36 records for 'LOWER WEST SIDE' to ../data/review/unclassified_lower_west_side.csv
✅ Saved 55 records for 'ALBANY PARK' to ../data/review