In [None]:
# 📦 Imports
import pandas as pd
import os

# 📍 File paths
RAW_PATH = '../data/raw/urban_agriculture_sites.csv'
CLEAN_PATH = '../data/cleaned/urban_agriculture_sites_cleaned.csv'

# 📥 Load data
df = pd.read_csv(RAW_PATH)
print(f"✅ Loaded raw urban ag dataset: {df.shape[0]} rows")

# ✂️ Keep only relevant columns
columns_to_keep = [
    'Site Name', 'Address', 'City', 'Zip', 'Latitude', 'Longitude', 'Site Type'
]
df = df[columns_to_keep]

# 🧹 Drop rows with missing or invalid coordinates
df = df.dropna(subset=['Latitude', 'Longitude'])
df = df[(df['Latitude'] != 0) & (df['Longitude'] != 0)]

# 🔠 Normalize text
df['Site Name'] = df['Site Name'].str.title().str.strip()
df['Address'] = df['Address'].str.title().str.strip()
df['City'] = df['City'].str.title().str.strip()
df['Site Type'] = df['Site Type'].str.title().str.strip()

# 🧼 Clean up ZIP codes
df['Zip'] = df['Zip'].astype(str).str.extract(r'(\d{5})')  # extract 5-digit ZIPs

# 💾 Save cleaned dataset
os.makedirs(os.path.dirname(CLEAN_PATH), exist_ok=True)
df.to_csv(CLEAN_PATH, index=False)
print(f"✅ Cleaned urban ag dataset saved to: {CLEAN_PATH}")
