In [7]:
import pandas as pd
import os

# Set paths
RAW_PATH = '../data/raw/urban_agriculture_sites.csv'
CLEAN_PATH = '../data/cleaned/urban_agriculture_sites_cleaned.csv'

# Load raw file
df = pd.read_csv(RAW_PATH)

print("✅ Loaded Urban Agriculture dataset:")
print(df.shape)
print("📊 Original columns:")
print(df.columns.tolist())
print(df.head())

# Step 1: Split Urban_Farms into Address and Coordinates
df_split = df['Urban_Farms'].str.split(',', expand=True)
df_split.columns = ['ADDRESS', 'COORDINATES']

# Step 2: Split coordinates into LATITUDE and LONGITUDE
df_split[['LATITUDE', 'LONGITUDE']] = df_split['COORDINATES'].str.extract(r'([0-9.-]+)[),\s]*([0-9.-]+)')


# Drop COORDINATES column
df_split = df_split.drop(columns=['COORDINATES'])

# Save cleaned version
os.makedirs(os.path.dirname(CLEAN_PATH), exist_ok=True)
df_split.to_csv(CLEAN_PATH, index=False)

print("✅ Cleaned dataset saved to", CLEAN_PATH)
print(df_split.head())


✅ Loaded Urban Agriculture dataset:
(12, 1)
📊 Original columns:
['Urban_Farms']
                                                                 Urban_Farms
ADDRESS            LATITUDE      LONGITUDE                          LOCATION
3333 S Iron Street 41.8332227100 -87.6610220800  (41.83322271, -87.66102208)
444 W Chicago      41.8966172500 -87.6404448500  (41.89661725, -87.64044485)
71st & Prairie     41.7656449200 -87.6184778500  (41.76564492, -87.61847785)
58th & Wood        41.7885099000 -87.6692952400   (41.7885099, -87.66929524)
✅ Cleaned dataset saved to ../data/cleaned/urban_agriculture_sites_cleaned.csv
                                                      ADDRESS     LATITUDE  \
ADDRESS            LATITUDE      LONGITUDE           LOCATION          NaN   
3333 S Iron Street 41.8332227100 -87.6610220800  (41.83322271  -87.6610220   
444 W Chicago      41.8966172500 -87.6404448500  (41.89661725  -87.6404448   
71st & Prairie     41.7656449200 -87.6184778500  (41.76564492  -87.