In [1]:
# 01_eda_augmented.ipynb
# Build an enriched, synthetic dataset for Chicago ZIP codes:
# - Start from Zillow ZORI rent index
# - Add structural features (distance to Loop)
# - Add crime (API + synthetic fallback)
# - Add economic & lifestyle features
# - Save as data/processed/chicago_augmented_12m.csv

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from pathlib import Path
import requests

# --------------------------------------------------
# 1. PROJECT ROOT + PATHS
#    Walk upward until we find data/raw/zori_zip.csv
#    This makes the notebook robust no matter where it's run from.
# --------------------------------------------------

ROOT = Path.cwd().resolve()
target = Path("data") / "raw" / "zori_zip.csv"

while not (ROOT / target).exists():
    if ROOT == ROOT.parent:
        raise FileNotFoundError(
            f"Could not find {target} above cwd={Path.cwd().resolve()}"
        )
    ROOT = ROOT.parent

print("Using ROOT:", ROOT)

RAW  = ROOT / "data" / "raw"
PROC = ROOT / "data" / "processed"
PROC.mkdir(parents=True, exist_ok=True)

# --------------------------------------------------
# 2. LOAD ZILLOW TARGET (ZORI) AND BUILD RENT ALPHA
# --------------------------------------------------

print("\n--- Loading Zillow Data ---")
zori_path = RAW / "zori_zip.csv"
if not zori_path.exists():
    raise FileNotFoundError("zori_zip.csv missing")

# RegionName = ZIP code
zori = pd.read_csv(zori_path, dtype={'RegionName': str})

# Keep Illinois only, then Chicago metro only (if Metro column exists)
zori = zori[zori['StateName'] == 'IL'].copy()
if 'Metro' in zori.columns:
    zori = zori[zori['Metro'].str.contains('Chicago', na=False)]

# Wide → long: one row per (zip, date)
date_cols = [c for c in zori.columns if c[:2] == '20']
zori_long = zori.melt(
    id_vars=['RegionName', 'City', 'CountyName'],
    value_vars=date_cols,
    var_name='date',
    value_name='rent'
)

zori_long['date'] = pd.to_datetime(zori_long['date'])
zori_long['rent'] = pd.to_numeric(zori_long['rent'], errors='coerce')
zori_long = zori_long.rename(columns={'RegionName': 'zip'})
zori_long = zori_long.dropna(subset=['rent']).sort_values(['zip', 'date'])

# 12-month forward rent growth per ZIP
zori_long['zip_12m_growth'] = zori_long.groupby('zip')['rent'].transform(
    lambda x: x.shift(-12) / x - 1
)

# Metro-wide average alpha per month
metro_avg = (
    zori_long
    .groupby('date')['zip_12m_growth']
    .mean()
    .reset_index()
    .rename(columns={'zip_12m_growth': 'metro_12m_growth'})
)

# Relative growth (alpha) = ZIP growth - overall metro growth
zori_long = zori_long.merge(metro_avg, on='date', how='left')
zori_long['relative_12m_growth'] = (
    zori_long['zip_12m_growth'] - zori_long['metro_12m_growth']
)

# --------------------------------------------------
# 3. STRUCTURAL FEATURES: DISTANCE TO THE LOOP
# --------------------------------------------------

print("\n--- Calculating Structural Features ---")
zip_geojson = RAW / "Boundaries_-_ZIP_Codes_20251207.geojson"

if zip_geojson.exists():
    gdf_zip = gpd.read_file(zip_geojson)

    # Ensure join key matches Zillow ZIP dtype
    gdf_zip['zip'] = gdf_zip['zip'].astype(str)

    # Approximate Chicago Loop point (lon, lat)
    loop_pt = Point(-87.6278, 41.8820)

    # Project to local planar CRS (Illinois State Plane) to measure in feet
    gdf_zip = gdf_zip.to_crs(epsg=3435)
    loop_series = gpd.GeoSeries(
        [loop_pt] * len(gdf_zip),
        crs="EPSG:4326"
    ).to_crs(epsg=3435)

    # Distance from each ZIP centroid to Loop, in miles
    gdf_zip['dist_to_loop_miles'] = (
        gdf_zip.centroid.distance(loop_series) / 5280
    )

    dist_map = gdf_zip[['zip', 'dist_to_loop_miles']].copy()
    zori_long = zori_long.merge(dist_map, on='zip', how='left')

    # Conservative fallback for missing shapes
    zori_long['dist_to_loop_miles'] = zori_long['dist_to_loop_miles'].fillna(20)
else:
    # If shapes are missing, assume “far” from Loop (suburban)
    zori_long['dist_to_loop_miles'] = 20

# --------------------------------------------------
# 4. CRIME DATA: API + FALLBACK
# --------------------------------------------------

print("\n--- Processing Crime Data ---")
crime_file = RAW / "chicago_crimes.csv"

# Map community areas (CA) → representative ZIPs
ca_to_zip = {
    8: '60611', 32: '60601', 24: '60622', 6: '60657', 7: '60614',
    22: '60647', 28: '60607', 3: '60640', 4: '60625', 5: '60618'
}

# Download from City of Chicago API if missing locally
if not crime_file.exists():
    try:
        url = (
            "https://data.cityofchicago.org/resource/ijzp-q8t2.csv"
            "?$where=date >= '2020-01-01T00:00:00'&$limit=50000"
        )
        r = requests.get(url)
        with open(crime_file, 'wb') as f:
            f.write(r.content)
    except Exception:
        # If API fails we’ll fall back to synthetic crime later
        pass

# Try to join crime to ZORI
if crime_file.exists():
    try:
        # Note: original CSV has capitalized column names
        crimes = pd.read_csv(
            crime_file,
            usecols=['Date', 'Primary Type', 'Community Area']
        )
        crimes.columns = ['date', 'primary_type', 'community_area']

        # Monthly aggregation
        crimes['date'] = pd.to_datetime(crimes['date'], errors='coerce')
        crimes = crimes.dropna(subset=['community_area'])
        crimes['community_area'] = crimes['community_area'].astype(int)

        crime_agg = (
            crimes
            .groupby(['community_area', 'date'])
            .size()
            .reset_index(name='count')
        )

        # Rolling 12-month sum per community area
        crime_agg['crime_12m_sum'] = crime_agg.groupby('community_area')['count'] \
                                              .transform(lambda x: x.rolling(12).sum())

        # Map CA → ZIP and then average within each ZIP/date
        crime_agg['zip'] = crime_agg['community_area'].map(ca_to_zip)
        crime_zip = (
            crime_agg
            .groupby(['zip', 'date'])['crime_12m_sum']
            .mean()
            .reset_index()
        )
        crime_zip['zip'] = crime_zip['zip'].astype(str)

        zori_long = zori_long.merge(
            crime_zip,
            on=['zip', 'date'],
            how='left'
        )
        print("Crime API merged.")
    except Exception as e:
        print(f"Crime merge failed: {e}")

# --------------------------------------------------
# 5. ADVANCED ECONOMIC + LIFESTYLE FEATURES
# --------------------------------------------------

print("\n--- Generating Advanced Features ---")

# ---------- Income ----------
# If we don't already have ACS income, synthesize a smooth distribution per ZIP.
if 'median_income' not in zori_long.columns:
    np.random.seed(42)
    unique_zips = zori_long['zip'].unique()

    # Uniform on [35k, 120k] to avoid the "clumping" problem of small quantile bins
    income_levels = np.random.uniform(35000, 120000, len(unique_zips))
    zip_income_map = dict(zip(unique_zips, income_levels))
    zori_long['median_income'] = zori_long['zip'].map(zip_income_map)
else:
    unique_zips = zori_long['zip'].unique()
    # median_income already present

# ---------- Crime fallback ----------
# If crime data didn't merge or is all zeros, synthesize an inverse-income signal.
if 'crime_12m_sum' not in zori_long.columns or zori_long['crime_12m_sum'].sum() == 0:
    inc_norm = (
        (zori_long['median_income'] - zori_long['median_income'].min()) /
        (zori_long['median_income'].max() - zori_long['median_income'].min())
    )
    zori_long['crime_12m_sum'] = (
        (1 - inc_norm) * 800 + np.random.normal(0, 50, len(zori_long))
    )
    zori_long['crime_12m_sum'] = zori_long['crime_12m_sum'].clip(lower=0)

# Convenience handles
dist = zori_long['dist_to_loop_miles']
inc = zori_long['median_income']

# ---------- Transit score ----------
# High in city center, decays with distance, plus a bit of noise
zori_long['transit_score'] = (
    (100 - dist * 3).clip(10, 100) + np.random.normal(0, 5, len(zori_long))
)

# ---------- Housing age ----------
# Older housing stock near the city, newer in outer suburbs
zori_long['housing_age_median'] = (
    (100 - dist * 2).clip(10, 90) + np.random.normal(0, 10, len(zori_long))
)

# ---------- Vacancy rate ----------
# U-shaped relationship: higher vacancy at very low and very high income
zori_long['vacancy_rate'] = (np.abs(inc - 70000) / 70000) * 0.1 + 0.02

# ---------- Poverty rate ----------
zori_long['poverty_rate'] = ((150000 - inc) / 150000).clip(0, 0.4)

# ---------- University proximity ----------
# Mark ~10% of ZIPs as “college-adjacent”
np.random.seed(99)
college_zips = np.random.choice(
    unique_zips,
    size=int(len(unique_zips) * 0.1),
    replace=False
)
zori_long['university_flag'] = np.where(
    zori_long['zip'].isin(college_zips),
    1,
    0
)

# ---------- Lifestyle indices ----------
if 'hipster_score' not in zori_long.columns:
    # Hipster: sweet spot around 4 miles, boosted by income
    zori_long['hipster_score'] = (
        np.exp(-(dist - 4) ** 2 / 10) * 100 + (inc / 4000)
    ).clip(0, 100)

    # Nightlife: strongest close to Loop
    zori_long['nightlife_index'] = (
        np.where(dist < 3, 90, 20) + np.random.normal(0, 5, len(zori_long))
    ).clip(0, 100)

    # Wellness: driven by income with a little noise
    zori_long['wellness_index'] = (
        (inc / 1500) + np.random.normal(0, 5, len(zori_long))
    ).clip(0, 100)

    # Pet-friendliness: best in inner suburbs (3-12 miles out)
    zori_long['pet_index'] = (
        np.where((dist > 3) & (dist < 12), 80, 40)
        + np.random.normal(0, 10, len(zori_long))
    ).clip(0, 100)

    # School rating: mix of income and distance
    zori_long['school_rating'] = (
        (inc / 25000) * 0.6 + (dist / 4) * 0.4
    ).clip(1, 10)

# --------------------------------------------------
# 6. FINAL CLEANUP + SAVE
# --------------------------------------------------

# If permits_12m_sum was never generated upstream, create a 0 baseline
zori_long['permits_12m_sum'] = zori_long.get(
    'permits_12m_sum',
    pd.Series(0, index=zori_long.index)
).fillna(0)

# Keep only rows where we have a valid target, fill remaining NaNs with 0
df_model = zori_long.dropna(subset=['relative_12m_growth']).fillna(0)

out_path = PROC / "chicago_augmented_12m.csv"
df_model.to_csv(out_path, index=False)
print(f"Saved. Shape: {df_model.shape}, Path: {out_path}")


Using ROOT: C:\Users\MaxGillum\Desktop\cmse492_project

--- Loading Zillow Data ---

--- Calculating Structural Features ---

--- Processing Crime Data ---


  crimes['date'] = pd.to_datetime(crimes['date'], errors='coerce')


Crime API merged.

--- Generating Advanced Features ---
Saved. Shape: (7410, 22), Path: C:\Users\MaxGillum\Desktop\cmse492_project\data\processed\chicago_augmented_12m.csv
