# Census Tract Characteristics (Urban/Rural, Metro, Commute)

This notebook retrieves tract-level characteristics from the Census APIs, including 2020 decennial urban/rural counts and 2022 ACS 5-year commute statistics, then saves the merged dataset to `data_dir` for downstream analysis.

In [None]:
# Enable autoreload and import configuration
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import requests

# Placeholder for linters; actual value defined via config
data_dir = None
%run ../../config.py

In [None]:
ACS_YEAR = 2022
ACS_PROFILE_URL = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile"
COMMUTE_VAR = "DP03_0025E"  # Mean travel time to work (minutes)

DECENNIAL_YEAR = 2020
URBAN_URL = f"https://api.census.gov/data/{DECENNIAL_YEAR}/dec/pl"
URBAN_VARS = ["P2_001N", "P2_002N", "P2_005N"]  # Total, Urban, Rural population

EXCLUDED_STATE_FIPS = {"72"}  # Skip Puerto Rico to avoid API limits; can be adjusted


def fetch_state_fips(dataset_url=f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"):
    params = {"get": "NAME", "for": "state:*"}
    response = requests.get(dataset_url, params=params, timeout=30)
    response.raise_for_status()
    data = response.json()
    t = pd.DataFrame(data[1:], columns=data[0])
    t["state"] = t["state"].str.zfill(2)
    t = t[~t["state"].isin(EXCLUDED_STATE_FIPS)]
    return t["state"].tolist()


def fetch_tract_data(base_url, variables, state_fips):
    frames = []
    for state in state_fips:
        params = {
            "get": ",".join(["NAME"] + variables),
            "for": "tract:*",
            "in": f"state:{state}",
        }
        response = requests.get(base_url, params=params, timeout=60)
        response.raise_for_status()
        payload = response.json()
        t = pd.DataFrame(payload[1:], columns=payload[0])
        t["state"] = t["state"].str.zfill(2)
        t["county"] = t["county"].str.zfill(3)
        t["tract"] = t["tract"].str.zfill(6)
        frames.append(t)
    if not frames:
        raise RuntimeError("No data returned from Census API.")
    return pd.concat(frames, ignore_index=True)


In [None]:
state_fips = fetch_state_fips()
print(f"Fetching data for {len(state_fips)} states")
state_fips[:5]

In [None]:
urban_df = fetch_tract_data(URBAN_URL, URBAN_VARS, state_fips)
urban_df.rename(
    columns={
        "P2_001N": "population_total",
        "P2_002N": "population_urban",
        "P2_005N": "population_rural",
    },
    inplace=True,
)
for col in ["population_total", "population_urban", "population_rural"]:
    urban_df[col] = pd.to_numeric(urban_df[col], errors="coerce")
urban_df["GEOID"] = urban_df["state"] + urban_df["county"] + urban_df["tract"]

print(f"Urban/rural rows: {len(urban_df):,}")
urban_df.head()

In [None]:
commute_df = fetch_tract_data(ACS_PROFILE_URL, [COMMUTE_VAR], state_fips)
commute_df.rename(columns={COMMUTE_VAR: "mean_commute_minutes"}, inplace=True)
commute_df["mean_commute_minutes"] = pd.to_numeric(commute_df["mean_commute_minutes"], errors="coerce")
commute_df["GEOID"] = commute_df["state"] + commute_df["county"] + commute_df["tract"]

print(f"Commute rows: {len(commute_df):,}")
commute_df.head()

In [None]:
merged = urban_df.merge(
    commute_df[["GEOID", "mean_commute_minutes"]], on="GEOID", how="left"
)

merged["urban_share"] = merged["population_urban"].div(merged["population_total"]).replace([np.inf, -np.inf], np.nan)
merged["rural_share"] = merged["population_rural"].div(merged["population_total"]).replace([np.inf, -np.inf], np.nan)
merged["is_urban"] = merged["urban_share"] >= 0.5
merged["is_rural"] = ~merged["is_urban"]
merged["is_metro"] = merged["population_urban"] >= 50000  # proxy for metro (urbanized area threshold)
merged["GEOID"] = merged["GEOID"].str.zfill(11)

selected_cols = [
    "GEOID",
    "NAME",
    "population_total",
    "population_urban",
    "population_rural",
    "urban_share",
    "rural_share",
    "is_urban",
    "is_rural",
    "is_metro",
    "mean_commute_minutes",
]

merged = merged[selected_cols]
print(f"Merged dataset rows: {len(merged):,}")
merged.head()

In [None]:
output_path = data_dir / "tract_characteristics.parquet"
merged.to_parquet(output_path, index=False)
print(f"Saved {len(merged):,} rows to {output_path}")
output_path

In [None]:
summary = {
    "rows": len(merged),
    "columns": list(merged.columns),
    "urban_share_missing": int(merged["urban_share"].isna().sum()),
    "commute_missing": int(merged["mean_commute_minutes"].isna().sum()),
}
pd.DataFrame([summary])