# Census Tract Characteristics (Urban/Rural, Metro, Commute)

This notebook retrieves tract-level characteristics from the Census APIs, including 2020 decennial urban/rural counts and 2022 ACS 5-year commute statistics, then saves the merged dataset to `data_dir` for downstream analysis.

In [1]:
# Enable autoreload and import configuration
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import requests

# Placeholder for linters; actual value defined via config
data_dir = None
%run ../../config.py

Project root: /Users/eric/proj/scratch/WirelessIncome
Data directory: /Users/eric/proj/scratch/WirelessIncome/data


In [None]:
ACS_YEAR = 2022
ACS_PROFILE_URL = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/profile"
COMMUTE_VAR = "DP03_0025E"  # Mean travel time to work (minutes)

DECENNIAL_YEAR = 2020
URBAN_URL = f"https://api.census.gov/data/{DECENNIAL_YEAR}/dec/pl"
URBAN_VARS = ["P2_001N", "P2_002N", "P2_005N"]  # Total, Urban, Rural population

CBSA_TRACT_URL = "https://www2.census.gov/geo/docs/maps-data/data/rel/cbsa_tract_rel_2020.txt"

EXCLUDED_STATE_FIPS = {"72"}  # Skip Puerto Rico to avoid API limits; can be adjusted


def fetch_state_fips(dataset_url=f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"):
    params = {"get": "NAME", "for": "state:*"}
    response = requests.get(dataset_url, params=params, timeout=30)
    response.raise_for_status()
    data = response.json()
    t = pd.DataFrame(data[1:], columns=data[0])
    t["state"] = t["state"].str.zfill(2)
    t = t[~t["state"].isin(EXCLUDED_STATE_FIPS)]
    return t["state"].tolist()


def fetch_tract_data(base_url, variables, state_fips):
    frames = []
    for state in state_fips:
        params = {
            "get": ",".join(["NAME"] + variables),
            "for": "tract:*",
            "in": f"state:{state}",
        }
        response = requests.get(base_url, params=params, timeout=60)
        response.raise_for_status()
        payload = response.json()
        t = pd.DataFrame(payload[1:], columns=payload[0])
        t["state"] = t["state"].str.zfill(2)
        t["county"] = t["county"].str.zfill(3)
        t["tract"] = t["tract"].str.zfill(6)
        frames.append(t)
    if not frames:
        raise RuntimeError("No data returned from Census API.")
    return pd.concat(frames, ignore_index=True)


In [3]:
state_fips = fetch_state_fips()
print(f"Fetching data for {len(state_fips)} states")
state_fips[:5]

Fetching data for 51 states


['01', '02', '04', '05', '06']

In [4]:
urban_df = fetch_tract_data(URBAN_URL, URBAN_VARS, state_fips)
urban_df.rename(
    columns={
        "P2_001N": "population_total",
        "P2_002N": "population_urban",
        "P2_005N": "population_rural",
    },
    inplace=True,
)
for col in ["population_total", "population_urban", "population_rural"]:
    urban_df[col] = pd.to_numeric(urban_df[col], errors="coerce")
urban_df["GEOID"] = urban_df["state"] + urban_df["county"] + urban_df["tract"]

print(f"Urban/rural rows: {len(urban_df):,}")
urban_df.head()

Urban/rural rows: 84,414


Unnamed: 0,NAME,population_total,population_urban,population_rural,state,county,tract,GEOID
0,"Census Tract 201, Autauga County, Alabama",1775,76,1376,1,1,20100,1001020100
1,"Census Tract 202, Autauga County, Alabama",2055,43,834,1,1,20200,1001020200
2,"Census Tract 203, Autauga County, Alabama",3216,100,2220,1,1,20300,1001020300
3,"Census Tract 204, Autauga County, Alabama",4246,158,3522,1,1,20400,1001020400
4,"Census Tract 205.01, Autauga County, Alabama",4322,156,3211,1,1,20501,1001020501


In [5]:
commute_df = fetch_tract_data(ACS_PROFILE_URL, [COMMUTE_VAR], state_fips)
commute_df.rename(columns={COMMUTE_VAR: "mean_commute_minutes"}, inplace=True)
commute_df["mean_commute_minutes"] = pd.to_numeric(commute_df["mean_commute_minutes"], errors="coerce")
commute_df["GEOID"] = commute_df["state"] + commute_df["county"] + commute_df["tract"]

print(f"Commute rows: {len(commute_df):,}")
commute_df.head()

Commute rows: 84,415


Unnamed: 0,NAME,mean_commute_minutes,state,county,tract,GEOID
0,Census Tract 201; Autauga County; Alabama,19.3,1,1,20100,1001020100
1,Census Tract 202; Autauga County; Alabama,25.9,1,1,20200,1001020200
2,Census Tract 203; Autauga County; Alabama,27.0,1,1,20300,1001020300
3,Census Tract 204; Autauga County; Alabama,23.0,1,1,20400,1001020400
4,Census Tract 205.01; Autauga County; Alabama,20.9,1,1,20501,1001020501


In [None]:
df = urban_df.merge(
    commute_df[["GEOID", "mean_commute_minutes"]], on="GEOID", how="left"
)

df["urban_share"] = df["population_urban"].div(df["population_total"]).replace([np.inf, -np.inf], np.nan)
df["rural_share"] = df["population_rural"].div(df["population_total"]).replace([np.inf, -np.inf], np.nan)
df["is_urban"] = df["urban_share"] >= 0.5
df["is_rural"] = ~df["is_urban"]
df["GEOID"] = df["GEOID"].str.zfill(11)

base_cols = [
    "GEOID",
    "NAME",
    "population_total",
    "population_urban",
    "population_rural",
    "urban_share",
    "rural_share",
    "is_urban",
    "is_rural",
    "mean_commute_minutes",
]

df = df[base_cols]
print(f"Merged dataset rows: {len(df):,}")
df.head()

Merged dataset rows: 84,414


Unnamed: 0,GEOID,NAME,population_total,population_urban,population_rural,urban_share,rural_share,is_urban,is_rural,is_metro,mean_commute_minutes
0,1001020100,"Census Tract 201, Autauga County, Alabama",1775,76,1376,0.042817,0.775211,False,True,False,19.3
1,1001020200,"Census Tract 202, Autauga County, Alabama",2055,43,834,0.020925,0.405839,False,True,False,25.9
2,1001020300,"Census Tract 203, Autauga County, Alabama",3216,100,2220,0.031095,0.690299,False,True,False,27.0
3,1001020400,"Census Tract 204, Autauga County, Alabama",4246,158,3522,0.037211,0.829487,False,True,False,23.0
4,1001020501,"Census Tract 205.01, Autauga County, Alabama",4322,156,3211,0.036094,0.742943,False,True,False,20.9


In [None]:
cbsa_df = pd.read_csv(CBSA_TRACT_URL, dtype=str)
cbsa_df.columns = [c.strip() for c in cbsa_df.columns]

if "GEOID" in cbsa_df.columns:
    cbsa_df["GEOID"] = cbsa_df["GEOID"].str.zfill(11)
else:
    def _find_col(options):
        for opt in options:
            match = next((col for col in cbsa_df.columns if col.lower() == opt or col.lower().endswith(opt)), None)
            if match:
                return match
        return None

    state_col = _find_col(["statefp", "state"])
    county_col = _find_col(["countyfp", "county"])
    tract_col = _find_col(["tractce", "tract"])
    if not all([state_col, county_col, tract_col]):
        raise ValueError("Could not identify state/county/tract columns in CBSA crosswalk")
    cbsa_df["GEOID"] = (
        cbsa_df[state_col].str.zfill(2)
        + cbsa_df[county_col].str.zfill(3)
        + cbsa_df[tract_col].str.zfill(6)
    )

rename_map = {
    "CBSAFP": "cbsa_fips",
    "NAME": "cbsa_name",
    "LSAD": "cbsa_lsad",
    "MEMI": "cbsa_metro_micro",
    "METDIVFP": "metdiv_fips",
}
cbsa_df = cbsa_df.rename(columns={k: v for k, v in rename_map.items() if k in cbsa_df.columns})
cbsa_df["cbsa_metro_micro"] = cbsa_df.get("cbsa_metro_micro")
cbsa_df["is_cbsa_metro"] = cbsa_df["cbsa_metro_micro"].eq("1")
cbsa_trim_cols = [
    "GEOID",
    "cbsa_fips",
    "cbsa_name",
    "cbsa_lsad",
    "cbsa_metro_micro",
    "is_cbsa_metro",
]
cbsa_trim = cbsa_df.loc[:, [col for col in cbsa_trim_cols if col in cbsa_df.columns]].drop_duplicates("GEOID")

print(f"CBSA crosswalk rows: {len(cbsa_trim):,}")
cbsa_trim.head()

In [None]:
df = df.merge(cbsa_trim, on="GEOID", how="left")

df["cbsa_fips"] = df["cbsa_fips"].fillna("")
df["cbsa_name"] = df["cbsa_name"].fillna("")
df["cbsa_lsad"] = df["cbsa_lsad"].fillna("")
df["cbsa_metro_micro"] = df["cbsa_metro_micro"].fillna("")
df["is_cbsa_metro"] = df["is_cbsa_metro"].fillna(False)
df["is_metro"] = df["is_cbsa_metro"]

ordered_cols = [
    "GEOID",
    "NAME",
    "population_total",
    "population_urban",
    "population_rural",
    "urban_share",
    "rural_share",
    "is_urban",
    "is_rural",
    "mean_commute_minutes",
    "cbsa_fips",
    "cbsa_name",
    "cbsa_lsad",
    "cbsa_metro_micro",
    "is_cbsa_metro",
    "is_metro",
]

df = df[[col for col in ordered_cols if col in df.columns]]
print("Metro flag summary:")
print(df["is_metro"].value_counts(dropna=False))

In [10]:
output_path = data_dir / "tract_characteristics.parquet"
df.to_parquet(output_path, index=False)
print(f"Saved {len(df):,} rows to {output_path}")
output_path

Saved 84,414 rows to /Users/eric/proj/scratch/WirelessIncome/data/tract_characteristics.parquet


PosixPath('/Users/eric/proj/scratch/WirelessIncome/data/tract_characteristics.parquet')

In [11]:
df.head()

Unnamed: 0,GEOID,NAME,population_total,population_urban,population_rural,urban_share,rural_share,is_urban,is_rural,is_metro,mean_commute_minutes
0,1001020100,"Census Tract 201, Autauga County, Alabama",1775,76,1376,0.042817,0.775211,False,True,False,19.3
1,1001020200,"Census Tract 202, Autauga County, Alabama",2055,43,834,0.020925,0.405839,False,True,False,25.9
2,1001020300,"Census Tract 203, Autauga County, Alabama",3216,100,2220,0.031095,0.690299,False,True,False,27.0
3,1001020400,"Census Tract 204, Autauga County, Alabama",4246,158,3522,0.037211,0.829487,False,True,False,23.0
4,1001020501,"Census Tract 205.01, Autauga County, Alabama",4322,156,3211,0.036094,0.742943,False,True,False,20.9


In [12]:
df.is_metro.value_counts()

is_metro
False    84414
Name: count, dtype: int64