# Census Tract Population Data

This notebook retrieves ACS 2022 5-Year total population by census tract from the Census API and prepares it for downstream analysis.

**Important**: Run the cells in order, starting with the configuration cell below.

In [1]:
# Enable IPython autoreload for modules
%load_ext autoreload
%autoreload 2

import pandas as pd
# Load configuration
data_dir = None  # To silence Pylance; defined in config.py
%run ../../config.py

Project root: /Users/max/proj/WirelessIncome
Data directory: /Users/max/proj/WirelessIncome/data


In [2]:
import requests
from pathlib import Path

## Configure Census API

In [3]:
BASE_URL = "https://api.census.gov/data/2022/acs/acs5"
POPULATION_VAR = "B01003_001E"
STATE_FIPS = [
    "01", "04", "05", "06", "08", "09", "10", "11", "12", "13",
    "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
    "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
    "36", "37", "38", "39", "40", "41", "42", "44", "45", "46",
    "47", "48", "49", "50", "51", "53", "54", "55", "56"
]

print(f"Fetching {POPULATION_VAR} for {len(STATE_FIPS)} states (contiguous US + DC)")

Fetching B01003_001E for 49 states (contiguous US + DC)


## Fetch Population by Tract

In [4]:
all_data = []

for i, state_fips in enumerate(STATE_FIPS, start=1):
    print(f"{i:02d}/{len(STATE_FIPS)} - state {state_fips}", end="  ")
    params = {
        "get": f"{POPULATION_VAR},NAME",
        "for": "tract:*",
        "in": f"state:{state_fips}"
    }

    response = requests.get(BASE_URL, params=params, timeout=60)
    response.raise_for_status()
    data = response.json()

    headers = data[0]
    rows = data[1:]
    t = pd.DataFrame(rows, columns=headers)
    all_data.append(t)

    print(f"tracts: {len(t):,}")

print(f"\nFinished fetching {len(all_data)} states.")

01/49 - state 01  tracts: 1,437
02/49 - state 04  tracts: 1,437
02/49 - state 04  tracts: 1,765
03/49 - state 05  tracts: 1,765
03/49 - state 05  tracts: 823
04/49 - state 06  tracts: 823
04/49 - state 06  tracts: 9,129
05/49 - state 08  tracts: 9,129
05/49 - state 08  tracts: 1,447
06/49 - state 09  tracts: 1,447
06/49 - state 09  tracts: 884
07/49 - state 10  tracts: 884
07/49 - state 10  tracts: 262
08/49 - state 11  tracts: 262
08/49 - state 11  tracts: 206
09/49 - state 12  tracts: 206
09/49 - state 12  tracts: 5,160
10/49 - state 13  tracts: 5,160
10/49 - state 13  tracts: 2,796
11/49 - state 16  tracts: 2,796
11/49 - state 16  

KeyboardInterrupt: 

## Combine and Clean Data

In [None]:
df = pd.concat(all_data, ignore_index=True)

df = df.rename(columns={
    POPULATION_VAR: "population",
    "NAME": "name"
})

df["GEOID"] = df["state"] + df["county"] + df["tract"]

df["population"] = pd.to_numeric(df["population"], errors="coerce")

df = df[["GEOID", "state", "county", "tract", "population", "name"]]

df.head()

## Population Summary

In [None]:
summary = {
    "tracts": len(df),
    "states": df["state"].nunique(),
    "missing_population": int(df["population"].isna().sum())
}

t = pd.DataFrame([summary])
t

## Population Distribution

In [None]:
df["population"].describe()

## Save to Parquet

In [None]:
output_path = Path(data_dir) / "census_tract_population_2022.parquet"
df.to_parquet(output_path, index=False)

save_stats = {
    "rows": len(df),
    "output_path": str(output_path)
}

save_stats