<a href="https://colab.research.google.com/github/meganm2c/MTC-SF-Neighborhood-Amenity-Equity-Project/blob/megan%2Feda/sf_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
from census import Census
from us import states
import os
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from pathlib import Path

In [30]:
# Census API
from google.colab import userdata
censusAPIKey = Census(
    os.getenv(
        userdata.get('censusAPIKey')
        ))

In [43]:
#ACS Variables
variables = (
    "NAME",
    "B03002_001E",  # total population
    "B03002_003E",  # non-hispanic white
    "B19083_001E",  # gini index
    "B19013_001E",  # median income
    "B17001_001E",  # poverty denominator
    "B17001_002E",  # poverty count
    "B08301_010E"   # public transit commuters
)
acs_years = [2015, 2018, 2020, 2022, 2023]

#helper to filter for valid tracts only
def clean_sf_tracts(df):
    df["tract"] = df["tract"].astype(str).str.zfill(6)
    df["population_total"] = pd.to_numeric(df["population_total"], errors="coerce")
    df = df[df["population_total"] > 0]
    #remove water/Farallon/military
    df = df[~df["tract"].str.startswith(("98", "99"))]
    return df

sf_acs_year_dfs = []
for YEAR in acs_years:
  sf_acs = censusAPIKey.acs5.state_county_tract(
      variables,
      "06",      # CA
      "075",     # SF County
      Census.ALL,
      year = YEAR #looping through acs_years
  )

  sf_acs_df = pd.DataFrame(sf_acs)
  sf_acs_df["GEOID"] = sf_acs_df["state"] + sf_acs_df["county"] + sf_acs_df["tract"]
  sf_acs_df = sf_acs_df.rename(columns={
      "B03002_001E": "population_total",
      "B03002_003E": "white/non_hispanic",
      "B19083_001E": "gini_index",
      "B19013_001E": "median_income",
      "B17001_001E": "poverty_denominator",
      "B17001_002E": "poverty_count",
      "B08301_010E": "transit_commuters"
  })
  #year column
  sf_acs_df["year"] = YEAR
  #deriving columns
  sf_acs_df["%_white_nonhisp"] = sf_acs_df["white/non_hispanic"] / sf_acs_df["population_total"]
  sf_acs_df["poverty_rate"] = sf_acs_df["poverty_count"] / sf_acs_df["poverty_denominator"]
  sf_acs_clean = clean_sf_tracts(sf_acs_df)
  sf_acs_year_dfs.append(sf_acs_clean)

final_sf_acs_years_panel = pd.concat(sf_acs_year_dfs, ignore_index=True)
final_sf_acs_years_panel

Unnamed: 0,NAME,population_total,white/non_hispanic,gini_index,median_income,poverty_denominator,poverty_count,transit_commuters,state,county,tract,GEOID,year,%_white_nonhisp,poverty_rate
0,"Census Tract 260.04, San Francisco County, Cal...",5015.0,528.0,0.4582,65862.0,5015.0,456.0,1099.0,06,075,026004,06075026004,2015,0.105284,0.090927
1,"Census Tract 301.01, San Francisco County, Cal...",4895.0,2915.0,0.4959,79071.0,4735.0,583.0,1326.0,06,075,030101,06075030101,2015,0.595506,0.123126
2,"Census Tract 330, San Francisco County, Califo...",8227.0,2891.0,0.4607,82527.0,8162.0,1088.0,1093.0,06,075,033000,06075033000,2015,0.351404,0.133301
3,"Census Tract 254.03, San Francisco County, Cal...",5154.0,1168.0,0.4495,73159.0,5128.0,611.0,1018.0,06,075,025403,06075025403,2015,0.226620,0.119150
4,"Census Tract 264.01, San Francisco County, Cal...",3937.0,137.0,0.4317,46150.0,3937.0,501.0,624.0,06,075,026401,06075026401,2015,0.034798,0.127254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1083,Census Tract 615.04; San Francisco County; Cal...,2133.0,390.0,0.4771,249539.0,2033.0,21.0,142.0,06,075,061504,06075061504,2023,0.182841,0.010330
1084,Census Tract 615.05; San Francisco County; Cal...,1162.0,291.0,0.2929,250001.0,1162.0,8.0,98.0,06,075,061505,06075061505,2023,0.250430,0.006885
1085,Census Tract 615.06; San Francisco County; Cal...,4918.0,1874.0,0.3948,250001.0,4918.0,174.0,285.0,06,075,061506,06075061506,2023,0.381049,0.035380
1086,Census Tract 615.07; San Francisco County; Cal...,1550.0,401.0,0.4970,167785.0,1539.0,60.0,436.0,06,075,061507,06075061507,2023,0.258710,0.038986


In [44]:
#Creating an sf acs variable codebook
codebook_dir = Path("sf_acs_variable_codebook")
codebook_dir.mkdir(parents=True, exist_ok=True)


tracking = {
    "variable_name": [
        "pop_total", "white_nonhisp", "pct_white_nonhisp",
        "gini", "median_income",
        "poverty_denom", "poverty_count", "poverty_rate",
        "transit_commuters"
    ],
    "acs_code": [
        "B03002_001E", "B03002_003E", "derived",
        "B19083_001E", "B19013_001E",
        "B17001_001E", "B17001_002E", "derived",
        "B08301_010E"
    ],
    "description": [
        "Total population",
        "White Non-Hispanic (count)",
        "White Non-Hispanic proportion",
        "Income inequality (0–1)",
        "Median household income",
        "Poverty denominator",
        "Poverty count",
        "Poverty rate",
        "Workers commuting by public transit"
    ],
    "why_needed": [
        "Normalization denominator for all per-capita amenity metrics",
        "Demographic composition (race)",
        "Key racial equity variable",
        "Income inequality metric for regressions",
        "Economic stratification and control variable",
        "Required for poverty rate",
        "Required for poverty rate",
        "Socioeconomic covariate (SES)",
        "Commute access proxy (accessibility indicator)"
    ]
}

acs_codebook = pd.DataFrame(tracking)

# CSV
csv_path = codebook_dir / "sf_acs_variable_codebook.csv"
acs_codebook.to_csv(csv_path, index=False)

# Markdown (maybe add to report?)
md_path = codebook_dir / "sf_acs_variable_codebook.md"
acs_codebook.to_markdown(md_path, index=False)

csv_path, md_path

(PosixPath('sf_acs_variable_codebook/sf_acs_variable_codebook.csv'),
 PosixPath('sf_acs_variable_codebook/sf_acs_variable_codebook.md'))