In [0]:
%run /Workspace/Repos/yubin.park@mimilabs.ai/mimi-common-utils/ingestion_utils

In [0]:
%run /Workspace/Repos/yubin.park@mimilabs.ai/mimi-census/acs_composite_measures

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf, broadcast
from pyspark.sql.types import StringType
from dateutil.parser import parse

volumepath = "/Volumes/mimi_ws_1/census/src/acs/2023_5yr_data"

In [0]:
# https://www.census.gov/programs-surveys/geography/technical-documentation/naming-convention/cartographic-boundary-file/carto-boundary-summary-level.html
# Sources:
# http://mcdc2.missouri.edu/pub/data/sf32000/Techdoc/ch4_summary_level_seq_chart.pdf
# http://www2.census.gov/acs2011_1yr/summaryfile/ACS_2011_SF_Tech_Doc.pdf
# https://gist.github.com/ryanpitts/5265134
geo_mapping = {
    "010":"United States",
    "020":"Region",
    "030":"Division",
    "040":"State",
    "050":"State-County",
    "060":"State-County-County Subdivision",
    "061":"Minor Civil Division (MCD)/Census County Division (CCD) (10,000+)",
    "062":"Minor Civil Division (MCD)/Census County Division (CCD) (<10,000)",
    "063":"Minor Civil Division (MCD)/Census County Division (CCD) (2500+)",
    "064":"Minor Civil Division (MCD)/Census County Division (CCD) (< 2500 in Metro Area)",
    "067":"State (Puerto Rico Only)-County-County Subdivision-Subbarrio",
    "070":"State-County-County Subdivision-Place/Remainder",
    "071":"County Subdivision-Place (10,000+)/Remainder",
    "072":"County Subdivision-Place (2500+)/Remainder",
    "080":"State-County-County Subdivision-Place/Remainder-Census Tract",
    "082":"County Subdivision-Place(2500+)/Remainder-Census Tract",
    "085":"State-County-County Subdivision-Place/Remainder-Census Tract-Urban/Rural",
    "090":"State-County-County Subdivision-Place/Remainder-Census Tract-Urban/Rural-Block Group",
    "091":"County Subdivision-Place/Remainder-Census Tract-Block Group",
    "101":"State-County-Census Tract-Block",
    "140":"State-County-Census Tract",
    "144":"State-County-Census Tract-American Indian Area/Alaska Native Area/Hawaiian Home Land",
    "150":"State-County-Census Tract-Block Group",
    "154":"State-County-Census Tract-Block Group-American Indian Area/Alaska Native Area/Hawaiian Home Land",
    "155":"State-Place-County",
    "157":"State-Place (no CDPs)-County" ,
    "158":"State-Place-County-Census Tract",
    "160":"State-Place",
    "161":"State-Place (10,000+)",
    "162":"State-Place (no CDPs)",
    "170":"State-Consolidated City",
    "172":"State-Consolidated City-Place Within Consolidated City",
    "200":"American Indian Reservation with Trust Lands",
    "201":"American Indian Reservation with Trust Lands: Reservation Only",
    "202":"American Indian Reservations with Trust Lands: Trust Lands Only",
    "203":"American Indian Reservation No Trust Lands/Tribal Jurisdiction Sa/Etc",
    "204":"American Indian Trust Lands (With No Reservation)",
    "205":"American Indian Reservation with Trust Lands: Reservation Only-State",
    "206":"American Indian Reservation with Trust Lands: Trust Lands Only-State",
    "207":"American Indian Reservation No Trust Lands/Tribal Jurisdiction Sa/Etc-State",
    "208":"American Indian Trust Lands (With No Reservation)-State",
    "210":"State-American Indian Reservation",
    "211":"State-American Indian Reservation Only",
    "212":"State-American Indian Reservation Trust Land Only",
    "215":"State-American Indian Reservation Jurisdiction",
    "216":"State-American Indian Trust Lands",
    "220":"American Indian Reservation Jurisdiction-Co",
    "221":"American Indian Trust Lands Only-Co",
    "230":"State-Alaska Native Regional Corporation",
    "250":"American Indian Area/Alaska Native Area/Hawaiian Home Land",
    "252":"American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)",
    "251":"American Indian Area/Alaska Native Area/Hawaiian Home Land-Tribal Subdivision/Remainder",
    "253":"American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-Tribal Subdivision/Remainder",
    "254":"American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land",
    "255":"American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-Tribal Subdivision/Remainder",
    "256":"Specified American Indian Area-Tribal Census Tract",
    "257":"Specified American Indian Area-Tribal Subdivision/Remainder-Tribal Census Tract",
    "259":"Specified American Indian Area-Tribal Subdivision/Remainder-Tribal Census Tract-Tribal Block Group",
    "258":"Specified American Indian Area-Tribal Census Tract-Tribal Block Group",
    "259":"Specified American Indian Area-Tribal Subdivision/Remainder-Tribal Census Tract-Tribal Block Group",
    "260":"American Indian Area/Alaska Native Area/Hawaiian Home Land-State",
    "261":"State-American Indian Area/Alaska Native Area/Hawaiian Home Land-County-County Subdivision",
    "262":"American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-State",
    "263":"State-American Indian Area/Alaska Native Area/Hawaiian Home Land-County-County Subdivision-Place/Remainder",
    "264":"American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-State",
    "265":"State-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-County-County Subdivision",
    "266":"State-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-County-County Subdivision-Place/Remainder",
    "267":"State-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-County-County Subdivision",
    "268":"State-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-County-County Subdivision-Place/Remainder",
    "269":"American Indian Area/Alaska Native Area/Hawaiian Home Land-Place-Remainder",
    "270":"American Indian Area/Alaska Native Area/Hawaiian Home Land-State-County",
    "271":"American Indian Area/Alaska Native Area/Hawaiian Home Land-State-County-County Subdivision ",
    "272":"American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-State-County",
    "273":"American Indian Area/Alaska Native Area/Hawaiian Home Land-State-County-County Subdivision-Place/Remainder ",
    "274":"American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-State-County",
    "275":"American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-State-County-County Subdivision",
    "276":"American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-State-County-County Subdivision-Place/Remainder",
    "277":"American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-State-County-County Subdivision ",
    "278":"American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-State-County-County Subdivision-Place/Remainder",
    "280":"State-American Indian Area/Alaska Native Area/Hawaiian Home Land",
    "281":"State-AmericanIndianArea/AlaskaNativeArea/Hawaiian Home Land-Tribal Subdivision/Remainder",
    "282":"State-American Indian Area/Alaska Native Area/Hawaiian Home Land-County",
    "283":"State-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)",
    "284":"State-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-Tribal Subdivision/Remainder",
    "285":"State-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-County",
    "286":"State-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land",
    "287":"State-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-Tribal Subdivision/Remainder",
    "288":"State-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-County",
    "290":"American Indian Area/Alaska Native Area/Hawaiian Home Land-Tribal Subdivision/Remainder-State",
    "291":"Specified American Indian Area (Reservation Only)-Tribal Census Tract",
    "292":"Specified American Indian Area (Off-Reservation Trust Land Only)-Tribal Census Tract",
    "293":"Specified American Indian Area (Reservation Only)-Tribal Census Tract-Tribal Block Group",
    "294":"Specified American Indian Area (Off-Reservation Trust Land Only)-Tribal Census Tract-Tribal Block Group",
    "300":"Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)",
    "301":"Primary Metropolitan Statistical Area",
    "310":"Core Based Statistical Area (CBSA)",
    "311":"Core Based Statistical Area (CBSA)-State",
    "312":"Core Based Statistical Area (CBSA)-State-Principal City",
    "313":"Core Based Statistical Area (CBSA)-State-County",
    "314":"Metropolitan Statistical Area (MSA)/Metropolitan Division",
    "315":"Metropolitan Statistical Area (MSA)/Metropolitan Division-State",
    "316":"Metropolitan Statistical Area (MSA)/Metropolitan Division-State-County",
    "319":"State-Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)",
    "320":"State-Core Based Statistical Area (CBSA)",
    "321":"State-Core Based Statistical Area (CBSA)-Principal City",
    "322":"State-Core Based Statistical Area (CBSA)-County",
    "323":"State-Metropolitan Statistical Area (MSA)/Metropolitan Division",
    "324":"State-Metropolitan Statistical Area (MSA)/Metropolitan Division-County",
    "329":"Metropolitan Statistical Area (MSA) (no CMSAs)-State-County",
    "330":"Combined Statistical Area (CSA)",
    "331":"Combined Statistical Area (CSA)-State",
    "332":"Combined Statistical Area (CSA)-Core Based Statistical Area (CBSA)",
    "333":"Combined Statistical Area (CSA)-Core Based Statistical Area (CBSA)-State",
    "335":"Combined New England City and Town Area",
    "336":"Combined New England City and Town Area-State",
    "337":"Combined New England City and Town Area-New England City and Town Area (NECTA)",
    "338":"Combined New England City and Town Area-New England City and Town Area (NECTA)-State",
    "340":"State-Combined Statistical Area (CSA)",
    "341":"State-Combined Statistical Area (CSA)-Core Based Statistical Area (CBSA)",
    "345":"State-Combined New England City and Town Area",
    "346":"State-Combined New England City and Town Area-New England City and Town Area",
    "350":"New England City and Town Area (NECTA)",
    "351":"New England City and Town Area (NECTA)-State",
    "352":"New England City and Town Area (NECTA)-State-Principal City",
    "353":"New England City and Town Area (NECTA)-State-County",
    "354":"New England City and Town Area (NECTA)-State-County-County Subdivision",
    "355":"New England City and Town Area (NECTA)-NECTA Division",
    "356":"New England City and Town Area (NECTA)-NECTA Division-State",
    "357":"New England City and Town Area (NECTA)-NECTA Division-State-County",
    "358":"New England City and Town Area (NECTA)-NECTA Division-State-County-County Subdivision",
    "360":"State-New England City and Town Area (NECTA)",
    "361":"State-New England City and Town Area (NECTA)-Principal City",
    "362":"State-New England City and Town Area (NECTA)-County",
    "363":"State-New England City and Town Area (NECTA)-County-County Subdivision",
    "364":"State-New England City and Town Area (NECTA)-NECTA Division",
    "365":"State-New England City and Town Area (NECTA)-NECTA Division-County",
    "366":"State-New England City and Town Area (NECTA)-NECTA Division-County-County Subdivision",
    "370":"New England County Metropolitan Area",
    "371":"New England County Metropolitan Area-State",
    "372":"New England County Metropolitan Area-State-Central City",
    "373":"New England County Metropolitan Area-State-County",
    "374":"State-New England County Metropolitan Area",
    "375":"State-New England County Metropolitan Area-Central City",
    "376":"State-New England County Metropolitan Area-County",
    "380":"Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)",
    "381":"Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-State",
    "382":"Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-State-Central City",
    "383":"Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-State-County",
    "384":"Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-State (New England only)-County-County Subdivision",
    "385":"Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area",
    "386":"Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area-State",
    "387":"Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area-State-County",
    "388":"Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area-State (New England only)-County-County Subdivision",
    "390":"State-Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)",
    "391":"State-Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-Central City",
    "392":"State-Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-County",
    "393":"State (New England only)-Metropolitan Statistical Area (MSA)/Consolidated Metropolitan Statistical Area (CMSA)-County-County Subdivision",
    "395":"State-Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area",
    "396":"State-Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area-County",
    "397":"State (New England only)-Consolidated Metropolitan Statistical Area (CMSA)-Primary Metropolitan Statistical Area-County-County Subdivision",
    "400":"Urban Area",
    "410":"Urban Area-State",
    "420":"State-Urban Area",
    "430":"Urban Area-State-County",
    "431":"State-Urban Area-County",
    "440":"Urban Area-State-County-County Subdivision",
    "441":"State-Urban Area-County-County Subdivision",
    "450":"Urban Area-State-County-County Subdivision-Place/Remainder",
    "451":"State-Urban Area-County-County Subdivision-Place/Remainder",
    "460":"Urban Area-State-Central Place",
    "461":"State-Urban Area-Central Place",
    "462":"Urban Area-State-Consolidated City",
    "463":"State-Urban Area-Consolidated City",
    "464":"Urban Area-State-Consolidated City-Place Within Consolidated City",
    "465":"State-Urban Area-Consolidated City-Place Within Consolidated City",
    "500":"State-Congressional District",
    "510":"State-Congressional District-County",
    "511":"State-Congressional District-County-Census Tract",
    "521":"State-Congressional District-County-County Subdivision",
    "531":"State-Congressional District-Place/Remainder",
    "541":"State-Congressional District-Consolidated City",
    "542":"State-Congressional District-Consolidated City-Place Within Consolidated City",
    "550":"State-Congressional District-American Indian Area/Alaska Native Area/Hawaiian Home Land",
    "551":"State-Congressional District-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)",
    "552":"State-Congressional District-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land",
    "553":"State-Congressional District-American Indian Area/Alaska Native Area/Hawaiian Home Land-Tribal Subdivision/Remainder",
    "554":"State-Congressional District-American Indian Area/Alaska Native Area (Reservation or Statistical Entity Only)-Tribal Subdivision/Remainder",
    "555":"State-Congressional District-American Indian Area (Off-Reservation Trust Land Only)/Hawaiian Home Land-Tribal Subdivision/Remainder",
    "560":"State-Congressional District-Alaska Native Regional Corporation",
    "610":"State Senate District",
    "612":"State Senate District-County",
    "613":"State Senate District-County-Minor Civil Division (MCD)-Place",
    "614":"State Senate District-Place",
    "620":"State House District",
    "622":"State House District-County",
    "623":"State House District-County-Minor Civil Division (MCD)-Place",
    "624":"State House District-Place",
    "700":"Voting Tabulation District (VTD)",
    "740":"Block Group [split by Voting Tabulation District (VTD), Minor Civil Division (MCD), and Place]",
    "750":"Census Block (pl94 files)",
    "795":"State-Public Use Microdata Sample Area (PUMA)",
    "850":"3-digit ZIP Code Tabulation Area (ZCTA3)",
    "851":"State-3-digit ZIP Code Tabulation Area (ZCTA3)",
    "852":"State-3-digit ZIP Code Tabulation Area (ZCTA3)-County",
    "860":"5-digit ZIP Code Tabulation Area (ZCTA5)",
    "870":"5-digit ZIP Code Tabulation Area (ZCTA5)-State",
    "871":"State-5-digit ZIP Code Tabulation Area (ZCTA5)",
    "880":"5-digit ZIP Code Tabulation Area (ZCTA5)-County",
    "881":"State-5-digit ZIP Code Tabulation Area (ZCTA5)-County",
    "901":"County Set",
    "930":"Metropolitan Planning Organization Region (CTPP)",
    "935":"State-County-Combined Zone (CTPP)",
    "940":"State-County-Traffic Analysis Zone (CTPP)",
    "950":"State-School District (Elementary)",
    "960":"State-School District (Secondary)",
    "970":"State-School District (Unified)",
}
def lookup_description(code):
    return geo_mapping.get(code, "Unknown")

lookup_udf = udf(lookup_description, StringType())

In [0]:
filepath_geos = "/Volumes/mimi_ws_1/census/src/acs/downloads/Geos20235YR.txt"
geo_key_columns = [
    'GEO_ID',      # Primary key for joining
    'NAME',        # Human-readable name (e.g., "Census Tract 9501, Autauga County, Alabama")
    'STUSAB',      # State abbreviation (e.g., "AL", "CA")
    'SUMLEVEL',    # Geographic level (140=tract, 150=block group, 050=county)
    'STATE',       # State FIPS code
    'COUNTY',      # County FIPS code
    'TRACT',       # Census tract (if applicable)
    'BLKGRP',      # Block group (if applicable)
    'PLACE',       # Place/city FIPS code (if applicable)
    'ZCTA5'        # ZIP code tabulation area (if applicable)
]
df = spark.read.csv(filepath_geos, header=True, sep='|', inferSchema=False)
df = df.select(*geo_key_columns)
for col in geo_key_columns:
    df = df.withColumnRenamed(col, col.lower())
df = df.withColumn("sumlevel_desc", lookup_udf(F.col("sumlevel")))

In [0]:
# =============================================================================
# ACS VARIABLES WITH CORRECT FORMAT (B01001_E001 instead of B01001_001E)
# =============================================================================

acs_variables = {
    # =============================================================================
    # INCOME & ECONOMIC STATUS
    # =============================================================================
    "B19013": {
        "median_household_income": clean_acs_value(F.col("B19013_E001"))
    },
    
    "B19301": {
        "per_capita_income": clean_acs_value(F.col("B19301_E001"))
    },
    
    "B17001": {
        "poverty_rate_all_ages": validate_rate(
            safe_divide("B17001_E002", "B17001_E001")
        ),
        "child_poverty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B17001_E004", "B17001_E005", "B17001_E006",  # Male age groups
                    "B17001_E007", "B17001_E008", "B17001_E009",
                    "B17001_E018", "B17001_E019", "B17001_E020",  # Female age groups
                    "B17001_E021", "B17001_E022", "B17001_E023"
                ]),
                safe_column_sum(["B17001_E003", "B17001_E017"])  # Total male + female under 18
            )
        )
    },
    
    "B19083": {
        "gini_index_income_inequality": F.col("B19083_E001")
    },
    
    "B19057": {
        "public_assistance_rate": validate_rate(
            safe_divide("B19057_E002", "B19057_E001")
        )
    },
    
    "B22003": {
        "snap_recipients_rate": validate_rate(
            safe_divide("B22003_E002", "B22003_E001")
        )
    },
    
    "B23025": {
        "unemployment_rate": validate_rate(
            safe_divide("B23025_E005", "B23025_E002")
        ),
        "employment_rate": validate_rate(
            safe_divide("B23025_E004", "B23025_E002")
        ),
        "labor_force_participation_rate": validate_rate(
            safe_divide("B23025_E002", "B23025_E001")
        )
    },
    
    # =============================================================================
    # EDUCATION
    # =============================================================================
    "B15003": {
        "less_than_high_school_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B15003_E002", "B15003_E003", "B15003_E004", "B15003_E005",
                    "B15003_E006", "B15003_E007", "B15003_E008", "B15003_E009",
                    "B15003_E010", "B15003_E011", "B15003_E012", "B15003_E013",
                    "B15003_E014", "B15003_E015", "B15003_E016"
                ]),
                F.col("B15003_E001")
            )
        ),
        "high_school_graduate_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B15003_E017", "B15003_E018"]),
                F.col("B15003_E001")
            )
        ),
        "some_college_or_associates_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B15003_E019", "B15003_E020", "B15003_E021"]),
                F.col("B15003_E001")
            )
        ),
        "bachelors_degree_or_higher_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B15003_E022", "B15003_E023", "B15003_E024", "B15003_E025"
                ]),
                F.col("B15003_E001")
            )
        ),
        "graduate_degree_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B15003_E023", "B15003_E024", "B15003_E025"]),
                F.col("B15003_E001")
            )
        )
    },
    
    # =============================================================================
    # HOUSING
    # =============================================================================
    "B25077": {
        "median_home_value": clean_acs_value(F.col("B25077_E001"))
    },
    
    "B25064": {
        "median_gross_rent": clean_acs_value(F.col("B25064_E001"))
    },
    
    "B25003": {
        "owner_occupied_rate": validate_rate(
            safe_divide("B25003_E002", "B25003_E001")
        ),
        "renter_occupied_rate": validate_rate(
            safe_divide("B25003_E003", "B25003_E001")
        )
    },
    
    "B25002": {
        "vacant_housing_rate": validate_rate(
            safe_divide("B25002_E003", "B25002_E001")
        )
    },
    
    "B25070": {
        "rent_burden_over_30pct_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B25070_E007", "B25070_E008", "B25070_E009", "B25070_E010"
                ]),
                F.col("B25070_E001")
            )
        ),
        "severe_rent_burden_over_50pct_rate": validate_rate(
            safe_divide("B25070_E010", "B25070_E001")
        )
    },
    
    "B25014": {
        "overcrowded_housing_rate_households": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B25014_E005", "B25014_E006", "B25014_E007",
                    "B25014_E011", "B25014_E012", "B25014_E013"
                ]),
                F.col("B25014_E001")
            )
        ),
        "severely_overcrowded_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B25014_E006", "B25014_E007",
                    "B25014_E012", "B25014_E013"
                ]),
                F.col("B25014_E001")
            )
        )
    },
    
    # =============================================================================
    # DEMOGRAPHICS & AGE
    # =============================================================================
    "B01003": {
        "total_population": F.col("B01003_E001")
    },
    
    "B01001": {
        "age_under_18_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B01001_E003", "B01001_E004", "B01001_E005", "B01001_E006",
                    "B01001_E027", "B01001_E028", "B01001_E029", "B01001_E030"
                ]),
                F.col("B01001_E001")
            )
        ),
        "age_65_and_over_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B01001_E020", "B01001_E021", "B01001_E022",
                    "B01001_E023", "B01001_E024", "B01001_E025",
                    "B01001_E044", "B01001_E045", "B01001_E046",
                    "B01001_E047", "B01001_E048", "B01001_E049"
                ]),
                F.col("B01001_E001")
            )
        ),
        "working_age_18_64_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B01001_E007", "B01001_E008", "B01001_E009", "B01001_E010",
                    "B01001_E011", "B01001_E012", "B01001_E013", "B01001_E014",
                    "B01001_E015", "B01001_E016", "B01001_E017", "B01001_E018", "B01001_E019",
                    "B01001_E031", "B01001_E032", "B01001_E033", "B01001_E034",
                    "B01001_E035", "B01001_E036", "B01001_E037", "B01001_E038",
                    "B01001_E039", "B01001_E040", "B01001_E041", "B01001_E042", "B01001_E043"
                ]),
                F.col("B01001_E001")
            )
        )
    },
    
    "B01002": {
        "median_age": clean_acs_value(F.col("B01002_E001")),
        "median_age_male": clean_acs_value(F.col("B01002_E002")),
        "median_age_female": clean_acs_value(F.col("B01002_E003"))
    },
    
    # =============================================================================
    # RACE & ETHNICITY
    # =============================================================================
    "B02001": {
        "white_alone_rate": validate_rate(
            safe_divide("B02001_E002", "B02001_E001")
        ),
        "black_african_american_rate": validate_rate(
            safe_divide("B02001_E003", "B02001_E001")
        ),
        "american_indian_alaska_native_rate": validate_rate(
            safe_divide("B02001_E004", "B02001_E001")
        ),
        "asian_rate": validate_rate(
            safe_divide("B02001_E005", "B02001_E001")
        ),
        "native_hawaiian_pacific_islander_rate": validate_rate(
            safe_divide("B02001_E006", "B02001_E001")
        ),
        "other_race_rate": validate_rate(
            safe_divide("B02001_E007", "B02001_E001")
        ),
        "two_or_more_races_rate": validate_rate(
            safe_divide("B02001_E008", "B02001_E001")
        )
    },
    
    "B03003": {
        "hispanic_latino_rate": validate_rate(
            safe_divide("B03003_E003", "B03003_E001")
        )
    },
    
    # =============================================================================
    # IMMIGRATION & LANGUAGE
    # =============================================================================
    "B05002": {
        "foreign_born_rate": validate_rate(
            safe_divide("B05002_E013", "B05002_E001")
        ),
        "naturalized_citizen_rate": validate_rate(
            safe_divide("B05002_E014", "B05002_E001")
        ),
        "not_us_citizen_rate": validate_rate(
            safe_divide("B05002_E021", "B05002_E001")
        )
    },
    
    "B16001": {
        "speaks_language_other_than_english_rate": validate_rate(
            safe_divide(
                F.col("B16001_E001") - F.col("B16001_E002"),
                F.col("B16001_E001")
            )
        )
    },
    
    "B16004": {
        "limited_english_proficiency_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B16004_E007", "B16004_E008", "B16004_E012", "B16004_E013",
                    "B16004_E017", "B16004_E018", "B16004_E022", "B16004_E023",
                    "B16004_E029", "B16004_E030", "B16004_E034", "B16004_E035",
                    "B16004_E039", "B16004_E040", "B16004_E044", "B16004_E045",
                    "B16004_E051", "B16004_E052", "B16004_E056", "B16004_E057",
                    "B16004_E061", "B16004_E062", "B16004_E066", "B16004_E067"
                ]),
                F.col("B16004_E001")
            )
        )
    },
    
    # =============================================================================
    # FAMILY STRUCTURE
    # =============================================================================
    "B11001": {
        "family_households_rate": validate_rate(
            safe_divide("B11001_E002", "B11001_E001")
        ),
        "married_couple_families_rate": validate_rate(
            safe_divide("B11001_E003", "B11001_E001")
        ),
        "single_person_households_rate": validate_rate(
            safe_divide("B11001_E008", "B11001_E001")
        )
    },
    
    "B11005": {
        "female_headed_households_with_children_rate": validate_rate(
            safe_divide("B11005_E007", "B11005_E001")
        ),
        "male_headed_households_with_children_rate": validate_rate(
            safe_divide("B11005_E005", "B11005_E001")
        )
    },
    
    # =============================================================================
    # EMPLOYMENT & OCCUPATION
    # =============================================================================
    "C24010": {
        "management_business_science_arts_rate": validate_rate(
            safe_divide(
                safe_column_sum(["C24010_E003", "C24010_E039"]),
                F.col("C24010_E001")
            )
        )
    },
    
    "C24030": {
        "manufacturing_employment_rate": validate_rate(
            safe_divide(
                safe_column_sum(["C24030_E007", "C24030_E034"]),
                F.col("C24030_E001")
            )
        ),
        "retail_trade_rate": validate_rate(
            safe_divide(
                safe_column_sum(["C24030_E009", "C24030_E036"]),
                F.col("C24030_E001")
            )
        ),
        "healthcare_social_assistance_rate": validate_rate(
            safe_divide(
                safe_column_sum(["C24030_E018", "C24030_E045"]),
                F.col("C24030_E001")
            )
        )
    },
    
    # =============================================================================
    # TRANSPORTATION
    # =============================================================================
    "B08303": {
        "commute_over_30_minutes_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B08303_E008", "B08303_E009", "B08303_E010",
                    "B08303_E011", "B08303_E012", "B08303_E013"
                ]),
                F.col("B08303_E001")
            )
        ),
        "commute_over_60_minutes_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B08303_E012", "B08303_E013"]),
                F.col("B08303_E001")
            )
        )
    },
    
    "B08301": {
        "drove_alone_to_work_rate": validate_rate(
            safe_divide("B08301_E003", "B08301_E001")
        ),
        "carpooled_to_work_rate": validate_rate(
            safe_divide("B08301_E004", "B08301_E001")
        ),
        "public_transit_use_rate": validate_rate(
            safe_divide("B08301_E010", "B08301_E001")
        ),
        "walked_to_work_rate": validate_rate(
            safe_divide("B08301_E019", "B08301_E001")
        ),
        "worked_from_home_rate": validate_rate(
            safe_divide("B08301_E021", "B08301_E001")
        )
    },
    
    "B25044": {
        "no_vehicle_households_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B25044_E003", "B25044_E010"]),
                F.col("B25044_E001")
            )
        )
    },
    
    # =============================================================================
    # HEALTH & DISABILITY
    # =============================================================================
    "B18101": {
        "disability_rate_all_ages": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B18101_E004", "B18101_E007", "B18101_E010",
                    "B18101_E013", "B18101_E016", "B18101_E019",
                    "B18101_E023", "B18101_E026", "B18101_E029",
                    "B18101_E032", "B18101_E035", "B18101_E038"
                ]),
                F.col("B18101_E001")
            )
        )
    },
    
    "B27001": {
        "no_health_insurance_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B27001_E005", "B27001_E008", "B27001_E011", "B27001_E014",
                    "B27001_E017", "B27001_E020", "B27001_E023", "B27001_E026", "B27001_E029",
                    "B27001_E033", "B27001_E036", "B27001_E039", "B27001_E042",
                    "B27001_E045", "B27001_E048", "B27001_E051", "B27001_E054", "B27001_E057"
                ]),
                F.col("B27001_E001")
            )
        )
    },
    
    # =============================================================================
    # TECHNOLOGY ACCESS
    # =============================================================================
    "B28002": {
        "broadband_internet_rate": validate_rate(
            safe_divide("B28002_E004", "B28002_E001")
        ),
        "no_internet_rate": validate_rate(
            safe_divide("B28002_E013", "B28002_E001")
        )
    },
    
    "B28001": {
        "has_computer_rate": validate_rate(
            safe_divide(
                F.col("B28001_E001") - F.col("B28001_E011"),
                F.col("B28001_E001")
            )
        ),
        "no_computer_rate": validate_rate(
            safe_divide("B28001_E011", "B28001_E001")
        )
    },
    
    # =============================================================================
    # ADDITIONAL HOUSING CHARACTERISTICS
    # =============================================================================
    "B25024": {
        "single_family_detached_rate": validate_rate(
            safe_divide("B25024_E002", "B25024_E001")
        ),
        "mobile_homes_rate": validate_rate(
            safe_divide("B25024_E010", "B25024_E001")
        ),
        "apartment_5plus_units_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B25024_E007", "B25024_E008", "B25024_E009"]),
                F.col("B25024_E001")
            )
        )
    },
    
    "B25034": {
        "built_before_1980_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B25034_E008", "B25034_E009", "B25034_E010", "B25034_E011"
                ]),
                F.col("B25034_E001")
            )
        ),
        "built_2000_or_later_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B25034_E002", "B25034_E003", "B25034_E004"]),
                F.col("B25034_E001")
            )
        )
    },
    
    # =============================================================================
    # ADDITIONAL DEMOGRAPHICS
    # =============================================================================
    "B21001": {
        "veterans_rate": validate_rate(
            safe_divide("B21001_E002", "B21001_E001")
        )
    },
    
    "B10051": {
        "grandparents_responsible_for_grandchildren_rate": validate_rate(
            safe_divide("B10051_E002", "B10051_E001")
        )
    },
    
    "B07003": {
        "moved_in_past_year_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B07003_E007", "B07003_E010", "B07003_E013", "B07003_E016"]),
                # Different house in same county + Different county + Different state + Abroad
                F.col("B07003_E001")
            )
        ),
        "moved_from_different_state_rate": validate_rate(
            safe_divide("B07003_E013", "B07003_E001")
        )
    },

    # =============================================================================
    # HEALTHCARE INSURANCE COVERAGE
    # =============================================================================
    "B27010": {
        "medicare_coverage_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Under 19 - single and multiple coverage types
                    "B27010_E006",  # Medicare only
                    "B27010_E012",  # Employer + Medicare
                    "B27010_E013",  # Medicare + Medicaid
                    # 19-34 - single and multiple coverage types
                    "B27010_E022",  # Medicare only
                    "B27010_E028",  # Employer + Medicare
                    "B27010_E029",  # Medicare + Medicaid
                    # 35-64 - single and multiple coverage types
                    "B27010_E038",  # Medicare only
                    "B27010_E044",  # Employer + Medicare
                    "B27010_E045",  # Direct purchase + Medicare
                    "B27010_E046",  # Medicare + Medicaid
                    # 65+ - single and multiple coverage types
                    "B27010_E055",  # Medicare only
                    "B27010_E060",  # Employer + Medicare
                    "B27010_E061",  # Direct purchase + Medicare
                    "B27010_E062",  # Medicare + Medicaid (dual eligible)
                    # Note: E063-E065 may contain some Medicare but mixed with others
                ]),
                F.col("B27010_E001")
            )
        ),
        
        "medicaid_coverage_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Under 19
                    "B27010_E007",  # Medicaid only
                    "B27010_E013",  # Medicare + Medicaid
                    # 19-34
                    "B27010_E023",  # Medicaid only
                    "B27010_E029",  # Medicare + Medicaid
                    # 35-64
                    "B27010_E039",  # Medicaid only
                    "B27010_E046",  # Medicare + Medicaid
                    # 65+ (Note: No Medicaid-only column for 65+)
                    "B27010_E062",  # Medicare + Medicaid
                ]),
                F.col("B27010_E001")
            )
        ),
        
        # Specifically for Medicare Advantage analysis
        "medicare_65plus_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B27010_E055",  # Medicare only
                    "B27010_E060",  # Employer + Medicare
                    "B27010_E061",  # Direct purchase + Medicare
                    "B27010_E062",  # Medicare + Medicaid
                ]),
                F.col("B27010_E051")  # Total 65+ population
            )
        ),
        
        # Dual eligible (Medicare + Medicaid) for all ages
        "dual_eligible_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B27010_E013",  # Under 19
                    "B27010_E029",  # 19-34
                    "B27010_E046",  # 35-64
                    "B27010_E062",  # 65+
                ]),
                F.col("B27010_E001")
            )
        ),
    },

    "B27022": {
        "no_insurance_55_64_rate": validate_rate(
            safe_divide("B27022_E008", "B27022_E002")
        )
    },

    "B18135": {
        # Public coverage for disabled 19-64 (includes Medicare for SSDI recipients)
        "public_insurance_19_64_disability_rate": validate_rate(
            safe_divide("B18135_E017", "B18135_E014")
        ),
        
        # Uninsured disabled 19-64
        "uninsured_19_64_disability_rate": validate_rate(
            safe_divide("B18135_E018", "B18135_E014")
        ),
        
        # Disability rate among 19-64
        "disability_rate_19_64": validate_rate(
            safe_divide("B18135_E014", "B18135_E013")
        ),
        
        # For 65+ with disability
        "public_coverage_65plus_disability_rate": validate_rate(
            safe_divide("B18135_E028", "B18135_E025")  # Most 65+ public is Medicare
        ),
    },

    # =============================================================================
    # DISABILITY BY TYPE
    # =============================================================================
    "B18102": {
        "hearing_difficulty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B18102_E004", "B18102_E007", "B18102_E010",
                    "B18102_E013", "B18102_E016", "B18102_E019",
                    "B18102_E023", "B18102_E026", "B18102_E029",
                    "B18102_E032", "B18102_E035", "B18102_E038"
                ]),
                F.col("B18102_E001")
            )
        )
    },

    "B18103": {
        "vision_difficulty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B18103_E004", "B18103_E007", "B18103_E010",
                    "B18103_E013", "B18103_E016", "B18103_E019",
                    "B18103_E023", "B18103_E026", "B18103_E029",
                    "B18103_E032", "B18103_E035", "B18103_E038"
                ]),
                F.col("B18103_E001")
            )
        )
    },

    "B18104": {
        "cognitive_difficulty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Male WITH cognitive difficulty
                    "B18104_E004",  # 5 to 17 years
                    "B18104_E007",  # 18 to 34 years
                    "B18104_E010",  # 35 to 64 years
                    "B18104_E013",  # 65 to 74 years
                    "B18104_E016",  # 75 years and over
                    # Female WITH cognitive difficulty
                    "B18104_E020",  # 5 to 17 years
                    "B18104_E023",  # 18 to 34 years
                    "B18104_E026",  # 35 to 64 years
                    "B18104_E029",  # 65 to 74 years
                    "B18104_E032"   # 75 years and over
                ]),
                F.col("B18104_E001")
            )
        )
    },

    "B18105": {
        "ambulatory_difficulty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Male WITH ambulatory difficulty
                    "B18105_E004",  # 5 to 17 years
                    "B18105_E007",  # 18 to 34 years
                    "B18105_E010",  # 35 to 64 years
                    "B18105_E013",  # 65 to 74 years
                    "B18105_E016",  # 75 years and over
                    # Female WITH ambulatory difficulty
                    "B18105_E020",  # 5 to 17 years
                    "B18105_E023",  # 18 to 34 years
                    "B18105_E026",  # 35 to 64 years
                    "B18105_E029",  # 65 to 74 years
                    "B18105_E032"   # 75 years and over
                ]),
                F.col("B18105_E001")
            )
        )
    },

    "B18106": {
        "self_care_difficulty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Male WITH self-care difficulty
                    "B18106_E004", "B18106_E007", "B18106_E010",
                    "B18106_E013", "B18106_E016",
                    # Female WITH self-care difficulty  
                    "B18106_E020", "B18106_E023", "B18106_E026",
                    "B18106_E029", "B18106_E032"
                ]),
                F.col("B18106_E001")
            )
        ),
        "youth_self_care_difficulty_rate": safe_divide(  # Ages 5-17
            safe_column_sum(["B18106_E004", "B18106_E020"]),
            safe_column_sum(["B18106_E003", "B18106_E019"])
        ),

        "working_age_self_care_difficulty_rate": safe_divide(  # Ages 18-64
            safe_column_sum(["B18106_E007", "B18106_E010", "B18106_E023", "B18106_E026"]),
            safe_column_sum(["B18106_E006", "B18106_E009", "B18106_E022", "B18106_E025"])
        ),

        "elderly_self_care_difficulty_rate": safe_divide(  # Ages 65+
            safe_column_sum(["B18106_E013", "B18106_E016", "B18106_E029", "B18106_E032"]),
            safe_column_sum(["B18106_E012", "B18106_E015", "B18106_E028", "B18106_E031"])
        )
    },

    "B18107": {
        "independent_living_difficulty_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Male WITH independent living difficulty (18+)
                    "B18107_E004",  # 18 to 34 years
                    "B18107_E007",  # 35 to 64 years
                    "B18107_E010",  # 65 to 74 years
                    "B18107_E013",  # 75 years and over
                    # Female WITH independent living difficulty (18+)
                    "B18107_E017",  # 18 to 34 years
                    "B18107_E020",  # 35 to 64 years
                    "B18107_E023",  # 65 to 74 years
                    "B18107_E026"   # 75 years and over
                ]),
                F.col("B18107_E001")
            )
        )
    },

    # =============================================================================
    # SENIOR-SPECIFIC DEMOGRAPHICS
    # =============================================================================
    "B17024": {
        "poverty_65_plus_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B17024_E107", "B17024_E108", "B17024_E109",  # 65-74: <50%, 50-74%, 75-99%
                    "B17024_E120", "B17024_E121", "B17024_E122"   # 75+: <50%, 50-74%, 75-99%
                ]),
                safe_column_sum(["B17024_E106", "B17024_E119"])  # Total 65-74 + 75+
            )
        ),
        
        "near_poverty_65_plus_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    # Below 125% of poverty (includes below 100%)
                    "B17024_E107", "B17024_E108", "B17024_E109", "B17024_E110",  # 65-74
                    "B17024_E120", "B17024_E121", "B17024_E122", "B17024_E123"   # 75+
                ]),
                safe_column_sum(["B17024_E106", "B17024_E119"])
            )
        ),
        
        "severe_poverty_65_plus_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B17024_E107", "B17024_E120"]),  # Below 50% only
                safe_column_sum(["B17024_E106", "B17024_E119"])
            )
        ),
        
        # Additional: 100-199% of poverty (near-poor but above poverty line)
        "low_income_not_poverty_65_plus_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B17024_E110", "B17024_E111", "B17024_E112", "B17024_E114",  # 65-74: 100-199%
                    "B17024_E123", "B17024_E124", "B17024_E125", "B17024_E127"   # 75+: 100-199%
                ]),
                safe_column_sum(["B17024_E106", "B17024_E119"])
            )
        )
    },

    # =============================================================================
    # SENIOR INCOME SOURCES
    # =============================================================================
    "B19055": {
        "social_security_income_rate": validate_rate(
            safe_divide("B19055_E002", "B19055_E001")
        ),
        "mean_social_security_income": clean_acs_value(F.col("B19055_E003"))
    },

    "B19059": {
        "retirement_income_rate": validate_rate(
            safe_divide("B19059_E002", "B19059_E001")
        )
    },

    "B19056": {
        "supplemental_security_income_rate": validate_rate(
            safe_divide("B19056_E002", "B19056_E001")
        )
    },

    # =============================================================================
    # LINGUISTIC ISOLATION & HOUSEHOLD COMPOSITION
    # =============================================================================
    "B16005": {
        # Keep only these unique measures:
        "spanish_speakers_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B16005_E004", "B16005_E026"]),
                F.col("B16005_E001")
            )
        ),
        
        # Add if you want more language detail:
        "spanish_limited_english_rate": validate_rate(
            safe_divide(
                safe_column_sum([
                    "B16005_E007", "B16005_E008",  # Native Spanish LEP
                    "B16005_E029", "B16005_E030"   # Foreign born Spanish LEP
                ]),
                F.col("B16005_E001")
            )
        ),
        
        "asian_language_speakers_rate": validate_rate(
            safe_divide(
                safe_column_sum(["B16005_E014", "B16005_E036"]),
                F.col("B16005_E001")
            )
        )
    },

    "B11007": {
        "living_alone_65_plus_rate": validate_rate(
            safe_divide(
                "B11007_E003",  # Households with 65+: 1-person household
                "B11007_E001"  # Total households
            )
        ),
        
        "households_with_elderly_rate": validate_rate(
            safe_divide(
                "B11007_E002",  # Households with one or more people 65+
                "B11007_E001"  # Total households
            )
        ),
        
        "elderly_family_households_rate": validate_rate(
            safe_divide(
                "B11007_E005",  # Households with 65+: 2+ person: Family households
                "B11007_E001"  # Total households
            )
        )
    },

    # =============================================================================
    # HOUSING CONDITIONS (HEALTH-RELEVANT)
    # =============================================================================
    "B25047": {
        "lacking_complete_plumbing_rate": validate_rate(
            safe_divide("B25047_E003", "B25047_E001")
        )
    },

    "B25051": {
        "lacking_complete_kitchen_rate": validate_rate(
            safe_divide("B25051_E003", "B25051_E001")
        )
    }
}

In [0]:
df_coll = {}
for filepath in Path(volumepath).glob("*.dat"):
    table_key = filepath.stem.split('-')[-1].upper()

    if table_key not in acs_variables:
        continue
    
    # Read everything as strings first (most reliable for ACS data)
    df_tab = spark.read.csv(
        str(filepath), 
        sep="|", 
        header=True, 
        inferSchema=False,  # Everything as strings
        nullValue="",
        nanValue="null"
    )
    
    # Cast numeric columns to Double
    for col in df_tab.columns:
        col_upper = col.upper()
        # Keep geographic identifiers as strings
        if any(geo in col_upper for geo in ['GEO', 'NAME', 'STATE', 'COUNTY', 
                                              'TRACT', 'PLACE', 'LOGRECNO', 'SUMLEVEL']):
            continue
        # Cast ACS data columns to Double
        elif any(suffix in col_upper for suffix in ['_E001', '_E002', '_E003', '_M001', '_M002']) or \
             (len(col) > 6 and col[-4] in ['E', 'M'] and col[-3:].isdigit()):
            df_tab = df_tab.withColumn(col, F.col(col).cast(DoubleType()))
    
    new_columns = []
    
    for column_name, column_eq in acs_variables[table_key].items():
        df_tab = df_tab.withColumn(column_name, column_eq)
        new_columns.append(column_name)
    df_tab = df_tab.withColumnRenamed('GEO_ID', 'geo_id')
    df_tab = df_tab.select('geo_id', *new_columns)
    df_coll[table_key] = df_tab

In [0]:
for table_key, df_tab in df_coll.items():
    # Outer join to keep all GEO_IDs
    df = df.join(df_tab, on='geo_id', how='left')
    print(f"Added {table_key}: {len(df_tab.columns)-1} variables")

In [0]:
for column_name, column_eq in composite_measure.items():
    print(column_name)
    df = df.withColumn(column_name, column_eq)

In [0]:
df.withColumn('mimi_src_file_date', F.lit(parse('2023-12-31').date())) \
 .withColumn('mimi_src_file_name', F.lit('2023_5YRData.zip')) \
 .withColumn('mimi_dlt_load_date', F.lit(datetime.today().date())) \
 .write.mode('overwrite').saveAsTable('mimi_ws_1.census.acs2023_5yr_sf2')

In [0]:
%sql
COMMENT ON TABLE mimi_ws_1.census.acs2023_5yr_sf IS '# [American Community Survey (ACS) 5 Year Estimates Summary File](https://www2.census.gov/programs-surveys/acs/summary_file/2023/) - 2019-2023 | interval: snapshot, resolution: geoid'

In [0]:
variables = {
  "median_household_income": "Median household income in dollars from table B19013. Uses B19013_E001 with ACS special values cleaned to null.",
  
  "per_capita_income": "Per capita income in dollars from table B19301. Uses B19301_E001 with ACS special values cleaned to null.",
  
  "poverty_rate_all_ages": "Percentage of population below poverty level from table B17001. Calculated as ratio of people below poverty (B17001_E002) to total population for whom poverty status is determined (B17001_E001).",
  
  "child_poverty_rate": "Percentage of children under 18 below poverty level from table B17001. Calculated as ratio of children below poverty (sum of male and female age groups under 18) to total children under 18.",
  
  "gini_index_income_inequality": "Gini coefficient measuring income inequality from table B19083. Uses B19083_E001 directly.",
  
  "public_assistance_rate": "Percentage of households receiving public assistance income from table B19057. Calculated as ratio of households with public assistance (B19057_E002) to total households (B19057_E001).",
  
  "snap_recipients_rate": "Percentage of households receiving SNAP benefits from table B22003. Calculated as ratio of households with SNAP (B22003_E002) to total households (B22003_E001).",
  
  "unemployment_rate": "Percentage of labor force that is unemployed from table B23025. Calculated as ratio of unemployed (B23025_E005) to labor force (B23025_E002).",
  
  "employment_rate": "Percentage of labor force that is employed from table B23025. Calculated as ratio of employed (B23025_E004) to labor force (B23025_E002).",
  
  "labor_force_participation_rate": "Percentage of population 16+ in labor force from table B23025. Calculated as ratio of labor force (B23025_E002) to population 16+ (B23025_E001).",
  
  "less_than_high_school_rate": "Percentage of population 25+ with less than high school education from table B15003. Calculated as ratio of education levels below high school graduation to total population 25+.",
  
  "high_school_graduate_rate": "Percentage of population 25+ with high school diploma or equivalent from table B15003. Calculated as ratio of high school graduates and equivalency holders to total population 25+.",
  
  "some_college_or_associates_rate": "Percentage of population 25+ with some college or associate's degree from table B15003. Calculated as ratio of some college, associate's degree holders to total population 25+.",
  
  "bachelors_degree_or_higher_rate": "Percentage of population 25+ with bachelor's degree or higher from table B15003. Calculated as ratio of bachelor's, master's, professional, and doctoral degree holders to total population 25+.",
  
  "graduate_degree_rate": "Percentage of population 25+ with graduate or professional degree from table B15003. Calculated as ratio of master's, professional, and doctoral degree holders to total population 25+.",
  
  "median_home_value": "Median value of owner-occupied housing units in dollars from table B25077. Uses B25077_E001 with ACS special values cleaned to null.",
  
  "median_gross_rent": "Median gross rent for renter-occupied units in dollars from table B25064. Uses B25064_E001 with ACS special values cleaned to null.",
  
  "owner_occupied_rate": "Percentage of occupied housing units that are owner-occupied from table B25003. Calculated as ratio of owner-occupied units (B25003_E002) to total occupied units (B25003_E001).",
  
  "renter_occupied_rate": "Percentage of occupied housing units that are renter-occupied from table B25003. Calculated as ratio of renter-occupied units (B25003_E003) to total occupied units (B25003_E001).",
  
  "vacant_housing_rate": "Percentage of housing units that are vacant from table B25002. Calculated as ratio of vacant units (B25002_E003) to total housing units (B25002_E001).",
  
  "rent_burden_over_30pct_rate": "Percentage of renter households paying 30% or more of income for rent from table B25070. Calculated as ratio of households paying 30-34.9%, 35-39.9%, 40-49.9%, and 50%+ to total renter households.",
  
  "severe_rent_burden_over_50pct_rate": "Percentage of renter households paying 50% or more of income for rent from table B25070. Calculated as ratio of households paying 50%+ (B25070_E010) to total renter households (B25070_E001).",
  
  "overcrowded_housing_rate_households": "Percentage of households living in overcrowded conditions (more than 1 person per room) from table B25014. Calculated as ratio of households with 1.01-1.50 and 1.51+ persons per room to total households.",
  
  "severely_overcrowded_rate": "Percentage of households living in severely overcrowded conditions (more than 1.5 persons per room) from table B25014. Calculated as ratio of households with 1.51+ persons per room to total households.",
  
  "total_population": "Total population count from table B01003. Uses B01003_E001 directly.",
  
  "age_under_18_rate": "Percentage of population under 18 years old from table B01001. Calculated as ratio of male and female population under 18 to total population.",
  
  "age_65_and_over_rate": "Percentage of population 65 years and older from table B01001. Calculated as ratio of male and female population 65+ to total population.",
  
  "working_age_18_64_rate": "Percentage of population ages 18-64 from table B01001. Calculated as ratio of male and female population 18-64 to total population.",
  
  "median_age": "Median age of population from table B01002. Uses B01002_E001 with ACS special values cleaned to null.",
  
  "median_age_male": "Median age of male population from table B01002. Uses B01002_E002 with ACS special values cleaned to null.",
  
  "median_age_female": "Median age of female population from table B01002. Uses B01002_E003 with ACS special values cleaned to null.",
  
  "white_alone_rate": "Percentage of population identifying as White alone from table B02001. Calculated as ratio of White alone (B02001_E002) to total population (B02001_E001).",
  
  "black_african_american_rate": "Percentage of population identifying as Black or African American alone from table B02001. Calculated as ratio of Black alone (B02001_E003) to total population (B02001_E001).",
  
  "american_indian_alaska_native_rate": "Percentage of population identifying as American Indian and Alaska Native alone from table B02001. Calculated as ratio of AIAN alone (B02001_E004) to total population (B02001_E001).",
  
  "asian_rate": "Percentage of population identifying as Asian alone from table B02001. Calculated as ratio of Asian alone (B02001_E005) to total population (B02001_E001).",
  
  "native_hawaiian_pacific_islander_rate": "Percentage of population identifying as Native Hawaiian and Other Pacific Islander alone from table B02001. Calculated as ratio of NHOPI alone (B02001_E006) to total population (B02001_E001).",
  
  "other_race_rate": "Percentage of population identifying as some other race alone from table B02001. Calculated as ratio of other race alone (B02001_E007) to total population (B02001_E001).",
  
  "two_or_more_races_rate": "Percentage of population identifying as two or more races from table B02001. Calculated as ratio of two or more races (B02001_E008) to total population (B02001_E001).",
  
  "hispanic_latino_rate": "Percentage of population of Hispanic or Latino origin from table B03003. Calculated as ratio of Hispanic/Latino (B03003_E003) to total population (B03003_E001).",
  
  "foreign_born_rate": "Percentage of population that is foreign-born from table B05002. Calculated as ratio of foreign-born (B05002_E013) to total population (B05002_E001).",
  
  "naturalized_citizen_rate": "Percentage of population that is naturalized U.S. citizens from table B05002. Calculated as ratio of naturalized citizens (B05002_E014) to total population (B05002_E001).",
  
  "not_us_citizen_rate": "Percentage of population that is not a U.S. citizen from table B05002. Calculated as ratio of non-citizens (B05002_E021) to total population (B05002_E001).",
  
  "speaks_language_other_than_english_rate": "Percentage of population 5+ speaking a language other than English at home from table B16001. Calculated as ratio of non-English speakers to total population 5+.",
  
  "limited_english_proficiency_rate": "Percentage of population 5+ with limited English proficiency from table B16004. Calculated as ratio of those speaking English 'not well' or 'not at all' across all language groups to total population 5+.",
  
  "family_households_rate": "Percentage of households that are family households from table B11001. Calculated as ratio of family households (B11001_E002) to total households (B11001_E001).",
  
  "married_couple_families_rate": "Percentage of households that are married-couple families from table B11001. Calculated as ratio of married-couple families (B11001_E003) to total households (B11001_E001).",
  
  "single_person_households_rate": "Percentage of households with only one person from table B11001. Calculated as ratio of 1-person households (B11001_E008) to total households (B11001_E001).",
  
  "female_headed_households_with_children_rate": "Percentage of households that are female-headed with own children under 18 from table B11005. Calculated as ratio of female householder families with children (B11005_E007) to total households (B11005_E001).",
  
  "male_headed_households_with_children_rate": "Percentage of households that are male-headed with own children under 18 from table B11005. Calculated as ratio of male householder families with children (B11005_E005) to total households (B11005_E001).",
  
  "management_business_science_arts_rate": "Percentage of employed population in management, business, science, and arts occupations from table C24010. Calculated as ratio of male and female workers in these occupations to total employed population.",
  
  "manufacturing_employment_rate": "Percentage of employed population working in manufacturing from table C24030. Calculated as ratio of male and female manufacturing workers to total employed population.",
  
  "retail_trade_rate": "Percentage of employed population working in retail trade from table C24030. Calculated as ratio of male and female retail workers to total employed population.",
  
  "healthcare_social_assistance_rate": "Percentage of employed population working in healthcare and social assistance from table C24030. Calculated as ratio of male and female healthcare/social assistance workers to total employed population.",
  
  "commute_over_30_minutes_rate": "Percentage of workers with commute time over 30 minutes from table B08303. Calculated as ratio of workers commuting 30+ minutes to total workers.",
  
  "commute_over_60_minutes_rate": "Percentage of workers with commute time over 60 minutes from table B08303. Calculated as ratio of workers commuting 60+ minutes to total workers.",
  
  "drove_alone_to_work_rate": "Percentage of workers who drove alone to work from table B08301. Calculated as ratio of workers driving alone (B08301_E003) to total workers (B08301_E001).",
  
  "carpooled_to_work_rate": "Percentage of workers who carpooled to work from table B08301. Calculated as ratio of workers carpooling (B08301_E004) to total workers (B08301_E001).",
  
  "public_transit_use_rate": "Percentage of workers who used public transportation to work from table B08301. Calculated as ratio of public transit users (B08301_E010) to total workers (B08301_E001).",
  
  "walked_to_work_rate": "Percentage of workers who walked to work from table B08301. Calculated as ratio of workers walking (B08301_E019) to total workers (B08301_E001).",
  
  "worked_from_home_rate": "Percentage of workers who worked from home from table B08301. Calculated as ratio of home workers (B08301_E021) to total workers (B08301_E001).",
  
  "no_vehicle_households_rate": "Percentage of households with no vehicle available from table B25044. Calculated as ratio of owner and renter households with no vehicle to total households.",
  
  "disability_rate_all_ages": "Percentage of civilian noninstitutionalized population with a disability from table B18101. Calculated as ratio of population with disability across all age groups to total population.",
  
  "no_health_insurance_rate": "Percentage of civilian noninstitutionalized population without health insurance from table B27001. Calculated as ratio of uninsured across all age groups to total population.",
  
  "broadband_internet_rate": "Percentage of households with broadband internet subscription from table B28002. Calculated as ratio of households with broadband (B28002_E004) to total households (B28002_E001).",
  
  "no_internet_rate": "Percentage of households with no internet access from table B28002. Calculated as ratio of households without internet (B28002_E013) to total households (B28002_E001).",
  
  "has_computer_rate": "Percentage of households with a computer from table B28001. Calculated as ratio of households with computers to total households.",
  
  "no_computer_rate": "Percentage of households without a computer from table B28001. Calculated as ratio of households without computer (B28001_E011) to total households (B28001_E001).",
  
  "single_family_detached_rate": "Percentage of housing units that are single-family detached homes from table B25024. Calculated as ratio of detached units (B25024_E002) to total housing units (B25024_E001).",
  
  "mobile_homes_rate": "Percentage of housing units that are mobile homes from table B25024. Calculated as ratio of mobile homes (B25024_E010) to total housing units (B25024_E001).",
  
  "apartment_5plus_units_rate": "Percentage of housing units in buildings with 5 or more units from table B25024. Calculated as ratio of units in 5-9, 10-19, and 20+ unit buildings to total housing units.",
  
  "built_before_1980_rate": "Percentage of housing units built before 1980 from table B25034. Calculated as ratio of units built 1979 or earlier to total housing units.",
  
  "built_2000_or_later_rate": "Percentage of housing units built in 2000 or later from table B25034. Calculated as ratio of units built 2000-2009, 2010-2013, and 2014+ to total housing units.",
  
  "veterans_rate": "Percentage of civilian population 18+ who are veterans from table B21001. Calculated as ratio of veterans (B21001_E002) to total civilian population 18+ (B21001_E001).",
  
  "grandparents_responsible_for_grandchildren_rate": "Percentage of grandparents responsible for own grandchildren under 18 from table B10051. Calculated as ratio of responsible grandparents (B10051_E002) to total grandparents (B10051_E001).",
  
  "moved_in_past_year_rate": "Percentage of population 1+ who moved in the past year from table B07003. Calculated as ratio of movers (different house, county, state, or abroad) to total population 1+.",
  
  "moved_from_different_state_rate": "Percentage of population 1+ who moved from a different state in the past year from table B07003. Calculated as ratio of different state movers (B07003_E013) to total population 1+ (B07003_E001).",
  
  "medicare_coverage_rate": "Percentage of civilian noninstitutionalized population with Medicare coverage from table B27010. Calculated as ratio of population with Medicare (alone or in combination) across all age groups to total population.",
  
  "medicaid_coverage_rate": "Percentage of civilian noninstitutionalized population with Medicaid coverage from table B27010. Calculated as ratio of population with Medicaid (alone or in combination) across all age groups to total population.",
  
  "medicare_65plus_rate": "Percentage of population 65+ with Medicare coverage from table B27010. Calculated as ratio of 65+ population with Medicare to total 65+ population (B27010_E051).",
  
  "dual_eligible_rate": "Percentage of civilian noninstitutionalized population with both Medicare and Medicaid coverage from table B27010. Calculated as ratio of dual eligible population across all age groups to total population.",
  
  "no_insurance_55_64_rate": "Percentage of population ages 55-64 without health insurance from table B27022. Calculated as ratio of uninsured 55-64 (B27022_E008) to total 55-64 population (B27022_E002).",
  
  "public_insurance_19_64_disability_rate": "Percentage of population 19-64 with disability who have public insurance from table B18135. Calculated as ratio of 19-64 disabled with public coverage (B18135_E017) to total 19-64 disabled (B18135_E014).",
  
  "uninsured_19_64_disability_rate": "Percentage of population 19-64 with disability who are uninsured from table B18135. Calculated as ratio of 19-64 disabled uninsured (B18135_E018) to total 19-64 disabled (B18135_E014).",
  
  "disability_rate_19_64": "Percentage of population 19-64 with a disability from table B18135. Calculated as ratio of 19-64 with disability (B18135_E014) to total 19-64 population (B18135_E013).",
  
  "public_coverage_65plus_disability_rate": "Percentage of population 65+ with disability who have public insurance from table B18135. Calculated as ratio of 65+ disabled with public coverage (B18135_E028) to total 65+ disabled (B18135_E025).",
  
  "hearing_difficulty_rate": "Percentage of civilian noninstitutionalized population with hearing difficulty from table B18102. Calculated as ratio of population with hearing difficulty across all age groups to total population.",
  
  "vision_difficulty_rate": "Percentage of civilian noninstitutionalized population with vision difficulty from table B18103. Calculated as ratio of population with vision difficulty across all age groups to total population.",
  
  "cognitive_difficulty_rate": "Percentage of civilian noninstitutionalized population 5+ with cognitive difficulty from table B18104. Calculated as ratio of male and female population with cognitive difficulty across age groups to total population 5+.",
  
  "ambulatory_difficulty_rate": "Percentage of civilian noninstitutionalized population 5+ with ambulatory difficulty from table B18105. Calculated as ratio of male and female population with ambulatory difficulty across age groups to total population 5+.",
  
  "self_care_difficulty_rate": "Percentage of civilian noninstitutionalized population 5+ with self-care difficulty from table B18106. Calculated as ratio of male and female population with self-care difficulty across age groups to total population 5+.",
  
  "youth_self_care_difficulty_rate": "Percentage of population ages 5-17 with self-care difficulty from table B18106. Calculated as ratio of male and female youth with self-care difficulty to total youth population 5-17.",
  
  "working_age_self_care_difficulty_rate": "Percentage of population ages 18-64 with self-care difficulty from table B18106. Calculated as ratio of male and female working-age population with self-care difficulty to total working-age population 18-64.",
  
  "elderly_self_care_difficulty_rate": "Percentage of population 65+ with self-care difficulty from table B18106. Calculated as ratio of male and female elderly with self-care difficulty to total elderly population 65+.",
  
  "independent_living_difficulty_rate": "Percentage of civilian noninstitutionalized population 18+ with independent living difficulty from table B18107. Calculated as ratio of male and female population 18+ with independent living difficulty to total population 18+.",
  
  "poverty_65_plus_rate": "Percentage of population 65+ below poverty level from table B17024. Calculated as ratio of 65+ population below 100% poverty to total 65+ population.",
  
  "near_poverty_65_plus_rate": "Percentage of population 65+ below 125% of poverty level from table B17024. Calculated as ratio of 65+ population below 125% poverty to total 65+ population.",
  
  "severe_poverty_65_plus_rate": "Percentage of population 65+ below 50% of poverty level from table B17024. Calculated as ratio of 65+ population below 50% poverty to total 65+ population.",
  
  "low_income_not_poverty_65_plus_rate": "Percentage of population 65+ with income 100-199% of poverty level from table B17024. Calculated as ratio of 65+ population at 100-199% poverty to total 65+ population.",
  
  "social_security_income_rate": "Percentage of households with Social Security income from table B19055. Calculated as ratio of households with Social Security (B19055_E002) to total households (B19055_E001).",
  
  "mean_social_security_income": "Mean Social Security income for households receiving Social Security from table B19055. Uses B19055_E003 with ACS special values cleaned to null.",
  
  "retirement_income_rate": "Percentage of households with retirement income from table B19059. Calculated as ratio of households with retirement income (B19059_E002) to total households (B19059_E001).",
  
  "supplemental_security_income_rate": "Percentage of households with Supplemental Security Income (SSI) from table B19056. Calculated as ratio of households with SSI (B19056_E002) to total households (B19056_E001).",
  
  "spanish_speakers_rate": "Percentage of population 5+ speaking Spanish at home from table B16005. Calculated as ratio of native-born and foreign-born Spanish speakers to total population 5+.",
  
  "spanish_limited_english_rate": "Percentage of population 5+ speaking Spanish at home with limited English proficiency from table B16005. Calculated as ratio of Spanish speakers with limited English to total population 5+.",
  
  "asian_language_speakers_rate": "Percentage of population 5+ speaking Asian and Pacific Island languages at home from table B16005. Calculated as ratio of native-born and foreign-born Asian language speakers to total population 5+.",
  
  "living_alone_65_plus_rate": "Percentage of households with a person 65+ living alone from table B11007. Calculated as ratio of 1-person households with 65+ person (B11007_E003) to total households (B11007_E001).",
  
  "households_with_elderly_rate": "Percentage of households with one or more people 65+ from table B11007. Calculated as ratio of households with 65+ (B11007_E002) to total households (B11007_E001).",
  
  "elderly_family_households_rate": "Percentage of households that are family households with someone 65+ from table B11007. Calculated as ratio of family households with 65+ (B11007_E005) to total households (B11007_E001).",
  
  "lacking_complete_plumbing_rate": "Percentage of occupied housing units lacking complete plumbing facilities from table B25047. Calculated as ratio of units lacking plumbing (B25047_E003) to total occupied units (B25047_E001).",
  
  "lacking_complete_kitchen_rate": "Percentage of occupied housing units lacking complete kitchen facilities from table B25051. Calculated as ratio of units lacking kitchen (B25051_E003) to total occupied units (B25051_E001).",
  
  "mimi_ma_market_entry_score": "Composite score (0-1) assessing market attractiveness for new Medicare Advantage plan entry. Combines addressable market size (40%: medicare_65plus_rate, no_insurance_55_64_rate), revenue adequacy (35%: dual_eligible_rate, disability_rate_all_ages), and growth indicators (25%: moved_in_past_year_rate, hispanic_latino_rate). Higher scores indicate more attractive markets for MA entry based on beneficiary density, payment opportunities, and demographic growth.",

  "mimi_dsnp_product_opportunity": "Composite score (0-1) identifying optimal markets for Dual Eligible Special Needs Plan products. Weights target population density (40%: dual_eligible_rate, supplemental_security_income_rate), care complexity payments (30%: minimum of cognitive/self-care difficulty and dual rates), state Medicaid environment (15%: medicaid_coverage_rate), and supplemental benefit needs (15%: SNAP, no vehicle rates). Higher scores indicate stronger D-SNP product viability.",

  "mimi_aco_savings_opportunity": "Composite score (0-1) quantifying potential for ACO shared savings achievement. Combines high-cost populations (40%: average of disability and Medicare rates, dual_eligible_rate), preventable utilization (25%: ambulatory_difficulty_rate, living_alone_65_plus_rate), care coordination gaps (20%: lack of broadband, cognitive_difficulty_rate), and attribution stability (15%: inverse of moved_in_past_year_rate). Higher scores indicate greater ACO savings potential.",

  "mimi_risk_adjustment_opportunity": "Composite score (0-1) measuring potential for improving risk adjustment factor (RAF) scores through better documentation. Weights complex conditions undercoding (45%: disability, cognitive, hearing difficulty rates), primary care access barriers (35%: no_vehicle_households_rate, poverty_65_plus_rate), and health system navigation challenges (20%: LEP, lack of broadband). Higher scores indicate greater opportunity for RAF improvement.",

  "mimi_supplemental_benefits_roi": "Composite score (0-1) identifying populations where supplemental benefits drive best outcomes. Combines transportation impact (30%: no_vehicle_households_rate), food insecurity (25%: SNAP, poverty_65_plus_rate), social isolation (20%: living_alone_65_plus_rate), and functional support needs (25%: self-care, independent living difficulty rates). Higher scores predict greater ROI from supplemental benefit investments.",

  "mimi_network_adequacy_challenge": "Composite score (0-1) quantifying difficulty meeting CMS network adequacy standards. Weights access barriers (30%: no_vehicle_households_rate), specialist needs (30%: disability, cognitive_difficulty_rate), language access requirements (30%: limited_english_proficiency_rate), and geriatric gaps (10%: age_65_and_over_rate). Higher scores indicate greater challenges meeting time/distance standards.",

  "mimi_care_management_intensity": "Composite score (0-1) estimating care management resource requirements. Combines cognitive support needs (25%: cognitive_difficulty_rate), social support gaps (25%: living_alone_65_plus_rate), medical complexity (25%: dual_eligible_rate, disability_rate_all_ages), and communication adaptation needs (25%: LEP, less_than_high_school_rate). Higher scores indicate need for more intensive care management.",

  "mimi_quality_measure_challenge": "Composite score (0-1) predicting difficulty achieving high Star Ratings. Weights medication adherence barriers (40%: poverty, cognitive_difficulty_rate), preventive care gaps (30%: no vehicle, living alone rates), and member experience challenges (30%: LEP, disability rates). Higher scores indicate greater difficulty achieving 4+ Star Ratings based on CMS quality measure specifications.",

  "mimi_total_cost_of_care_risk": "Composite score (0-1) estimating relative medical cost burden. Combines complex chronic conditions (30%: disability_rate_all_ages), behavioral/cognitive costs (25%: cognitive, self-care difficulty rates), social risk multipliers (25%: living alone, no vehicle rates), and age/poverty factors (20%). Higher scores predict higher per-member-per-month medical expenditures.",

  "mimi_ma_growth_potential": "Composite score (0-1) identifying markets with future MA enrollment growth potential. Weights pre-Medicare pipeline (40%: no_insurance_55_64_rate, near_poverty_65_plus_rate), underserved populations (35%: dual_eligible_rate, poverty_65_plus_rate), and demographic dynamics (25%: minimum of moved/Medicare rates, minimum of Hispanic/65+ rates). Higher scores indicate stronger 5-10 year growth trajectory.",

  "mimi_health_equity_index_alignment": "Composite score (0-1) predicting alignment with CMS Health Equity Index priorities starting 2027. Combines social risk factor enrollment (50%: dual, SSI, disability rates) with quality achievement potential (50%: inverse of LEP, no vehicle, cognitive difficulty, poverty rates). Higher scores indicate better positioning for HEI rewards under new CMS quality bonus structure.",

  "mimi_value_based_care_readiness": "Composite score (0-1) assessing market readiness for value-based care models. Weights population stability (25%: inverse of moved_in_past_year_rate), digital enablement (25%: broadband, computer rates), care coordination capability (25%: maximum of language/education competency), and preventive care culture (25%: inverse of uninsured, poverty rates). Higher scores indicate better VBC preparedness."
}

In [0]:
for col in spark.read.table('mimi_ws_1.census.acs2023_5yr_sf').columns:
    if col not in variables:
        continue
    print(col)
    desc = variables.get(col).replace("'", "\\\'")
    spark.sql(f"""ALTER TABLE mimi_ws_1.census.acs2023_5yr_sf ALTER COLUMN {col} COMMENT '{desc}'""")