In [17]:
# pc 2/20/2026
mport geopandas as gpd
import pandas as pd

# -----------------------------
# INPUTS
# -----------------------------
bg_file = "acs2024_bg_with_gq_allocated_scaled.dbf"
ratio_file = "bg_taz_ratios_clean.csv"

# -----------------------------
# LOAD TRUE BG TOTALS
# -----------------------------
bg = gpd.read_file(bg_file)

bg = bg[[
    "GEOID",
    "TOT_HH",
    "HH_POP",
    "TOT_POP",
    "EMP_RES"
]].copy()

# Convert to normal pandas dataframe
bg = pd.DataFrame(bg)

# -----------------------------
# LOAD RATIOS
# -----------------------------
ratios = pd.read_csv(ratio_file)

# -----------------------------
# FIX GEOID FORMAT PROPERLY
# (handles Excel float corruption)
# -----------------------------

# Clean ratios GEOID
ratios["GEOID"] = (
    ratios["GEOID"]
    .astype(str)
    .str.replace(".0", "", regex=False)
    .str.split(".").str[0]
    .str.zfill(12)
)

# Clean BG GEOID
bg["GEOID"] = (
    bg["GEOID"]
    .astype(str)
    .str.replace(".0", "", regex=False)
    .str.zfill(12)
)

# -----------------------------
# MERGE
# -----------------------------
df = ratios.merge(bg, on="GEOID", how="left")

missing = df["TOT_POP"].isna().sum()
print("Missing BG matches:", missing)


# -----------------------------
# IDENTIFY MISSING BGs
# -----------------------------
missing_df = df[df["TOT_POP"].isna()].copy()

print("Number of missing BG rows:", len(missing_df))
print("Unique missing GEOIDs:", missing_df["GEOID"].nunique())

print("\nSample missing GEOIDs:")
print(missing_df["GEOID"].unique()[:20])

# Save them for inspection
missing_df[["GEOID", "TAZ"]].drop_duplicates().to_csv(
    "missing_bg_matches.csv",
    index=False
)

print("Missing BG list saved to missing_bg_matches.csv")


# -----------------------------
# ALLOCATE
# -----------------------------
df["TOT_HH_alloc"]  = df["TOT_HH"]  * df["norm_ratio"]
df["HH_POP_alloc"]  = df["HH_POP"]  * df["norm_ratio"]
df["TOT_POP_alloc"] = df["TOT_POP"] * df["norm_ratio"]
df["EMP_RES_alloc"] = df["EMP_RES"] * df["norm_ratio"]

# -----------------------------
# AGGREGATE TO TAZ
# -----------------------------
taz = df.groupby("TAZ")[[
    "TOT_HH_alloc",
    "HH_POP_alloc",
    "TOT_POP_alloc",
    "EMP_RES_alloc"
]].sum().reset_index()

taz.to_csv("TAZ_allocated_clean.csv", index=False)

print("TAZ allocation complete.")


Missing BG matches: 88
Number of missing BG rows: 88
Unique missing GEOIDs: 63

Sample missing GEOIDs:
['060050003042' '060090002211' '060090002222' '060190079032'
 '060190083041' '060190083042' '060190084022' '060310017031'
 '060330011012' '060330011021' '060330012002' '060330013025'
 '060450111021' '060450111023' '060450118001' '060470021001'
 '060670094061' '060670094062' '060670094081' '060670095011']
Missing BG list saved to missing_bg_matches.csv
TAZ allocation complete.


In [18]:
print("Unique BGs in ratios:", ratios["GEOID"].nunique())
print("Unique BGs in bg file:", bg["GEOID"].nunique())


Unique BGs in ratios: 6214
Unique BGs in bg file: 6156
