In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
import json
from textwrap import dedent

# 1) Peek at the user's backlog to surface "row 4" context (0-index vs 1-index can be ambiguous).
backlog_path = "/mnt/data/RDM_Backlog_With_ICE.csv"
try:
    backlog_df = pd.read_csv(backlog_path)
    # Create a helper column with 1-index row numbers for clarity.
    backlog_df.insert(0, "Row#", range(1, len(backlog_df) + 1))
    # Try to locate rows with ICE score 40 to help the user confirm.
    ice_40 = backlog_df[backlog_df.astype(str).apply(lambda r: any("40" == str(x).strip() for x in r), axis=1)]
except Exception as e:
    backlog_df = None
    ice_40 = None
    backlog_error = str(e)

# 2) Write a ready-to-run Python module that:
#    - Loads ABS county×NAICS3 data
#    - Verifies county totals reconcile to NAICS 00 (All sectors)
#    - Aggregates to CBSA via county→CBSA crosswalk
#    - Filters to "large" CBSAs (by total firms or an optional population column if present)
#    - Saves outputs to CSV
script = dedent("""
    # rdm_abs_naics3_cbsa.py
    # -------------------------------------------
    # Extend ABS pull to 3-digit NAICS for CBSAs,
    # verify county totals reconcile to NAICS 00,
    # then aggregate to CBSA and filter to "large" CBSAs.
    #
    # INPUTS (CSV, UTF-8):
    #   1) abs_county_naics3.csv  (county × NAICS3)
    #        Required columns (case-insensitive accepted):
    #          state, county, naics, NAICS2022 or naics3
    #          FIRMPDEMP (firms), EMP, PAYANN (in $1,000s), RCPPDEMP (receipts in $1,000s)
    #          NAICS 00 total rows must also be present for each county (All sectors)
    #   2) cbsa_county_crosswalk.csv
    #        Required columns:
    #          state_fips (2), county_fips (3), cbsa_code, cbsa_title
    #        Optional:
    #          cbsa_pop (for 'large' definition). If missing, we rank by ABS firms instead.
    #
    # OUTPUTS:
    #   - abs_county_naics3_recon_report.csv  (county-level reconciliation vs NAICS 00)
    #   - abs_cbsa_naics3.csv                 (CBSA × NAICS3 rolled up from counties)
    #   - abs_cbsa_naics3_large.csv           (only 'large' CBSAs)
    #   - abs_cbsa_naics3_discrepancies.csv   (CBSA counties with reconciliation issues)
    #
    # USAGE:
    #   python rdm_abs_naics3_cbsa.py --abs /path/to/abs_county_naics3.csv \\
    #                                 --xwalk /path/to/cbsa_county_crosswalk.csv \\
    #                                 --year 2022 \\
    #                                 --large_by firms --large_threshold 20000
    #
    import argparse
    import pandas as pd
    import numpy as np
    from pathlib import Path
    
    def zfill_series(s, n):
        return s.astype(str).str.extract(r"(\\d+)", expand=False).fillna("").str.zfill(n)
    
    def normalize_abs_columns(df):
        cols = {c.lower(): c for c in df.columns}
        # Make standard names
        rename_map = {}
        for want in ["state","county","naics","naics2022","firmpdemp","emp","payann","rcppdemp"]:
            for actual in df.columns:
                if actual.lower() == want:
                    rename_map[actual] = want.upper() if want in ["firmpdemp","emp","payann","rcppdemp"] else want
        df = df.rename(columns=rename_map)
        # Derive NAICS3 (first 3 digits of NAICS2022 or naics)
        if "NAICS2022" in df.columns:
            df["naics3"] = df["NAICS2022"].astype(str).str[:3]
        elif "naics" in df.columns:
            df["naics3"] = df["naics"].astype(str).str[:3]
        else:
            raise ValueError("ABS file must have NAICS2022 or naics column")
        # Standardize FIPS
        df["state"] = zfill_series(df["state"], 2)
        df["county"] = zfill_series(df["county"], 3)
        # Convert $1,000s → $
        for k in ["PAYANN", "RCPPDEMP"]:
            if k in df.columns:
                df[k] = pd.to_numeric(df[k], errors="coerce") * 1000
        # Nullable numerics
        for c in ["FIRMPDEMP","EMP","PAYANN","RCPPDEMP"]:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        return df
    
    def normalize_crosswalk(df):
        need = ["state_fips","county_fips","cbsa_code","cbsa_title"]
        missing = [c for c in need if c not in df.columns]
        if missing:
            raise ValueError(f"Crosswalk missing columns: {missing}")
        df["state_fips"]  = zfill_series(df["state_fips"], 2)
        df["county_fips"] = zfill_series(df["county_fips"], 3)
        if "cbsa_pop" in df.columns:
            df["cbsa_pop"] = pd.to_numeric(df["cbsa_pop"], errors="coerce")
        return df
    
    def reconcile_county_totals(abs_df, year=None, atol=1.0):
        \"\"\"Check that sum over NAICS3 equals NAICS 00 total at county level.
        Returns a report with absolute and percent deltas for EMP, PAYANN, RCPPDEMP, FIRMPDEMP.
        atol in dollars and counts (post $1,000→$ conversion).\"\"\"
        base = abs_df.copy()
        if year and "year" in base.columns:
            base = base[base["year"] == year]
        # Identify NAICS 00 "All sectors"
        mask_all = base["naics3"].isin(["000"]) | base.get("NAICS2022","").astype(str).isin(["00"]) | base.get("naics","").astype(str).isin(["00"])
        all_rows = base[mask_all].copy()
        # If NAICS 00 is encoded as "00" rather than "000", normalize it
        all_rows["naics3"] = "000"
        parts = base[~mask_all].copy()
        # Roll up by county
        key = ["state","county"]
        sums = (parts.groupby(key, as_index=False)[["FIRMPDEMP","EMP","PAYANN","RCPPDEMP"]]
                      .sum(min_count=1))
        totals = (all_rows.groupby(key, as_index=False)[["FIRMPDEMP","EMP","PAYANN","RCPPDEMP"]]
                          .sum(min_count=1))
        rep = key.copy()
        for c in ["FIRMPDEMP","EMP","PAYANN","RCPPDEMP"]:
            sums[c] = pd.to_numeric(sums[c], errors="coerce")
            totals[c] = pd.to_numeric(totals[c], errors="coerce")
            sums = sums.rename(columns={c: f"sum_{c.lower()}"})
            totals = totals.rename(columns={c: f"tot_{c.lower()}"})
        report = sums.merge(totals, on=key, how="outer")
        for c in ["firmpdemp","emp","payann","rcppdemp"]:
            report[f"delta_{c}"] = report[f"sum_{c}"] - report[f"tot_{c}"]
            report[f"pct_delta_{c}"] = np.where(report[f"tot_{c}"].abs() > 0,
                                                report[f"delta_{c}"] / report[f"tot_{c}"], np.nan)
            report[f"flag_{c}"] = report[f"delta_{c}"].abs() > atol
        # Any flag triggered?
        report["recon_ok"] = ~(report[[f"flag_{c}" for c in ["firmpdemp","emp","payann","rcppdemp"]]].any(axis=1))
        return report.sort_values(key)
    
    def aggregate_to_cbsa(abs_df, xwalk_df, year=None):
        base = abs_df.copy()
        if year and "year" in base.columns:
            base = base[base["year"] == year]
        base["state_fips"] = base["state"]
        base["county_fips"] = base["county"]
        merged = base.merge(xwalk_df, on=["state_fips","county_fips"], how="left", validate="m:1")
        # Roll up county→CBSA for NAICS3 (excluding NAICS 000 rows)
        parts = merged[~merged["naics3"].isin(["000"])].copy()
        key = ["cbsa_code","cbsa_title","naics3"]
        out = (parts.groupby(key, as_index=False)[["FIRMPDEMP","EMP","PAYANN","RCPPDEMP"]]
                    .sum(min_count=1))
        # Also keep CBSA totals (All sectors) for reference
        all_cbsa = (merged[merged["naics3"].isin(["000"])]
                    .groupby(["cbsa_code","cbsa_title"], as_index=False)[["FIRMPDEMP","EMP","PAYANN","RCPPDEMP"]]
                    .sum(min_count=1)
                    .rename(columns={
                        "FIRMPDEMP":"cbsa_tot_firms",
                        "EMP":"cbsa_tot_emp",
                        "PAYANN":"cbsa_tot_payroll",
                        "RCPPDEMP":"cbsa_tot_receipts"
                    }))
        out = out.merge(all_cbsa, on=["cbsa_code","cbsa_title"], how="left")
        return out
    
    def filter_large_cbsa(cbsa_df, xwalk_df, large_by="firms", threshold=20000):
        # If population available, prefer it when large_by=='population'
        if large_by == "population" and "cbsa_pop" in xwalk_df.columns:
            pop = xwalk_df.drop_duplicates(subset=["cbsa_code","cbsa_title"])[["cbsa_code","cbsa_title","cbsa_pop"]]
            cbsa_totals = (cbsa_df.drop_duplicates(subset=["cbsa_code","cbsa_title"])
                                [["cbsa_code","cbsa_title"]].merge(pop, on=["cbsa_code","cbsa_title"], how="left"))
            big_codes = cbsa_totals.loc[cbsa_totals["cbsa_pop"] >= threshold, ["cbsa_code","cbsa_title"]]
        else:
            # Fallback: size by CBSA total firms
            totals = (cbsa_df.groupby(["cbsa_code","cbsa_title"], as_index=False)["cbsa_tot_firms"]
                            .max())
            big_codes = totals.loc[totals["cbsa_tot_firms"] >= threshold, ["cbsa_code","cbsa_title"]]
        large = cbsa_df.merge(big_codes, on=["cbsa_code","cbsa_title"], how="inner")
        return large
    
    def main():
        ap = argparse.ArgumentParser()
        ap.add_argument("--abs", required=True, help="Path to ABS county×NAICS3 CSV")
        ap.add_argument("--xwalk", required=True, help="Path to county→CBSA crosswalk CSV")
        ap.add_argument("--year", type=int, default=None, help="Optional year filter")
        ap.add_argument("--large_by", choices=["firms","population"], default="firms")
        ap.add_argument("--large_threshold", type=int, default=20000)
        ap.add_argument("--recon_atol", type=float, default=1.0, help="Absolute tolerance for reconciliation deltas")
        ap.add_argument("--outdir", default=".", help="Directory to write outputs")
        args = ap.parse_args()
    
        abs_df = pd.read_csv(args.abs, dtype=str)
        # Cast numerics later inside normalize
        abs_df = normalize_abs_columns(abs_df)
        xwalk_df = pd.read_csv(args.xwalk, dtype=str)
        xwalk_df = normalize_crosswalk(xwalk_df)
    
        # 1) Reconciliation at county-level
        rep = reconcile_county_totals(abs_df, year=args.year, atol=args.recon_atol)
        outdir = Path(args.outdir); outdir.mkdir(parents=True, exist_ok=True)
        rep_path = outdir / "abs_county_naics3_recon_report.csv"
        rep.to_csv(rep_path, index=False)
    
        # 2) Aggregate to CBSA
        cbsa = aggregate_to_cbsa(abs_df, xwalk_df, year=args.year)
        cbsa_path = outdir / "abs_cbsa_naics3.csv"
        cbsa.to_csv(cbsa_path, index=False)
    
        # 3) Filter to 'large' CBSAs
        large = filter_large_cbsa(cbsa, xwalk_df, large_by=args.large_by, threshold=args.large_threshold)
        large_path = outdir / "abs_cbsa_naics3_large.csv"
        large.to_csv(large_path, index=False)
    
        # 4) Flag CBSA counties with reconciliation issues (join county report back to crosswalk)
        #    Useful to understand whether any CBSA totals might be biased by county-level suppression.
        rep_key = ["state","county"]
        cw_small = xwalk_df.rename(columns={"state_fips":"state","county_fips":"county"})
        bad = rep[~rep["recon_ok"]].merge(cw_small, on=rep_key, how="left")
        bad_path = outdir / "abs_cbsa_naics3_discrepancies.csv"
        bad.to_csv(bad_path, index=False)
    
        print(f"Wrote: {rep_path}")
        print(f"Wrote: {cbsa_path}")
        print(f"Wrote: {large_path}")
        print(f"Wrote: {bad_path}")
    
    if __name__ == "__main__":
        main()
""")

script_path = "/mnt/data/rdm_abs_naics3_cbsa.py"
with open(script_path, "w", encoding="utf-8") as f:
    f.write(script)

# Save a lightweight README with instructions and expected schemas.
readme = dedent("""
    RDM ABS NAICS3 → CBSA Pipeline
    ===============================
    
    Files you need
    --------------
    1) `abs_county_naics3.csv` — ABS data at **county × NAICS3** with NAICS 00 totals.
       - Required columns (case-insensitive accepted):
         state, county, naics or NAICS2022, FIRMPDEMP, EMP, PAYANN, RCPPDEMP, (optional) year
       - NOTE: PAYANN and RCPPDEMP must be in $1,000s (as published) — the script converts to $.
    
    2) `cbsa_county_crosswalk.csv` — County → CBSA mapping.
       - Required: state_fips (2), county_fips (3), cbsa_code, cbsa_title
       - Optional: cbsa_pop (to size “large” CBSAs by population). If absent, we size by ABS firms.
    
    What the script does
    --------------------
    • Reconciles county sums across NAICS3 to the NAICS 00 (All sectors) total.
    • Aggregates counties to CBSA for NAICS3 rows (excludes NAICS “000”). Keeps CBSA totals for reference.
    • Filters to “large” CBSAs using either population or total ABS firms.
    • Writes four CSVs: reconciliation report, CBSA NAICS3, CBSA NAICS3 (large only), and a list of counties with reconciliation flags.
    
    Quick start
    -----------
    python rdm_abs_naics3_cbsa.py --abs abs_county_naics3.csv \\
                                  --xwalk cbsa_county_crosswalk.csv \\
                                  --year 2022 \\
                                  --large_by firms --large_threshold 20000 \\
                                  --outdir outputs
    
    Tip
    ---
    If your NAICS 00 is coded as "00" instead of "000", the script normalizes that internally.
    Expect some reconciliation flags where ABS suppression/rounding occurs.
""")
readme_path = "/mnt/data/README_ABS_NAICS3_CBSA.txt"
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme)

# Prepare a tiny crosswalk template to guide the user.
crosswalk_template = pd.DataFrame({
    "state_fips": ["06","06"],
    "county_fips": ["001","087"],
    "cbsa_code": ["41860","41940"],
    "cbsa_title": ["San Francisco-Oakland-Berkeley, CA","San Jose-Sunnyvale-Santa Clara, CA"],
    "cbsa_pop": [4731000, 1996000]
})
crosswalk_path = "/mnt/data/cbsa_county_crosswalk_template.csv"
crosswalk_template.to_csv(crosswalk_path, index=False)

# Display helpful tables (if backlog was readable)
from caas_jupyter_tools import display_dataframe_to_user
if backlog_df is not None:
    display_dataframe_to_user("RDM Backlog (with row numbers)", backlog_df)
    if ice_40 is not None and len(ice_40) > 0:
        display_dataframe_to_user("Backlog rows containing ICE score '40' (string match)", ice_40)

{
    "script_path": script_path,
    "readme_path": readme_path,
    "crosswalk_template_path": crosswalk_path,
    "backlog_loaded": backlog_df is not None
}
