In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Pull BEA Regional GDP (nominal, current $) by County × Industry (CAGDP2)
- Fetches valid LineCodes for CAGDP2
- Loops through lines to download all industries
- Filters to 5-digit county FIPS
- Maps BEA industry codes to NAICS2 (group codes handled)
- Outputs tidy CSV: county_fips, naics2, year, gdp_current_usd

Usage:
  python bea_cagdp2_pull.py

Set:
  YEAR = 2022 (or another year)
  GEO = "COUNTY" for all counties, or "CA" for California-only
"""

import os
import time
import requests
import pandas as pd

# ====== CONFIG ======
BEA_API_KEY = os.getenv("BEA_API_KEY", "YOUR_BEA_KEY")  # ← replace or set env var
BASE_URL = "https://apps.bea.gov/api/data/"
YEAR = 2022
GEO = "COUNTY"   # "COUNTY" for all U.S. counties, or state code like "CA" for California-only
SLEEP_BETWEEN_CALLS = 0.2  # seconds
OUTFILE = f"bea_cagdp2_county_naics2_{YEAR}.csv"
# =====================

# BEA grouped industry codes → NAICS2 proxy (everything that is already 2-digit numeric is passed through)
BEA_GROUP_MAP = {
    "31G": "31",   # Manufacturing (group)
    "44RT": "44",  # Retail Trade (group)
    "48TW": "48",  # Transportation & Warehousing (group)
    "G":   "92",   # Government → NAICS 92 (public sector). Drop later if you prefer.
}

def bea_to_naics2(code: str):
    """Map BEA IndustryClassification to a NAICS2-like code.
       - If it's a 2-digit numeric string (e.g., '23', '51'), return as-is.
       - If it's a known BEA group code (31G, 44RT, 48TW, G), map to a NAICS2 proxy.
       - Otherwise return None (will be filtered out).
    """
    if not code:
        return None
    code = str(code).strip()
    if code.isdigit() and len(code) == 2:
        return code
    return BEA_GROUP_MAP.get(code)

def bea_request(params, timeout=60):
    """Call BEA API with basic error handling; returns the 'BEAAPI' dict."""
    r = requests.get(BASE_URL, params=params, timeout=timeout)
    r.raise_for_status()
    payload = r.json()
    beaapi = payload.get("BEAAPI", {})
    if "Error" in beaapi:
        # Print a readable error to help diagnose parameter issues
        raise RuntimeError(f"BEA API error: {beaapi['Error']}")
    return beaapi

def get_linecodes_for_cagdp2():
    """Discover valid LineCodes for CAGDP2 (nominal GDP by county & industry)."""
    params = {
        "UserID": BEA_API_KEY,
        "method": "GetParameterValuesFiltered",
        "DataSetName": "Regional",
        "TargetParameter": "LineCode",
        "TableName": "CAGDP2",
        "ResultFormat": "JSON",
    }
    beaapi = bea_request(params)
    vals = beaapi.get("Results", {}).get("ParamValue", [])
    out = []
    for v in vals:
        lc = str(v.get("Key", "")).strip()
        desc = v.get("Desc", "").strip()
        if lc:
            out.append((lc, desc))
    return out

def fetch_cagdp2_county_by_linecode(year: int, linecode: str, geo="COUNTY") -> pd.DataFrame:
    """Fetch nominal GDP for a single LineCode (industry) across counties."""
    params = {
        "UserID": BEA_API_KEY,
        "method": "GetData",
        "datasetname": "Regional",
        "TableName": "CAGDP2",   # nominal (current-dollar) GDP by county × industry
        "LineCode": str(linecode),
        "Year": str(year),
        "GeoFIPS": geo,          # "COUNTY" = all U.S. counties; e.g., "CA" for California-only
        "ResultFormat": "JSON",
    }
    beaapi = bea_request(params)
    data = beaapi.get("Results", {}).get("Data", [])
    if not data:
        return pd.DataFrame()

    df = pd.DataFrame(data)
    # Normalize columns
    df.columns = [c.lower() for c in df.columns]

    # Keep county rows only (5-digit FIPS)
    if "geofips" not in df.columns:
        # Some rare API responses use different casing; normalize already done above → should exist.
        raise KeyError(f"Expected 'geofips' in columns, found: {list(df.columns)}")
    df = df[df["geofips"].str.len() == 5].copy()

    # Convert GDP string (e.g., '1,234') to numeric current-dollar GDP
    df["gdp_current_usd"] = pd.to_numeric(df["datavalue"].str.replace(",", ""), errors="coerce")

    # Minimal tidy columns
    df = df.rename(columns={"geofips": "county_fips", "timeperiod": "year"})
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

    # Map to NAICS2 and drop non-mapped aggregates
    df["naics2"] = df["industryclassification"].apply(bea_to_naics2)
    df = df[df["naics2"].notna()].copy()

    # Keep useful fields
    keep = ["county_fips", "naics2", "year", "gdp_current_usd", "geoname", "industryclassification", "description"]
    df = df[[c for c in keep if c in df.columns]]
    return df

def fetch_cagdp2_all_sectors(year: int, geo="COUNTY", sleep_sec=0.2) -> pd.DataFrame:
    """Fetch all industries (looping valid LineCodes) for a given year."""
    linecodes = get_linecodes_for_cagdp2()
    if not linecodes:
        raise RuntimeError("No LineCodes returned for CAGDP2. Check API key or table name.")

    frames = []
    for lc, desc in linecodes:
        try:
            part = fetch_cagdp2_county_by_linecode(year, lc, geo=geo)
            if not part.empty:
                frames.append(part)
        except Exception as e:
            print(f"[WARN] LineCode {lc} ({desc}) failed: {e}")
        time.sleep(sleep_sec)  # be nice to the API

    if not frames:
        print("[WARN] No data returned for any LineCode.")
        return pd.DataFrame()

    out = pd.concat(frames, ignore_index=True)
    # Deduplicate in case any overlaps occur
    out = out.drop_duplicates(subset=["county_fips", "naics2", "year", "gdp_current_usd"])
    return out

def main():
    print(f"Fetching BEA CAGDP2 (nominal) for YEAR={YEAR}, GEO={GEO} ...")
    df = fetch_cagdp2_all_sectors(YEAR, geo=GEO, sleep_sec=SLEEP_BETWEEN_CALLS)

    if df.empty:
        print("[WARN] Final DataFrame is empty. Enable warnings above for details.")
        return

    print("Rows:", len(df))
    print("Unique counties:", df["county_fips"].nunique(), "NAICS2:", df["naics2"].nunique())

    # Save tidy output for joining with QCEW
    out_cols = ["county_fips", "naics2", "year", "gdp_current_usd"]
    tidy = df[out_cols].copy()
    tidy["county_fips"] = tidy["county_fips"].astype(str).str.zfill(5)
    tidy["year"] = tidy["year"].astype(int)

    tidy.to_csv(OUTFILE, index=False)
    print("Wrote:", OUTFILE)

if __name__ == "__main__":
    main()
