In [None]:
https://apps.bea.gov/api/data/?UserID=290EFCBB-FAEF-4B15-AED6-BBFB7B290205&method=GetParameterValuesFiltered&DataSetName=Regional&TargetParameter=LineCode&TableName=CAGDP2&ResultFormat=JSON

In [None]:
https://apps.bea.gov/api/data/?UserID=290EFCBB-FAEF-4B15-AED6-BBFB7B290205&method=GetData&datasetname=Regional&TableName=CAGDP2&LineCode=12&Year=2022&GeoFIPS=CA&ResultFormat=JSON

In [9]:
import os
import time
import requests
import pandas as pd

In [10]:
BEA_API_KEY = os.getenv("BEA_API_KEY", "290EFCBB-FAEF-4B15-AED6-BBFB7B290205")
BASE_URL = "https://apps.bea.gov/api/data/"

In [11]:
# 1) Replace the BEA_TO_NAICS2 logic with a helper that also accepts pure 2-digit codes
BEA_GROUP_MAP = {
    "31G": "31",   # Manufacturing group
    "44RT": "44",  # Retail Trade group
    "48TW": "48",  # Transportation & Warehousing group
    "G":  "92",    # Government → NAICS 92 (optional to keep)
}

def bea_to_naics2(code: str) -> str | None:
    """
    Map BEA IndustryClassification to a NAICS2-like code:
    - If it's a 2-digit numeric string (e.g., '23', '51'), return as-is.
    - If it's in the known group map (31G, 44RT, 48TW, G), return mapped.
    - Otherwise, return None (e.g., totals or odd aggregates).
    """
    if not code:
        return None
    code = str(code).strip()
    if code.isdigit() and len(code) == 2:
        return code
    return BEA_GROUP_MAP.get(code)

# 2) In fetch_cagdp2_county_by_linecode(), replace the mapping & drop with:
df["naics2"] = df["industryclassification"].apply(bea_to_naics2)
# keep only rows that mapped to a 2-digit NAICS (or chosen groups)
df = df[df["naics2"].notna()].copy()


NameError: name 'df' is not defined

In [None]:
out = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

if out.empty:
    # Help yourself debug quickly:
    print("[DEBUG] No rows returned after merging all line codes.")
    print("[DEBUG] Sample of line codes retrieved:", linecodes[:10])
    return out

# If not empty, you should have these columns now:
# ['county_fips', 'geoname', 'industryclassification', 'description', 'year', 'gdp_current_usd', 'naics2']
out_cols = ["county_fips", "naics2", "year", "gdp_current_usd"]
missing = [c for c in out_cols if c not in out.columns]
if missing:
    print("[DEBUG] Missing expected columns:", missing)
    print("[DEBUG] Available columns:", list(out.columns))


In [4]:
def bea_request(params, timeout=60):
    """Small helper to call the BEA API with error handling."""
    r = requests.get(BASE_URL, params=params, timeout=timeout)
    r.raise_for_status()
    payload = r.json()
    beaapi = payload.get("BEAAPI", {})
    if "Error" in beaapi:
        raise RuntimeError(f"BEA API error: {beaapi['Error']}")
    return beaapi

In [5]:
def get_linecodes_for_cagdp2():
    """Discover valid LineCodes for CAGDP2 (nominal GDP by county & industry)."""
    params = {
        "UserID": BEA_API_KEY,
        "method": "GetParameterValuesFiltered",
        "DataSetName": "Regional",
        "TargetParameter": "LineCode",
        "TableName": "CAGDP2",
        "ResultFormat": "JSON",
    }
    beaapi = bea_request(params)
    vals = beaapi.get("Results", {}).get("ParamValue", [])
    # Return list of (LineCode, Description)
    out = []
    for v in vals:
        lc = str(v.get("Key", "")).strip()
        desc = v.get("Desc", "").strip()
        if lc:
            out.append((lc, desc))
    return out

In [6]:
def fetch_cagdp2_county_by_linecode(year: int, linecode: str, geo="COUNTY") -> pd.DataFrame:
    """Fetch nominal GDP for a single LineCode (industry) across counties."""
    params = {
        "UserID": BEA_API_KEY,
        "method": "GetData",
        "datasetname": "Regional",
        "TableName": "CAGDP2",   # nominal/current-dollar GDP
        "LineCode": str(linecode),
        "Year": str(year),
        "GeoFIPS": geo,          # "COUNTY" = all U.S. counties; e.g., "CA" to restrict to California
        "ResultFormat": "JSON",
    }
    beaapi = bea_request(params)
    data = beaapi.get("Results", {}).get("Data", [])
    if not data:
        return pd.DataFrame()
    df = pd.DataFrame(data)
    # Normalize column names and clean
    df.columns = [c.lower() for c in df.columns]
    # Keep county rows (5-digit FIPS)
    df = df[df["geofips"].str.len() == 5].copy()
    # DataValue → numeric current-dollar GDP
    df["gdp_current_usd"] = pd.to_numeric(df["datavalue"].str.replace(",", ""), errors="coerce")
    # Keep useful columns
    df = df[["geofips", "geoname", "industryclassification", "description", "timeperiod", "gdp_current_usd"]]
    df = df.rename(columns={"geofips": "county_fips", "timeperiod": "year"})
    df["year"] = df["year"].astype(int)
    # Map BEA industry code → NAICS2
    df["naics2"] = df["industryclassification"].map(BEA_TO_NAICS2)
    # Drop lines that aren't mapped (e.g., totals or special aggregates)
    df = df[~df["naics2"].isna()].copy()
    return df

In [7]:
def fetch_cagdp2_all_sectors(year: int, geo="COUNTY", sleep_sec=0.2) -> pd.DataFrame:
    """Fetch all industries (loop over valid LineCodes) for a given year."""
    linecodes = get_linecodes_for_cagdp2()
    frames = []
    for lc, desc in linecodes:
        try:
            df = fetch_cagdp2_county_by_linecode(year, lc, geo=geo)
            if not df.empty:
                frames.append(df)
        except Exception as e:
            # Non-fatal: print and continue with next line code
            print(f"[WARN] LineCode {lc} ({desc}) failed: {e}")
        time.sleep(sleep_sec)  # be nice to the API
    if not frames:
        return pd.DataFrame()
    out = pd.concat(frames, ignore_index=True)
    # Deduplicate just in case
    out = out.drop_duplicates(subset=["county_fips", "naics2", "year", "gdp_current_usd"])
    return out

In [8]:
if __name__ == "__main__":
    YEAR = 2022

    # A) All U.S. counties
    df_us = fetch_cagdp2_all_sectors(YEAR, geo="COUNTY")

    # (Optional) B) Restrict to California only for testing
    # df_us = fetch_cagdp2_all_sectors(YEAR, geo="CA")

    print(df_us.head())
    print("Rows:", len(df_us), "Unique counties:", df_us['county_fips'].nunique(), "NAICS2:", df_us['naics2'].nunique())

    # Save tidy output for joining with QCEW
    out_cols = ["county_fips", "naics2", "year", "gdp_current_usd"]
    df_us[out_cols].to_csv(f"bea_cagdp2_county_naics2_{YEAR}.csv", index=False)
    print("Wrote:", f"bea_cagdp2_county_naics2_{YEAR}.csv")


[WARN] LineCode 1 ([CAGDP2] Gross Domestic Product (GDP): All industry total) failed: "['industryclassification', 'description'] not in index"
[WARN] LineCode 10 ([CAGDP2] Gross Domestic Product (GDP): Utilities (22)) failed: "['industryclassification', 'description'] not in index"
[WARN] LineCode 11 ([CAGDP2] Gross Domestic Product (GDP): Construction (23)) failed: "['industryclassification', 'description'] not in index"
[WARN] LineCode 12 ([CAGDP2] Gross Domestic Product (GDP): Manufacturing (31-33)) failed: "['industryclassification', 'description'] not in index"
[WARN] LineCode 13 ([CAGDP2] Gross Domestic Product (GDP): Durable goods manufacturing (321,327-339)) failed: "['industryclassification', 'description'] not in index"
[WARN] LineCode 2 ([CAGDP2] Gross Domestic Product (GDP): Private industries) failed: "['industryclassification', 'description'] not in index"
[WARN] LineCode 25 ([CAGDP2] Gross Domestic Product (GDP): Nondurable goods manufacturing (311-316,322-326)) failed: 

KeyError: 'county_fips'