In [1]:
import requests
import pandas as pd
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import time

In [2]:
# Choose which data and aggregation to use

labor_aggregation = "state"  # state / metropolitan_area

In [3]:
API_KEY = '5f3f444cd4ce64996377d78bf3310f73b4f88945'

## 1. Downloading Labor  Data
Access and extract labor data by state nd metropolitan statistical area anand NAICS code using the Census Bureau's API. 

* Obtained to:
https://www.census.gov/data/developers/data-sets/cbp-zbp/cbp-api.html

* To check:
https://data.bls.gov/cew/apps/table_maker/v4/table_maker.htm#type=0&year=2024&qtr=4&own=5&ind=10&supp=0

In [4]:
def census_session():
    s = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False,
    )
    s.mount("https://", HTTPAdapter(max_retries=retries, pool_maxsize=20))
    return s

COMMON_HEADERS = {
    "Connection": "close",
    "Accept-Encoding": "identity",
    "User-Agent": "requests (data job)"
}

def fetch_census_json_df(base_url: str, params: dict, timeout=(5, 300)) -> pd.DataFrame:
    """Generic: call a Census endpoint and return a typed DataFrame."""
    p = dict(params)
    if "key" not in p:
        p["key"] = API_KEY

    with census_session().get(base_url, params=p, headers=COMMON_HEADERS, timeout=timeout) as r:
        r.raise_for_status()
        if "application/json" not in r.headers.get("Content-Type", "").lower():
            raise ValueError(f"Non-JSON response: {r.text[:200]}")
        data = r.json()
        if not data or len(data) < 2:
            return pd.DataFrame(columns=data[0] if data else [])
        df = pd.DataFrame(data[1:], columns=data[0])
    return df

def fetch_cbp(url, geography, get_cols: str = "NAICS2017,EMP,STATE") -> pd.DataFrame:
    """    Fetch CBP 2022 by state or metropolitan area.    """
    params = {
        "get": get_cols,
        "for": geography,
        "key": API_KEY,
    }

    df = fetch_census_json_df(url, params)
    if "EMP" in df: df["EMP"] = pd.to_numeric(df["EMP"], errors="coerce").astype("Int64")

    return df

In [5]:
url = "https://www2.census.gov/geo/docs/reference/state.txt"
df_states = pd.read_csv(url, delimiter='|')
df_states.columns = ['STATE_FIPS', 'USPS', 'STATE_NAME', 'STATENS']
df_states['STATE_FIPS'] = df_states['STATE_FIPS'].astype(str).str.zfill(2)
df_states.head()

Unnamed: 0,STATE_FIPS,USPS,STATE_NAME,STATENS
0,1,AL,Alabama,1779775
1,2,AK,Alaska,1785533
2,4,AZ,Arizona,1779777
3,5,AR,Arkansas,68085
4,6,CA,California,1779778


In [6]:
BASE_URL_CBP_2022 = "https://api.census.gov/data/2022/cbp"

if labor_aggregation == "state":
        DATASETS_DIR = './datasets_labor_states/'
        df_cbp = fetch_cbp(BASE_URL_CBP_2022, "state:*")
        df_cbp = df_cbp.merge(df_states[["STATE_FIPS", "STATE_NAME"]], left_on="STATE", right_on="STATE_FIPS", how="left")
        df_cbp.drop(columns=["STATE_FIPS", "state"], inplace=True)
        # Save df in format long to parquet
        df_cbp.to_parquet(DATASETS_DIR + 'df_labor_usa_states.parquet',engine='fastparquet')

elif labor_aggregation == "metropolitan_area":
        DATASETS_DIR = './datasets_labor_metropolitan_area/'
        df_metrop = fetch_cbp(BASE_URL_CBP_2022, "metropolitan statistical area/micropolitan statistical area:*")
        # Save df in format long to parquet
        df_metrop.to_parquet(DATASETS_DIR + 'df_labor_usa_metropolitan_area.parquet',engine='fastparquet')