In [1]:
import pandas as pd
import numpy as np

# ---------- Load & standardize ----------
abs_df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/ABS_2022_allcounties_NAICS2_from_API.csv")
qcew_df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/qcew_county_naics_sector_2022.csv")  # make this later
# Optional TRI facilities (facility-level)
# tri = pd.read_csv("tri_facilities_2022.csv")

In [2]:
qcew_df.head(10)

Unnamed: 0,state_fips,county_fips,naics_sector,year,annual_avg_emplvl,total_annual_wages,avg_weekly_wage
0,1,1,11,2022,97,7271752,1441.663759
1,1,1,21,2022,79,5383592,1310.514119
2,1,1,22,2022,84,12421322,2843.709249
3,1,1,23,2022,527,28459329,1038.510035
4,1,1,31-33,2022,1534,104157352,1305.753585
5,1,1,42,2022,418,28889133,1329.091507
6,1,1,44-45,2022,1689,52853052,601.779068
7,1,1,48-49,2022,145,7332786,972.518037
8,1,1,51,2022,35,2896883,1591.693956
9,1,1,52,2022,306,18320054,1151.335721


In [3]:
abs_df.head(10)

Unnamed: 0,NAME,GEO_ID,NAICS2022,NAICS2022_LABEL,INDLEVEL,FIRMPDEMP,EMP,PAYANN,RCPPDEMP,INDLEVEL.1,state,county
0,"Autauga County, Alabama",0500000US01001,00,Total for all sectors,2,0,0,0,0,2,1,1
1,"Autauga County, Alabama",0500000US01001,11,"Agriculture, forestry, fishing and hunting",2,0,0,0,0,2,1,1
2,"Autauga County, Alabama",0500000US01001,21,"Mining, quarrying, and oil and gas extraction",2,0,0,0,0,2,1,1
3,"Autauga County, Alabama",0500000US01001,22,Utilities,2,5,133,17854,191100,2,1,1
4,"Autauga County, Alabama",0500000US01001,23,Construction,2,118,387,18215,83338,2,1,1
5,"Autauga County, Alabama",0500000US01001,31-33,Manufacturing,2,16,1173,108226,901319,2,1,1
6,"Autauga County, Alabama",0500000US01001,42,Wholesale trade,2,12,92,6778,244550,2,1,1
7,"Autauga County, Alabama",0500000US01001,44-45,Retail trade,2,0,0,0,0,2,1,1
8,"Autauga County, Alabama",0500000US01001,48-49,Transportation and warehousing,2,0,0,0,0,2,1,1
9,"Autauga County, Alabama",0500000US01001,51,Information,2,14,72,4163,34012,2,1,1


In [5]:
# Standardize FIPS & NAICS2
def zfill_series(s, n): return s.astype(str).str.extract(r"(\d+)", expand=False).fillna("").str.zfill(n)

abs_df["state"]  = zfill_series(abs_df["state"], 2)
abs_df["county"] = zfill_series(abs_df["county"], 3)
abs_df["naics2"] = abs_df["NAICS2022"].astype(str).str[:2]

In [6]:
# Convert ABS $1k → dollars
for k in ["PAYANN", "RCPPDEMP"]:
    abs_df[k] = pd.to_numeric(abs_df[k], errors="coerce") * 1000

abs_df = abs_df.rename(columns={
    "FIRMPDEMP":"abs_firms",
    "EMP":"abs_emp",
    "PAYANN":"abs_payroll_usd",
    "RCPPDEMP":"abs_receipts_usd"
})
for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd"]:
    abs_df[c] = pd.to_numeric(abs_df[c], errors="coerce")

In [15]:
# Minimal QCEW standardization (you’ll adapt to your column names)
# Expecting: state_fips, county_fips, naics (2-digit), year, annual_avg_emplvl, total_annual_wages, avg_weekly_wage
qcew_df["state"]  = zfill_series(qcew_df["state_fips"], 2)
qcew_df["county"] = zfill_series(qcew_df["county_fips"], 3)
qcew_df["state_county_fips"] = zfill_series(qcew_df["state_fips"], 2) + zfill_series(qcew_df["county_fips"], 3)
qcew_df["naics2"] = qcew_df["naics_sector"].astype(str).str[:2]
qcew_df = qcew_df.rename(columns={
    "annual_avg_emplvl":"qcew_emp",
    "total_annual_wages":"qcew_wages_usd",
    "avg_weekly_wage":"qcew_avg_weekly_wage_usd"
})
for c in ["qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd"]:
    qcew_df[c] = pd.to_numeric(qcew_df[c], errors="coerce")

In [16]:
# Optional: TRI aggregation to county × NAICS2
# tri["state"]  = zfill_series(tri["FIPS_STATE"], 2)
# tri["county"] = zfill_series(tri["FIPS_COUNTY"], 3)
# tri["naics2"] = tri["PRIMARY_NAICS"].astype(str).str[:2]
# tri_g = (tri.groupby(["state","county","naics2"], as_index=False)["TOTAL_RELEASES_LBS"].sum()
#            .rename(columns={"TOTAL_RELEASES_LBS":"tri_releases_lbs"}))

In [17]:
# ---------- Merge ----------
key = ["state","county","naics2"]
merged = (abs_df.merge(qcew_df[[*key,"qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd","year","state_county_fips"]],
                       on=key, how="left"))

In [18]:
# If using TRI:
# merged = merged.merge(tri_g, on=key, how="left")

In [19]:
# ---------- Derived cross-source metrics ----------
# Wage sanity: annualized wage per employee from QCEW vs ABS payroll/emp
merged["qcew_wage_per_emp_usd"] = np.where(merged["qcew_emp"]>0,
                                           merged["qcew_wages_usd"]/merged["qcew_emp"], np.nan)

merged["abs_wage_per_emp_usd"]  = np.where(merged["abs_emp"]>0,
                                           merged["abs_payroll_usd"]/merged["abs_emp"], np.nan)

merged["abs_receipts_per_firm_usd"] = np.where(merged["abs_firms"]>0,
                                               merged["abs_receipts_usd"]/merged["abs_firms"], np.nan)

In [20]:
# If TRI present: intensity metrics
# merged["tri_lbs_per_emp"]  = np.where(merged["abs_emp"]>0, merged["tri_releases_lbs"]/merged["abs_emp"], np.nan)
# merged["tri_lbs_per_firm"] = np.where(merged["abs_firms"]>0, merged["tri_releases_lbs"]/merged["abs_firms"], np.nan)


In [21]:
# ---------- Quality checks ----------
assert merged.duplicated(subset=key).sum() == 0
# Non-negatives where applicable
for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd","qcew_emp","qcew_wages_usd"]:
    if c in merged.columns:
        assert (merged[c].dropna() >= 0).all()


In [25]:
len(merged)

56564

In [27]:
# Save
merged.to_csv("portfolio_abs_qcew_ca_county_naics2.csv", index=False, header=False)