In [2]:
import pandas as pd
import numpy as np

# ---------- Load & standardize ----------
abs_df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/ABS_2022_CA_allcounties_NAICS2.csv")
#qcew_df = pd.read_csv("/qcew_ca_county_naics2_2022.csv")  # make this later
# Optional TRI facilities (facility-level)
# tri = pd.read_csv("tri_facilities_2022.csv"


In [3]:
# Standardize FIPS & NAICS2
def zfill_series(s, n): return s.astype(str).str.extract(r"(\d+)", expand=False).fillna("").str.zfill(n)

abs_df["state"]  = zfill_series(abs_df["state"], 2)
abs_df["county"] = zfill_series(abs_df["county"], 3)
abs_df["naics2"] = abs_df["NAICS2022"].astype(str).str[:2]

In [4]:
# Convert ABS $1k → dollars
for k in ["PAYANN", "RCPPDEMP"]:
    abs_df[k] = pd.to_numeric(abs_df[k], errors="coerce") * 1000

abs_df = abs_df.rename(columns={
    "FIRMPDEMP":"abs_firms",
    "EMP":"abs_emp",
    "PAYANN":"abs_payroll_usd",
    "RCPPDEMP":"abs_receipts_usd"
})
for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd"]:
    abs_df[c] = pd.to_numeric(abs_df[c], errors="coerce")

In [5]:
# Minimal QCEW standardization (you’ll adapt to your column names)
# Expecting: state_fips, county_fips, naics (2-digit), year, annual_avg_emplvl, total_annual_wages, avg_weekly_wage
qcew_df["state"]  = zfill_series(qcew_df["state_fips"], 2)
qcew_df["county"] = zfill_series(qcew_df["county_fips"], 3)
qcew_df["naics2"] = qcew_df["naics"].astype(str).str[:2]
qcew_df = qcew_df.rename(columns={
    "annual_avg_emplvl":"qcew_emp",
    "total_annual_wages":"qcew_wages_usd",
    "avg_weekly_wage":"qcew_avg_weekly_wage_usd"
})
for c in ["qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd"]:
    qcew_df[c] = pd.to_numeric(qcew_df[c], errors="coerce")

NameError: name 'qcew_df' is not defined

In [6]:

# Optional: TRI aggregation to county × NAICS2
# tri["state"]  = zfill_series(tri["FIPS_STATE"], 2)
# tri["county"] = zfill_series(tri["FIPS_COUNTY"], 3)
# tri["naics2"] = tri["PRIMARY_NAICS"].astype(str).str[:2]
# tri_g = (tri.groupby(["state","county","naics2"], as_index=False)["TOTAL_RELEASES_LBS"].sum()
#            .rename(columns={"TOTAL_RELEASES_LBS":"tri_releases_lbs"}))

# ---------- Merge ----------
key = ["state","county","naics2"]
merged = (abs_df.merge(qcew_df[[*key,"qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd","year"]],
                       on=key, how="left"))
# If using TRI:
# merged = merged.merge(tri_g, on=key, how="left")

# ---------- Derived cross-source metrics ----------
# Wage sanity: annualized wage per employee from QCEW vs ABS payroll/emp
merged["qcew_wage_per_emp_usd"] = np.where(merged["qcew_emp"]>0,
                                           merged["qcew_wages_usd"]/merged["qcew_emp"], np.nan)

merged["abs_wage_per_emp_usd"]  = np.where(merged["abs_emp"]>0,
                                           merged["abs_payroll_usd"]/merged["abs_emp"], np.nan)

merged["abs_receipts_per_firm_usd"] = np.where(merged["abs_firms"]>0,
                                               merged["abs_receipts_usd"]/merged["abs_firms"], np.nan)

# If TRI present: intensity metrics
# merged["tri_lbs_per_emp"]  = np.where(merged["abs_emp"]>0, merged["tri_releases_lbs"]/merged["abs_emp"], np.nan)
# merged["tri_lbs_per_firm"] = np.where(merged["abs_firms"]>0, merged["tri_releases_lbs"]/merged["abs_firms"], np.nan)

# ---------- Quality checks ----------
assert merged.duplicated(subset=key).sum() == 0
# Non-negatives where applicable
for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd","qcew_emp","qcew_wages_usd"]:
    if c in merged.columns:
        assert (merged[c].dropna() >= 0).all()

# Save
merged.to_csv("portfolio_abs_qcew_ca_county_naics2.csv", index=False)


NameError: name 'qcew_df' is not defined