In [1]:
import pandas as pd
import numpy as np

# ---------- Load & standardize ----------
abs_df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/ABS_2022_CA_allcounties_NAICS2_from_API.csv")
qcew_df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/qcew_county_naics_sector_2022.csv")  # make this later
# Optional TRI facilities (facility-level)
tri = pd.read_csv("/Users/michaelwalker/RDM_Datalab/tri_release_by_county_naics.csv")

In [2]:
# Standardize FIPS & NAICS2
def zfill_series(s, n): return s.astype(str).str.extract(r"(\d+)", expand=False).fillna("").str.zfill(n)

abs_df["state"]  = zfill_series(abs_df["state"], 2)
abs_df["county"] = zfill_series(abs_df["county"], 3)
abs_df["naics2"] = abs_df["NAICS2022"].astype(str).str[:2]

In [3]:
# Convert ABS $1k → dollars
for k in ["PAYANN", "RCPPDEMP"]:
    abs_df[k] = pd.to_numeric(abs_df[k], errors="coerce") * 1000

abs_df = abs_df.rename(columns={
    "FIRMPDEMP":"abs_firms",
    "EMP":"abs_emp",
    "PAYANN":"abs_payroll_usd",
    "RCPPDEMP":"abs_receipts_usd"
})
for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd"]:
    abs_df[c] = pd.to_numeric(abs_df[c], errors="coerce")

In [4]:
# Minimal QCEW standardization (you’ll adapt to your column names)
# Expecting: state_fips, county_fips, naics (2-digit), year, annual_avg_emplvl, total_annual_wages, avg_weekly_wage
qcew_df["state"]  = zfill_series(qcew_df["state_fips"], 2)
qcew_df["county"] = zfill_series(qcew_df["county_fips"], 3)
qcew_df["naics2"] = qcew_df["naics_sector"].astype(str).str[:2]
qcew_df = qcew_df.rename(columns={
    "annual_avg_emplvl":"qcew_emp",
    "total_annual_wages":"qcew_wages_usd",
    "avg_weekly_wage":"qcew_avg_weekly_wage_usd"
})
for c in ["qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd"]:
    qcew_df[c] = pd.to_numeric(qcew_df[c], errors="coerce")

In [5]:
tri.head(20)

Unnamed: 0,state,county,naics2,tri_releases_lbs
0,AK,ALEUTIANS EAST BOROUGH,31,196277.0
1,AK,ALEUTIANS WEST CENSUS ARE,31,2816.0
2,AK,ALEUTIANS WEST CENSUS ARE,42,256.1228
3,AK,ANCHORAGE MUNICIPALITY,32,255.0
4,AK,ANCHORAGE MUNICIPALITY,42,12038.38
5,AK,ANCHORAGE MUNICIPALITY,92,38164.5
6,AK,BETHEL CENSUS AREA,42,781.45
7,AK,BRISTOL BAY BOROUGH,31,13538.0
8,AK,DENALI BOROUGH,21,183409.3
9,AK,DENALI BOROUGH,22,539460.2


In [6]:
counties = pd.read_csv("/Users/michaelwalker/RDM_Datalab/simplemaps_uscounties_basicv1.91/uscounties.csv")

In [7]:
counties.head(20)

Unnamed: 0,county,county_ascii,county_full,county_fips,state_id,state_name,lat,lng,population
0,Los Angeles,Los Angeles,Los Angeles County,6037,CA,California,34.3219,-118.2247,9848406
1,Cook,Cook,Cook County,17031,IL,Illinois,41.8401,-87.8168,5185812
2,Harris,Harris,Harris County,48201,TX,Texas,29.8578,-95.3938,4758579
3,Maricopa,Maricopa,Maricopa County,4013,AZ,Arizona,33.349,-112.4915,4491987
4,San Diego,San Diego,San Diego County,6073,CA,California,33.0343,-116.735,3282782
5,Orange,Orange,Orange County,6059,CA,California,33.7031,-117.7609,3164063
6,Miami-Dade,Miami-Dade,Miami-Dade County,12086,FL,Florida,25.615,-80.5624,2685296
7,Kings,Kings,Kings County,36047,NY,New York,40.6413,-73.9383,2646306
8,Dallas,Dallas,Dallas County,48113,TX,Texas,32.7666,-96.7778,2603816
9,Riverside,Riverside,Riverside County,6065,CA,California,33.7437,-115.9938,2449909


In [16]:
key_county = ["county"]
tri = tri.merge(counties, on=str.upper(key_county), how="left")


TypeError: descriptor 'upper' for 'str' objects doesn't apply to a 'list' object

In [14]:
mask = tri["county"] == "LOS ANGELES"
tri[mask].head(20)

Unnamed: 0,state,county,naics2,tri_releases_lbs,county_ascii_x,county_full_x,county_fips_x,state_id_x,state_name_x,lat_x,lng_x,population_x,county_ascii_y,county_full_y,county_fips_y,state_id_y,state_name_y,lat_y,lng_y,population_y
416,CA,LOS ANGELES,22,1798.65,,,,,,,,,,,,,,,,
417,CA,LOS ANGELES,31,197927.7,,,,,,,,,,,,,,,,
418,CA,LOS ANGELES,32,5182203.0,,,,,,,,,,,,,,,,
419,CA,LOS ANGELES,33,3215156.0,,,,,,,,,,,,,,,,
420,CA,LOS ANGELES,42,59939.3,,,,,,,,,,,,,,,,
421,CA,LOS ANGELES,56,113364.9,,,,,,,,,,,,,,,,
422,CA,LOS ANGELES,92,7565.602,,,,,,,,,,,,,,,,


In [9]:
# ---------- Merge ----------
key = ["state","county","naics2"]
merged = (abs_df.merge(qcew_df[[*key,"qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd","year"]],
                       on=key, how="left"))

In [None]:
# If using TRI:
# merged = merged.merge(tri_g, on=key, how="left")

In [10]:
# ---------- Derived cross-source metrics ----------
# Wage sanity: annualized wage per employee from QCEW vs ABS payroll/emp
merged["qcew_wage_per_emp_usd"] = np.where(merged["qcew_emp"]>0,
                                           merged["qcew_wages_usd"]/merged["qcew_emp"], np.nan)

merged["abs_wage_per_emp_usd"]  = np.where(merged["abs_emp"]>0,
                                           merged["abs_payroll_usd"]/merged["abs_emp"], np.nan)

merged["abs_receipts_per_firm_usd"] = np.where(merged["abs_firms"]>0,
                                               merged["abs_receipts_usd"]/merged["abs_firms"], np.nan)

In [None]:
# If TRI present: intensity metrics
# merged["tri_lbs_per_emp"]  = np.where(merged["abs_emp"]>0, merged["tri_releases_lbs"]/merged["abs_emp"], np.nan)
# merged["tri_lbs_per_firm"] = np.where(merged["abs_firms"]>0, merged["tri_releases_lbs"]/merged["abs_firms"], np.nan)


In [11]:
# ---------- Quality checks ----------
assert merged.duplicated(subset=key).sum() == 0
# Non-negatives where applicable
for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd","qcew_emp","qcew_wages_usd"]:
    if c in merged.columns:
        assert (merged[c].dropna() >= 0).all()


In [12]:
# Save
merged.to_csv("portfolio_abs_qcew_ca_county_naics2.csv", index=False)
