# QCEW 2022 — county × NAICS2 prep (step‑through)

This notebook prepares **BLS QCEW** wages at the target grain and writes a tidy CSV ready to merge with ABS.

In [15]:
import pandas as pd, re
df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/qcew_annual_raw_2022.csv", dtype=str)   # name may vary after unzip

In [16]:
# standardize columns (names vary slightly across vintages)
df = df.rename(columns={"annual_avg_wkly_wage":"avg_weekly_wage"})

In [17]:
df = df[df["area_fips"].str.len() == 5]                     # counties only
df["state_fips"]  = df["area_fips"].str[:2]
df["county_fips"] = df["area_fips"].str[2:]

In [18]:
df["naics2"] = df["industry_code"].str.extract(r"(\d+)").astype(str).str[:2]
df = df[df["naics2"].str.len() == 2]

AttributeError: 'DataFrame' object has no attribute 'str'

In [None]:
if "own_code" in df.columns: df = df[df["own_code"] == "0"] # Total covered
out = (df.groupby(["state_fips","county_fips","naics2","year"], as_index=False)
         .agg(annual_avg_emplvl=("annual_avg_emplvl","sum"),
              total_annual_wages=("total_annual_wages","sum")))
out["avg_weekly_wage"] = out["total_annual_wages"] / (out["annual_avg_emplvl"] * 52)
out.to_csv("qcew_county_naics2_2022_mgw.csv", index=False)

In [10]:

# Parameters — edit as needed
QCEW_RAW = "/Users/michaelwalker/RDM_Datalab/qcew_county_naics2_2022_mgw.csv"   # your downloaded QCEW annual CSV
YEAR = 2022
OUT = "qcew_county_naics2_2022.csv"


In [11]:

import pandas as pd
import numpy as np

def zfill_series(s, n):
    return s.astype(str).str.extract(r"(\d+)", expand=False).fillna("").str.zfill(n)

def normalize_qcew_columns(df):
    lower = {c.lower(): c for c in df.columns}
    def pick(*opts):
        for o in opts:
            if o in lower:
                return lower[o]
        return None
    area     = pick("area_fips", "area", "fips")
    ind      = pick("industry_code", "naics", "industry")
    year_col = pick("year")
    aemp     = pick("annual_avg_emplvl", "annual_avg_employment", "annualaverageemployment", "annual_avg_emplv")
    twages   = pick("total_annual_wages", "totalannualwages", "annual_total_wages", "tot_annual_wages")
    awage    = pick("annual_avg_wkly_wage", "avg_weekly_wage", "average_weekly_wage")
    own      = pick("own_code", "ownership", "own")
    need = [area, ind, year_col, aemp, twages, awage]
    if any(x is None for x in need):
        missing = [n for n,x in zip(["area_fips","industry_code","year","annual_avg_emplvl","total_annual_wages","avg_weekly_wage"], need) if x is None]
        raise ValueError(f"Missing required columns (or synonyms): {missing}")
    df = df.rename(columns={
        area: "area_fips",
        ind: "industry_code",
        year_col: "year",
        aemp: "annual_avg_emplvl",
        twages: "total_annual_wages",
        awage: "avg_weekly_wage",
    })
    if own:
        df = df.rename(columns={own: "own_code"})
    return df

def prepare_qcew_naics2(qdf, year=None, keep_own_code_zero=True):
    df = qdf.copy()
    if year is not None and "year" in df.columns:
        df = df[df["year"].astype(str) == str(year)]
    if keep_own_code_zero and "own_code" in df.columns:
        df = df[df["own_code"].astype(str) == "0"]
    df["area_fips"] = df["area_fips"].astype(str).str.strip()
    df = df[df["area_fips"].str.len() == 5]
    df["state_fips"]  = df["area_fips"].str[:2]
    df["county_fips"] = df["area_fips"].str[2:]
    df["naics2"] = df["industry_code"].astype(str).str.extract(r"(\\d+)", expand=False)
    df["naics2"] = df["naics2"].str[:2]
    df = df[df["naics2"].str.len() == 2]
    for c in ["annual_avg_emplvl","total_annual_wages","avg_weekly_wage"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce")
    grp = (df.groupby(["state_fips","county_fips","naics2","year"], as_index=False)
             .agg({
                 "annual_avg_emplvl": "sum",
                 "total_annual_wages": "sum"
             }))
    grp["avg_weekly_wage"] = np.where(
        grp["annual_avg_emplvl"] > 0,
        grp["total_annual_wages"] / (grp["annual_avg_emplvl"] * 52.0),
        np.nan
    )
    return grp[["state_fips","county_fips","naics2","year","annual_avg_emplvl","total_annual_wages","avg_weekly_wage"]]


In [12]:

raw = pd.read_csv(QCEW_RAW, dtype=str)
raw = normalize_qcew_columns(raw)
out = prepare_qcew_naics2(raw, year=YEAR, keep_own_code_zero=True)
print(f"Prepared {len(out):,} rows for {YEAR}.")
out.head(10)


Prepared 0 rows for 2022.


Unnamed: 0,state_fips,county_fips,naics2,year,annual_avg_emplvl,total_annual_wages,avg_weekly_wage


In [13]:

out.to_csv(OUT, index=False)
print("Wrote:", OUT)


Wrote: qcew_county_naics2_2022.csv
