
# QCEW 2022 — county × NAICS2 prep (step‑through, **patched**)
This version fixes the `AttributeError: 'DataFrame' object has no attribute 'str'` by
deriving `naics2` with a robust method (strip non‑digits, then take the first two digits).


In [3]:

# Parameters — edit as needed
QCEW_RAW = "/Users/michaelwalker/RDM_Datalab/qcew_annual_raw_2022.csv"  # or your QCEW raw CSV
YEAR = 2022
OUT = "qcew_county_naics2_2022.csv"


In [4]:

import pandas as pd
import numpy as np

def zfill_series(s, n):
    return s.astype(str).str.extract(r"(\d+)", expand=False).fillna("").str.zfill(n)

def normalize_qcew_columns(df):
    # De-duplicate any repeated headers first
    if df.columns.duplicated().any():
        df = df.loc[:, ~df.columns.duplicated()].copy()
    lower = {c.lower(): c for c in df.columns}
    def pick(*opts):
        for o in opts:
            if o in lower:
                return lower[o]
        return None
    area     = pick("area_fips", "area", "fips")
    ind      = pick("industry_code", "naics", "industry")
    year_col = pick("year")
    aemp     = pick("annual_avg_emplvl", "annual_avg_employment", "annualaverageemployment", "annual_avg_emplv")
    twages   = pick("total_annual_wages", "totalannualwages", "annual_total_wages", "tot_annual_wages")
    awage    = pick("avg_wkly_wage", "avg_weekly_wage", "average_weekly_wage", "annual_avg_wkly_wage")
    own      = pick("own_code", "ownership", "own")
    need = [area, ind, year_col, aemp, twages, awage]
    if any(x is None for x in need):
        missing = [n for n,x in zip(["area_fips","industry_code","year","annual_avg_emplvl","total_annual_wages","avg_weekly_wage"], need) if x is None]
        raise ValueError(f"Missing required columns (or synonyms): {missing}")
    df = df.rename(columns={
        area: "area_fips",
        ind: "industry_code",
        year_col: "year",
        aemp: "annual_avg_emplvl",
        twages: "total_annual_wages",
        awage: "avg_weekly_wage",
    })
    if own:
        df = df.rename(columns={own: "own_code"})
    return df

def prepare_qcew_naics2(qdf, year=None, keep_own_code_zero=True):
    df = qdf.copy()

    # Filter year if present
    if year is not None and "year" in df.columns:
        df = df[df["year"].astype(str) == str(year)]

    # Filter to ownership code '0' (Total covered), if present
    if keep_own_code_zero and "own_code" in df.columns:
        df = df[df["own_code"].astype(str) == "0"]

    # Keep only county-level rows: area_fips length == 5
    df["area_fips"] = df["area_fips"].astype(str).str.strip()
    df = df[df["area_fips"].str.len() == 5].copy()
    df["state_fips"]  = df["area_fips"].str[:2]
    df["county_fips"] = df["area_fips"].str[2:]

    # Robust NAICS2 extraction
    df["industry_code"] = df["industry_code"].astype(str)
    df["naics2"] = df["industry_code"].str.replace(r"\D", "", regex=True).str[:2]
    df = df[df["naics2"].str.fullmatch(r"\d{2}")].copy()

    # Cast numerics
    for c in ["annual_avg_emplvl","total_annual_wages","avg_weekly_wage"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce")

    # Aggregate to unique county×naics2×year (sum employment & wages)
    grp = (df.groupby(["state_fips","county_fips","naics2","year"], as_index=False)
             .agg({
                 "annual_avg_emplvl": "sum",
                 "total_annual_wages": "sum"
             }))

    # Recompute avg weekly wage = total_wages / (employment * 52)
    grp["avg_weekly_wage"] = np.where(
        grp["annual_avg_emplvl"] > 0,
        grp["total_annual_wages"] / (grp["annual_avg_emplvl"] * 52.0),
        np.nan
    )

    # Final order
    out = grp[["state_fips","county_fips","naics2","year","annual_avg_emplvl","total_annual_wages","avg_weekly_wage"]]

    # QA checks
    assert out.duplicated(subset=["state_fips","county_fips","naics2","year"]).sum() == 0, "Duplicate keys after aggregation."
    for c in ["annual_avg_emplvl","total_annual_wages"]:
        assert (out[c].dropna() >= 0).all(), f"Negative values found in {c}."
    return out


In [5]:

# Load raw QCEW CSV
raw = pd.read_csv(QCEW_RAW, dtype=str)

# Normalize columns (handles common synonym names)
raw = normalize_qcew_columns(raw)

# Prepare county × NAICS2 × year
out = prepare_qcew_naics2(raw, year=YEAR, keep_own_code_zero=True)
print(f"Prepared {len(out):,} rows for {YEAR}.")
out.head(10)


Prepared 4,429 rows for 2022.


Unnamed: 0,state_fips,county_fips,naics2,year,annual_avg_emplvl,total_annual_wages,avg_weekly_wage
0,1,0,10,2022,2026102,116114501322,1102.102056
1,1,1,10,2022,11535,537282230,895.739105
2,1,3,10,2022,79972,3760240355,904.220409
3,1,5,10,2022,7823,334008300,821.070758
4,1,7,10,2022,4836,234460788,932.353455
5,1,9,10,2022,8726,374288938,824.875566
6,1,11,10,2022,2713,123191264,873.226233
7,1,13,10,2022,6633,281180290,815.21382
8,1,15,10,2022,43846,2025016432,888.168218
9,1,17,10,2022,8533,388077578,874.608033


In [6]:

out.to_csv(OUT, index=False)
print("Wrote:", OUT)


Wrote: qcew_county_naics2_2022.csv


## QA on QCEW output file

In [9]:
import pandas as pd
import numpy as np

# === load & standardize ===
q = pd.read_csv("qcew_county_naics2_2022.csv", dtype=str)
for c in ["state_fips","county_fips","naics2"]:
    q[c] = q[c].str.zfill(2 if c=="state_fips" else (3 if c=="county_fips" else 2))

In [10]:
# cast numerics
for c in ["annual_avg_emplvl","total_annual_wages","avg_weekly_wage","year"]:
    q[c] = pd.to_numeric(q[c], errors="coerce")

print("Rows:", len(q))  # should show 4429



Rows: 4429


In [11]:
# === key uniqueness ===
key = ["state_fips","county_fips","naics2","year"]
dups = q.duplicated(subset=key).sum()
print("Duplicate key rows:", dups)
assert dups == 0, "Unexpected duplicate keys!"



Duplicate key rows: 0


In [13]:
# === basic sanity checks ===
for c in ["annual_avg_emplvl","total_annual_wages","avg_weekly_wage"]:
    assert (q[c].dropna() >= 0).all(), f"Negative values in {c}"

In [14]:
# quick coverage checks
print("NAICS2 counts (top 10):")
display(q["naics2"].value_counts().head(10))
print("State coverage (top 10):")
display(q["state_fips"].value_counts().head(10))

NAICS2 counts (top 10):


naics2
10    4429
Name: count, dtype: int64

State coverage (top 10):


state_fips
48    256
C3    239
C2    236
C1    234
C4    220
CS    169
13    161
51    135
21    122
29    117
Name: count, dtype: int64