# QCEW 2022 — County × NAICS Sector prep (step‑through)

In [None]:

# Parameters — edit as needed
QCEW_RAW = "data/raw/2022_annual_singlefile.csv"   # path inside this repo layout
YEAR = 2022
OUT = "data/processed/qcew_county_naics_sector_2022.csv"


In [None]:

import pandas as pd
import numpy as np

VALID_SECTORS = {
    "11","21","22","23","31-33","42","44-45","48-49","51","52","53","54",
    "55","56","61","62","71","72","81","92"
}

def normalize_qcew_columns(df):
    if df.columns.duplicated().any():
        df = df.loc[:, ~df.columns.duplicated()].copy()
    lower = {c.lower(): c for c in df.columns}
    def pick(*opts):
        for o in opts:
            if o in lower:
                return lower[o]
        return None
    area     = pick("area_fips","area","fips")
    ind      = pick("industry_code","naics","industry")
    year_col = pick("year")
    aemp     = pick("annual_avg_emplvl","annual_avg_employment","annualaverageemployment","annual_avg_emplv")
    twages   = pick("total_annual_wages","totalannualwages","annual_total_wages","tot_annual_wages")
    awage    = pick("avg_wkly_wage","avg_weekly_wage","average_weekly_wage","annual_avg_wkly_wage")
    agglvl   = pick("agglvl_code","agglevel_code","aggregation_level")
    own      = pick("own_code","ownership","own")
    qtr      = pick("qtr","quarter")
    need = [area, ind, year_col, aemp, twages, awage, agglvl]
    if any(x is None for x in need):
        missing = [n for n,x in zip(
            ["area_fips","industry_code","year","annual_avg_emplvl","total_annual_wages","avg_weekly_wage","agglvl_code"],
            need) if x is None]
        raise ValueError(f"Missing required columns (or synonyms): {missing}")
    df = df.rename(columns={
        area: "area_fips",
        ind: "industry_code",
        year_col: "year",
        aemp: "annual_avg_emplvl",
        twages: "total_annual_wages",
        awage: "avg_weekly_wage",
        agglvl: "agglvl_code"
    })
    if own:
        df = df.rename(columns={own: "own_code"})
    if qtr:
        df = df.rename(columns={qtr: "qtr"})
    return df

def prep_sector(qdf, year=None, prefer_private_if_total_missing=True):
    df = qdf.copy()
    if year is not None and "year" in df.columns:
        df = df[df["year"].astype(str) == str(year)]
    if "qtr" in df.columns:
        df = df[df["qtr"].astype(str).str.upper().eq("A")]
    df = df[df["agglvl_code"].astype(str) == "74"]
    if "own_code" in df.columns:
        vals = set(df["own_code"].astype(str).unique())
        if "0" in vals:
            df = df[df["own_code"].astype(str) == "0"]
        elif prefer_private_if_total_missing and "5" in vals:
            df = df[df["own_code"].astype(str) == "5"]
    df["area_fips"] = df["area_fips"].astype(str).str.strip()
    df = df[df["area_fips"].str.len() == 5].copy()
    df["state_fips"]  = df["area_fips"].str[:2]
    df["county_fips"] = df["area_fips"].str[2:]
    df["industry_code"] = df["industry_code"].astype(str)
    df = df[df["industry_code"].isin(VALID_SECTORS)].copy()
    df["naics_sector"] = df["industry_code"]
    for c in ["annual_avg_emplvl","total_annual_wages","avg_weekly_wage"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce")
    out = (df.groupby(["state_fips","county_fips","naics_sector","year"], as_index=False)
             .agg({"annual_avg_emplvl":"sum","total_annual_wages":"sum"}))
    out["avg_weekly_wage"] = np.where(out["annual_avg_emplvl"]>0,
                                      out["total_annual_wages"]/(out["annual_avg_emplvl"]*52.0), np.nan)
    return out


In [None]:

raw = pd.read_csv(QCEW_RAW, dtype=str)
raw = normalize_qcew_columns(raw)
out = prep_sector(raw, year=YEAR, prefer_private_if_total_missing=True)
print("Rows:", len(out))
out.head(10)


In [None]:

out.to_csv(OUT, index=False)
print("Wrote:", OUT)
