
# EPA TRI 2022 → County × NAICS2 (Step‑through)

This notebook walks through aggregating EPA **Toxics Release Inventory (TRI)** facility‑level data to **county × NAICS2**, and (optionally) merging with your ABS and QCEW datasets for the RDM Datalab portfolio.

**Outputs**
- `tri_2022_county_naics2_CA.csv` (or multi‑state if you change the filter)
- Optional merged portfolio file with TRI intensity metrics.

> ⚠️ Releases are measured in **pounds (lbs)**. Keys are zero‑padded strings: `state` (2), `county` (3), `naics2` (2).


In [2]:

import pandas as pd
import numpy as np
from pathlib import Path

def zfill_series(s, n):
    return s.astype(str).str.extract(r"(\d+)", expand=False).fillna("").str.zfill(n)


In [None]:

# === Configure your file paths ===
TRI_CSV = "tri_facilities_2022.csv"   # Replace with your local file path
ABS_CSV = None                        # e.g., "abs_ca_county_naics2.csv"
QCEW_CSV = None                       # e.g., "qcew_ca_county_naics2_2022.csv"
STATE_FILTER = ["06"]                 # Default: California only

# === Load TRI facility-level ===
tri = pd.read_csv(TRI_CSV, dtype=str, low_memory=False)

# Normalize total release lbs
if "TOTAL_RELEASES_LBS" in tri.columns:
    tri["TOTAL_RELEASES_LBS"] = pd.to_numeric(tri["TOTAL_RELEASES_LBS"], errors="coerce")
    tri = tri.rename(columns={"TOTAL_RELEASES_LBS":"tri_releases_lbs"})
elif "TOTAL_RELEASE_LBS" in tri.columns:
    tri["TOTAL_RELEASE_LBS"] = pd.to_numeric(tri["TOTAL_RELEASE_LBS"], errors="coerce")
    tri = tri.rename(columns={"TOTAL_RELEASE_LBS":"tri_releases_lbs"})
else:
    release_cols = [c for c in tri.columns if c.upper().endswith("_RELEASE_LBS")]
    if not release_cols:
        raise ValueError("Could not locate TRI release columns. Expected TOTAL_RELEASES_LBS or *_RELEASE_LBS fields.")
    tri[release_cols] = tri[release_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
    tri["tri_releases_lbs"] = tri[release_cols].sum(axis=1)

# Standardize keys
for col, width in [("FIPS_STATE", 2), ("FIPS_COUNTY", 3)]:
    if col not in tri.columns:
        raise ValueError(f"Expected column '{col}' not found in TRI file.")
    tri[col] = zfill_series(tri[col], width)

naics_col = None
for candidate in ["PRIMARY_NAICS", "PRIMARY_NAICS_CODE", "NAICS_CODE"]:
    if candidate in tri.columns:
        naics_col = candidate
        break
if naics_col is None:
    raise ValueError("Could not find PRIMARY_NAICS / PRIMARY_NAICS_CODE / NAICS_CODE column in TRI file.")

tri["state"]  = tri["FIPS_STATE"]
tri["county"] = tri["FIPS_COUNTY"]
tri["naics2"] = tri[naics_col].astype(str).str[:2]

tri = tri[["state","county","naics2","tri_releases_lbs"]]
tri.head()


In [None]:

# === Aggregate to county × NAICS2 ===
if STATE_FILTER:
    STATE_FILTER = [str(s).zfill(2) for s in STATE_FILTER]
    tri_use = tri[tri["state"].isin(STATE_FILTER)].copy()
else:
    tri_use = tri.copy()

tri_g = (tri_use.groupby(["state","county","naics2"], as_index=False)["tri_releases_lbs"]
               .sum())

# QA checks
assert (tri_g["tri_releases_lbs"].dropna() >= 0).all()
assert tri_g.duplicated(subset=["state","county","naics2"]).sum() == 0

tri_g.head()


In [None]:

# === Save aggregated TRI CSV ===
out_tri = Path("tri_2022_county_naics2_CA.csv") if STATE_FILTER == ["06"] else Path("tri_2022_county_naics2_multi.csv")
tri_g.to_csv(out_tri, index=False)
out_tri, len(tri_g)


In [None]:

# === Optional: merge with ABS and QCEW to compute intensity metrics ===
abs_df = None
qcew_df = None

if ABS_CSV:
    abs_df = pd.read_csv(ABS_CSV, dtype=str)
    abs_df["state"]  = zfill_series(abs_df.get("state") if "state" in abs_df.columns else abs_df.get("STATEFP"), 2)
    abs_df["county"] = zfill_series(abs_df.get("county") if "county" in abs_df.columns else abs_df.get("COUNTYFP"), 3)
    abs_df["naics2"] = abs_df["NAICS2022"].astype(str).str[:2]
    for k in ["PAYANN", "RCPPDEMP"]:
        if k in abs_df.columns:
            abs_df[k] = pd.to_numeric(abs_df[k], errors="coerce") * 1000
    abs_df = abs_df.rename(columns={
        "FIRMPDEMP":"abs_firms",
        "EMP":"abs_emp",
        "PAYANN":"abs_payroll_usd",
        "RCPPDEMP":"abs_receipts_usd"
    })
    for c in ["abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd"]:
        if c in abs_df.columns:
            abs_df[c] = pd.to_numeric(abs_df[c], errors="coerce")
    abs_df = abs_df[["state","county","naics2","abs_firms","abs_emp","abs_payroll_usd","abs_receipts_usd"]].drop_duplicates()

if QCEW_CSV:
    qcew_df = pd.read_csv(QCEW_CSV, dtype=str)
    qcew_df["state"]  = zfill_series(qcew_df.get("state_fips"), 2)
    qcew_df["county"] = zfill_series(qcew_df.get("county_fips"), 3)
    qcew_df["naics2"] = qcew_df["naics"].astype(str).str[:2]
    qcew_df = qcew_df.rename(columns={
        "annual_avg_emplvl":"qcew_emp",
        "total_annual_wages":"qcew_wages_usd",
        "avg_weekly_wage":"qcew_avg_weekly_wage_usd"
    })
    for c in ["qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd"]:
        qcew_df[c] = pd.to_numeric(qcew_df[c], errors="coerce")
    qcew_df = qcew_df[["state","county","naics2","qcew_emp","qcew_wages_usd","qcew_avg_weekly_wage_usd","year"]].drop_duplicates()

merged = tri_g.copy()
if abs_df is not None:
    merged = merged.merge(abs_df, on=["state","county","naics2"], how="left")
if qcew_df is not None:
    merged = merged.merge(qcew_df, on=["state","county","naics2"], how="left")

if "qcew_emp" in merged.columns:
    merged["tri_lbs_per_emp"]  = np.where(merged["qcew_emp"].fillna(0) > 0,
                                          merged["tri_releases_lbs"]/merged["qcew_emp"], np.nan)
if "abs_firms" in merged.columns:
    merged["tri_lbs_per_firm"] = np.where(merged["abs_firms"].fillna(0) > 0,
                                          merged["tri_releases_lbs"]/merged["abs_firms"], np.nan)

assert merged.duplicated(subset=["state","county","naics2"]).sum() == 0
merged.head()


In [None]:

# === Save merged (if ABS/QCEW provided) ===
if ABS_CSV or QCEW_CSV:
    out_merged = Path("portfolio_abs_qcew_tri_county_naics2.csv")
    merged.to_csv(out_merged, index=False)
    out_merged, len(merged)
else:
    print("No ABS/QCEW provided; skipping merged output.")
