In [96]:
import pandas as pd
import numpy as np

In [100]:
#df = pd.read_csv("/Users/michaelwalker/RDM_Datalab/US_1a_2022.txt", sep='\t', encoding="latin1", 
#                 engine="python",
#    dtype=str,
#    on_bad_lines="skip"     )

In [135]:
import pandas as pd, csv, io

path = "/Users/michaelwalker/RDM_Datalab/US_1A_2022.txt"  # your 1A file

In [138]:
# 1) Read the header line exactly as-is, normalize it, and get expected width
with open(path, "r", encoding="latin1", newline="") as f:
    raw_header = f.readline().lstrip("\ufeff").rstrip("\r\n")  # strip BOM + newline
header_cols = raw_header.split("\t")
print(header_cols)
len(header_cols)

['1. FORM TYPE', '2. REPORTING YEAR', '3. TRADE SECRET INDICATOR', '4. SANITIZED INDICATOR', '5. TITLE OF CERTIFYING OFFICIAL', '6. NAME OF CERTIFYING OFFICIAL', '7. CERTIFYING OFFICIAL S SIGNATURE INDICATOR', '8. DATE SIGNED', '9. TRIFD', '10. FACILITY NAME', '11. FACILITY STREET', '12. FACILITY CITY', '13. FACILITY COUNTY', '14. FACILITY STATE', '15. FACILITY ZIP CODE', '16. BIA CODE', '17. TRIBE NAME', '18. MAILING NAME', '19. MAILING STREET', '20. MAILING CITY', '21. MAILING STATE', '22. MAILING PROVINCE', '23. MAILING ZIP CODE', '24. ENTIRE FACILITY IND', '25. PARTIAL FACILITY IND', '26. FEDERAL FACILITY IND', '27. GOCO FACILITY IND', '28. ASSIGNED FED FACILITY FLAG', '29. ASSIGNED PARTIAL FACILITY FLAG', '30. PUBLIC CONTACT NAME', '31. PUBLIC CONTACT PHONE', '32. PUBLIC CONTACT PHONE EXT', '33. PUBLIC CONTACT EMAIL', '34. PRIMARY SIC CODE', '35. SIC CODE 2', '36. SIC CODE 3', '37. SIC CODE 4', '38. SIC CODE 5', '39. SIC CODE 6', '40. NAICS ORIGIN', '41. PRIMARY NAICS CODE', '42. 

282

In [137]:
# Some bundles add a trailing run-stamp like '20250917' or an empty last column:
if header_cols and (header_cols[-1]=="" or header_cols[-1].isdigit()):
    header_cols = header_cols[:-1]

N = len(header_cols)
print(len(header_cols))
print (header_cols)

281
['1. FORM TYPE', '2. REPORTING YEAR', '3. TRADE SECRET INDICATOR', '4. SANITIZED INDICATOR', '5. TITLE OF CERTIFYING OFFICIAL', '6. NAME OF CERTIFYING OFFICIAL', '7. CERTIFYING OFFICIAL S SIGNATURE INDICATOR', '8. DATE SIGNED', '9. TRIFD', '10. FACILITY NAME', '11. FACILITY STREET', '12. FACILITY CITY', '13. FACILITY COUNTY', '14. FACILITY STATE', '15. FACILITY ZIP CODE', '16. BIA CODE', '17. TRIBE NAME', '18. MAILING NAME', '19. MAILING STREET', '20. MAILING CITY', '21. MAILING STATE', '22. MAILING PROVINCE', '23. MAILING ZIP CODE', '24. ENTIRE FACILITY IND', '25. PARTIAL FACILITY IND', '26. FEDERAL FACILITY IND', '27. GOCO FACILITY IND', '28. ASSIGNED FED FACILITY FLAG', '29. ASSIGNED PARTIAL FACILITY FLAG', '30. PUBLIC CONTACT NAME', '31. PUBLIC CONTACT PHONE', '32. PUBLIC CONTACT PHONE EXT', '33. PUBLIC CONTACT EMAIL', '34. PRIMARY SIC CODE', '35. SIC CODE 2', '36. SIC CODE 3', '37. SIC CODE 4', '38. SIC CODE 5', '39. SIC CODE 6', '40. NAICS ORIGIN', '41. PRIMARY NAICS CODE', '

In [139]:
# 2) Now read the rest with those names (no automatic header)
df = pd.read_csv(
    path,
    sep="\t",
    encoding="latin1",
    engine="python",
    header=None,
    names=header_cols,
    skiprows=1,
    dtype=str,
    quoting=csv.QUOTE_NONE,   # be strict about tabs
    on_bad_lines="skip"       # skip malformed data rows
)

In [140]:
# Optional: drop any phantom trailing column that slipped in anyway
df = df.loc[:, ~df.columns.astype(str).str.fullmatch(r"Unnamed:.*|^$")]

print(len(df.columns), "columns")
print(df.columns[:8].tolist())
print(df.head(1).T.head(6))  # sanity: '1. FORM TYPE' should be 'R' or 'A'; '2. REPORTING YEAR' should be 2022/2023

282 columns
['1. FORM TYPE', '2. REPORTING YEAR', '3. TRADE SECRET INDICATOR', '4. SANITIZED INDICATOR', '5. TITLE OF CERTIFYING OFFICIAL', '6. NAME OF CERTIFYING OFFICIAL', '7. CERTIFYING OFFICIAL S SIGNATURE INDICATOR', '8. DATE SIGNED']
                                                R
1. FORM TYPE                                 2022
2. REPORTING YEAR                              NO
3. TRADE SECRET INDICATOR                      NO
4. SANITIZED INDICATOR           VP-MANUFACTURING
5. TITLE OF CERTIFYING OFFICIAL      JOHN NUNDAHL
6. NAME OF CERTIFYING OFFICIAL         ELECTRONIC


In [141]:
import csv, io, re
import pandas as pd
from pathlib import Path

path = Path("US_1A_2022.txt")   # <-- your 1A file

# --- 1) Read header exactly as-is (strip BOM, kill stamp/empty tail) ---
with open(path, "r", encoding="latin1", newline="") as f:
    raw_header = f.readline().lstrip("\ufeff").rstrip("\r\n")
header_cols = raw_header.split("\t")
# Some drops include a trailing run-stamp like '20250917' or a blank last field
if header_cols and (header_cols[-1] == "" or header_cols[-1].isdigit()):
    header_cols = header_cols[:-1]
N = len(header_cols)

# --- 2) Robust row normalizer: enforce exactly N fields per row ---
def normalize_row(fields, N):
    """Return a list of exactly N fields: pad if short; glue extras into the last field."""
    if len(fields) < N:
        return fields + [""]*(N - len(fields))
    if len(fields) > N:
        fields[N-1] = fields[N-1] + "\t" + "\t".join(fields[N:])
        return fields[:N]
    return fields

# --- 3) Stream the file with csv (tab-delim, standard quoting), normalize widths ---
rows = []
with open(path, "r", encoding="latin1", newline="") as fh:
    next(fh)  # skip header line (we already captured it)
    reader = csv.reader(fh, delimiter="\t", quotechar='"', doublequote=True, escapechar="\\")
    for r in reader:
        rows.append(normalize_row(r, N))

df = pd.DataFrame(rows, columns=header_cols)

# --- 4) Clean phantom columns, if any slipped in ---
df = df.loc[:, ~df.columns.astype(str).str.fullmatch(r"Unnamed:.*|^$")]

# --- 5) Sanity checks (these should pass) ---
print(len(df.columns), "columns (expected N=", N, ")")
print(df.columns[:8].tolist())
print(df.head(1)[["1. FORM TYPE", "2. REPORTING YEAR"]])

# Expect form type to be R or A; year like 2022/2023:
print("Form type ok:", df["1. FORM TYPE"].str.upper().isin(["R","A"]).mean())
print("Year ok:", df["2. REPORTING YEAR"].str.fullmatch(r"\d{4}").mean())

281 columns (expected N= 281 )
['1. FORM TYPE', '2. REPORTING YEAR', '3. TRADE SECRET INDICATOR', '4. SANITIZED INDICATOR', '5. TITLE OF CERTIFYING OFFICIAL', '6. NAME OF CERTIFYING OFFICIAL', '7. CERTIFYING OFFICIAL S SIGNATURE INDICATOR', '8. DATE SIGNED']
  1. FORM TYPE 2. REPORTING YEAR
0            R              2022
Form type ok: 1.0
Year ok: 1.0


In [142]:
print(df.iloc[0])

1. FORM TYPE                                                R
2. REPORTING YEAR                                        2022
3. TRADE SECRET INDICATOR                                  NO
4. SANITIZED INDICATOR                                     NO
5. TITLE OF CERTIFYING OFFICIAL              VP-MANUFACTURING
                                                   ...       
277. ON-SITE RECYCLING PROCESSES METHOD 3                    
278. ON-SITE RECYCLING PROCESSES METHOD 4                    
279. ON-SITE RECYCLING PROCESSES METHOD 5                    
280. ON-SITE RECYCLING PROCESSES METHOD 6                    
281. ON-SITE RECYCLING PROCESSES METHOD 7               \t \t
Name: 0, Length: 281, dtype: object


In [143]:
df.head(10)


Unnamed: 0,1. FORM TYPE,2. REPORTING YEAR,3. TRADE SECRET INDICATOR,4. SANITIZED INDICATOR,5. TITLE OF CERTIFYING OFFICIAL,6. NAME OF CERTIFYING OFFICIAL,7. CERTIFYING OFFICIAL S SIGNATURE INDICATOR,8. DATE SIGNED,9. TRIFD,10. FACILITY NAME,...,272. ON-SITE ENERGY RECOVERY METHOD 2,273. ON-SITE ENERGY RECOVERY METHOD 3,274. ON-SITE ENERGY RECOVERY METHOD 4,275. ON-SITE RECYCLING PROCESSES METHOD 1,276. ON-SITE RECYCLING PROCESSES METHOD 2,277. ON-SITE RECYCLING PROCESSES METHOD 3,278. ON-SITE RECYCLING PROCESSES METHOD 4,279. ON-SITE RECYCLING PROCESSES METHOD 5,280. ON-SITE RECYCLING PROCESSES METHOD 6,281. ON-SITE RECYCLING PROCESSES METHOD 7
0,R,2022,NO,NO,VP-MANUFACTURING,JOHN NUNDAHL,ELECTRONIC,2024-04-25,54307FRTHW1919S,GEORGIA-PACIFIC BROADWAY LLC,...,,,,,,,,,,\t \t
1,R,2022,NO,NO,EHS MANAGER,NEEL PATEL,ELECTRONIC,2024-05-09,08861NGLRT1200A,ENGLERT INC,...,,,,,,,,,,\t \t
2,R,2022,NO,NO,PLANT MANAGER,TIM MENKE,ELECTRONIC,2024-06-13,46135LNSTRPUTNA,BUZZI UNICEM USA-GREENCASTLE PLANT,...,,,,,,,,,,\t \t
3,R,2022,NO,NO,CHIEF ENGINEER,STEPHEN STAUDINGER,ELECTRONIC,2024-06-26,53207MLWKF1532E,MILWAUKEE FORGE LLC,...,,,,,,,,,,\t \t
4,R,2022,YES,YES,PLANT MANAGER,RENE NERON,ORIGINAL,2024-04-25,36505MTCHMHWY43,ARKEMA INC,...,,,,H20,,,,,,\t \t
5,R,2022,NO,NO,EHS ENGINEER,MARK MEURETTE,ELECTRONIC,2024-07-15,54401MXXXX144RO,3M CO - WAUSAU DOWNTOWN,...,,,,,,,,,,\t \t
6,R,2022,NO,NO,EHS DIRECTOR,JUSTIN TETLOW,ELECTRONIC,2024-07-22,20794WMTBR2112M,WM. T. BURNETT & CO.,...,,,,,,,,,,\t \t
7,R,2022,NO,NO,HSE SPECIALIST,YANETH HUERTA,ELECTRONIC,2024-08-01,77015MRFRG1377I,AFGLOBAL CORP,...,,,,,,,,,,\t \t
8,R,2022,NO,NO,VICE PRESIDENT OF MANUFACTURING,RODNEY DILLON,ELECTRONIC,2024-09-03,78410KCHRFSUNTI,FLINT HILLS RESOURCES CORPUS CHRISTI LLC - WES...,...,U03,,,,,,,,,\t \t
9,R,2022,NO,NO,CORPORATE ENGINEERING & EH&S MANAGER,PAUL DONNDELINGER,ELECTRONIC,2023-03-21,02860CLYNC50EST,COOLEY INC,...,,,,,,,,,,\t \t


In [144]:
# 2. Verify it’s really a DataFrame and see first few column names
print("Rows:", len(df))
print("Columns:", len(df.columns))
print(df.columns[:282].tolist())  # sample 20 columns

Rows: 80003
Columns: 281
['1. FORM TYPE', '2. REPORTING YEAR', '3. TRADE SECRET INDICATOR', '4. SANITIZED INDICATOR', '5. TITLE OF CERTIFYING OFFICIAL', '6. NAME OF CERTIFYING OFFICIAL', '7. CERTIFYING OFFICIAL S SIGNATURE INDICATOR', '8. DATE SIGNED', '9. TRIFD', '10. FACILITY NAME', '11. FACILITY STREET', '12. FACILITY CITY', '13. FACILITY COUNTY', '14. FACILITY STATE', '15. FACILITY ZIP CODE', '16. BIA CODE', '17. TRIBE NAME', '18. MAILING NAME', '19. MAILING STREET', '20. MAILING CITY', '21. MAILING STATE', '22. MAILING PROVINCE', '23. MAILING ZIP CODE', '24. ENTIRE FACILITY IND', '25. PARTIAL FACILITY IND', '26. FEDERAL FACILITY IND', '27. GOCO FACILITY IND', '28. ASSIGNED FED FACILITY FLAG', '29. ASSIGNED PARTIAL FACILITY FLAG', '30. PUBLIC CONTACT NAME', '31. PUBLIC CONTACT PHONE', '32. PUBLIC CONTACT PHONE EXT', '33. PUBLIC CONTACT EMAIL', '34. PRIMARY SIC CODE', '35. SIC CODE 2', '36. SIC CODE 3', '37. SIC CODE 4', '38. SIC CODE 5', '39. SIC CODE 6', '40. NAICS ORIGIN', '41. P

In [145]:
df["naics2"] = df["41. PRIMARY NAICS CODE"].astype(str).str.extract(r"(\d+)", expand=False).str[:2]
df["state"]  = df["14. FACILITY STATE"].astype(str).str.zfill(2)   # if present
df["county"] = df["13. FACILITY COUNTY"].astype(str).str.zfill(3)  # if present

In [146]:
# now df.columns is a valid Index object
rel_cols = [c for c in df.columns if any(k in c.lower()
                for k in ["release", "transfer", "to air", "to water", "to land", "off-site"])]


print("Found", len(rel_cols), "possible release/transfer columns:")
for c in rel_cols:
    print(" •", c)


Found 85 possible release/transfer columns:
 • 110. FUGITIVE AIR EMISSIONS - TOTAL RELEASE POUNDS
 • 111. FUGITIVE AIR EMISSIONS - TOTAL RELEASE RANGE CODE
 • 114. STACK AIR EMISSIONS - RELEASE POUNDS
 • 115. STACK AIR EMISSIONS - RELEASE RANGE CODE
 • 120. DISCHARGES TO STREAM A - RELEASE POUNDS
 • 121. DISCHARGES TO STREAM A - RELEASE RANGE CODE
 • 126. DISCHARGES TO STREAM B - RELEASE POUNDS
 • 127. DISCHARGES TO STREAM B - RELEASE RANGE CODE
 • 132. DISCHARGES TO STREAM C - RELEASE POUNDS
 • 133. DISCHARGES TO STREAM C - RELEASE RANGE CODE
 • 138. DISCHARGES TO STREAM D - RELEASE POUNDS
 • 139. DISCHARGES TO STREAM D - RELEASE RANGE CODE
 • 144. DISCHARGES TO STREAM E - RELEASE POUNDS
 • 145. DISCHARGES TO STREAM E - RELEASE RANGE CODE
 • 150. DISCHARGES TO STREAM F - RELEASE POUNDS
 • 151. DISCHARGES TO STREAM F - RELEASE RANGE CODE
 • 156. DISCHARGES TO STREAM G - RELEASE POUNDS
 • 157. DISCHARGES TO STREAM G - RELEASE RANGE CODE
 • 162. DISCHARGES TO STREAM H - RELEASE POUNDS
 •

In [40]:
#TOTAL TRANSFERRED OFF SITE FOR DISPOSAL
#TOTAL ON-SITE RELEASES

In [147]:
# Prefer EPA-provided totals:
col_on  = next((c for c in df.columns if "TOTAL ON-SITE RELEASES" in c.upper()), None)
col_off = next((c for c in df.columns if "TOTAL TRANSFERRED OFF SITE FOR DISPOSAL" in c.upper()), None)

if col_on and col_off:
    df["TOTAL_RELEASES_LBS"] = (
        pd.to_numeric(df[col_on], errors="coerce").fillna(0) +
        pd.to_numeric(df[col_off], errors="coerce").fillna(0)
    )
else:
    # fallback: sum components carefully (excluding recycling/energy/treatment/POTW)
    pass  # (use the component-summing snippet we discussed earlier)


In [148]:
tri_g = (df.dropna(subset=["state","county","naics2"])
           .groupby(["state","county","naics2"], as_index=False)["TOTAL_RELEASES_LBS"]
           .sum()
           .rename(columns={"TOTAL_RELEASES_LBS":"tri_releases_lbs"}))

In [149]:
len(tri_g)

6027

In [150]:
mask = tri_g["state"] == "CA"

In [151]:
tri_g[mask]

Unnamed: 0,state,county,naics2,tri_releases_lbs
375,CA,ALAMEDA,31,1.283200e+04
376,CA,ALAMEDA,32,1.593656e+05
377,CA,ALAMEDA,33,2.155065e+06
378,CA,ALAMEDA,54,1.100000e+00
379,CA,ALAMEDA,56,5.410000e+00
...,...,...,...,...
529,CA,YOLO,32,0.000000e+00
530,CA,YOLO,33,7.692200e+04
531,CA,YOLO,42,1.900000e+01
532,CA,YUBA,21,1.220000e-01


In [155]:
#df["NAICS CODE 2"]
print(tri_g['naics2'].value_counts())

naics2
32    1924
33    1574
31     934
42     445
22     361
92     295
21     243
56     188
54      16
45      12
81      10
11       5
61       5
71       4
23       3
48       3
49       3
51       2
Name: count, dtype: int64


In [152]:
def looks_like_type4(cols):
    marks = ["TITLE OF CERTIFYING OFFICIAL","SUBMITTED FACILITY NAME","SUBMITTED PRIMARY NAICS CODE"]
    return sum(m in cols for m in marks) >= 2

def looks_like_1a(cols):
    hints = ["CHEMICAL NAME","CAS","TRANSFER","RELEASE"]
    return any(any(h in c for h in hints) for c in map(str.upper, cols))

cols = list(df.columns)
print("Type 4?", looks_like_type4(cols), " |  Type 1A?", looks_like_1a(cols))

Type 4? False  |  Type 1A? True


In [157]:
tri_g.to_csv("/Users/michaelwalker/RDM_Datalab/tri_release_by_county_naics.csv", index=False)