In [1]:
import os
import requests
from zipfile import ZipFile
from io import BytesIO

#make directory to save CSV files
os.makedirs("acs_pums", exist_ok=True)

#ACS 5-Year PUMS data base URL
#only goes back to 2009
dir_url = "https://www2.census.gov/programs-surveys/acs/data/pums/{year}/5-Year/csv_hca.zip"
max_year = 2025
min_year = 2008
#loop through various years to extract CSV files
for year in range(max_year, min_year, -1):
    url = dir_url.format(year=year)
    print(f"Loop for {year}...")

    #create exception if file is not found
    try:
        response = requests.get(url)
    
        #if response works
        if response.status_code == 200:
            print(f"Downloading CSV for {year}.")
    
            #iterate through directory and grab csv
            with ZipFile(BytesIO(response.content)) as zf:
                for file in zf.namelist():
                    if file.endswith(".csv"):
                        file_name = f"acs_pums_hca_{year}.csv"
                        with open(os.path.join("acs_pums", file_name), "wb") as f_out:
                            f_out.write(zf.read(file))
                        print(f"Saved csv as {file_name}")
        else:
            print(f"No file found for {year}")
    except Exception as e:
        print(f"Failed to download file for {year}")

Loop for 2025...
No file found for 2025
Loop for 2024...
No file found for 2024
Loop for 2023...
Downloading CSV for 2023.
Saved csv as acs_pums_hca_2023.csv
Loop for 2022...
Downloading CSV for 2022.
Saved csv as acs_pums_hca_2022.csv
Loop for 2021...
Downloading CSV for 2021.
Saved csv as acs_pums_hca_2021.csv
Loop for 2020...
Downloading CSV for 2020.
Saved csv as acs_pums_hca_2020.csv
Loop for 2019...
Downloading CSV for 2019.
Saved csv as acs_pums_hca_2019.csv
Loop for 2018...
Downloading CSV for 2018.
Saved csv as acs_pums_hca_2018.csv
Loop for 2017...
Downloading CSV for 2017.
Saved csv as acs_pums_hca_2017.csv
Loop for 2016...
Downloading CSV for 2016.
Saved csv as acs_pums_hca_2016.csv
Loop for 2015...
Downloading CSV for 2015.
Saved csv as acs_pums_hca_2015.csv
Loop for 2014...
Downloading CSV for 2014.
Saved csv as acs_pums_hca_2014.csv
Loop for 2013...
Downloading CSV for 2013.
Saved csv as acs_pums_hca_2013.csv
Loop for 2012...
Downloading CSV for 2012.
Saved csv as acs_pu

In [8]:
import os
import pandas as pd
from sqlalchemy import create_engine

# Your Postgres connection string (edit with your credentials)
engine = create_engine("postgresql://u5m4j1jukal6qj:p75db821c082ec1caea5744dba224fdf7efbc014d01550bc5121c5ba3743eb0df@c2hbg00ac72j9d.cluster-czrs8kj4isg7.us-east-1.rds.amazonaws.com:5432/d9gmr441d827r4")

# Input folder with ACS CSVs
input_dir = "acs_pums"

# Loop through each CSV
for file in os.listdir(input_dir):
    if file.endswith(".csv") and "acs_pums_hca" in file:
        file_path = os.path.join(input_dir, file)
        year = file.split("_")[-1].replace(".csv", "")
        print(f"Inserting {file} into database...")

        # Load CSV
        df = pd.read_csv(file_path)

        # Optional: Add year column to track source
        df["acs_year"] = int(year)
        df["acs_file"] = file
        
        # Insert into a table (append if exists)
        df.to_sql("acs_pums", con=engine, if_exists="append", index=False,method='multi',chunksize=10000)

        print(f"Lodaed {file}")

Inserting acs_pums_hca_2009.csv into database...


PendingRollbackError: Can't reconnect until invalid transaction is rolled back.  Please rollback() fully before proceeding (Background on this error at: https://sqlalche.me/e/20/8s2b)

In [5]:
import os
import psycopg2

# --- Connection config ---
conn = psycopg2.connect(
    dbname="d9gmr441d827r4",
    user="u5m4j1jukal6qj",
    password="p75db821c082ec1caea5744dba224fdf7efbc014d01550bc5121c5ba3743eb0df",
    host="c2hbg00ac72j9d.cluster-czrs8kj4isg7.us-east-1.rds.amazonaws.com",
    port="5432"
)
cur = conn.cursor()

input_dir = "acs_pums"
table_name = "acs_pums_hca"

# Loop through files and load them
for file in os.listdir(input_dir):
    if file.endswith(".csv") and "acs_pums_hca" in file:
        file_path = os.path.join(input_dir, file)
        print(f"📥 Loading {file}...")

        with open(file_path, "r", encoding="utf-8") as f:
            cur.copy_expert(f"COPY {table_name} FROM STDIN WITH CSV HEADER", f)

        conn.commit()
        print(f"✅ Loaded {file}")

cur.close()
conn.close()

📥 Loading acs_pums_hca_2009.csv...


NumericValueOutOfRange: value "2005000000005" is out of range for type integer
CONTEXT:  COPY acs_pums_hca, line 2, column SERIALNO: "2005000000005"


In [None]:
import os
import pandas as pd

# Directory where the ACS CSVs are stored
input_dir = "acs_pums"
output_dir = "acs_geo"
os.makedirs(output_dir, exist_ok=True)

# Load ZIP → PUMA crosswalk and ZIP → lat/lon mapping
zip_to_puma = pd.read_excel("geocorr2022_2517206410.xlsx")  # Replace with your filename
zip_coords = pd.read_excel("uszips.xlsx")  # Replace with your ZIP-lat/lon file

# Clean and merge crosswalk
zip_to_puma["zip"] = zip_to_puma["zcta"].astype(str).str.zfill(5)
zip_coords["zip"] = zip_coords["zip"].astype(str).str.zfill(5)
zip_geo = zip_to_puma.merge(zip_coords, on="zip", how="left")

# Loop over all downloaded CSVs
for file_name in os.listdir(input_dir):
    if file_name.endswith(".csv") and "acs_pums_hca" in file_name:
        file_path = os.path.join(input_dir, file_name)
        print(f"Processing {file_name}...")

        # Load ACS data
        df = pd.read_csv(file_path)
        df["PUMA"] = df["PUMA"].astype(str).str.zfill(5)

        # Merge with geo data on PUMA
        merged_df = df.merge(zip_geo, left_on="PUMA", right_on="puma", how="left")

        # Save merged output
        year = file_name.split("_")[-1].replace(".csv", "")
        output_file = os.path.join(output_dir, f"acs_geo_{year}.csv")
        merged_df.to_csv(output_file, index=False)

        print(f"Saved merged file: {output_file}")
