In [2]:
import requests
import time
from dotenv import load_dotenv
import os

In [4]:
load_dotenv(dotenv_path="keys.env")

API_KEY = os.getenv("GOVINFO_API_KEY")

if not API_KEY:
    raise ValueError("GOVINFO_API_KEY not found. Please set it in a .env file.")

In [5]:
os.makedirs("hearings html", exist_ok=True)

In [6]:
headers = {"X-Api-Key": API_KEY}
base_url = "https://api.govinfo.gov/collections/CHRG/2023-01-02T00:00:00Z"
params = {
    "offset": 0,
    "pageSize": 100
}

In [7]:
def fetch_package_ids(base_url, headers, params):
    all_package_ids = []

    while True:
        response = requests.get(base_url, headers=headers, params=params)
        data = response.json()

        packages = data.get("packages", [])
        if not packages:
            break

        for pkg in packages:
            package_id = pkg.get("packageId")
            publish_date = pkg.get("dateIssued")
            if package_id and (publish_date and publish_date >= "2023-01-01"):
                print(f"{publish_date} — {package_id}")
                all_package_ids.append(package_id)

        if not data.get("nextPage"):
            break

        params["offset"] += params["pageSize"]
        time.sleep(0.5)

    print(f"\nTotal CHRG documents since 2023-01-02: {len(all_package_ids)}")
    return all_package_ids

# Call the function
all_package_ids = fetch_package_ids(base_url, headers, params)

2023-12-05 — CHRG-118hhrg56746
2024-02-15 — CHRG-118hhrg55185
2025-02-25 — CHRG-119hhrg59366
2024-09-06 — CHRG-118hhrg56663
2025-01-23 — CHRG-119shrg58427
2023-07-26 — CHRG-118shrg57229
2023-07-11 — CHRG-118shrg57196
2023-04-19 — CHRG-118shrg58673
2023-09-27 — CHRG-118hhrg56742
2023-06-22 — CHRG-118shrg57195
2023-11-14 — CHRG-118hhrg56745
2024-09-18 — CHRG-118hhrg57162
2025-02-12 — CHRG-119hhrg58844
2025-02-11 — CHRG-119hhrg58836
2024-02-14 — CHRG-118hhrg55077
2024-05-01 — CHRG-118shrg55728
2024-02-14 — CHRG-118hhrg55078
2024-09-18 — CHRG-118hhrg56669
2024-07-24 — CHRG-118hhrg56139
2024-11-19 — CHRG-118hhrg58949
2024-06-26 — CHRG-118hhrg59458
2024-02-15 — CHRG-118hhrg58808
2023-11-08 — CHRG-118hhrg56744
2024-06-04 — CHRG-118shrg55299
2023-04-27 — CHRG-118shrg56951
2024-06-26 — CHRG-118hhrg59423
2023-04-18 — CHRG-118hhrg58897
2024-12-11 — CHRG-118hhrg58898
2024-12-05 — CHRG-118jhrg57781
2024-07-24 — CHRG-118hhrg56399
2024-07-23 — CHRG-118hhrg56360
2024-12-18 — CHRG-118hhrg57831
2024-12-

In [None]:
def download_hearing(package_id, headers):
    granule_summary_url = f"https://api.govinfo.gov/packages/{package_id}/granules/{package_id}/summary"
    
    response = requests.get(granule_summary_url, headers=headers)
    data = response.json()

    txt_link = data['download']['txtLink']
    response = requests.get(txt_link, headers=headers)

    if response.status_code == 200:
        file_path = os.path.join("hearings html", f"{package_id}.htm")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Downloaded and saved {file_path}")
    else:
        print("Failed to download:", response.status_code)


Downloaded and saved hearings html/CHRG-118hhrg56746.htm


In [11]:
for package_id in all_package_ids:
    download_hearing(package_id, headers)
    time.sleep(0.5)
print("All hearings downloaded.")

Downloaded and saved hearings html/CHRG-118hhrg56746.htm
Downloaded and saved hearings html/CHRG-118hhrg55185.htm
Downloaded and saved hearings html/CHRG-119hhrg59366.htm
Downloaded and saved hearings html/CHRG-118hhrg56663.htm
Downloaded and saved hearings html/CHRG-119shrg58427.htm
Downloaded and saved hearings html/CHRG-118shrg57229.htm
Downloaded and saved hearings html/CHRG-118shrg57196.htm
Downloaded and saved hearings html/CHRG-118shrg58673.htm
Downloaded and saved hearings html/CHRG-118hhrg56742.htm
Downloaded and saved hearings html/CHRG-118shrg57195.htm
Downloaded and saved hearings html/CHRG-118hhrg56745.htm
Downloaded and saved hearings html/CHRG-118hhrg57162.htm
Downloaded and saved hearings html/CHRG-119hhrg58844.htm
Downloaded and saved hearings html/CHRG-119hhrg58836.htm
Downloaded and saved hearings html/CHRG-118hhrg55077.htm
Downloaded and saved hearings html/CHRG-118shrg55728.htm
Downloaded and saved hearings html/CHRG-118hhrg55078.htm
Downloaded and saved hearings h