In [1]:
import requests
import time
from dotenv import load_dotenv
import os

In [2]:
load_dotenv(dotenv_path="keys.env")

API_KEY = os.getenv("GOVINFO_API_KEY")

if not API_KEY:
    raise ValueError("GOVINFO_API_KEY not found. Please set it in a .env file.")

In [3]:
os.makedirs("hearings html", exist_ok=True)

In [4]:
headers = {"X-Api-Key": API_KEY}
base_url = "https://api.govinfo.gov/collections/CHRG/2024-01-01T00:00:00Z"
params = {
    "offset": 0,
    "pageSize": 100
}

In [5]:
def fetch_package_ids(base_url, headers, params):
    all_package_ids = []

    while True:
        response = requests.get(base_url, headers=headers, params=params)
        data = response.json()

        packages = data.get("packages", [])
        if not packages:
            break

        for pkg in packages:
            package_id = pkg.get("packageId")
            publish_date = pkg.get("dateIssued")
            if package_id and (publish_date and publish_date >= "2024-01-01"):
                print(f"{publish_date} — {package_id}")
                all_package_ids.append(package_id)

        if not data.get("nextPage"):
            break

        params["offset"] += params["pageSize"]
        time.sleep(0.5)

    print(f"\nTotal CHRG documents since 2024-01-01: {len(all_package_ids)}")
    return all_package_ids

# Call the function
all_package_ids = fetch_package_ids(base_url, headers, params)

2024-06-27 — CHRG-118hhrg59596
2024-02-15 — CHRG-118hhrg55185
2025-02-25 — CHRG-119hhrg59366
2024-09-06 — CHRG-118hhrg56663
2025-01-23 — CHRG-119shrg58427
2024-09-18 — CHRG-118hhrg57162
2025-02-12 — CHRG-119hhrg58844
2025-02-11 — CHRG-119hhrg58836
2024-02-14 — CHRG-118hhrg55077
2024-05-01 — CHRG-118shrg55728
2024-02-14 — CHRG-118hhrg55078
2024-09-18 — CHRG-118hhrg56669
2024-07-24 — CHRG-118hhrg56139
2024-11-19 — CHRG-118hhrg58949
2024-06-26 — CHRG-118hhrg59458
2024-02-15 — CHRG-118hhrg58808
2024-06-04 — CHRG-118shrg55299
2024-06-26 — CHRG-118hhrg59423
2024-12-11 — CHRG-118hhrg58898
2024-12-05 — CHRG-118jhrg57781
2024-07-24 — CHRG-118hhrg56399
2024-07-23 — CHRG-118hhrg56360
2024-12-18 — CHRG-118hhrg57831
2024-12-18 — CHRG-118hhrg57830
2024-07-23 — CHRG-118hhrg56359
2024-05-16 — CHRG-118hhrg55689
2024-12-04 — CHRG-118hhrg57677
2024-12-17 — CHRG-118hhrg57832
2024-04-09 — CHRG-118shrg55293
2024-04-16 — CHRG-118shrg55308
2025-02-06 — CHRG-119hhrg58804
2024-07-30 — CHRG-118shrg58679
2024-05-

In [6]:
def download_hearing(package_id, headers):
    granule_summary_url = f"https://api.govinfo.gov/packages/{package_id}/granules/{package_id}/summary"
    
    response = requests.get(granule_summary_url, headers=headers)
    data = response.json()

    txt_link = data['download']['txtLink']
    response = requests.get(txt_link, headers=headers)

    if response.status_code == 200:
        file_path = os.path.join("hearings html", f"{package_id}.htm")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(response.text)
        print(f"Downloaded and saved {file_path}")
    else:
        print("Failed to download:", response.status_code)


In [7]:
for i, package_id in enumerate(all_package_ids, start=1):
    download_hearing(package_id, headers)
    print(f"Downloaded {i} out of {len(all_package_ids)} documents.")
    time.sleep(0.5)
print("All hearings downloaded.")

Downloaded and saved hearings html/CHRG-118hhrg59596.htm
Downloaded 1 out of 521 documents.
Downloaded and saved hearings html/CHRG-118hhrg55185.htm
Downloaded 2 out of 521 documents.
Downloaded and saved hearings html/CHRG-119hhrg59366.htm
Downloaded 3 out of 521 documents.
Downloaded and saved hearings html/CHRG-118hhrg56663.htm
Downloaded 4 out of 521 documents.
Downloaded and saved hearings html/CHRG-119shrg58427.htm
Downloaded 5 out of 521 documents.
Downloaded and saved hearings html/CHRG-118hhrg57162.htm
Downloaded 6 out of 521 documents.
Downloaded and saved hearings html/CHRG-119hhrg58844.htm
Downloaded 7 out of 521 documents.
Downloaded and saved hearings html/CHRG-119hhrg58836.htm
Downloaded 8 out of 521 documents.
Downloaded and saved hearings html/CHRG-118hhrg55077.htm
Downloaded 9 out of 521 documents.
Downloaded and saved hearings html/CHRG-118shrg55728.htm
Downloaded 10 out of 521 documents.
Downloaded and saved hearings html/CHRG-118hhrg55078.htm
Downloaded 11 out of 5