In [31]:
import os
import json
import random

# Folder containing the JSON files
folder_path = "/Users/nickgutin/Documents/GitHub/congress-project/general bill info"

# List to hold all JSON data
all_json_data = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):  # Check if the file is a JSON file
        file_path = os.path.join(folder_path, filename)  # Full path to the file
        with open(file_path, "r") as file:
            try:
                data = json.load(file)  # Load the JSON file content
                sample = random.sample(data, min(len(data), 15))
                all_json_data.append(sample)  # Add the content to the list
            except json.JSONDecodeError:
                print(f"Error decoding {filename}")

all_json_data[6]

[{'congress': 116,
  'latestAction': {'actionDate': '2020-01-08',
   'text': 'Read twice and referred to the Committee on Health, Education, Labor, and Pensions.'},
  'number': '3161',
  'originChamber': 'Senate',
  'originChamberCode': 'S',
  'title': 'National Biomedical Research Act',
  'type': 'S',
  'updateDate': '2024-02-16',
  'updateDateIncludingText': '2024-02-16',
  'url': 'https://api.congress.gov/v3/bill/116/s/3161?format=json'},
 {'congress': 116,
  'latestAction': {'actionDate': '2019-11-13',
   'text': 'Referred to the House Committee on Oversight and Reform.'},
  'number': '5070',
  'originChamber': 'House',
  'originChamberCode': 'H',
  'title': 'A Just Society: The Mercy In Re-entry Act',
  'type': 'HR',
  'updateDate': '2024-02-07',
  'updateDateIncludingText': '2024-02-07',
  'url': 'https://api.congress.gov/v3/bill/116/hr/5070?format=json'},
 {'congress': 116,
  'latestAction': {'actionDate': '2019-03-13',
   'text': 'Read twice and referred to the Committee on the

In [39]:
import requests

my_key = os.getenv("congress_apiKey")
con = '116'
code = 'hres'
num = '923'

api_url = f'https://api.congress.gov/v3/bill/{con}/{code}/{num}/text?api_key={my_key}'
test = requests.get(api_url)
test.json()


{'pagination': {'count': 1},
 'request': {'billNumber': '923',
  'billType': 'hres',
  'billUrl': 'https://api.congress.gov/v3/bill/116/hres/923?format=json',
  'congress': '116',
  'contentType': 'application/json',
  'format': 'json'},
 'textVersions': [{'date': '2020-04-14T04:00:00Z',
   'formats': [{'type': 'Formatted Text',
     'url': 'https://www.congress.gov/116/bills/hres923/BILLS-116hres923ih.htm'},
    {'type': 'PDF',
     'url': 'https://www.congress.gov/116/bills/hres923/BILLS-116hres923ih.pdf'},
    {'type': 'Formatted XML',
     'url': 'https://www.congress.gov/116/bills/hres923/BILLS-116hres923ih.xml'}],
   'type': 'Introduced in House'}]}

In [46]:
import asyncio
import aiohttp
import os
import json
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import boto3

load_dotenv()


async def fetch_text(session, url):
    """Fetch bill text asynchronously."""
    try:
        async with session.get(url) as response:
            if response.status == 200:
                return await response.text()  # Fetch text content
            else:
                print(f"Failed to fetch text URL {url} with status {response.status}")
                return None
    except Exception as e:
        print(f"Error fetching text URL {url}: {e}")
        return None

def save_code(code):
    
    if code == 'hr' or code == 'hres':
        return 'h'
    
    else:
        return 's'
    
async def process_bill_texts(congress_data):
    """Process bill metadata and fetch their text versions asynchronously."""
    my_key = os.getenv("congress_apiKey")
    bill_texts = []
    failed_bills = []  # Log for failed bills

    async with aiohttp.ClientSession() as session:
        for bill in congress_data:
            con = str(bill['congress'])
            code = bill['type'].lower()
            num = bill['number']
            save = save_code(code)

            # API call to get text versions
            try:
                #response = await fetch_url(session, api_url)
                
                #Fetch the latest text URL
                text_url = f"https://www.congress.gov/{con}/bills/{code}{num}/BILLS-{con}{code}{num}i{save}.htm"
                text_response = await fetch_text(session, text_url)

                if text_response:
                    soup = BeautifulSoup(text_response, "lxml")
                    bill_texts.append({
                        "id": f"{code}{num}",
                        "congress": con,
                        "text": soup.text
                    })
                    print(f"Saved text for bill {code}{num}")
                else:
                    print(f"Failed to fetch bill text for {code}{num}")
                    failed_bills.append({"id": f"{code}{num}", "congress": con})
    
            except Exception as e:
                print(f"Error processing bill {code}{num}: {e}")
                failed_bills.append({"id": f"{code}{num}", "congress": con})

    return bill_texts, failed_bills


def load_congress_data_from_s3():
    """Load congress data from S3."""
    bucket_name = os.getenv("bucket_name")
    folder_path = os.getenv("folder_path")
    s3 = boto3.client("s3")

    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    congress_data = []
    full_congress = []

    if "Contents" in response:
        for obj in response["Contents"]:
            key = obj["Key"]
            if key.endswith(".json"):
                print(f"Fetching file: {key}")
                json_obj = s3.get_object(Bucket=bucket_name, Key=key)
                data = json.loads(json_obj["Body"].read().decode("utf-8"))
                congress_data.extend(data)
                
            full_congress.append(congress_data)
    else:
        print(f"No JSON files found in the S3 folder: {folder_path}")

    return full_congress


async def main():
    # Load congress data from S3
    print("Loading congress data from S3...")
    congress_data = all_json_data[6]
    
    print(f"Loaded {len(congress_data)} bills from S3.")

    # Process all bill texts
    bill_texts, failed_bills, file_name = await process_bill_texts(congress_data)
    

    with open("processed_bills.json", "w") as file:
        json.dump(bill_texts, file, indent=2)

asyncio.run(main())

Loading congress data from S3...
Loaded 15 bills from S3.
Saved text for bill s3161
Saved text for bill hr5070
Saved text for bill s766
Saved text for bill hr4812
Saved text for bill hr4779
Saved text for bill hres923
Saved text for bill hr4434
Saved text for bill s450
Saved text for bill sres442
Saved text for bill hr1770
Saved text for bill s494
Saved text for bill hr6256
Saved text for bill s3225
Saved text for bill s5084
Saved text for bill s4663


ValueError: not enough values to unpack (expected 3, got 2)

In [47]:
def load_congress_data_from_s3():
    """Load congress data from S3, with each entry as a full JSON file."""
    bucket_name = os.getenv("bucket_name")
    folder_path = os.getenv("folder_path")
    s3 = boto3.client("s3")

    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
    congress_data = []

    if "Contents" in response:
        for obj in response["Contents"]:
            key = obj["Key"]
            if key.endswith(".json"):
                print(f"Fetching file: {key}")
                json_obj = s3.get_object(Bucket=bucket_name, Key=key)
                data = json.loads(json_obj["Body"].read().decode("utf-8"))
                congress_data.append(data)
    else:
        print(f"No JSON files found in the S3 folder: {folder_path}")

    return congress_data


In [49]:
test = load_congress_data_from_s3()
len(test)
test[0]

Fetching file: baseinfo/c110_allBills.json
Fetching file: baseinfo/c111_allBills.json
Fetching file: baseinfo/c112_allBills.json
Fetching file: baseinfo/c113_allBills.json
Fetching file: baseinfo/c114_allBills.json
Fetching file: baseinfo/c115_allBills.json
Fetching file: baseinfo/c116_allBills.json
Fetching file: baseinfo/c117_allBills.json


[{'congress': 110,
  'latestAction': {'actionDate': '2009-01-03',
   'text': 'Referred to the House Committee on House Administration.'},
  'number': '107',
  'originChamber': 'Senate',
  'originChamberCode': 'S',
  'title': 'A concurrent resolution expressing the sense of Congress regarding the rights of Members of Congress (or any employee of a Member of Congress authorized by that Member) to lead tours of the United States Capitol complex.',
  'type': 'SCONRES',
  'updateDate': '2024-02-07',
  'updateDateIncludingText': '2024-02-07',
  'url': 'https://api.congress.gov/v3/bill/110/sconres/107?format=json'},
 {'congress': 110,
  'latestAction': {'actionDate': '2009-01-02',
   'text': 'Referred to the Subcommittee on Emergency Communications, Preparedness, and Response.'},
  'number': '7329',
  'originChamber': 'House',
  'originChamberCode': 'H',
  'title': 'To amend the Robert T. Stafford Disaster Relief and Emergency Assistance Act to authorize funding for emergency management perfo

In [50]:
test[1]

[{'congress': 111,
  'latestAction': {'actionDate': '2011-01-07',
   'text': 'Became Public Law No: 111-383.'},
  'number': '6523',
  'originChamber': 'House',
  'originChamberCode': 'H',
  'title': 'Ike Skelton National Defense Authorization Act for Fiscal Year 2011',
  'type': 'HR',
  'updateDate': '2024-05-17',
  'updateDateIncludingText': '2024-05-17',
  'url': 'https://api.congress.gov/v3/bill/111/hr/6523?format=json'},
 {'congress': 111,
  'latestAction': {'actionDate': '2011-01-05',
   'text': 'Referred to the Subcommittee on National Parks, Forests and Public Lands.'},
  'number': '6561',
  'originChamber': 'House',
  'originChamberCode': 'H',
  'title': 'History Is Learned from the Living Act',
  'type': 'HR',
  'updateDate': '2024-02-07',
  'updateDateIncludingText': '2024-02-07',
  'url': 'https://api.congress.gov/v3/bill/111/hr/6561?format=json'},
 {'congress': 111,
  'latestAction': {'actionDate': '2011-01-04',
   'text': 'Became Public Law No: 111-373.'},
  'number': '841

In [44]:
path = "/Users/nickgutin/Documents/GitHub/congress-project/notebooks/test_processed_bills.json"
with open(path, 'r') as file:
    data = json.load(file)



In [45]:
data[1]['text'].strip()

"[Congressional Bills 112th Congress]\n[From the U.S. Government Publishing Office]\n[H.R. 1510 Introduced in House (IH)]\n\n112th CONGRESS\n  1st Session\n                                H. R. 1510\n\nTo amend title 49, United States Code, to prohibit a pat down search of \n minor for purposes of air transportation security without the consent \n     and presence of a parent of the minor, and for other purposes.\n\n\n_______________________________________________________________________\n\n\n                    IN THE HOUSE OF REPRESENTATIVES\n\n                             April 13, 2011\n\n Mr. Chaffetz introduced the following bill; which was referred to the \n                     Committee on Homeland Security\n\n_______________________________________________________________________\n\n                                 A BILL\n\n\n \nTo amend title 49, United States Code, to prohibit a pat down search of \n minor for purposes of air transportation security without the consent \n 