In [23]:
import requests
from pymongo import MongoClient
from datetime import datetime, timedelta
import logging
import os
from dotenv import load_dotenv
import PyPDF2
from xml.etree import ElementTree
from bs4 import BeautifulSoup
import re


In [15]:
# Load environment variables
load_dotenv()

# Logging Configuration
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


In [16]:
# MongoDB Connection
def get_mongo_collection():
    try:
        connection_string = os.getenv('MONGO_CONNECTION_STRING')
        client = MongoClient(connection_string)
        db = client['govai']
        logging.info("Connected to MongoDB successfully.")
        return db['test']
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {e}")
        raise
    

In [17]:
BASE_URL = 'https://www.federalregister.gov/api/v1/documents'


In [18]:
# Fetch documents from the Federal Register API with pagination
# TODO, can extract more information including agencies, summaries?, topics, images, etc for better information
def fetch_documents(start_date, end_date, per_page=100):
    all_documents = []
    page = 1  # Start with the first page

    while True:
        params = {
            "conditions[publication_date][gte]": start_date,
            "conditions[publication_date][lte]": end_date,
            "per_page": per_page,
            "page": page,
            "order": "newest",
            "fields[]": [
                "document_number",
                "title",
                "abstract",
                "publication_date",
                "type",
                "html_url",
                "pdf_url",
                "full_text_xml_url",
                "raw_text_url",
                "agencies",
            ] 
        }

        try:
            response = requests.get(BASE_URL, params=params)
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])
            if not results:
                break  # Exit the loop if no more results

            all_documents.extend(results)
            logging.info(f"Fetched {len(results)} documents from page {page}.")
            page += 1  # Move to the next page

        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to fetch data from the API: {e}")
            raise

    logging.info(f"Total documents fetched: {len(all_documents)}.")
    return {"results": all_documents}


# start and end dates in YYYY-MM-DD format
raw_data = fetch_documents("2024-12-4", "2024-12-5")


2024-12-11 20:38:51,462 - INFO - Fetched 100 documents from page 1.
2024-12-11 20:38:51,851 - INFO - Fetched 63 documents from page 2.
2024-12-11 20:38:52,120 - INFO - Total documents fetched: 163.


In [19]:
# Confirm metadata is the same for all response
for i in range(1, len(raw_data["results"])):
    if raw_data["results"][i].keys() != raw_data["results"][i-1].keys():
        print("False")


In [20]:
# Print example result
print(raw_data['results'][5])
print(raw_data["results"][5].keys())

{'document_number': '2024-28502', 'title': 'Surplus Property; Notice of Additional Property at the Former Pueblo Chemical Depot', 'abstract': 'This Notice amends the Notice published in the Federal Register on December 20, 2013 and provides information regarding the property that has been determined surplus to the United States needs pursuant to section 2854 (Closure and Disposal of the Pueblo Chemical Depot, Pueblo County, Colorado) of the National Defense Authorization Act for Fiscal Year 2024 (NDAA FY24) and in accordance with procedures and authorities for the closure, management, and disposal of property under the appropriate base closure laws, and following screening with Federal agencies and Department of Defense components.', 'publication_date': '2024-12-05', 'type': 'Notice', 'html_url': 'https://www.federalregister.gov/documents/2024/12/05/2024-28502/surplus-property-notice-of-additional-property-at-the-former-pueblo-chemical-depot', 'pdf_url': 'https://www.govinfo.gov/conten

In [21]:
# Raw Text Extraction
# Fetch raw text from the URL
def fetch_raw_text(raw_text_url):
    try:
        response = requests.get(raw_text_url)
        response.raise_for_status()
        raw_html = response.text
        # Clean the HTML content
        soup = BeautifulSoup(raw_html, "html.parser")
        clean_text = soup.get_text()
        return clean_text
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to fetch raw text from {raw_text_url}: {e}")
        return None

# Fetch and parse text from the full text XML URL
def fetch_full_text(full_text_xml_url):
    try:
        response = requests.get(full_text_xml_url)
        response.raise_for_status()
        root = ElementTree.fromstring(response.content)
        text = " ".join(element.text for element in root.iter() if element.text)
        return text
    except (requests.exceptions.RequestException, ElementTree.ParseError) as e:
        logging.error(f"Failed to fetch or parse full text from {full_text_xml_url}: {e}")
        return None

# Extract text from a PDF URL
def extract_text_from_pdf(pdf_url):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        with open("temp.pdf", "rb") as f:
            reader = PyPDF2.PdfReader(f)
            text = " ".join(page.extract_text() for page in reader.pages if page.extract_text())
        os.remove("temp.pdf")  # Clean up temporary file
        return text
    except requests.exceptions.RequestException as e:
        logging.error(f"Failed to download PDF from {pdf_url}: {e}")
        return None
    except Exception as e:
        logging.error(f"Failed to extract text from PDF: {e}")
        return None
    
def clean_raw_text(text):
    """
    Cleans raw text by:
    - Removing HTML tags.
    - Replacing special characters like \\n, \\x00 with spaces.
    - Normalizing whitespace.
    - Stripping leading/trailing whitespace.
    """
    try:
        # Remove HTML tags
        soup = BeautifulSoup(text, "html.parser")
        cleaned_text = soup.get_text()

        # Remove special characters (e.g., \n, \x00)
        cleaned_text = re.sub(r'\\n|\\x[0-9a-fA-F]{2}', ' ', cleaned_text)

        # Normalize excessive whitespace
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)

        # Strip leading/trailing whitespace
        cleaned_text = cleaned_text.strip()

        return cleaned_text
    except Exception as e:
        logging.error(f"Failed to clean raw text: {e}")
        return text

In [24]:
# Transform raw API data into a consistent format with text extraction
def transform(raw_data):
    try:
        documents = []
        raw_text_count = 0
        full_text_count = 0
        pdf_text_count = 0
        failed_text_count = 0

        for item in raw_data.get("results", []):
            raw_text = None
            if item.get("raw_text_url"):
                raw_text = fetch_raw_text(item["raw_text_url"])
                if raw_text:
                    raw_text_count += 1
            elif item.get("full_text_xml_url"):
                raw_text = fetch_full_text(item["full_text_xml_url"])
                if raw_text:
                    full_text_count += 1
            elif item.get("pdf_url"):
                raw_text = extract_text_from_pdf(item["pdf_url"])
                if raw_text:
                    pdf_text_count += 1

            if not raw_text:
                failed_text_count += 1
                
            raw_text = clean_raw_text(raw_text)

            documents.append({
                "document_number": item.get("document_number"),
                "title": item.get("title"),
                "abstract": item.get("abstract"),
                "publication_date": item.get("publication_date"),
                "type": item.get("type"),
                "html_url": item.get("html_url"),
                "pdf_url": item.get("pdf_url"),
                "full_text_xml_url": item.get("full_text_xml_url"),
                "raw_text_url": item.get("raw_text_url"),
                "raw_text": raw_text,  # Store extracted text
                "agencies": item.get("agencies", []),
                "summary": None,  # Placeholder for summarization
                "chunked": False,  # Indicates if the document has been chunked
                "embedded": False,  # Indicates if embeddings are generated
                "processed_at": None,  # Timestamp of the last processing
            })

        logging.info(f"Text extraction summary: {raw_text_count} from raw_text_url, {full_text_count} from full_text_xml_url, {pdf_text_count} from pdf_url, {failed_text_count} failures.")
        logging.info(f"Transformed {len(documents)} documents with text extraction.")
        return documents
    except Exception as e:
        logging.error(f"Error during transformation: {e}")
        raise
    
transformed_data = transform(raw_data)


2024-12-11 20:43:49,401 - INFO - Text extraction summary: 163 from raw_text_url, 0 from full_text_xml_url, 0 from pdf_url, 0 failures.
2024-12-11 20:43:49,414 - INFO - Transformed 163 documents with text extraction.


In [25]:
# Check on an example
print(transformed_data[0].keys())
print(transformed_data[0]['raw_text'])

dict_keys(['document_number', 'title', 'abstract', 'publication_date', 'type', 'html_url', 'pdf_url', 'full_text_xml_url', 'raw_text_url', 'raw_text', 'agencies', 'summary', 'chunked', 'embedded', 'processed_at'])
Federal Register, Volume 89 Issue 234 (Thursday, December 5, 2024) [Federal Register Volume 89, Number 234 (Thursday, December 5, 2024)] [Presidential Documents] [Pages 96515-96516] From the Federal Register Online via the Government Publishing Office [www.gpo.gov] [FR Doc No: 2024-28714]       Presidential Documents       Federal Register / Vol. 89, No. 234 / Thursday, December 5, 2024 / Presidential Documents   [[Page 96515]] Proclamation 10867 of November 29, 2024 World AIDS Day, 2024 By the President of the United States of America A Proclamation Our Nation has made enormous strides toward preventing, diagnosing, and treating HIV--a terrible disease that has stolen the precious lives of over 40 million people since the epidemic began in 1981. Despite our progress, over 39

In [26]:
# Load data into MongoDB

def load_into_mongo(data):
    try:
        collection = get_mongo_collection()
        for doc in data:
            collection.update_one(
                {"document_number": doc["document_number"]},
                {"$set": doc},
                upsert=True
            )
        logging.info(f"Loaded {len(data)} documents into MongoDB.")
    except Exception as e:
        logging.error(f"Failed to load data into MongoDB: {e}")
        raise
    
load_into_mongo(transformed_data)


2024-12-11 20:44:29,065 - INFO - Connected to MongoDB successfully.
  if response.this_update > now:
  if response.next_update and response.next_update < now:
  if value.next_update is None:
  value.this_update
  < value.next_update
  cached_value.next_update is not None
  and cached_value.next_update < value.next_update
  assert value.this_update is not None
  assert value.next_update is not None
  value.this_update
  < value.next_update
2024-12-11 20:44:43,249 - INFO - Loaded 163 documents into MongoDB.
