In [24]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from datetime import datetime

# Load environment variables
load_dotenv()

# Logging Configuration
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [25]:
# MongoDB Connection
def get_mongo_collection():
    try:
        connection_string = os.getenv('MONGO_CONNECTION_STRING')
        client = MongoClient(connection_string)
        db = client['govai']
        logging.info("Connected to MongoDB successfully.")
        return db['test']
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {e}")
        raise

In [26]:
# Get documents from MongoDB that haven't been processed
collection = get_mongo_collection()
documents = list(collection.find({"processed_at": None}))
logging.info(f"Found {len(documents)} unprocessed documents")

2024-12-11 20:45:20,118 - INFO - Connected to MongoDB successfully.
  if response.this_update > now:
  if response.next_update and response.next_update < now:
  if value.next_update is None:
  value.this_update
  < value.next_update
  cached_value.next_update is not None
  and cached_value.next_update < value.next_update
  assert value.this_update is not None
  assert value.next_update is not None
  value.this_update
  < value.next_update
2024-12-11 20:45:21,557 - INFO - Found 163 unprocessed documents


In [27]:
print(documents[0]['raw_text'])

Federal Register, Volume 89 Issue 234 (Thursday, December 5, 2024) [Federal Register Volume 89, Number 234 (Thursday, December 5, 2024)] [Presidential Documents] [Pages 96515-96516] From the Federal Register Online via the Government Publishing Office [www.gpo.gov] [FR Doc No: 2024-28714]       Presidential Documents       Federal Register / Vol. 89, No. 234 / Thursday, December 5, 2024 / Presidential Documents   [[Page 96515]] Proclamation 10867 of November 29, 2024 World AIDS Day, 2024 By the President of the United States of America A Proclamation Our Nation has made enormous strides toward preventing, diagnosing, and treating HIV--a terrible disease that has stolen the precious lives of over 40 million people since the epidemic began in 1981. Despite our progress, over 39 million people worldwide continue to live with it, including over 1 million people in the United States. On World AIDS Day, we honor the memory of all those we tragically lost to HIV around the world. We stand in 

In [28]:
# Initialize LLM for summarization
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k")

def generate_summary(text):
    try:
        prompt = f"Craft a one paragraphsummary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness. 
        Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects. 
        Rely strictly on the provided text, without including external information. Format the summary in paragraph form for easy 
        understanding.\n\n{text}"
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        return None
# Generate summaries and update MongoDB

#print(generate_summary(documents[0]['raw_text']))
print(summary = generate_summary(documents[0]['raw_text']))
"""
for doc in documents:
    if doc.get('raw_text') and not doc.get('summary'):
        summary = generate_summary(doc['raw_text'])
        if summary:
            collection.update_one(
                {"_id": doc["_id"]},
                {"$set": {"summary": summary}}
            )
            print("Updated")
            logging.info(f"Added summary for document {doc.get('document_number')}")
"""

SyntaxError: unterminated f-string literal (detected at line 6) (4289498910.py, line 6)

In [7]:
# Initialize text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Process documents - chunking and embedding
for doc in documents:
    if doc.get('raw_text') and not doc.get('chunked'):
        try:
            # Create chunks
            chunks = text_splitter.create_documents([doc['raw_text']])
            
            # Generate embeddings for each chunk
            chunk_embeddings = []
            for i, chunk in enumerate(chunks):
                embedding = embeddings.embed_query(chunk.page_content)
                chunk_embeddings.append({
                    "chunk_id": i,
                    "text": chunk.page_content,
                    "embedding": embedding
                })
            
            # Update MongoDB with chunks and embeddings
            collection.update_one(
                {"_id": doc["_id"]},
                {
                    "$set": {
                        "chunks": chunk_embeddings,
                        "chunked": True,
                        "embedded": True,
                        "processed_at": datetime.utcnow()
                    }
                }
            )
            logging.info(f"Processed document {doc.get('document_number')} with {len(chunks)} chunks")
            
        except Exception as e:
            logging.error(f"Error processing document {doc.get('document_number')}: {e}")
            continue