In [1]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from datetime import datetime
import openai
from pinecone.grpc import PineconeGRPC as Pinecone


# Load environment variables
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["PINECONE_API_KEY"] = os.getenv('PINECONE_API_KEY')
# Logging Configuration
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

  from tqdm.autonotebook import tqdm


In [2]:
# MongoDB Connection
def get_mongo_collection():
    try:
        connection_string = os.getenv('MONGO_CONNECTION_STRING')
        client = MongoClient(connection_string)
        db = client['govai']
        logging.info("Connected to MongoDB successfully.")
        return db['test']
    except Exception as e:
        logging.error(f"Failed to connect to MongoDB: {e}")
        raise

In [3]:
# Get documents from MongoDB that haven't been processed
collection = get_mongo_collection()
documents = list(collection.find({"processed_at": None}))
logging.info(f"Found {len(documents)} unprocessed documents")

2024-12-12 13:24:47,293 - INFO - Connected to MongoDB successfully.
  if response.next_update and response.next_update < now:
  if response.this_update > now:
  if value.next_update is None:
  value.this_update
  < value.next_update
  cached_value.next_update is not None
  and cached_value.next_update < value.next_update
  assert value.this_update is not None
  assert value.next_update is not None
  value.this_update
  < value.next_update
2024-12-12 13:24:48,876 - INFO - Found 74 unprocessed documents


In [4]:
# Initialize LLM for summarization
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

def generate_summary(text):
    try:
        prompt = f"""
        You are a summarization assistant. 
        Summarize the following text in no more than four sentences.
        Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness. 
        Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects. 
        Rely strictly on the provided text, without including external information. 
        The summary will be displayed a quick card, giving users an overview of the document.
        Do not use more than 4 sentences.
        
        {text}"""
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        logging.error(f"Error generating summary: {e}")
        return None


In [31]:
# Test the summary generation
summary = generate_summary(documents[0]['raw_text'])
print(summary)

2024-12-11 21:17:09,748 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


On November 29, 2024, President Biden issued Proclamation 10867, designating December 1, 2024, as World AIDS Day, a day to honor the memory of over 40 million lives lost to HIV since the epidemic began in 1981 and to support those currently living with the disease. The proclamation highlights significant advancements made by the Biden Administration, including the reestablishment of the White House Office of National AIDS Policy, the launch of a new National HIV/AIDS Strategy aimed at ending the epidemic by 2030, and substantial funding through the Ryan White HIV/AIDS Program to ensure access to care for low-income individuals. It also emphasizes efforts to combat stigma, promote understanding of HIV transmission, and reform outdated criminalization laws, while reaffirming the U.S. commitment to global initiatives like PEPFAR, which has saved millions of lives worldwide. The proclamation culminates in a call for unity and support for the HIV community, encouraging participation in acti

In [7]:
for doc in documents:
    if doc.get('raw_text') and not doc.get('summary'):
        summary = generate_summary(doc['raw_text'])
        if summary:
            collection.update_one(
                {"_id": doc["_id"]},
                {"$set": {"summary": summary}}
            )
            print("Updated")
            logging.info(f"Added summary for document {doc.get('document_number')}")

1


In [33]:
print(documents[0])

False


In [None]:
# Initialize text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False
    )

# Initialize embeddings and Pinecone
embeddings = OpenAIEmbeddings()
pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index_name = "federal-register"  # Replace with your Pinecone index name
index = pc.Index(
                 )

# Process documents - chunking, embedding, and indexing to Pinecone
for doc in documents:
    if doc.get('raw_text') and not doc.get('chunked'):
        try:
            # Create chunks
            chunks = text_splitter.create_documents([doc['raw_text']])
            
            # Prepare vectors for Pinecone batch upsert
            vectors_to_upsert = []
            chunk_embeddings = []  # Keep this for MongoDB
            
            for i, chunk in enumerate(chunks):
                # Create metadata for this chunk
                metadata = {
                    "document_id": str(doc['_id']),
                    "chunk_number": i,
                    "title": doc.get("title", ""),
                    "publication_date": doc.get("publication_date", ""),
                    "source_url": doc.get("pdf_url"),
                    "text": chunk.page_content  # Store the actual text in metadata
                }
                
                # Generate embedding for the chunk
                embedding = embeddings.embed_query(chunk.page_content)
                
                # Prepare vector for Pinecone
                vector_id = f"{str(doc['_id'])}_{i}"  # Create unique ID for each chunk
                vectors_to_upsert.append({
                    "id": vector_id,
                    "values": embedding,
                    "metadata": metadata
                })
                
            
            # Batch upsert to Pinecone (in batches of 100)
            batch_size = 100
            for i in range(0, len(vectors_to_upsert), batch_size):
                batch = vectors_to_upsert[i:i + batch_size]
                index.upsert(vectors=batch)
            
            # Update MongoDB 
            collection.update_one(
                {"_id": doc["_id"]},
                {
                    "$set": {
                        "chunked": True,
                        "embedded": True,
                        "processed_at": datetime.utcnow(),
                    }
                }
            )
            logging.info(f"Processed and indexed document {doc.get('document_number')} with {len(chunks)} chunks")
            
        except Exception as e:
            logging.error(f"Error processing document {doc.get('document_number')}: {e}")
            continue

2024-12-12 13:30:02,941 - INFO - Discovering subpackages in _NamespacePath(['/Users/nishanshehadeh/opt/anaconda3/envs/WTP/lib/python3.12/site-packages/pinecone_plugins'])
2024-12-12 13:30:02,942 - INFO - Looking for plugins in pinecone_plugins.inference
2024-12-12 13:30:02,962 - INFO - Installing plugin inference into PineconeGRPC
2024-12-12 13:30:16,708 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-12 13:30:17,205 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-12 13:30:17,437 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-12 13:30:17,718 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-12 13:30:17,887 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-12 13:30:18,291 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-12-12 13:30:18,548 -

In [None]:
# Summary of work
countSummary, countChunked = 0, 0
for doc in documents:
    if doc.get('raw_text') and not doc.get('summary'):
        countSummary += 1
    elif doc.get('raw_text') and not doc.get('chunked'):
        countChunked += 1
logging.info(f"Number of documents without summary: {countSummary}")
logging.info(f"Number of documents without chunking: {countChunked}")