In [1]:
# Imports
import os
import nest_asyncio
nest_asyncio.apply()
from dotenv import load_dotenv
load_dotenv()
from whbriefingroom_loader import WhBriefingRoomLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pymongo import MongoClient
from langchain.schema import Document
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore




  from tqdm.autonotebook import tqdm


In [2]:
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["PINECONE_API_KEY"] = os.getenv('PINECONE_API_KEY')

connection_string = os.getenv('MONGO_CONNECTION_STRING')
client = MongoClient(connection_string)

ATLAS_VECTOR_SEARCH_INDEX_NAME = "index_name"
DB_NAME = "WTP"
COLLECTION_NAME = "whbriefingroom"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

In [3]:
loader = WhBriefingRoomLoader(
    connection_string=connection_string,
    db_name=DB_NAME,
    collection_name=COLLECTION_NAME,
)

In [4]:
# Load documents into "docs" list, each document is loaded in as a large string
# The data for one document can be viewed with docs[index].page_content
docs = loader.load()
len(docs)

9570

In [5]:
print(docs[0].metadata)
print(docs[0].page_content)

{'database': 'WTP', 'collection': 'whbriefingroom', 'id': '66a8ff13efac37d905d1db54', 'title': 'Remarks by President Biden in Statement to the American People', 'date_posted': 'July 24, 2024', 'category': 'Speeches and Remarks'}
8:01 P.M. EDT THE PRESIDENT: My fellow Americans, I’m speaking to you tonight from behind the Resolute Desk in the Oval Office. In this sacred space, I’m surrounded by portraits of extraordinary American presidents. Thomas Jefferson, who wrote the immortal words that guide this nation. George Washington, who showed us presidents are not kings. Abraham Lincoln, who implored us to reject malice. Franklin Roosevelt, who inspired us to reject fear. I revere this office, but I love my country more. It’s been the honor of my life to serve as your president. But in the defense of democracy, which is at stake, I think it’s more important than any title. I draw strength and I find joy in working for the American people. But this sacred task of perfecting our Union — it’

In [6]:
# Split document strings
# TODO better to add metadata back to all documents? Otherwise it'll only be in the first chunk right?
# Experiment with this, might need to build own loader
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, add_start_index=True)
split_documents = []
for doc in docs:
    chunks = text_splitter.split_text(doc.page_content)
    metadata_str = f"Title: {doc.metadata['title']} Date Posted: {doc.metadata['date_posted']} Category: {doc.metadata['category']}"
    new_metadata = {key: value for key, value in doc.metadata.items() if key not in ["title", "date_posted", "category"]}

    for chunk in chunks:
        content_with_metadata = f"{chunk} {metadata_str}"
        split_documents.append(Document(
            page_content=content_with_metadata, 
            metadata=new_metadata))

len(split_documents)

110222

In [7]:
from pprint import pprint
# Show the first two chunks w/ overlap from the same article
pprint(split_documents[0].metadata)
pprint(split_documents[0].page_content)


{'collection': 'whbriefingroom',
 'database': 'WTP',
 'id': '66a8ff13efac37d905d1db54'}
('8:01 P.M. EDT THE PRESIDENT: My fellow Americans, I’m speaking to you '
 'tonight from behind the Resolute Desk in the Oval Office. In this sacred '
 'space, I’m surrounded by portraits of extraordinary American presidents. '
 'Thomas Jefferson, who wrote the immortal words that guide this nation. '
 'George Washington, who showed us presidents are not kings. Abraham Lincoln, '
 'who implored us to reject malice. Franklin Roosevelt, who inspired us to '
 'reject fear. I revere this office, but I love my country more. It’s been the '
 'honor of my life to serve as your president. But in the defense of '
 'democracy, which is at stake, I think it’s more important than any title. I '
 'draw strength and I find joy in working for the American people. But this '
 'sacred task of perfecting our Union — it’s not about me. It’s about you, '
 'your families, your futures. It’s about “We the People.” We can

In [8]:
pc = Pinecone()

index_name = "langchain-index"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)


In [10]:
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

#FIXME count on documents, maybe upsert them somehow next time, I think that's a function
#FIXME maybe add back metadata seperate of text, src: https://python.langchain.com/v0.2/docs/integrations/retrievers/self_query/pinecone/

vector_store = PineconeVectorStore.from_documents(split_documents[90872:], embeddings, index_name=index_name)
