In [1]:
# Imports
import os
import nest_asyncio
nest_asyncio.apply()
from dotenv import load_dotenv
load_dotenv()
from langchain_openai import ChatOpenAI
from pymongo import MongoClient
from whbriefingroom_loader import WhBriefingRoomLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch


In [42]:
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] =  os.getenv('LANGSMITH_API_KEY')
connection_string = os.getenv('MONGO_CONNECTION_STRING')
client = MongoClient(connection_string)

ATLAS_VECTOR_SEARCH_INDEX_NAME = "index_name"
COLLECTION_NAME = "vector_whbr"
DB_NAME = "WH"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]


In [23]:
loader = WhBriefingRoomLoader(
    connection_string=mongodb_connection_string,
    db_name="WTP",
    collection_name="whbriefingroom",
)

In [24]:
# Load documents into "docs" list, each document is loaded in as a large string
# The data for one document can be viewed with docs[index].page_content
docs = loader.load()
len(docs)

9570

In [26]:
print(docs[0].metadata)
print(docs[0].page_content)

{'database': 'WTP', 'collection': 'whbriefingroom', 'title': 'Remarks by President Biden in Statement to the American People', 'date_posted': 'July 24, 2024', 'category': 'Speeches and Remarks', 'url': 'https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/07/24/remarks-by-president-biden-in-statement-to-the-american-people/', 'source': 'White House Gov Briefing Room'}
8:01 P.M. EDT THE PRESIDENT: My fellow Americans, I’m speaking to you tonight from behind the Resolute Desk in the Oval Office. In this sacred space, I’m surrounded by portraits of extraordinary American presidents. Thomas Jefferson, who wrote the immortal words that guide this nation. George Washington, who showed us presidents are not kings. Abraham Lincoln, who implored us to reject malice. Franklin Roosevelt, who inspired us to reject fear. I revere this office, but I love my country more. It’s been the honor of my life to serve as your president. But in the defense of democracy, which is at stake, I think i

In [37]:
# Split document strings
# TODO better to add metadata back to all documents? Otherwise it'll only be in the first chunk right?
# Experiment with this, might need to build own loader
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, add_start_index=True)
split_documents = []
for doc in docs:
    chunks = text_splitter.split_text(doc.page_content)
    metadata_str = str(doc.metadata)
    for chunk in chunks:
        split_documents.append(Document(
            page_content=chunk, 
            metadata=doc.metadata))

len(split_documents)

110222

In [38]:
# Show the first two chunks w/ overlap from the same article
print(split_documents[0].metadata)
print(split_documents[1].metadata)
print(split_documents[0].page_content)
print(split_documents[1].page_content)

{'database': 'WTP', 'collection': 'whbriefingroom', 'title': 'Remarks by President Biden in Statement to the American People', 'date_posted': 'July 24, 2024', 'category': 'Speeches and Remarks', 'url': 'https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/07/24/remarks-by-president-biden-in-statement-to-the-american-people/', 'source': 'White House Gov Briefing Room'}
{'database': 'WTP', 'collection': 'whbriefingroom', 'title': 'Remarks by President Biden in Statement to the American People', 'date_posted': 'July 24, 2024', 'category': 'Speeches and Remarks', 'url': 'https://www.whitehouse.gov/briefing-room/speeches-remarks/2024/07/24/remarks-by-president-biden-in-statement-to-the-american-people/', 'source': 'White House Gov Briefing Room'}
8:01 P.M. EDT THE PRESIDENT: My fellow Americans, I’m speaking to you tonight from behind the Resolute Desk in the Oval Office. In this sacred space, I’m surrounded by portraits of extraordinary American presidents. Thomas Jefferson, who 

In [44]:
# Create embeddings and add to a MongoDBVectorStore
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=split_documents,
    embedding=OpenAIEmbeddings(),
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)

OperationFailure: you are over your space quota, using 536 MB of 512 MB, full error: {'ok': 0, 'errmsg': 'you are over your space quota, using 536 MB of 512 MB', 'code': 8000, 'codeName': 'AtlasError'}