In [49]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
MONGODB_ATLAS_CLUSTER_URI = os.getenv("MONGODB_ATLAS_CLUSTER_URI")
MONGODB_PEM_LOCATION = os.getenv("MONGODB_PEM_LOCATION")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

In [52]:
from langchain.embeddings import CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.schema.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter

In [53]:
def return_text_document(input_text: str):
    return Document(page_content=some_text)

some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [54]:
loader = TextLoader("./state_of_the_union.txt")
documents = loader.load()

In [66]:
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    # separators=["\n\n", "\n", "\. ", " ", ""]
)
docs = text_splitter.split_documents(documents)

In [63]:
docs

[Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source'

In [60]:
embeddings = CohereEmbeddings(model="embed-english-light-v2.0", cohere_api_key=COHERE_API_KEY)

In [44]:
from pymongo import MongoClient
from pymongo.server_api import ServerApi

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI,
                     tls=True,
                     tlsCertificateKeyFile=MONGODB_PEM_LOCATION,
                     server_api=ServerApi('1'))

db_name = "langchain_db"
collection_name = "langchain_col"
collection = client[db_name][collection_name]
index_name = "emb"

In [45]:
collection

Collection(Database(MongoClient(host=['ac-ogbb0jm-shard-00-00.jbfx18n.mongodb.net:27017', 'ac-ogbb0jm-shard-00-01.jbfx18n.mongodb.net:27017', 'ac-ogbb0jm-shard-00-02.jbfx18n.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, authsource='$external', authmechanism='MONGODB-X509', retrywrites=True, w='majority', replicaset='atlas-473jce-shard-0', tls=True, tlscertificatekeyfile='/Users/nikhil.joy/coding_stuff/open_source/ai-phone-caller/src/docloader/X509-cert-7504625587682750196.pem', server_api=<pymongo.server_api.ServerApi object at 0x127738650>), 'langchain_db'), 'langchain_col')

In [46]:
# insert the documents in MongoDB Atlas with their embedding
docsearch = MongoDBAtlasVectorSearch.from_documents(
    docs, embeddings, collection=collection, index_name=index_name
)

In [47]:
# perform a similarity search between the embedding of the query and the embeddings of the documents
query = "What did the president say about Ketanji Brown Jackson"


In [48]:
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


In [68]:
from langchain.schema.document import Document
from langchain.embeddings import CohereEmbeddings
from pymongo import MongoClient
from pymongo.server_api import ServerApi
from typing import List, Tuple

def get_mongo_client():
    return MongoClient(
        os.getenv("MONGODB_ATLAS_CLUSTER_URI"),
        tls=True,
        tlsCertificateKeyFile=os.getenv("MONGODB_PEM_LOCATION"),
        server_api=ServerApi('1')
    )

def get_cohere_embeddings_definition():
    return CohereEmbeddings(
        model=os.getenv("COHERE_MODEL_NAME"),
        cohere_api_key=os.getenv("COHERE_API_KEY")
    )

def return_relevant_docs(
    query,
    mongo_client,
    emb_definition,
    db_name,
    collection_name,
    index_name,
) -> List[Document]:
    collection = mongo_client[db_name][collection_name]
    vector_store = MongoDBAtlasVectorSearch(
        collection=collection,
        embedding=emb_definition,
        index_name=index_name,
    )
    return vector_store.similarity_search(query)

def add_docs(
    docs: List[Document],
    mongo_client,
    emb_definition,
    db_name,
    collection_name,
    index_name,
) -> None:
    collection = mongo_client[db_name][collection_name]
    vector_store = MongoDBAtlasVectorSearch.from_documents(
        documents=docs,
        collection=collection,
        embedding=emb_definition,
        index_name=index_name,
    )

query = "What did the president say about Ketanji Brown Jackson"
db_name = "langchain_db"
collection_name = "langchain_col"
index_name = "emb"

add_docs(
    docs,
    get_mongo_client(),
    get_cohere_embeddings_definition(),
    db_name,
    collection_name,
    index_name,
)

return_relevant_docs(
    query,
    get_mongo_client(),
    get_cohere_embeddings_definition(),
    db_name,
    collection_name,
    index_name,
)

[Document(page_content='But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n\nDanielle says Heath was a fighter to the very end. \n\nHe didn’t know how to stop fighting, and neither did she. \n\nThrough her pain she found purpose to demand we do better. \n\nTonight, Danielle—we are. \n\nThe VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n\nAnd tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. \n\nI’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. \n\nAnd fourth, let’s end cancer as we know it. \n\nThis is personal to me and Jill, to Kamala, and to so many of you. \n\nCancer is the #2 cause of death in America–second only to heart disease.', metadata={'_id': ObjectId('64ea683d3baaa3eb5d585714'), 'embedding':