In [1]:
# Vector stores

# pip install chromadb

In [4]:
import os
import getpass

# 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

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma

# Load the document, split it into chunks, embed each chunk and load it into the vector store.

# https://github.com/hwchase17/chroma-langchain/blob/master/state_of_the_union.txt

raw_documents = TextLoader('state_of_the_union.txt').load() # raw_documents = TextLoader('../../../state_of_the_union.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, OpenAIEmbeddings())

ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)

In [None]:
# Similarity search

query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0].page_content)

In [None]:
# Similarity search by vector

embedding_vector = OpenAIEmbeddings().embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

In [None]:
# Asynchronous operations

# pip install qdrant-client

from langchain_community.vectorstores import Qdrant

In [None]:
# Create a vector store asynchronously

db = await Qdrant.afrom_documents(documents, embeddings, "http://localhost:6333")

In [None]:
# Similarity search

query = "What did the president say about Ketanji Brown Jackson"
docs = await db.asimilarity_search(query)
print(docs[0].page_content)

In [None]:
# Similarity search by vector

embedding_vector = embeddings.embed_query(query)
docs = await db.asimilarity_search_by_vector(embedding_vector)

In [None]:
# Maximum marginal relevance shearch (MMR)

# La relevancia marginal máxima optimiza la similitud con la consulta y la diversidad entre los documentos seleccionados. También es compatible con API asíncrona.

query = "What did the president say about Ketanji Brown Jackson"
found_docs = await qdrant.amax_marginal_relevance_search(query, k=2, fetch_k=10)
for i, doc in enumerate(found_docs):
    print(f"{i + 1}.", doc.page_content, "\n")