## Vector stores and retrievers


In [1]:
from langchain_core.documents import Document

In [2]:
documents = [
    Document(
        page_content="Dogs are reat companies, known for their loyalty and friendliness",
        metadata={"source": "mamma1-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space",
        metadata={"source": "mamma1-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care. ",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech",
        metadata={"source": "bird-pets-doc"},
    )
]

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
groq_api_key = os.getenv("GROQ_API_KEY")

os.environ["HF_ACCESS_TOKEN"] = os.getenv("HF_ACCESS_TOKEN")

llm = ChatGroq(groq_api_key=groq_api_key, model="Llama3-8b-8192")
print(llm)

client=<groq.resources.chat.completions.Completions object at 0x10e563cb0> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x10e6f0980> model_name='Llama3-8b-8192' model_kwargs={} groq_api_key=SecretStr('**********')


In [5]:
from tqdm.autonotebook import tqdm, trange
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [6]:
## vector stores
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    documents, embedding=embeddings
)
vectorstore


<langchain_chroma.vectorstores.Chroma at 0x1214a2cf0>

In [7]:
vectorstore.similarity_search("cat")

[Document(id='dc66aacd-9c04-4a6b-b2f7-aa41e832c7f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space'),
 Document(id='17a330f4-efb8-4cbb-9687-a1035a28d8f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Dogs are reat companies, known for their loyalty and friendliness'),
 Document(id='6a424100-9923-4673-95f0-f7dfe6f86996', metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech'),
 Document(id='8c63f321-2d89-4b03-9495-83e2b91b9c4f', metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care. ')]

In [8]:
# Async query
await vectorstore.asimilarity_search("Cat")

[Document(id='dc66aacd-9c04-4a6b-b2f7-aa41e832c7f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space'),
 Document(id='17a330f4-efb8-4cbb-9687-a1035a28d8f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Dogs are reat companies, known for their loyalty and friendliness'),
 Document(id='6a424100-9923-4673-95f0-f7dfe6f86996', metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech'),
 Document(id='8c63f321-2d89-4b03-9495-83e2b91b9c4f', metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care. ')]

In [10]:
vectorstore.similarity_search_with_score("cat")

[(Document(id='dc66aacd-9c04-4a6b-b2f7-aa41e832c7f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space'),
  0.9436444044113159),
 (Document(id='17a330f4-efb8-4cbb-9687-a1035a28d8f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Dogs are reat companies, known for their loyalty and friendliness'),
  1.4431562423706055),
 (Document(id='6a424100-9923-4673-95f0-f7dfe6f86996', metadata={'source': 'bird-pets-doc'}, page_content='Parrots are intelligent birds capable of mimicking human speech'),
  1.628485918045044),
 (Document(id='8c63f321-2d89-4b03-9495-83e2b91b9c4f', metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care. '),
  1.8011280298233032)]

### Retrievers 

Langchain VectorStore objects do not subclass Runnable, and so cannot immediately be integrated into LangChain Expression Language chains

Langchain Retrievers are Runnables, so they implement a standard set of methods (e.g., synchronous and asynchronous invoke and batch operations) and are designed to be incorporated in LCEL chains

We can create a simple version of this ourselves, without subclassing Retriever. If we choose what method we wish to use to retrieve documents, we can create a runnable easily. Below we will build one around the similarity_search method.


In [12]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda

retriever = RunnableLambda(vectorstore.similarity_search).bind(k=1)
retriever.batch(["cat", "dog"])

[[Document(id='dc66aacd-9c04-4a6b-b2f7-aa41e832c7f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space')],
 [Document(id='17a330f4-efb8-4cbb-9687-a1035a28d8f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Dogs are reat companies, known for their loyalty and friendliness')]]

Vectorestore implement an as_retriever method that will generate a Retriever, specifically a VectorStoreRetriever. These retrievers include specific search_type and search_kwargs attributes that identify what methods of the underlying vectors store to call, and how to parameterize them. For instance, we can replicate the above with the following:

In [13]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1}
)
retriever.batch(["cat", "dog"])

[[Document(id='dc66aacd-9c04-4a6b-b2f7-aa41e832c7f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space')],
 [Document(id='17a330f4-efb8-4cbb-9687-a1035a28d8f9', metadata={'source': 'mamma1-pets-doc'}, page_content='Dogs are reat companies, known for their loyalty and friendliness')]]

In [15]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

message = """
Answer the question using the provided context only.
{question}
Context: 
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])

rag_chain = {
    "context": retriever,
    "question": RunnablePassthrough()
} | prompt | llm | StrOutputParser()

response = rag_chain.invoke("Tell me about dogs")
print(response)

According to the provided context, dogs are "reat companies, known for their loyalty and friendliness".
