In [1]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from rag_utils.vanilla import vectorstore

# Create a WebBaseLoader instance to load documents from web sources
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
# Load documents from web sources using the loader
documents = loader.load()
# Initialize a RecursiveCharacterTextSplitter for splitting text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Split the documents into chunks using the text_splitter
docs = text_splitter.split_documents(documents)

In [2]:
vectorstore.add_documents(docs)
retriever = vectorstore.as_retriever()

In [3]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from rag_utils.vanilla import format_docs, rag_prompt, llm

vanilla_rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [4]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from rag_utils.hyde import HydeRetriever

hyde_retriever = HydeRetriever.from_vectorstore(vectorstore)

hyde_chain = (
    {"context": hyde_retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [5]:
query = "which vector approximate searching algorithms work in a vector store"

vanilla_result = vanilla_rag_chain.invoke(query)
hyde_result = hyde_chain.invoke(query)
print(f"\n[vanilla_result]:\n{vanilla_result}\n\n[hyde_result]:\n{hyde_result}")


[vanilla_result]:
The vector approximate searching algorithms that work in a vector store include FAISS (Facebook AI Similarity Search), ScaNN (Scalable Nearest Neighbors), LSH (Locality-Sensitive Hashing), and ANNOY (Approximate Nearest Neighbors Oh Yeah). These algorithms are commonly used for fast Maximum Inner Product Search (MIPS) in a vector store.

[hyde_result]:
The vector approximate searching algorithms that work in a vector store include Locality-Sensitive Hashing (LSH), ANNOY (Approximate Nearest Neighbors Oh Yeah), FAISS (Facebook AI Similarity Search), ScaNN (Scalable Nearest Neighbors), and HNSW (Hierarchical Navigable Small World). These algorithms are commonly used for fast Maximum Inner Product Search (MIPS) in a vector store database.


In the [hyde_result], it retrieved with ground truth "HNSW", which does not appear in the [vanilla_result].

Let's dive deep into the retrieved results to find the reason.

In [6]:
retriever.invoke(query)

[Document(page_content='FAISS (Facebook AI Similarity Search): It operates on the assumption that in high dimensional space, distances between nodes follow a Gaussian distribution and thus there should exist clustering of data points. FAISS applies vector quantization by partitioning the vector space into clusters and then refining the quantization within clusters. Search first looks for cluster candidates with coarse quantization and then further looks into each cluster with finer quantization.\nScaNN (Scalable Nearest Neighbors): The main innovation in ScaNN is anisotropic vector quantization. It quantizes a data point $x_i$ to $\\tilde{x}_i$ such that the inner product $\\langle q, x_i \\rangle$ is as similar to the original distance of $\\angle q, \\tilde{x}_i$ as possible, instead of picking the closet quantization centroid points.', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'pk': 449915553583988755}),
 Document(page_content='LSH (Locality-Sensiti

In [7]:
hyde_retriever.invoke(query)

[Document(page_content='FAISS (Facebook AI Similarity Search): It operates on the assumption that in high dimensional space, distances between nodes follow a Gaussian distribution and thus there should exist clustering of data points. FAISS applies vector quantization by partitioning the vector space into clusters and then refining the quantization within clusters. Search first looks for cluster candidates with coarse quantization and then further looks into each cluster with finer quantization.\nScaNN (Scalable Nearest Neighbors): The main innovation in ScaNN is anisotropic vector quantization. It quantizes a data point $x_i$ to $\\tilde{x}_i$ such that the inner product $\\langle q, x_i \\rangle$ is as similar to the original distance of $\\angle q, \\tilde{x}_i$ as possible, instead of picking the closet quantization centroid points.', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'pk': 449915553583988755}),
 Document(page_content='LSH (Locality-Sensiti

As we can see, what hyde_retriever retrieved contains the "HNSW" item, while vanilla_retriever did not. This is because the generated fake documents contain contents about "HNSW", which makes the hyde retriever did better.