## Load docs

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS




pdf_url = "https://arxiv.org/pdf/2401.18059v1.pdf"

# PDF loader
pdf_loader = PyPDFLoader(pdf_url, extract_images=True)
pdf_pages = pdf_loader.load()


# Splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=0,
)
docs = splitter.split_documents(pdf_pages)

# Embedding model
embedding_model = HuggingFaceEmbeddings()

# vector store
chroma_db = Chroma.from_documents(docs, embedding=embedding_model)

faiss_db = FAISS.from_documents(docs, embedding=embedding_model)

In [27]:
top_K = 6

# 1. Use vector store as a retriever

## 1.1 Chroma

In [2]:
chroma_retriever = chroma_db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": top_K
    }
)

In [3]:
query = "What is a RAG system?"

In [4]:
retrieved_docs = chroma_retriever.invoke(query)

In [5]:
len(retrieved_docs)

6

In [6]:
retrieved_docs

[Document(page_content='ral Information Processing Systems , volume 33, pp. 1877–1901. Curran Associates, Inc.,\n10', metadata={'page': 9, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='2018), is to index large quantities of text, after splitting it into chunks (paragraphs), in a separate\ninformation retrieval system. Retrieved information is then presented to the LLM along with the', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='texts of length 100, similar to traditional retrieval augmentation techniques. If a sentence exceeds the\n100-token limit, we move the entire sentence to the next chunk, rather than cutting it mid-sentence.', metadata={'page': 2, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='denses the potentially large volume of retrieved information into a manageable size. We provide\nstatistics on the compression due to the summarization in Appendix C and the

## 1.2 FAISS

In [7]:
faiss_retriever = faiss_db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": top_K
    }
)

In [8]:
retrieved_docs = chroma_retriever.invoke(query)

In [9]:
len(retrieved_docs)

6

In [10]:
retrieved_docs

[Document(page_content='ral Information Processing Systems , volume 33, pp. 1877–1901. Curran Associates, Inc.,\n10', metadata={'page': 9, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='2018), is to index large quantities of text, after splitting it into chunks (paragraphs), in a separate\ninformation retrieval system. Retrieved information is then presented to the LLM along with the', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='texts of length 100, similar to traditional retrieval augmentation techniques. If a sentence exceeds the\n100-token limit, we move the entire sentence to the next chunk, rather than cutting it mid-sentence.', metadata={'page': 2, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='denses the potentially large volume of retrieved information into a manageable size. We provide\nstatistics on the compression due to the summarization in Appendix C and the

# 2. BM25

In [11]:
# %pip install --upgrade --quiet  rank_bm25

In [12]:
from langchain_community.retrievers import BM25Retriever

In [13]:
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = top_K

In [14]:
retrieved_docs = bm25_retriever.get_relevant_documents(query)

In [15]:
len(retrieved_docs) 

6

In [16]:
retrieved_docs

[Document(page_content='text and then mapping the tokens. This amendment improves the accuracy of the METEOR\ncalculation by taking into account the correct linguistic boundaries of words.\nQuestion: What is the central theme of the story?', metadata={'source': 'https://arxiv.org/pdf/2401.18059v1.pdf', 'page': 20}),
 Document(page_content='total tokens ←0\nfornode in top nodes do\niftotal tokens +node.token size<max tokens then\nresult.append(node)\nend if\ntotal tokens ←total tokens +node.token size\nend for\nreturn result\nend function\nQuestion: What is the central theme of the story?', metadata={'source': 'https://arxiv.org/pdf/2401.18059v1.pdf', 'page': 18}),
 Document(page_content='LM fine-tuned for open-domain question answering; and RAG (Retrieval-Augmented Genera-\ntion) (Lewis et al., 2020), which integrates pre-trained sequence-to-sequence models with a neural\nretriever. Min et al. (2021) introduced Joint Passage Retrieval (JPR) model which uses a tree-', metadata={'source'

# 3. TF-IDF

In [17]:
# %pip install --upgrade --quiet  scikit-learn

In [18]:
from langchain_community.retrievers import TFIDFRetriever

In [19]:
tfidf_retriever = TFIDFRetriever.from_documents(docs)
tfidf_retriever.k = top_K

In [20]:
retrieved_docs = tfidf_retriever.get_relevant_documents(query)

In [21]:
len(retrieved_docs) 

6

In [22]:
retrieved_docs

[Document(page_content='from the story is present below and the full PDF of this story is linked here. For questions like “What\nis the central theme of the story?”, an upper-level node is retrieved which includes the sentence:\n“This story is about the power of human connection... inspiring and uplifting each other as they', metadata={'source': 'https://arxiv.org/pdf/2401.18059v1.pdf', 'page': 19}),
 Document(page_content='given context, it is not possible to determine how Cinderella finds a happy ending, as the text lacks\ninformation about the story’s conclusion.”\nThe second question we examine is “What is the central theme of the story?”, a thematic question', metadata={'source': 'https://arxiv.org/pdf/2401.18059v1.pdf', 'page': 19}),
 Document(page_content='2018), is to index large quantities of text, after splitting it into chunks (paragraphs), in a separate\ninformation retrieval system. Retrieved information is then presented to the LLM along with the', metadata={'source': 'ht

# 4. Ensemble

In [23]:
from langchain.retrievers import EnsembleRetriever

In [34]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[chroma_retriever, 
                faiss_retriever,
                bm25_retriever, 
                tfidf_retriever], 
    weights=[0.25, 0.25, 0.25, 0.25]
)

In [35]:
retrieved_docs = ensemble_retriever.invoke(query)

In [37]:
len(retrieved_docs)

15

In [38]:
retrieved_docs

[Document(page_content='2018), is to index large quantities of text, after splitting it into chunks (paragraphs), in a separate\ninformation retrieval system. Retrieved information is then presented to the LLM along with the', metadata={'page': 0, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='ral Information Processing Systems , volume 33, pp. 1877–1901. Curran Associates, Inc.,\n10', metadata={'page': 9, 'source': 'https://arxiv.org/pdf/2401.18059v1.pdf'}),
 Document(page_content='given context, it is not possible to determine how Cinderella finds a happy ending, as the text lacks\ninformation about the story’s conclusion.”\nThe second question we examine is “What is the central theme of the story?”, a thematic question', metadata={'source': 'https://arxiv.org/pdf/2401.18059v1.pdf', 'page': 19}),
 Document(page_content='text and then mapping the tokens. This amendment improves the accuracy of the METEOR\ncalculation by taking into account the correct li