In [1]:
2+2


4

## Vector Store Retriever

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

from langchain_openai import OpenAIEmbeddings
from pathlib import Path

BASEDIR=Path.cwd()

PATHDIR=BASEDIR / "data" / "MachineLearning.pdf"

loader=PyPDFLoader(PATHDIR)
docs=loader.load()

rec_splitter=RecursiveCharacterTextSplitter(chunk_size=300 ,chunk_overlap=30)

chunks=rec_splitter.split_documents(docs)
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")

In [3]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma(
    collection_name="my_docs",
    embedding_function=embeddings,
    persist_directory="./chroma_db"
)
vectorstore.add_documents(chunks)
vectorstore.persist()

  vectorstore = Chroma(
  vectorstore.persist()


In [None]:
vector_retriever=vectorstore.as_retriever(search_kwargs={"k":3})

response=vector_retriever.invoke("machine")
print(len(response))
for d in response:
    print(f"ID:{d.metadata.get('id')}: {d.page_content}")



3
ID:None: tor machine its name.
To make a prediction for a new point, the distance to each of the support vectors is
measured. A classification decision is made based on the distances to the support vec‚Äê
tor, and the importance of the support vectors that was learned during training
ID:None: large machine, or even to rent one from a cloud provider. In most applications, the
data that is used to build a machine learning system is relatively small, though, and
few machine learning datasets consist of hundreds of gigabites of data or more. This
ID:None: put given an input. In particular, the algorithm is able to create an output for an input
it has never seen before without any help from a human. Going back to our example
of spam classification, using machine learning, the user provides the algorithm with a


## BM25 Retriever

BM25 is a lexical (keyword-based) retrieval algorithm.

Matches exact words

Uses term frequency + inverse document frequency

No embeddings

Very fast

üëâ Best for:

IDs, codes, names

Numbers

Error messages

In [None]:

from langchain_community.retrievers import BM25Retriever

bm25Retriever=BM25Retriever.from_documents(documents=docs,k=3)

response=bm25Retriever.invoke("machine")
print(len(response))
for d in response:
    print({d.page_content})

3
{'Preface\nMachine learning is an integral part of many commercial applications and research\nprojects today, in areas ranging from medical diagnosis and treatment to finding your\nfriends on social networks. Many people think that machine learning can only be\napplied by large companies with extensive research teams. In this book, we want to\nshow you how easy it can be to build machine learning solutions yourself, and how to\nbest go about it. With the knowledge in this book, you can build your own system for\nfinding out how people feel on Twitter, or making predictions about global warming.\nThe applications of machine learning are endless and, with the amount of data avail‚Äê\nable today, mostly limited by your imagination.\nWho Should Read This Book\nThis book is for current and aspiring machine learning practitioners looking to\nimplement solutions to real-world machine learning problems. This is an introduc‚Äê\ntory book requiring no previous knowledge of machine learning or 

## Ensemble /Hybrid Retriever

In [None]:
from langchain.retrievers.ensemble import EnsembleRetriever

bm25Retriever=BM25Retriever.from_documents(documents=docs,k=3)
vector_retriever=vectorstore.as_retriever(search_kwargs={"k":3})

#Ensemble
ensemble=EnsembleRetriever(retrievers=[bm25Retriever,vector_retriever], weights=[0.5,0.5])

response=ensemble.invoke("machine")
print(len(response))
for d in response:
    print(d.metadata)

## MultiQuery Retriever

User query: "machine"
‚Üí Retriever searches using exactly "machine"
‚Üí Misses synonyms, intent variations

LLM rewrites the query into multiple variants:
- "machine learning concepts"
- "types of machines in AI"
- "how machines learn"

Each query ‚Üí retriever ‚Üí results merged

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

base_retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3}
)

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=base_retriever,
    llm=llm
)

docs = multi_query_retriever.invoke("machine")


## Contextual compression

Problem in normal RAG

Retriever returns long chunks

Only 2‚Äì3 lines are relevant

LLM wastes tokens + hallucinations

Solution

Compress retrieved documents based on the user query

üëâ Keep only query-relevant sentences

In [None]:
from langchain_openai import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# LLM for compression
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

# Base retriever (vector)
base_retriever = vectorstore.as_retriever(
    search_kwargs={"k": 5}
)

# Compressor
compressor = LLMChainExtractor.from_llm(llm)

# Contextual Compression Retriever
compression_retriever = ContextualCompressionRetriever(
    base_retriever=base_retriever,
    base_compressor=compressor
)

docs = compression_retriever.invoke("machine learning types")

print("Compressed docs:", len(docs))
for d in docs:
    print("\n---")
    print(d.page_content)
