# Package Installation and Imports

In [None]:
# Install required packages
!pip install langchain langchain-openai python-dotenv sentence-transformers

In [124]:
# Clone the repository to access helper functions and evaluation modules
!git clone https://github.com/NirDiamant/RAG_TECHNIQUES.git
import sys
sys.path.append('RAG_TECHNIQUES')
# If you need to run with the latest data
# !cp -r RAG_TECHNIQUES/data .

fatal: destination path 'RAG_TECHNIQUES' already exists and is not an empty directory.


In [125]:
import os
import sys
# from langchain.docstore.document import Document
from langchain_core.documents import Document as LangchainDocument
from typing import List, Dict, Any, Tuple
from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
from langchain_core.retrievers import BaseRetriever
from sentence_transformers import CrossEncoder

from google.colab import userdata


# Set the OpenAI API key environment variable
OPENAI_API_KEY = userdata.get('key_openai')

In [126]:
# from helper_functions import *

In [None]:
!pip install langchain-community

In [128]:
from langchain_community.document_loaders.pdf import PyPDFLoader

In [129]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [130]:
from langchain_openai import OpenAIEmbeddings

In [131]:
!pip install langchain[faiss]



In [132]:
from langchain_community.vectorstores import FAISS

# Define the document's path

In [133]:
# Download required data files
import os
os.makedirs('data', exist_ok=True)

# Download the PDF document used in this notebook
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf


--2026-01-22 09:11:00--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206372 (202K) [application/octet-stream]
Saving to: ‘data/Understanding_Climate_Change.pdf’


2026-01-22 09:11:00 (7.89 MB/s) - ‘data/Understanding_Climate_Change.pdf’ saved [206372/206372]

--2026-01-22 09:11:00--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
L

In [134]:
path = "data/Understanding_Climate_Change.pdf"

# Create a vector store

In [135]:
# vectorstore = encode_pdf(path)


In [136]:
from langchain_community.embeddings import HuggingFaceEmbeddings


In [137]:

def create_vectorstore(file_path, chunk_size=1000, chunk_overlap=100, persist=False, persist_dir="vectorstore"):
    """
    Tạo vector store từ file PDF hoặc TXT.

    Args:
        file_path (str): đường dẫn file tài liệu (.pdf hoặc .txt)
        chunk_size (int): số ký tự tối đa mỗi chunk
        chunk_overlap (int): số ký tự overlap giữa các chunk
        persist (bool): có lưu vectorstore ra disk không
        persist_dir (str): thư mục lưu vectorstore nếu persist=True

    Returns:
        vectorstore: object vectorstore (FAISS)
    """

    # Load document
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path)
    else:
        raise ValueError("Chỉ hỗ trợ PDF hoặc TXT")

    documents = loader.load()

    # Split thành chunks nhỏ
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    docs = text_splitter.split_documents(documents)

    # Tạo embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embeddings)


    # Tạo vectorstore FAISS
    vectorstore = FAISS.from_documents(docs, embeddings)

    # Lưu vectorstore nếu cần
    if persist:
        vectorstore.save_local(persist_dir)

    return vectorstore


In [138]:
!pip install faiss-cpu



In [139]:
!pip install pypdf



In [140]:
vectorstore = create_vectorstore(
    path,
    chunk_size=2000,
    chunk_overlap=200,
    persist=True
)


# Method 1: LLM based function to rerank the retrieved documents

# Create a custom reranking function

In [141]:
from pydantic import BaseModel, Field

In [142]:
from langchain_core.prompts import PromptTemplate

In [143]:
class RatingScore(BaseModel):
    relevance_score: float = Field(..., description="The relevance score of a document to a query.")

def rerank_documents(query: str, docs: List[LangchainDocument], top_n: int = 3) -> List[LangchainDocument]:
    prompt_template = PromptTemplate(
        input_variables=["query", "doc"],
        template="""On a scale of 1-10, rate the relevance of the following document to the query. Consider the specific context and intent of the query, not just keyword matches.
        Query: {query}
        Document: {doc}
        Relevance Score:"""
    )

    llm = ChatOpenAI(
        model="gpt-4.1", # Model's name
        temperature=0,
        max_tokens=4000,
        openai_api_key=userdata.get('key_ptn'), # PTN's key
        base_url="https://llm.ptnglobalcorp.com"
    )
    llm_chain = prompt_template | llm.with_structured_output(RatingScore)

    scored_docs = []
    for doc in docs:
        input_data = {"query": query, "doc": doc.page_content}
        score = llm_chain.invoke(input_data).relevance_score
        try:
            score = float(score)
        except ValueError:
            score = 0  # Default score if parsing fails
        scored_docs.append((doc, score))

    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in reranked_docs[:top_n]]

# Example usage of the reranking function with a sample query relevant to the document

In [144]:
query = "What are the impacts of climate change on biodiversity?"
initial_docs = vectorstore.similarity_search(query, k=15)
reranked_docs = rerank_documents(query, initial_docs)

# print first 3 initial documents
print("Top initial documents:")
for i, doc in enumerate(initial_docs[:3]):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document


# Print results
print(f"Query: {query}\n")
print("Top reranked documents:")
for i, doc in enumerate(reranked_docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document

Top initial documents:

Document 1:
includes identifying climate refugia, areas less affected by climate change, and prioritizing 
them for protection. Adaptive management practices ensure that conservation efforts remain 
effective und...

Document 2:
experiencing shifts in plant and animal species composition. These changes can lead to a loss 
of biodiversity and disrupt ecological balance. 
Marine Ecosystems 
Marine ecosystems are highly vulnerab...

Document 3:
goals. Policies should promote synergies between biodiversity conservation and climate 
action. 
Chapter 10: Climate Change and Human Health 
Health Impacts 
Heat-Related Illnesses 
Rising temperature...
Query: What are the impacts of climate change on biodiversity?

Top reranked documents:

Document 1:
experiencing shifts in plant and animal species composition. These changes can lead to a loss 
of biodiversity and disrupt ecological balance. 
Marine Ecosystems 
Marine ecosystems are highly vulnerab...

Document 2:
includes

In [145]:
from langchain_core.retrievers import BaseRetriever

In [146]:
# # Create a custom retriever class
# class CustomRetriever(BaseRetriever, BaseModel):

#     vectorstore: Any = Field(description="Vector store for initial retrieval")

#     class Config:
#         arbitrary_types_allowed = True

#     def get_relevant_documents(self, query: str, num_docs=2) -> List[Document]:
#         initial_docs = self.vectorstore.similarity_search(query, k=30)
#         return rerank_documents(query, initial_docs, top_n=num_docs)


# # Create the custom retriever
# custom_retriever = CustomRetriever(vectorstore=vectorstore)

# # Create an LLM for answering questions
# llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

# # Create the RetrievalQA chain with the custom retriever
# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=custom_retriever,
#     return_source_documents=True
# )

""" RetrievalQA cant import, so we replace it with another methods"""


' RetrievalQA cant import, so we replace it with another methods'

In [147]:
class RetrievalQA(BaseModel):
    """
    Minimal replacement for RetrievalQA for LangChain 1.2.6
    Keeps the same interface:
        - from_chain_type
        - return_source_documents
        - run / __call__ interface
    """
    llm: Any
    retriever: BaseRetriever
    return_source_documents: bool = True
    # combine_documents_chain: Any = None  # Stuff chain
    # Minimal prompt
    prompt_template: str= """You are a helpful assistant.
Use ONLY the following context to answer the question.
If the answer is not in the context, say "I don't know".

Context:
{context}

Question:
{input}

Answer:
"""


    @classmethod
    def from_chain_type(cls, llm, retriever, chain_type="stuff", return_source_documents=True):
        if chain_type != "stuff":
            raise NotImplementedError("Only 'stuff' chain is implemented in this custom class")
        return cls(
            llm=llm,
            retriever=retriever,
            return_source_documents=return_source_documents
        )

        # combine_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

        return cls(
            llm=llm,
            retriever=retriever,
            return_source_documents=return_source_documents,
            # combine_documents_chain=combine_chain
        )

    def __call__(self, inputs: Dict[str, str]) -> Dict[str, Any]:
        query = inputs.get("query") or inputs.get("input")
        if not query:
            raise ValueError("You must provide 'query' or 'input' key")

        # Retrieve documents
        docs: List[LangchainDocument] = self.retriever.get_relevant_documents(query)

        # Run combine_documents_chain
        # answer = self.combine_documents_chain.run(docs, question=query)
        context = "\n\n".join([doc.page_content for doc in docs])


        #  Create prompt
        final_prompt = self.prompt_template.format(context=context, input=query)

        # Call directly LLMs
        answer = self.llm.invoke(final_prompt)

        # Return in same format as original RetrievalQA
        result = {"result": answer}
        if self.return_source_documents:
            result["source_documents"] = docs
        return result

    def run(self, query: str) -> str:
        return self.__call__({"query": query})["result"]


In [148]:
# Create a custom retriever class
class CustomRetriever(BaseRetriever, BaseModel):

    vectorstore: Any = Field(description="Vector store for initial retrieval")

    class Config:
        arbitrary_types_allowed = True
    # abstract method
    def _get_relevant_documents(self, query: str, num_docs=2) -> List[LangchainDocument]:
        initial_docs = self.vectorstore.similarity_search(query, k=30)
        return rerank_documents(query, initial_docs, top_n=num_docs)

    # wrapper of BaseRetriever
    def get_relevant_documents(self, query: str, num_docs=2) -> List[LangchainDocument]:
        initial_docs = self.vectorstore.similarity_search(query, k=30)
        return rerank_documents(query, initial_docs, top_n=num_docs)



/tmp/ipython-input-3281216925.py:2: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  class CustomRetriever(BaseRetriever, BaseModel):


In [149]:
custom_retriever = CustomRetriever(vectorstore=vectorstore)

In [150]:
llm =  ChatOpenAI(
        model="gpt-4.1", # Model's name
        temperature=0,
        max_tokens=4000,
        openai_api_key=userdata.get('key_ptn'), # PTN's key
        base_url="https://llm.ptnglobalcorp.com"
    )

In [151]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=custom_retriever,
    chain_type="stuff",
    return_source_documents=True
)

# Example query

In [152]:
result = qa_chain({"query": query})

print(f"\nQuestion: {query}")
print(f"Answer: {result['result']}")
print("\nRelevant source documents:")
print("Sources:", [doc.page_content[:200] for doc in result["source_documents"]]) # Print first 200 characters of each document


Question: What are the impacts of climate change on biodiversity?
Answer: content='Climate change impacts biodiversity by altering terrestrial ecosystems through shifting habitat ranges, changing species distributions, and impacting ecosystem functions. Forests, grasslands, and deserts are experiencing shifts in plant and animal species composition, which can lead to a loss of biodiversity and disrupt ecological balance. In marine ecosystems, rising sea temperatures, ocean acidification, and changing currents affect marine biodiversity, from coral reefs to deep-sea habitats. Species migration and changes in reproductive cycles can disrupt marine food webs and fisheries.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 96, 'prompt_tokens': 483, 'total_tokens': 579, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': None, 'reasoning_tokens': None, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_to

# Example that demonstrates why we should use reranking

In [153]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [154]:
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name="thenlper/gte-small",
        multi_process=True,
        # model_kwargs={"device": "cuda"},
        encode_kwargs={
            "normalize_embeddings": True
        },  # set True to compute cosine similarity
    )

In [155]:
chunks = [
    "The capital of France is great.",
    "The capital of France is huge.",
    "The capital of France is beautiful.",
    """Have you ever visited Paris? It is a beautiful city where you can eat delicious food and see the Eiffel Tower.
    I really enjoyed all the cities in france, but its capital with the Eiffel Tower is my favorite city.""",
    "I really enjoyed my trip to Paris, France. The city is beautiful and the food is delicious. I would love to visit again. Such a great capital city."
]
docs = [LangchainDocument(page_content=sentence) for sentence in chunks]


def compare_rag_techniques(query: str, docs: List[LangchainDocument] = docs) -> None:
    embeddings = embedding_model
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore = FAISS.from_documents(docs, embeddings)

    print("Comparison of Retrieval Techniques")
    print("==================================")
    print(f"Query: {query}\n")

    print("Baseline Retrieval Result:")
    baseline_docs = vectorstore.similarity_search(query, k=2)
    for i, doc in enumerate(baseline_docs):
        print(f"\nDocument {i+1}:")
        print(doc.page_content)

    print("\nAdvanced Retrieval Result:")
    custom_retriever = CustomRetriever(vectorstore=vectorstore)
    advanced_docs = custom_retriever.get_relevant_documents(query)
    for i, doc in enumerate(advanced_docs):
        print(f"\nDocument {i+1}:")
        print(doc.page_content)


query = "what is the capital of france?"
compare_rag_techniques(query, docs)

Comparison of Retrieval Techniques
Query: what is the capital of france?

Baseline Retrieval Result:

Document 1:
The capital of France is great.

Document 2:
The capital of France is huge.

Advanced Retrieval Result:

Document 1:
The capital of France is great.

Document 2:
Have you ever visited Paris? It is a beautiful city where you can eat delicious food and see the Eiffel Tower.
    I really enjoyed all the cities in france, but its capital with the Eiffel Tower is my favorite city.


# Method 2:Cross Encoder models

# Define the cross encoder class

In [159]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

class CrossEncoderRetriever(BaseRetriever, BaseModel):
    vectorstore: Any = Field(description="Vector store for initial retrieval")
    cross_encoder: Any = Field(description="Cross-encoder model for reranking")
    k: int = Field(default=5, description="Number of documents to retrieve initially")
    rerank_top_k: int = Field(default=3, description="Number of documents to return after reranking")

    class Config:
        arbitrary_types_allowed = True

    def _get_relevant_documents(self, query: str) -> List[LangchainDocument]:
        # Initial retrieval
        initial_docs = self.vectorstore.similarity_search(query, k=self.k)

        # Prepare pairs for cross-encoder
        pairs = [[query, doc.page_content] for doc in initial_docs]

        # Get cross-encoder scores
        scores = self.cross_encoder.predict(pairs)

        # Sort documents by score
        scored_docs = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)

        # Return top reranked documents
        return [doc for doc, _ in scored_docs[:self.rerank_top_k]]


    def get_relevant_documents(self, query: str) -> List[LangchainDocument]:
        # Initial retrieval
        initial_docs = self.vectorstore.similarity_search(query, k=self.k)

        # Prepare pairs for cross-encoder
        pairs = [[query, doc.page_content] for doc in initial_docs]

        # Get cross-encoder scores
        scores = self.cross_encoder.predict(pairs)

        # Sort documents by score
        scored_docs = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)

        # Return top reranked documents
        return [doc for doc, _ in scored_docs[:self.rerank_top_k]]

    async def aget_relevant_documents(self, query: str) -> List[LangchainDocument]:
        raise NotImplementedError("Async retrieval not implemented")



/tmp/ipython-input-2459284724.py:3: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  class CrossEncoderRetriever(BaseRetriever, BaseModel):


# Create an instance and showcase over an example

In [162]:
# Create the cross-encoder retriever
cross_encoder_retriever = CrossEncoderRetriever(
    vectorstore=vectorstore,
    cross_encoder=cross_encoder,
    k=10,  # Retrieve 10 documents initially
    rerank_top_k=5  # Return top 5 after reranking
)

# Set up the LLM
llm = ChatOpenAI(
    base_url="https://llm.ptnglobalcorp.com",
    model="gpt-4.1",
    temperature=0,
    max_tokens=4000,
    openai_api_key=userdata.get('key_ptn')
)

# Create the RetrievalQA chain with the cross-encoder retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=cross_encoder_retriever,
    return_source_documents=True
)

# Example query
query = "What are the impacts of climate change on biodiversity?"
result = qa_chain({"query": query})

print(f"\nQuestion: {query}")
print(f"Answer: {result['result']}")
print("\nRelevant source documents:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document


Question: What are the impacts of climate change on biodiversity?
Answer: content='Climate change impacts biodiversity by causing shifts in plant and animal species composition, leading to a loss of biodiversity and disrupting ecological balance. In marine ecosystems, rising sea temperatures, ocean acidification, and changing currents affect marine biodiversity, disrupt marine food webs, and impact fisheries. Freshwater ecosystems are affected by changes in precipitation patterns, temperature, and water flow, resulting in altered water quality, habitat loss, and reduced biodiversity. Overall, climate change contributes to ecosystem degradation and decreased availability of natural resources.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 98, 'prompt_tokens': 1279, 'total_tokens': 1377, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': None, 'reasoning_tokens': None, 'rejected_prediction_tokens': 0}, 'prompt_tok