In [1]:
# Install required packages
!pip install langchain langchain-openai python-dotenv sentence-transformers langchain_classic langchain_community langchain_community rank_bm25 fitz tools deepeval pypdf faiss-cpu



In [2]:
# Clone the repository to access helper functions and evaluation modules
!git clone https://github.com/NirDiamant/RAG_TECHNIQUES.git
import sys
sys.path.append('RAG_TECHNIQUES')

Cloning into 'RAG_TECHNIQUES'...
remote: Enumerating objects: 1769, done.[K
remote: Counting objects: 100% (1105/1105), done.[K
remote: Compressing objects: 100% (417/417), done.[K
remote: Total 1769 (delta 735), reused 690 (delta 688), pack-reused 664 (from 4)[K
Receiving objects: 100% (1769/1769), 36.51 MiB | 14.09 MiB/s, done.
Resolving deltas: 100% (1121/1121), done.


In [3]:
import os
import sys
from dotenv import load_dotenv
from langchain_core.documents import Document
from typing import List, Dict, Any, Tuple
from langchain_openai import ChatOpenAI
from langchain_classic.chains import RetrievalQA
from langchain_core.retrievers import BaseRetriever
from sentence_transformers import CrossEncoder
from google.colab import userdata

# Ensure the RAG_TECHNIQUES directory is in the path for this cell
if 'RAG_TECHNIQUES' not in sys.path:
    sys.path.append('RAG_TECHNIQUES')

# Use for collab - Set OpenAI API key environment variables FIRST
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_BASE_URL"] = userdata.get('OPENAI_API_BASE_URL')

# Original path append replaced for Colab compatibility
from helper_functions import *
from evaluation.evalute_rag import *

# Load environment variables from a .env file
# load_dotenv()

# Set the OpenAI API key environment variable (these are now redundant if using userdata.get)
# os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
# os.environ["OPENAI_API_BASE_URL"] = os.getenv('OPENAI_API_BASE_URL')




In [4]:
# Download required data files
import os
os.makedirs('data', exist_ok=True)

# Download the PDF document used in this notebook
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf


--2026-01-27 07:05:27--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206372 (202K) [application/octet-stream]
Saving to: ‘data/Understanding_Climate_Change.pdf’


2026-01-27 07:05:27 (7.97 MB/s) - ‘data/Understanding_Climate_Change.pdf’ saved [206372/206372]

--2026-01-27 07:05:27--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
L

In [5]:
path = "data/Understanding_Climate_Change.pdf"

In [12]:
import os
# from langchain_openai import OpenAIEmbeddings # Remove OpenAI Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings # Import HuggingFace Embeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# This function redefines the encode_pdf logic to explicitly pass the OpenAI API base URL to OpenAIEmbeddings,
# addressing potential configuration issues or custom endpoint requirements.
def encode_pdf_remake(path, chunk_size=1000, chunk_overlap=200):
  """
  Encodes a PDF book into a vector store using HuggingFace embeddings.

  Args:
      path: The path to the PDF file.
      chunk_size: The desired size of each text chunk.
      chunk_overlap: The amount of overlap between consecutive chunks.

  Returns:
      A FAISS vector store containing the encoded book content.
  """

  # Load PDF documents
  loader = PyPDFLoader(path)
  documents = loader.load()

  # Split documents into chunks
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
  )
  texts = text_splitter.split_documents(documents)
  cleaned_texts = replace_t_with_space(texts) # Assuming replace_t_with_space is available from helper_functions

  # Create embeddings and vector store using HuggingFace Embeddings
  # A common model is 'sentence-transformers/all-MiniLM-L6-v2'
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  vectorstore = FAISS.from_documents(cleaned_texts, embeddings)

  return vectorstore

In [17]:
vectorstore = encode_pdf_remake(path)

## Method 1: LLM based function to rerank the retrieved documents

In [19]:
class RatingScore(BaseModel):
    relevance_score: float = Field(..., description="The relevance score of a document to a query.")

def rerank_documents(query: str, docs: List[Document], top_n: int = 3) -> List[Document]:
    prompt_template = PromptTemplate(
        input_variables=["query", "doc"],
        template="""On a scale of 1-10, rate the relevance of the following document to the query. Consider the specific context and intent of the query, not just keyword matches.
        Query: {query}
        Document: {doc}
        Relevance Score:"""
    )

    llm = ChatOpenAI(
        api_key=os.getenv("OPENAI_API_KEY"),
        base_url=os.getenv("OPENAI_API_BASE_URL"),
        temperature=0,
        model_name="gpt-4.1",
        max_tokens=4000
      )
    llm_chain = prompt_template | llm.with_structured_output(RatingScore)

    scored_docs = []
    for doc in docs:
        input_data = {"query": query, "doc": doc.page_content}
        score = llm_chain.invoke(input_data).relevance_score
        try:
            score = float(score)
        except ValueError:
            score = 0  # Default score if parsing fails
        scored_docs.append((doc, score))

    reranked_docs = sorted(scored_docs, key=lambda x: x[1], reverse=True)
    return [doc for doc, _ in reranked_docs[:top_n]]

In [20]:
query = "What are the impacts of climate change on biodiversity?"
initial_docs = vectorstore.similarity_search(query, k=15)
reranked_docs = rerank_documents(query, initial_docs)

# print first 3 initial documents
print("Top initial documents:")
for i, doc in enumerate(initial_docs[:3]):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document


# Print results
print(f"Query: {query}\n")
print("Top reranked documents:")
for i, doc in enumerate(reranked_docs):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document

Top initial documents:

Document 1:
Climate change is altering terrestrial ecosystems by shifting habitat ranges, changing species 
distributions, and impacting ecosystem functions. Forests, grasslands, and deserts are 
experiencing shi...

Document 2:
protection, and habitat creation. 
Climate-Resilient Conservation 
Conservation strategies must account for climate change impacts to be effective. This 
includes identifying climate refugia, areas le...

Document 3:
The economic costs of climate change include damage to infrastructure, reduced agricultural 
productivity, health care costs, and lost labor productivity. Extreme weather events, such as 
hurricanes a...
Query: What are the impacts of climate change on biodiversity?

Top reranked documents:

Document 1:
Climate change is altering terrestrial ecosystems by shifting habitat ranges, changing species 
distributions, and impacting ecosystem functions. Forests, grasslands, and deserts are 
experiencing shi...

Document 2:
Coral re

In [23]:
# Create a custom retriever class
class CustomRetriever(BaseRetriever, BaseModel):

    vectorstore: Any = Field(description="Vector store for initial retrieval")

    class Config:
        arbitrary_types_allowed = True

    def _get_relevant_documents(self, query: str, num_docs=2) -> List[Document]:
        initial_docs = self.vectorstore.similarity_search(query, k=30)
        return rerank_documents(query, initial_docs, top_n=num_docs)


# Create the custom retriever
custom_retriever = CustomRetriever(vectorstore=vectorstore)

# Create an LLM for answering questions
llm = ChatOpenAI(
  api_key=os.getenv("OPENAI_API_KEY"),
  base_url=os.getenv("OPENAI_API_BASE_URL"),
  temperature=0,
  model_name="gpt-4.1",
  max_tokens=4000
)

# Create the RetrievalQA chain with the custom retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=custom_retriever,
    return_source_documents=True
)

/tmp/ipython-input-4244753332.py:2: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  class CustomRetriever(BaseRetriever, BaseModel):


In [24]:
result = qa_chain({"query": query})

print(f"\nQuestion: {query}")
print(f"Answer: {result['result']}")
print("\nRelevant source documents:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document

  result = qa_chain({"query": query})



Question: What are the impacts of climate change on biodiversity?
Answer: Climate change has significant impacts on biodiversity, including:

1. **Shifting Habitat Ranges:** As temperatures rise and precipitation patterns change, many species are forced to move to new areas where conditions are more suitable. This can lead to changes in the composition of forests, grasslands, and deserts.

2. **Changing Species Distributions:** Some species may expand their range, while others may contract or even disappear from certain areas. This disrupts existing ecological balances and can lead to the loss of native species.

3. **Loss of Biodiversity:** The disruption of habitats and ecosystems can result in a decline in the number and variety of species, threatening overall biodiversity.

4. **Disrupted Ecosystem Functions:** Changes in species composition and abundance can impact ecosystem functions such as pollination, nutrient cycling, and food web dynamics.

5. **Marine Ecosystem Impacts:** 

In [27]:
chunks = [
    "The capital of France is great.",
    "The capital of France is huge.",
    "The capital of France is beautiful.",
    """Have you ever visited Paris? It is a beautiful city where you can eat delicious food and see the Eiffel Tower.
    I really enjoyed all the cities in france, but its capital with the Eiffel Tower is my favorite city.""",
    "I really enjoyed my trip to Paris, France. The city is beautiful and the food is delicious. I would love to visit again. Such a great capital city."
]
docs = [Document(page_content=sentence) for sentence in chunks]


def compare_rag_techniques(query: str, docs: List[Document] = docs) -> None:
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embeddings)

    print("Comparison of Retrieval Techniques")
    print("==================================")
    print(f"Query: {query}\n")

    print("Baseline Retrieval Result:")
    baseline_docs = vectorstore.similarity_search(query, k=2)
    for i, doc in enumerate(baseline_docs):
        print(f"\nDocument {i+1}:")
        print(doc.page_content)

    print("\nAdvanced Retrieval Result:")
    custom_retriever = CustomRetriever(vectorstore=vectorstore)
    advanced_docs = custom_retriever._get_relevant_documents(query)
    for i, doc in enumerate(advanced_docs):
        print(f"\nDocument {i+1}:")
        print(doc.page_content)


query = "what is the capital of france?"
compare_rag_techniques(query, docs)

Comparison of Retrieval Techniques
Query: what is the capital of france?

Baseline Retrieval Result:

Document 1:
The capital of France is huge.

Document 2:
The capital of France is great.

Advanced Retrieval Result:

Document 1:
The capital of France is great.

Document 2:
The capital of France is huge.


## Method 2: Cross Encoder models

In [30]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

class CrossEncoderRetriever(BaseRetriever, BaseModel):
    vectorstore: Any = Field(description="Vector store for initial retrieval")
    cross_encoder: Any = Field(description="Cross-encoder model for reranking")
    k: int = Field(default=5, description="Number of documents to retrieve initially")
    rerank_top_k: int = Field(default=3, description="Number of documents to return after reranking")

    class Config:
        arbitrary_types_allowed = True

    def _get_relevant_documents(self, query: str) -> List[Document]:
        # Initial retrieval
        initial_docs = self.vectorstore.similarity_search(query, k=self.k)

        # Prepare pairs for cross-encoder
        pairs = [[query, doc.page_content] for doc in initial_docs]

        # Get cross-encoder scores
        scores = self.cross_encoder.predict(pairs)

        # Sort documents by score
        scored_docs = sorted(zip(initial_docs, scores), key=lambda x: x[1], reverse=True)

        # Return top reranked documents
        return [doc for doc, _ in scored_docs[:self.rerank_top_k]]

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        raise NotImplementedError("Async retrieval not implemented")


/tmp/ipython-input-3795157466.py:3: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  class CrossEncoderRetriever(BaseRetriever, BaseModel):


In [31]:
# Create the cross-encoder retriever
cross_encoder_retriever = CrossEncoderRetriever(
    vectorstore=vectorstore,
    cross_encoder=cross_encoder,
    k=10,  # Retrieve 10 documents initially
    rerank_top_k=5  # Return top 5 after reranking
)

# Set up the LLM
llm = ChatOpenAI(
  api_key=os.getenv("OPENAI_API_KEY"),
  base_url=os.getenv("OPENAI_API_BASE_URL"),
  temperature=0,
  model_name="gpt-4.1",
  max_tokens=4000
)

# Create the RetrievalQA chain with the cross-encoder retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=cross_encoder_retriever,
    return_source_documents=True
)

# Example query
query = "What are the impacts of climate change on biodiversity?"
result = qa_chain({"query": query})

print(f"\nQuestion: {query}")
print(f"Answer: {result['result']}")
print("\nRelevant source documents:")
for i, doc in enumerate(result["source_documents"]):
    print(f"\nDocument {i+1}:")
    print(doc.page_content[:200] + "...")  # Print first 200 characters of each document


Question: What are the impacts of climate change on biodiversity?
Answer: Climate change has significant impacts on biodiversity, affecting both terrestrial and marine ecosystems. Here are the main impacts:

**1. Shifts in Habitat Ranges and Species Distributions:**  
As temperatures rise and precipitation patterns change, many plant and animal species are forced to move to new areas where conditions are more suitable. This can lead to changes in the composition of forests, grasslands, deserts, and other ecosystems.

**2. Loss of Biodiversity:**  
These shifts can result in the loss of species that are unable to adapt or migrate quickly enough, leading to reduced biodiversity and the potential extinction of vulnerable species.

**3. Disruption of Ecosystem Functions:**  
Changes in species composition can disrupt ecological balance and the functions that ecosystems provide, such as pollination, nutrient cycling, and water regulation.

**4. Marine Ecosystem Vulnerability:**  
Rising se