In [11]:
# Install required packages
!pip install langchain-experimental langchain-openai python-dotenv rank_bm25 fitz tools deepeval PyMuPDF faiss-cpu

Collecting langchain-experimental
  Downloading langchain_experimental-0.4.1-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.7-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-community<1.0.0,>=0.4.0 (from langchain-experimental)
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community<1.0.0,>=0.4.0->langchain-experimental)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community<1.0.0,>=0.4.0->langchain-experimental)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community<1.0.0,>=0.4.0->langchain-experimental)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community<1.0.0,>=0.4.0->lan

In [6]:
# Clone the repository to access helper functions and evaluation modules
!git clone https://github.com/NirDiamant/RAG_TECHNIQUES.git

Cloning into 'RAG_TECHNIQUES'...
remote: Enumerating objects: 1769, done.[K
remote: Counting objects: 100% (1105/1105), done.[K
remote: Compressing objects: 100% (417/417), done.[K
remote: Total 1769 (delta 735), reused 690 (delta 688), pack-reused 664 (from 4)[K
Receiving objects: 100% (1769/1769), 36.51 MiB | 30.62 MiB/s, done.
Resolving deltas: 100% (1121/1121), done.


In [34]:
import sys
sys.path.append('/content/RAG_TECHNIQUES')

In [13]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [35]:
import os
import sys
from dotenv import load_dotenv

from langchain_experimental.text_splitter import SemanticChunker
from google.colab import userdata

# Use for collab - Set OpenAI API key environment variables FIRST
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_BASE_URL"] = userdata.get('OPENAI_API_BASE_URL')

# Original path append replaced for Colab compatibility
from helper_functions import *
from evaluation.evalute_rag import *

# Load environment variables from a .env file
# load_dotenv()

# Set the OpenAI API key environment variable
# os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

In [8]:
# Download required data files
import os
os.makedirs('data', exist_ok=True)

# Download the PDF document used in this notebook
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf


--2026-01-27 16:01:15--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206372 (202K) [application/octet-stream]
Saving to: ‘data/Understanding_Climate_Change.pdf’


2026-01-27 16:01:15 (7.96 MB/s) - ‘data/Understanding_Climate_Change.pdf’ saved [206372/206372]

--2026-01-27 16:01:15--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
L

In [36]:
path = "data/Understanding_Climate_Change.pdf"

In [37]:
from langchain_community.embeddings import HuggingFaceEmbeddings # Import HuggingFace Embeddings
import fitz # Import PyMuPDF
def read_pdf_to_string(path):
    """
    Read a PDF document from the specified path and return its content as a string.

    Args:
        path (str): The file path to the PDF document.

    Returns:
        str: The concatenated text content of all pages in the PDF document.

    The function uses the 'fitz' library (PyMuPDF) to open the PDF document, iterate over each page,
    extract the text content from each page, and append it to a single string.
    """
    # Open the PDF document located at the specified path
    doc = fitz.open(path)
    content = ""
    # Iterate over each page in the document
    for page_num in range(len(doc)):
        # Get the current page
        page = doc[page_num]
        # Extract the text content from the current page and append it to the content string
        content += page.get_text()
    return content

In [38]:
content = read_pdf_to_string(path)

In [39]:
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(
  HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
  breakpoint_threshold_type='percentile',
  breakpoint_threshold_amount=90
) # chose which embeddings and breakpoint type and threshold to use

In [40]:
docs = text_splitter.create_documents([content])

In [41]:
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)
chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [42]:
def retrieve_context_per_question_remake(question, chunks_query_retriever):
    """
    Retrieves relevant context and unique URLs for a given question using the chunks query retriever.

    Args:
        question: The question for which to retrieve context and URLs.

    Returns:
        A tuple containing:
        - A string with the concatenated content of relevant documents.
        - A list of unique URLs from the metadata of the relevant documents.
    """

    # Retrieve relevant documents for the given question
    docs = chunks_query_retriever.invoke(question)

    # Concatenate document content
    # context = " ".join(doc.page_content for doc in docs)
    context = [doc.page_content for doc in docs]

    return context

In [43]:
from helper_functions import *

test_query = "What is the main cause of climate change?"
context = retrieve_context_per_question_remake(test_query, chunks_query_retriever)
show_context(context)

Context 1:
These effects include: 
Rising Temperatures 
Global temperatures have risen by about 1.2 degrees Celsius (2.2 degrees Fahrenheit) since 
the late 19th century. This warming is not uniform, with some regions experiencing more 
significant increases than others. Heatwaves 
Heatwaves are becoming more frequent and severe, posing risks to human health, agriculture, 
and infrastructure. Cities are particularly vulnerable due to the "urban heat island" effect. Heatwaves can lead to heat-related illnesses and exacerbate existing health conditions. Changing Seasons 
Climate change is altering the timing and length of seasons, affecting ecosystems and human 
activities. For example, spring is arriving earlier, and winters are becoming shorter and 
milder in many regions. This shift disrupts plant and animal life cycles and agricultural 
practices. Melting Ice and Rising Sea Levels 
Warmer temperatures are causing polar ice caps and glaciers to melt, contributing to rising 
sea levels