# Package Installation and Imports

In [1]:
# Install required packages
!pip install langchain-experimental langchain-openai python-dotenv

Collecting langchain-experimental
  Downloading langchain_experimental-0.4.1-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.7-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-community<1.0.0,>=0.4.0 (from langchain-experimental)
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community<1.0.0,>=0.4.0->langchain-experimental)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community<1.0.0,>=0.4.0->langchain-experimental)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community<1.0.0,>=0.4.0->langchain-experimental)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community<1.0.0,>=0.4.0->lan

In [2]:
# Clone the repository to access helper functions and evaluation modules
!git clone https://github.com/NirDiamant/RAG_TECHNIQUES.git
import sys
sys.path.append('RAG_TECHNIQUES')
# If you need to run with the latest data
# !cp -r RAG_TECHNIQUES/data .

Cloning into 'RAG_TECHNIQUES'...
remote: Enumerating objects: 1765, done.[K
remote: Counting objects: 100% (1101/1101), done.[K
remote: Compressing objects: 100% (414/414), done.[K
remote: Total 1765 (delta 733), reused 688 (delta 687), pack-reused 664 (from 4)[K
Receiving objects: 100% (1765/1765), 36.51 MiB | 10.23 MiB/s, done.
Resolving deltas: 100% (1119/1119), done.


In [3]:
import os
import sys
# from langchain.docstore.document import Document
from langchain_core.documents import Document as LangchainDocument
from typing import List, Dict, Any, Tuple
from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
from langchain_core.retrievers import BaseRetriever
from sentence_transformers import CrossEncoder

from google.colab import userdata


# Set the OpenAI API key environment variable
OPENAI_API_KEY = userdata.get('key_openai')



In [4]:
!pip install langchain-community langchain_experimental



In [5]:
from langchain_experimental.text_splitter import SemanticChunker


In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Define file path

In [7]:
# Download required data files
import os
os.makedirs('data', exist_ok=True)

# Download the PDF document used in this notebook
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
!wget -O data/Understanding_Climate_Change.pdf https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf


--2026-01-22 09:42:17--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206372 (202K) [application/octet-stream]
Saving to: ‘data/Understanding_Climate_Change.pdf’


2026-01-22 09:42:17 (13.4 MB/s) - ‘data/Understanding_Climate_Change.pdf’ saved [206372/206372]

--2026-01-22 09:42:17--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/Understanding_Climate_Change.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
L

In [8]:
path = "data/Understanding_Climate_Change.pdf"

# Read PDF to string

In [10]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-6.6.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.6.0-py3-none-any.whl (328 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m327.7/329.0 kB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.6.0


In [11]:
from pypdf import PdfReader

def read_pdf_to_string(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

In [12]:
content = read_pdf_to_string(path)

### Breakpoint types:
* 'percentile': all differences between sentences are calculated, and then any difference greater than the X percentile is split.
* 'standard_deviation': any difference greater than X standard deviations is split.
* 'interquartile': the interquartile distance is used to split chunks.

In [13]:
model_embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  model_embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
text_splitter = SemanticChunker(model_embedding, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90)
# chose which embeddings and breakpoint type and threshold to use

# Split original text to semantic chunks

In [15]:
docs = text_splitter.create_documents([content])

### Create vector store and retriever


In [20]:
!pip install faiss-cpu
!pip install langchain[faiss]



In [22]:
from langchain_community.vectorstores import FAISS

In [23]:
embeddings = model_embedding
vectorstore = FAISS.from_documents(docs, embeddings)
chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

### Test the retriever

In [36]:
from pydantic import BaseModel, Field

In [34]:
from langchain_core.retrievers import BaseRetriever

In [44]:
class CustomRetriever(BaseRetriever, BaseModel):
    vectorstore: Any = Field(description="Vector store for initial retrieval")

    class Config:
        arbitrary_types_allowed = True

    # Required by BaseRetriever
    def _get_relevant_documents(self, query: str) -> List[LangchainDocument]:
        # Lấy top 30 docs từ vectorstore
        initial_docs = self.vectorstore.similarity_search(query, k=30)
        return initial_docs

    # Optional: để giữ code cũ
    def get_relevant_documents(self, query: str, num_docs=2):
        docs = self._get_relevant_documents(query)
        return docs[:num_docs]  # slice top N

/tmp/ipython-input-4281801628.py:1: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  class CustomRetriever(BaseRetriever, BaseModel):


In [45]:
def retrieve_context_per_question(query, retriever):
    docs = retriever.get_relevant_documents(query)
    context = "\n\n".join([doc.page_content for doc in docs])
    return context

# Wrap VectorStoreRetriever
chunks_query_retriever_custom = CustomRetriever(vectorstore=chunks_query_retriever.vectorstore)

# Test
test_query = "What is the main cause of climate change?"
context = retrieve_context_per_question(test_query, chunks_query_retriever_custom)
print(context[:500])  # show first 500 chars


These effects include: 
Rising Temperatures 
Global temperatures have risen by about 1.2 degrees Celsius (2.2 degrees Fahrenheit) since 
the late 19th century. This warming is not uniform, with some regions experiencing more 
significant increases than others. Heatwaves 
Heatwaves are becoming more frequent and severe, posing risks to human health, agriculture, 
and infrastructure. Cities are particularly vulnerable due to the "urban heat island" effect. Heatwaves can lead to heat-related illnes
