In [1]:
! pip install langchain_community langchainhub chromadb langchain langchain-classic langchain-chroma langchain-google-genai pypdf rank_bm25 

Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-1.1.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-4.2.0-py3-none-any.whl.metadata (2.7 kB)
Collecting pypdf
  Downloading pypdf-6.6.2-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain_community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain_community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none

In [3]:
import bs4
import os
import time
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.retrievers import EnsembleRetriever
import uuid
import time
from langchain_core.documents import Document

# Khởi tạo brief cho từng chunk

In [35]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-lite", temperature=0)


In [36]:
def generate_chunk_context(document, chunk):

    chunk_process_prompt = """You are an AI assistant specializing in research paper analysis.
                            Your task is to provide brief, relevant context for a chunk of text
                            based on the following research paper.

                            Here is the research paper:
                            <paper>
                            {paper}
                            </paper>

                            Here is the chunk we want to situate within the whole document:
                            <chunk>
                            {chunk}
                            </chunk>

                            Provide a concise context (3-4 sentences max) for this chunk,
                            considering the following guidelines:

                            - Give a short succinct context to situate this chunk within the overall document
                            for the purposes of improving search retrieval of the chunk.
                            - Answer only with the succinct context and nothing else.
                            - Context should be mentioned like 'Focuses on ....'
                            do not mention 'this chunk or section focuses on...'

                            Context:
                        """

    prompt_template = ChatPromptTemplate.from_template(chunk_process_prompt)

    agentic_chunk_chain = (prompt_template
                                |
                               llm
                                |
                            StrOutputParser())

    context = agentic_chunk_chain.invoke({'paper': document, 'chunk': chunk})

    return context

In [43]:
def create_contextual_chunks(file_path, chunk_size=1000, chunk_overlap=200):

    print('Loading pages:', file_path)
    loader = PyPDFLoader(file_path)
    doc_pages = loader.load()
    print("doc_pages: ", doc_pages)

    print('Chunking pages:', file_path)
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                              chunk_overlap=chunk_overlap)
    doc_chunks = splitter.split_documents(doc_pages)
    print("doc_chunks: ", doc_chunks)

    print('Generating contextual chunks:', file_path)
    original_doc = '\n'.join([doc.page_content for doc in doc_chunks])

    contextual_chunks = []

    for i, chunk in enumerate(doc_chunks):
        chunk_content = chunk.page_content
        chunk_metadata = chunk.metadata
        chunk_metadata_upd = {
            'id': str(uuid.uuid4()),                # create unique id for the chunk
            'page': chunk_metadata['page'],         # page number of the chunk in the source doc
            'source': chunk_metadata['source'],     #
            'title': chunk_metadata['source'].split('/')[-1]    # source doc title
        }

        context = generate_chunk_context(original_doc, chunk_content)
        contextual_chunks.append(Document(page_content=context+'\n'+chunk_content,      # Prepend Chunk Context to the Chunk & add Metadata
                                          metadata=chunk_metadata_upd))
        print(f"  > Đã xong chunk {i+1}/{len(doc_chunks)}. Đang nghỉ 10s...")

        if i < len(doc_chunks) - 1:
            time.sleep(10)
    print('Finished processing:', file_path)
    print()
    return contextual_chunks

In [44]:
from glob import glob

pdf_files = glob('./data/*.pdf')
pdf_files

['./data/Chinh_Sach_Doi_Tra_Bao_Hanh.pdf',
 './data/Huong_Dan_Chon_Size_Va_Bao_Quan.pdf']

In [45]:
paper_docs = []

for fp in pdf_files:
    paper_docs.extend(create_contextual_chunks(file_path=fp))

Loading pages: ./data/Chinh_Sach_Doi_Tra_Bao_Hanh.pdf
doc_pages:  [Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2026-02-02T21:20:41+07:00', 'author': 'Nguyen Thi Lan Anh  - dd23', 'moddate': '2026-02-02T21:20:41+07:00', 'source': './data/Chinh_Sach_Doi_Tra_Bao_Hanh.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='CHÍNH SÁCH ĐỔI TRẢ VÀ BẢO HÀNH CỦA CỬA HÀNG SHATE SHOP \nNgày ban hành: 02/02/2026  \nÁp dụng cho: Tất cả các sản phẩm giày dép đư ợc bán t ại cửa hàng Shate Shop, bao \ngồm mua trực tiếp tại cửa hàng và mua hàng tr ực tuyến qua website hoặc các nền tảng \nliên kết.  \nMục đích: Shate Shop cam kết mang đến cho khách hàng những sản phẩm chất lượng \ncao và dịch vụ hậu mãi tốt nhất. Chính sách này nhằm bảo vệ quyền lợi của khách hàng, \nđồng thời quy định rõ ràng các điều kiện đổi trả và bảo hành để đảm bảo tính công bằng \nvà minh bạch. \nChính sách này tuân th ủ các quy 

In [46]:
len(paper_docs)

16

# Khởi tạo database (contextual + keyword)

In [47]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [48]:
chroma_db = Chroma.from_documents(documents=paper_docs,
                                  collection_name='my_context_db',
                                  embedding=gemini_embeddings,
                                  collection_metadata={"hnsw:space": "cosine"},
                                  persist_directory="./my_context_db")
chroma_db = Chroma(persist_directory="./my_context_db",
                   collection_name='my_context_db',
                   embedding_function=gemini_embeddings)

In [49]:
### Contextual search ###
similarity_retriever = chroma_db.as_retriever(search_type="similarity",
                                              search_kwargs={"k": 5})

In [50]:
### BM25 search ###
bm25_retriever = BM25Retriever.from_documents(documents=paper_docs,
                                              k=5)

BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7c127539d8e0>, k=5)

In [51]:
# Vector Similarity Search + BM25 Search
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, similarity_retriever],      
    weights=[0.5, 0.5]
)
ensemble_retriever

EnsembleRetriever(retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x7c127539d8e0>, k=5), VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7c1274080fb0>, search_kwargs={'k': 5})], weights=[0.5, 0.5])

# Reranker

In [53]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_classic.retrievers.document_compressors import CrossEncoderReranker
from langchain_classic.retrievers import ContextualCompressionRetriever

# download an open-source Re-Ranker Model - BAAI/bge-reranker-v2-m3
reranker = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
reranker_compressor = CrossEncoderReranker(model=reranker, top_n=5)

# Retriever 2 - Uses a Reranker model to rerank retrieval results from the previous retriever
final_retriever = ContextualCompressionRetriever(
    base_retriever=ensemble_retriever,              # Stage 1: Base Retriever: (Vector Similarity Search + BM25 Search) : Reciprocal Rank Fusion
    base_compressor=reranker_compressor             # Stage 2: Re-ranker
)
final_retriever

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/393 [00:00<?, ?it/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

ContextualCompressionRetriever(base_compressor=CrossEncoderReranker(model=HuggingFaceCrossEncoder(client=CrossEncoder(
  (model): XLMRobertaForSequenceClassification(
    (classifier): XLMRobertaClassificationHead(
      (dense): Linear(in_features=1024, out_features=1024, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (out_proj): Linear(in_features=1024, out_features=1, bias=True)
    )
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (position_embeddings): Embedding(8194, 1024, padding_idx=1)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
 

# Retrieval

In [52]:
rag_prompt = """You are an assistant who is an expert in question-answering tasks.
                Answer the following question using only the following pieces of retrieved context.
                If the answer is not in the context, do not make up answers, just say that you don't know.
                Keep the answer detailed and well formatted based on the information from the context.

                Question:
                {question}

                Context:
                {context}

                Answer:
            """

rag_prompt_template = ChatPromptTemplate.from_template(rag_prompt)

In [54]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [55]:
qa_rag_chain = (
    {
        "context": (final_retriever           # Revokes the Retriever using the user Query
                      |
                    format_docs),             # formats the document content into a structured string
        "question": RunnablePassthrough()     # to directly pass the user Query without any modifications
    }
      |
    rag_prompt_template
      |
     llm
)

In [None]:
from IPython.display import display, Markdown

query = "CEO của cửa hàng là ai?"
result = qa_rag_chain.invoke(query)
display(Markdown(result.content))