In [None]:
! pip install langchain_community tiktoken langchain_text_splitters faiss-cpu qdrant-client protonx dotenv langchain-google-genai pyvi rank_bm25 numpy sentence-transformers ipywidgets

# I. Setup

In [1]:
import os
import json
import sys
import time
import re
from typing import List

import tiktoken
from dotenv import load_dotenv
from langchain.retrievers import EnsembleRetriever
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from protonx import ProtonX
from pyvi import ViTokenizer
from tqdm import tqdm


from IPython.display import Markdown, display

load_dotenv()

True

# II. Load Documents

In [2]:
# Enhanced document loading with metadata preservation - TESTING VERSION (50 docs only)

def load_legal_docs_from_folders(root_dir: str, max_docs: int = 50) -> list[Document]:
    """
    Enhanced loader that preserves structured metadata from folders containing
    both 'metadata.json' and 'content.txt' files.
    
    Args:
        root_dir: Directory to load documents from
        max_docs: Maximum number of documents to load (for testing)
    """
    documents = []
    doc_count = 0
    
    print(f"\n[INFO] Starting enhanced loading from '{root_dir}'...")
    
    for dirpath, _, filenames in os.walk(root_dir):
        if doc_count >= max_docs:
            break
            
        if "metadata.json" in filenames and "content.txt" in filenames:
            metadata_path = os.path.join(dirpath, "metadata.json")
            content_path = os.path.join(dirpath, "content.txt")

            try:
                with open(metadata_path, 'r', encoding='utf-8') as f:
                    metadata_json = json.load(f)
                with open(content_path, 'r', encoding='utf-8') as f:
                    full_text = f.read()

                # Enhanced page content with title
                page_content = (
                    f"Ti√™u ƒë·ªÅ: {metadata_json.get('title', '')}\n\n"
                    f"To√†n vƒÉn: {full_text}"
                )

                # Extract specific metadata fields from the diagram section
                id_metadata = metadata_json.get("metadata", {})
                diagram_metadata = metadata_json.get("metadata", {}).get("diagram", {})
                
                # Create final metadata with only the required fields
                final_metadata = {
                    '_id': id_metadata.get('_id', ''),
                    'so_hieu': diagram_metadata.get('so_hieu', ''),
                    'loai_van_ban': diagram_metadata.get('loai_van_ban', ''),
                    'linh_vuc_nganh': diagram_metadata.get('linh_vuc_nganh', ''),
                    'noi_ban_hanh': diagram_metadata.get('noi_ban_hanh', ''),
                    'nguoi_ky': diagram_metadata.get('nguoi_ky', ''),
                    'ngay_ban_hanh': diagram_metadata.get('ngay_ban_hanh', ''),
                    'ngay_hieu_luc': diagram_metadata.get('ngay_hieu_luc', ''),
                    'tinh_trang': diagram_metadata.get('tinh_trang', ''),
                    'title': metadata_json.get('title', ''),
                    'source': content_path,
                    'doc_id': doc_count + 1  # Add document ID for testing
                }
                
                doc = Document(page_content=page_content, metadata=final_metadata)
                documents.append(doc)
                doc_count += 1
                
                # Dynamic progress indicator
                progress_message = f"  -> Progress: {doc_count}/{max_docs} documents loaded."
                sys.stdout.write('\r' + progress_message)
                sys.stdout.flush()
                
            except Exception as e:
                # Print error on a new line to avoid being overwritten by progress
                print(f"\n[ERROR] Failed to process document in {dirpath}: {e}")
    
    # Print a newline to move past the progress indicator line
    print()
    return documents

# --- SCRIPT EXECUTION ---

# TESTING CONFIGURATION - Load only a few documents
root_dir = '../data/raw_data_50/documents'
MAX_DOCS_FOR_TESTING = 50

print("=" * 70)
print("   STARTING DOCUMENT LOADING SCRIPT (TESTING MODE)")
print(f"[INFO] Source Directory: {root_dir}")
print(f"[INFO] Document Limit: {MAX_DOCS_FOR_TESTING}")
print("=" * 70)

try:
    docs = load_legal_docs_from_folders(root_dir, max_docs=MAX_DOCS_FOR_TESTING)
    print(f"[SUCCESS] Enhanced loading complete. Found {len(docs)} structured documents.")
    if docs:
        print("\n[DEBUG] Sample metadata from the first document:")
        # Pretty print the JSON metadata for readability
        print(json.dumps(docs[0].metadata, indent=2, ensure_ascii=False))
        
except Exception as e:
    print(f"\n[ERROR] Enhanced loading process failed: {e}")
    print("[INFO] Attempting fallback to simple directory loading...")
    
    try:
        # Fallback to simple loading with limit
        loader = DirectoryLoader(
            root_dir, 
            glob='**/*.txt', 
            loader_cls=TextLoader, 
            show_progress=True # Langchain's built-in progress bar
        )
        all_docs = loader.load()
        docs = all_docs[:MAX_DOCS_FOR_TESTING]
        print(f'[SUCCESS] Simple loading complete. Loaded {len(docs)} documents (capped at {MAX_DOCS_FOR_TESTING}).')
    except Exception as fallback_e:
        print(f"[ERROR] Fallback loading also failed: {fallback_e}")
        docs = [] # Ensure docs is an empty list on total failure

print("\n" + "=" * 70)
print("   FINAL SUMMARY")
if docs:
    print(f"  -> Total documents loaded: {len(docs)}")
    print("  -> Status: Ready for next processing steps.")
    print("\nPreview of first document content:")
    print("-" * 40)
    print(f"{docs[0].page_content[:300]}...")
    print("-" * 40)
else:
    print("  -> No documents were loaded. Please check the source directory and error logs.")
print("=" * 70)

   STARTING DOCUMENT LOADING SCRIPT (TESTING MODE)
[INFO] Source Directory: ../data/raw_data_50/documents
[INFO] Document Limit: 50

[INFO] Starting enhanced loading from '../data/raw_data_50/documents'...
  -> Progress: 50/50 documents loaded.
[SUCCESS] Enhanced loading complete. Found 50 structured documents.

[DEBUG] Sample metadata from the first document:
{
  "_id": "66b9c0513ab9c4ae3d5eec2c",
  "so_hieu": "02/2024/NQ-HƒêND",
  "loai_van_ban": "Ngh·ªã quy·∫øt",
  "linh_vuc_nganh": "Gi√°o d·ª•c",
  "noi_ban_hanh": "Th√†nh ph·ªë H·∫£i Ph√≤ng",
  "nguoi_ky": "Ph·∫°m VƒÉn L·∫≠p",
  "ngay_ban_hanh": "19/07/2024",
  "ngay_hieu_luc": "01/08/2024",
  "tinh_trang": "C√≤n hi·ªáu l·ª±c",
  "title": "Ngh·ªã quy·∫øt 02/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph√≠ ƒë·ªëi v·ªõi c√°c c∆° s·ªü gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng c√¥ng l·∫≠p tr√™n ƒë·ªãa b√†n th√†nh ph·ªë H·∫£i Ph√≤ng t·ª´ nƒÉm h·ªçc 2024-2025",
  "source": "../data/raw_data_50/documents\\Ngh·ªã quy·∫øt 022024NQ-HƒêND

# III. Document Splitting (Parent-Child Architecture)

In [3]:
# Enhanced Intelligent Chunking for Legal Documents
tokenizer = tiktoken.get_encoding("cl100k_base")

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [4]:
class IntelligentLegalChunker:
    """
    Advanced chunking strategy for legal documents that:
    1. Respects token limits (max 3500 tokens to stay under 4096 limit)
    2. Preserves paragraph boundaries
    3. Handles oversized paragraphs intelligently
    4. Maintains context between chunks
    """
    
    def __init__(self, max_tokens=3500, overlap_tokens=200):
        self.max_tokens = max_tokens
        self.overlap_tokens = overlap_tokens
        self.tokenizer = tokenizer
        
    def split_paragraph_intelligently(self, paragraph, max_tokens=None):
        """Split a single paragraph that's too large"""
        if max_tokens is None:
            max_tokens = self.max_tokens
            
        # Try to split on sentences first
        sentences = re.split(r'(?<=[.!?])\s+', paragraph)
        if len(sentences) <= 1:
            # If no sentence breaks, split on words
            words = paragraph.split()
            chunks = []
            current_chunk = []
            current_tokens = 0
            
            for word in words:
                word_tokens = self.tokenizer.encode(word, disallowed_special=())
                if current_tokens + len(word_tokens) > max_tokens and current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = [word]
                    current_tokens = len(word_tokens)
                else:
                    current_chunk.append(word)
                    current_tokens += len(word_tokens)
            
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            return chunks
        
        # Split by sentences
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for sentence in sentences:
            sentence_tokens = self.tokenizer.encode(sentence, disallowed_special=())
            if current_tokens + len(sentence_tokens) > max_tokens and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_tokens = len(sentence_tokens)
            else:
                current_chunk.append(sentence)
                current_tokens += len(sentence_tokens)
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks
    
    def chunk_document(self, document):
        """Chunk a document intelligently"""
        content = document.page_content
        metadata = document.metadata.copy()
        
        # Split by double newlines (paragraphs)
        paragraphs = content.split('\n\n')
        chunks = []
        current_chunk = []
        current_tokens = 0
        
        for paragraph in paragraphs:
            paragraph = paragraph.strip()
            if not paragraph:
                continue
                
            paragraph_tokens = self.tokenizer.encode(paragraph, disallowed_special=())
            
            # If single paragraph is too large, split it
            if len(paragraph_tokens) > self.max_tokens:
                # Add current chunk if it exists
                if current_chunk:
                    chunks.append('\n\n'.join(current_chunk))
                    current_chunk = []
                    current_tokens = 0
                
                # Split the oversized paragraph
                sub_chunks = self.split_paragraph_intelligently(paragraph)
                for sub_chunk in sub_chunks:
                    chunks.append(sub_chunk)
                continue
            
            # Check if adding this paragraph would exceed limit
            if current_tokens + len(paragraph_tokens) > self.max_tokens and current_chunk:
                # Save current chunk
                chunks.append('\n\n'.join(current_chunk))
                
                # Start new chunk with overlap if possible
                if len(current_chunk) > 0:
                    # Try to include last paragraph for overlap
                    overlap_text = current_chunk[-1]
                    overlap_tokens = self.tokenizer.encode(overlap_text, disallowed_special=())
                    if len(overlap_tokens) <= self.overlap_tokens:
                        current_chunk = [overlap_text, paragraph]
                        current_tokens = len(overlap_tokens) + len(paragraph_tokens)
                    else:
                        current_chunk = [paragraph]
                        current_tokens = len(paragraph_tokens)
                else:
                    current_chunk = [paragraph]
                    current_tokens = len(paragraph_tokens)
            else:
                current_chunk.append(paragraph)
                current_tokens += len(paragraph_tokens)
        
        # Add final chunk
        if current_chunk:
            chunks.append('\n\n'.join(current_chunk))
        
        # Create Document objects for each chunk
        chunk_docs = []
        for i, chunk_content in enumerate(chunks):
            chunk_metadata = metadata.copy()
            chunk_metadata['chunk_id'] = i
            chunk_metadata['total_chunks'] = len(chunks)
            chunk_docs.append(Document(page_content=chunk_content, metadata=chunk_metadata))
        
        return chunk_docs

print("Intelligent Legal Chunker initialized successfully!")
print("Features:")
print("- Respects 3500 token limit (safe for 4096 API limit)")
print("- Preserves paragraph boundaries")
print("- Handles oversized paragraphs intelligently")
print("- Maintains context with smart overlap")


Intelligent Legal Chunker initialized successfully!
Features:
- Respects 3500 token limit (safe for 4096 API limit)
- Preserves paragraph boundaries
- Handles oversized paragraphs intelligently
- Maintains context with smart overlap


In [5]:
# Enhanced Parent-Child Architecture with Intelligent Chunking
print("Setting up enhanced parent-child document architecture with intelligent chunking...")

# Initialize the intelligent chunker
intelligent_chunker = IntelligentLegalChunker(max_tokens=3500, overlap_tokens=200)

# Create parent chunks and store them
docstore = InMemoryStore()
child_chunks = []

print(f"Processing {len(docs)} documents with intelligent chunking...")

for doc_idx, doc in enumerate(docs):
    print(f"Processing document {doc_idx + 1}/{len(docs)}: {doc.metadata.get('title', 'Unknown')[:50]}...")
    
    # Use intelligent chunking for parent chunks
    parent_chunks = intelligent_chunker.chunk_document(doc)
    
    for i, parent_chunk in enumerate(parent_chunks):
        parent_id = f"{doc.metadata.get('title', 'doc')}-{i}"
        docstore.mset([(parent_id, parent_chunk)])
        
        # For child chunks, use smaller intelligent chunking
        child_chunker = IntelligentLegalChunker(max_tokens=800, overlap_tokens=100)
        sub_chunks = child_chunker.chunk_document(parent_chunk)
        
        for sub_chunk in sub_chunks:
            sub_chunk.metadata['parent_id'] = parent_id
            # Preserve original metadata
            sub_chunk.metadata.update(doc.metadata)
            child_chunks.append(sub_chunk)

print(f"[INFO] Created {len(child_chunks)} child chunks from {len(docs)} documents.")
print(f"[INFO] Parent chunks stored in docstore for context retrieval.")

# Verify chunk sizes
chunk_sizes = [tiktoken_len(chunk.page_content) for chunk in child_chunks]
print(f"[INFO] Chunk size statistics:")
print(f"   - Min tokens: {min(chunk_sizes)}")
print(f"   - Max tokens: {max(chunk_sizes)}")
print(f"   - Avg tokens: {sum(chunk_sizes)//len(chunk_sizes)}")
print(f"   - Chunks over 3500 tokens: {sum(1 for size in chunk_sizes if size > 3500)}")

if max(chunk_sizes) > 3500:
    print(f"[WARNING] Some chunks still exceed 3500 tokens.")
    print("   This might cause API errors. Consider reducing max_tokens further.")
else:
    print("[SUCCESS] All chunks are within safe token limits.")

Setting up enhanced parent-child document architecture with intelligent chunking...
Processing 50 documents with intelligent chunking...
Processing document 1/50: Ngh·ªã quy·∫øt 02/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph...
Processing document 2/50: Ngh·ªã quy·∫øt 05/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph...
Processing document 3/50: Ngh·ªã quy·∫øt 06/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph...
Processing document 4/50: Ngh·ªã quy·∫øt 06/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph...
Processing document 5/50: Ngh·ªã quy·∫øt 07/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c chi b·∫£o ƒë·∫£...
Processing document 6/50: Ngh·ªã quy·∫øt 07/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph...
Processing document 7/50: Ngh·ªã quy·∫øt 10/2024/NQ-HƒêND quy ƒë·ªãnh m·ªôt s·ªë ch√≠nh s...
Processing document 8/50: Ngh·ªã quy·∫øt 10/2024/NQ-HƒêND quy ƒë·ªãnh m·ª©c thu h·ªçc ph...
Processing document 9/50: Ngh·ªã quy·∫øt 11/2024/NQ-HƒêND quy ƒë·ªãnh ch√≠nh s√°ch h·ªó ...
Processing document 10/50: Ngh·

# IV. Embedding Generation

In [6]:
# Enhanced ProtonXEmbeddings with Token Limit Handling
class EnhancedProtonXEmbeddings(Embeddings):
    """
    Enhanced ProtonX embeddings class that handles token limit errors gracefully.
    Features:
    - Automatic text truncation for oversized inputs
    - Dynamic, token-aware batching to avoid API token limits
    - Internal progress bar for user feedback
    """
    def __init__(self, max_retries: int = 3, delay: int = 5, batch_token_limit: int = 3500):
        """Initializes the ProtonX client and sets processing parameters."""
        self.client = ProtonX()
        self.max_retries = max_retries
        self.delay = delay
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
        self.batch_token_limit = batch_token_limit
        print(f"Enhanced ProtonXEmbeddings initialized with a token limit of {self.batch_token_limit} per batch.")

    def _get_token_count(self, text: str) -> int:
        """Calculates the number of tokens in a string."""
        return len(self.tokenizer.encode(text, disallowed_special=()))

    def _truncate_text_if_needed(self, text: str, max_tokens: int) -> str:
        """Truncate text if it exceeds token limit"""
        if self._get_token_count(text) <= max_tokens:
            return text

        tokens = self.tokenizer.encode(text, disallowed_special=())
        truncated_tokens = tokens[:max_tokens]
        print(f"[WARNING] A single document was truncated as it exceeds the batch token limit.")
        return self.tokenizer.decode(truncated_tokens)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Generates embeddings for a list of documents using a built-in progress bar and dynamic batching.
        """
        all_embeddings = []
        current_batch = []
        current_batch_tokens = 0

        # Wrap the texts iterator with tqdm for a progress bar
        for text in tqdm(texts, desc="Generating Embeddings", unit="chunk"):
            truncated_text = self._truncate_text_if_needed(text, self.batch_token_limit)
            text_tokens = self._get_token_count(truncated_text)

            if current_batch and (current_batch_tokens + text_tokens > self.batch_token_limit):
                batch_embeddings = self._process_batch_with_retries(current_batch)
                all_embeddings.extend(batch_embeddings)

                current_batch = [truncated_text]
                current_batch_tokens = text_tokens
            else:
                current_batch.append(truncated_text)
                current_batch_tokens += text_tokens

        if current_batch:
            batch_embeddings = self._process_batch_with_retries(current_batch)
            all_embeddings.extend(batch_embeddings)

        return all_embeddings

    def _process_batch_with_retries(self, batch: List[str]) -> List[List[float]]:
        """Handles the API call with retries for a given batch."""
        for attempt in range(self.max_retries):
            try:
                response = self.client.embeddings.create(input=batch)
                if not response or not response.get("data"):
                    raise ValueError("Invalid response from ProtonX API")
                return [item.get("embedding") for item in response.get("data")]
            except Exception as e:
                print(f"\n[ERROR] API call failed on attempt {attempt + 1}/{self.max_retries}: {e}")
                if attempt + 1 == self.max_retries:
                    print(f"[ERROR] All retries failed for batch. Returning zero vectors.")
                    return [[0.0] * 1536 for _ in batch]
                time.sleep(self.delay)
        return []

    def embed_query(self, text: str) -> List[float]:
        """Generates an embedding for a single query text."""
        truncated_text = self._truncate_text_if_needed(text, self.batch_token_limit)
        return self._process_batch_with_retries([truncated_text])[0]

print("Enhanced ProtonXEmbeddings class updated with robust dynamic batching and internal progress bar.")

Enhanced ProtonXEmbeddings class updated with robust dynamic batching and internal progress bar.


In [7]:
# 1. Initialize the enhanced embeddings class
embeddings = EnhancedProtonXEmbeddings()
print("Enhanced ProtonX embeddings object created and ready to use.")

# 2. Extract texts and metadatas from child chunks
texts = [chunk.page_content for chunk in child_chunks]
metadatas = [chunk.metadata for chunk in child_chunks]

# --- START OF PROCESS ---
print("\n" + "=" * 70)
print("   STARTING EMBEDDING GENERATION PROCESS")
print("=" * 70)
print(f"[INFO] Number of text chunks to process: {len(texts)}")
print("-" * 70)

start_time = time.time()

# 3. Process all embeddings with a single call.
# The new class handles its own batching and progress bar internally.
text_embeddings = embeddings.embed_documents(texts)

end_time = time.time()
total_time = end_time - start_time

# --- FINAL SUMMARY REPORT ---
print("\n" + "=" * 70)
print("   EMBEDDING PROCESS COMPLETE: SUMMARY REPORT")
print("=" * 70)

if text_embeddings and len(text_embeddings) == len(texts):
    print(f"[SUCCESS] Process finished in {total_time:.2f} seconds ({total_time/60:.1f} minutes).")

    print("\n--- Performance Metrics ---")
    print(f"  -> {'Total chunks processed':<25}: {len(text_embeddings)}")
    print(f"  -> {'Throughput':<25}: {len(text_embeddings)/total_time:.1f} chunks/sec")
    print(f"  -> {'Avg. time per chunk':<25}: {(total_time/len(text_embeddings))*1000:.2f} ms")

    print("\n--- Embedding Details ---")
    print(f"  -> {'Embedding dimension':<25}: {len(text_embeddings[0])}")
else:
    print("[ERROR] Embedding generation failed or was incomplete. Please check the error logs above.")

print("=" * 70)

Enhanced ProtonXEmbeddings initialized with a token limit of 3500 per batch.
Enhanced ProtonX embeddings object created and ready to use.

   STARTING EMBEDDING GENERATION PROCESS
[INFO] Number of text chunks to process: 546
----------------------------------------------------------------------


Generating Embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 546/546 [19:00<00:00,  2.09s/chunk]



   EMBEDDING PROCESS COMPLETE: SUMMARY REPORT
[SUCCESS] Process finished in 1146.44 seconds (19.1 minutes).

--- Performance Metrics ---
  -> Total chunks processed   : 546
  -> Throughput               : 0.5 chunks/sec
  -> Avg. time per chunk      : 2099.71 ms

--- Embedding Details ---
  -> Embedding dimension      : 768


# V. Vector Store and Retriever Setup

In [8]:
# Setup FAISS vector store
print("\nCreating FAISS vector store...")
# FAISS requires (text, embedding) pairs
text_embedding_pairs = list(zip(texts, text_embeddings))
vectorstore_local = FAISS.from_embeddings(text_embedding_pairs, embeddings, metadatas=metadatas)
print("FAISS vector store created successfully.")

# Setup BM25 retriever with Vietnamese tokenization
print("\nSetting up BM25 retriever with Vietnamese tokenization...")

def tokenize_vi_for_bm25(doc):
    """Tokenize Vietnamese text for better BM25 matching"""
    doc.page_content = ViTokenizer.tokenize(doc.page_content)
    return doc

# Create tokenized copies for BM25
child_chunks_tokenized = [tokenize_vi_for_bm25(chunk) for chunk in child_chunks]
bm25_retriever = BM25Retriever.from_documents(child_chunks_tokenized)
bm25_retriever.k = 15  # Retrieve top 15 for ensemble
print("BM25 retriever with Vietnamese tokenization created successfully.")


Creating FAISS vector store...
FAISS vector store created successfully.

Setting up BM25 retriever with Vietnamese tokenization...
BM25 retriever with Vietnamese tokenization created successfully.


In [9]:
# Setup Ensemble Retrieval (BM25 + FAISS)
print("\nSetting up ensemble retrieval...")

# Create FAISS retriever
faiss_retriever = vectorstore_local.as_retriever(search_kwargs={'k': 15})

# Create ensemble retriever combining BM25 and FAISS
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.6, 0.4]  # Favor BM25 for Vietnamese lexical matching
)

print("Ensemble retriever (BM25 + FAISS) created successfully.")

# Setup Parent Document Retrieval Chain
def _get_parent_docs(input_dict: dict) -> list[Document]:
    """Retrieve parent documents from child documents"""
    child_docs = input_dict["child_docs"]
    store = input_dict["docstore"]
    parent_ids = []
    for doc in child_docs:
        if "parent_id" in doc.metadata and doc.metadata["parent_id"] not in parent_ids:
            parent_ids.append(doc.metadata["parent_id"])
    return [doc for doc in store.mget(parent_ids) if doc is not None]

# Create the parent retrieval chain
parent_retriever_chain = (
    {"child_docs": ensemble_retriever}
    | RunnablePassthrough.assign(docstore=lambda x: docstore)
    | RunnableLambda(_get_parent_docs)
)

print("Parent document retrieval chain created successfully.")


Setting up ensemble retrieval...
Ensemble retriever (BM25 + FAISS) created successfully.
Parent document retrieval chain created successfully.


# VI. RAG Chain Setup and Testing

In [10]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0)

# Enhanced prompt template with Vietnamese instructions and citation requirements
template = """Answer the question based ONLY on the following context.
Your answer must be in Vietnamese.
Your answer should be well-structured and easy to read.
- Use bullet points or numbered lists for multiple items or steps.
- Use **bold** for key terms, names, or important numbers and concepts.
- Use *italics* for emphasis or to highlight specific terms.

**After providing the answer, you MUST cite your sources accurately using the metadata from the context.**
For each source used, provide its title and document number (S·ªë hi·ªáu) if available.

Context:
{context}

Question: {question}

If the context does not provide enough information, say "T√¥i kh√¥ng t√¨m th·∫•y th√¥ng tin trong t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p." and do not provide an answer."""

prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    """Format retrieved documents with metadata for better context"""
    formatted_docs = []
    for i, doc in enumerate(docs):
        # Add document metadata as header
        title = doc.metadata.get('title', f'Document {i+1}')
        source = doc.metadata.get('source', 'Unknown source')
        formatted_doc = f"--- Document: {title} ---\n{doc.page_content}\n--- End Document ---"
        formatted_docs.append(formatted_doc)
    return "\n\n".join(formatted_docs)

# Enhanced RAG chain using parent document retrieval
rag_chain = (
    {"context": parent_retriever_chain | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("Enhanced RAG chain with hybrid retrieval and parent documents created successfully.")

Enhanced RAG chain with hybrid retrieval and parent documents created successfully.


In [11]:
query = "th·ªùi gian h·ªçc t·∫≠p nƒÉm 2024-2025 t·ªânh b·∫Øc k·∫°n l√† t·ª´ khi n√†o?"
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
# print(f"Response: {response}\n")
display(Markdown(f"**Response:** {response}"))

Query: th·ªùi gian h·ªçc t·∫≠p nƒÉm 2024-2025 t·ªânh b·∫Øc k·∫°n l√† t·ª´ khi n√†o?



**Response:** D·ª±a tr√™n c√°c t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p, k·∫ø ho·∫°ch th·ªùi gian h·ªçc t·∫≠p nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n t·∫°i t·ªânh B·∫Øc K·∫°n ƒë∆∞·ª£c quy ƒë·ªãnh nh∆∞ sau:

*   **Ng√†y t·ª±u tr∆∞·ªùng:**
    *   C√°c c·∫•p h·ªçc t·ª±u tr∆∞·ªùng s·ªõm nh·∫•t v√†o **ng√†y 29 th√°ng 8 nƒÉm 2024** (th·ª© NƒÉm).
    *   Ri√™ng ƒë·ªëi v·ªõi l·ªõp 1, t·ª±u tr∆∞·ªùng s·ªõm nh·∫•t v√†o **ng√†y 22 th√°ng 8 nƒÉm 2024** (th·ª© NƒÉm).

*   **Ng√†y khai gi·∫£ng:**
    *   T·ªï ch·ª©c khai gi·∫£ng nƒÉm h·ªçc m·ªõi v√†o **ng√†y 05 th√°ng 9 nƒÉm 2024** (th·ª© NƒÉm).

*   **K·∫ø ho·∫°ch h·ªçc k·ª≥ v√† k·∫øt th√∫c nƒÉm h·ªçc:**
    *   NƒÉm h·ªçc c√≥ **35 tu·∫ßn th·ª±c h·ªçc**, trong ƒë√≥ h·ªçc k·ª≥ I c√≥ 18 tu·∫ßn v√† h·ªçc k·ª≥ II c√≥ 17 tu·∫ßn.
    *   K·∫øt th√∫c h·ªçc k·ª≥ I tr∆∞·ªõc **ng√†y 18 th√°ng 01 nƒÉm 2025**.
    *   Ho√†n th√†nh ch∆∞∆°ng tr√¨nh v√† k·∫øt th√∫c nƒÉm h·ªçc tr∆∞·ªõc **ng√†y 31 th√°ng 5 nƒÉm 2025**.

*   **C√°c m·ªëc th·ªùi gian quan tr·ªçng kh√°c:**
    *   X√©t c√¥ng nh·∫≠n ho√†n th√†nh ch∆∞∆°ng tr√¨nh ti·ªÉu h·ªçc v√† t·ªët nghi·ªáp trung h·ªçc c∆° s·ªü tr∆∞·ªõc **ng√†y 30 th√°ng 6 nƒÉm 2025**.
    *   Ho√†n th√†nh tuy·ªÉn sinh c√°c l·ªõp ƒë·∫ßu c·∫•p tr∆∞·ªõc **ng√†y 31 th√°ng 7 nƒÉm 2025**.
    *   Thi t·ªët nghi·ªáp trung h·ªçc ph·ªï th√¥ng nƒÉm 2025 d·ª± ki·∫øn di·ªÖn ra v√†o **ng√†y 26 v√† 27 th√°ng 6 nƒÉm 2025**.

***

**Ngu·ªìn:**
*   Quy·∫øt ƒë·ªãnh 1370/Qƒê-UBND nƒÉm 2024 v·ªÅ K·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n t·ªânh B·∫Øc K·∫°n (S·ªë hi·ªáu: 1370/Qƒê-UBND).

In [12]:
query = "K·ª≥ thi ch·ªçn h·ªçc sinh gi·ªèi c·∫•p t·ªânh l·ªõp 12 ƒë∆∞·ª£c t·ªï ch·ª©c v√†o ng√†y n√†o?"
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
# print(f"Response: {response}\n")
display(Markdown(f"**Response:** {response}"))

Query: K·ª≥ thi ch·ªçn h·ªçc sinh gi·ªèi c·∫•p t·ªânh l·ªõp 12 ƒë∆∞·ª£c t·ªï ch·ª©c v√†o ng√†y n√†o?



**Response:** D·ª±a tr√™n c√°c t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p, th·ªùi gian t·ªï ch·ª©c k·ª≥ thi ch·ªçn h·ªçc sinh gi·ªèi c·∫•p t·ªânh l·ªõp 12 nƒÉm h·ªçc 2024-2025 kh√°c nhau t√πy theo t·ª´ng ƒë·ªãa ph∆∞∆°ng. C·ª• th·ªÉ nh∆∞ sau:

*   **T·ªânh ƒê·∫Øk N√¥ng**: K·ª≥ thi h·ªçc sinh gi·ªèi l·ªõp 12 trung h·ªçc ph·ªï th√¥ng c·∫•p t·ªânh ƒë∆∞·ª£c t·ªï ch·ª©c t·ª´ ng√†y **25 th√°ng 3 nƒÉm 2025** ƒë·∫øn ng√†y **26 th√°ng 3 nƒÉm 2025**.
*   **T·ªânh B√¨nh ƒê·ªãnh**: K·ª≥ thi ch·ªçn h·ªçc sinh gi·ªèi c·∫•p t·ªânh l·ªõp 12 ƒë∆∞·ª£c t·ªï ch·ª©c v√†o ng√†y **22/10/2024**.
*   **T·ªânh L√¢m ƒê·ªìng**: Ho√†n th√†nh t·ªï ch·ª©c thi ch·ªçn h·ªçc sinh gi·ªèi c·∫•p t·ªânh l·ªõp 12 tr∆∞·ªõc ng√†y **31/01/2025**.
*   **T·ªânh Y√™n B√°i**: Thi ch·ªçn h·ªçc sinh gi·ªèi trung h·ªçc ph·ªï th√¥ng c·∫•p t·ªânh ph·∫£i ho√†n th√†nh tr∆∞·ªõc ng√†y **31 th√°ng 3 nƒÉm 2025**.
*   **T·ªânh B·∫Øc K·∫°n**: Thi ch·ªçn h·ªçc sinh gi·ªèi vƒÉn h√≥a c·∫•p t·ªânh trung h·ªçc ph·ªï th√¥ng ph·∫£i ho√†n th√†nh tr∆∞·ªõc ng√†y **28 th√°ng 4 nƒÉm 2025**.
*   **T·ªânh S√≥c TrƒÉng**: Th·ªùi gian thi s·∫Ω theo K·∫ø ho·∫°ch c·ªßa S·ªü Gi√°o d·ª•c v√† ƒê√†o t·∫°o.
*   **T·ªânh Qu·∫£ng Ng√£i**: Th·ªùi gian thi s·∫Ω do S·ªü Gi√°o d·ª•c v√† ƒê√†o t·∫°o h∆∞·ªõng d·∫´n c·ª• th·ªÉ.

***

### Ngu·ªìn t√†i li·ªáu:
1.  Quy·∫øt ƒë·ªãnh 944/Qƒê-UBND nƒÉm 2024 v·ªÅ K·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n do t·ªânh ƒê·∫Øk N√¥ng ban h√†nh (S·ªë hi·ªáu: 944/Qƒê-UBND).
2.  Quy·∫øt ƒë·ªãnh 2834/Qƒê-UBND nƒÉm 2024 v·ªÅ K·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n tr√™n ƒë·ªãa b√†n t·ªânh B√¨nh ƒê·ªãnh (S·ªë hi·ªáu: 2834/Qƒê-UBND).
3.  Quy·∫øt ƒë·ªãnh 1287/Qƒê-UBND nƒÉm 2024 v·ªÅ K·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n tr√™n ƒë·ªãa b√†n t·ªânh L√¢m ƒê·ªìng (S·ªë hi·ªáu: 1287/Qƒê-UBND).
4.  Quy·∫øt ƒë·ªãnh 1627/Qƒê-UBND nƒÉm 2024 v·ªÅ Khung k·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n tr√™n ƒë·ªãa b√†n t·ªânh Y√™n B√°i (S·ªë hi·ªáu: 1627/Qƒê-UBND).
5.  Quy·∫øt ƒë·ªãnh 1370/Qƒê-UBND nƒÉm 2024 v·ªÅ K·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n t·ªânh B·∫Øc K·∫°n (S·ªë hi·ªáu: 1370/Qƒê-UBND).
6.  Quy·∫øt ƒë·ªãnh 1828/Qƒê-UBND nƒÉm 2024 v·ªÅ K·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n tr√™n ƒë·ªãa b√†n t·ªânh S√≥c TrƒÉng (S·ªë hi·ªáu: 1828/Qƒê-UBND).
7.  Quy·∫øt ƒë·ªãnh 994/Qƒê-UBND nƒÉm 2024 v·ªÅ Khung k·∫ø ho·∫°ch th·ªùi gian nƒÉm h·ªçc 2024-2025 ƒë·ªëi v·ªõi gi√°o d·ª•c m·∫ßm non, gi√°o d·ª•c ph·ªï th√¥ng v√† gi√°o d·ª•c th∆∞·ªùng xuy√™n t·ªânh Qu·∫£ng Ng√£i (S·ªë hi·ªáu: 994/Qƒê-UBND).

In [13]:
query = "D·ª±a tr√™n h·ªì s∆° do C√¥ng ty TNHH OTES CORPORATION cung c·∫•p, h√£y m√¥ t·∫£ c√°c ƒë·∫∑c t√≠nh k·ªπ thu·∫≠t ch√≠nh c·ªßa s·∫£n ph·∫©m TamSoil Polynite ECO ·ªü tr·∫°ng th√°i dung d·ªãch sau khi pha, v√† n√™u r√µ cƒÉn c·ª© ph√°p l√Ω cao nh·∫•t (Lu·∫≠t) m√† C·ª•c H·∫£i quan ƒë√£ d·ª±a v√†o ƒë·ªÉ ban h√†nh th√¥ng b√°o m√£ s·ªë cho s·∫£n ph·∫©m n√†y."
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
print(f"Response: {response}\n")

# Display with markdown formatting
display(Markdown(f"**Response:** {response}"))

Query: D·ª±a tr√™n h·ªì s∆° do C√¥ng ty TNHH OTES CORPORATION cung c·∫•p, h√£y m√¥ t·∫£ c√°c ƒë·∫∑c t√≠nh k·ªπ thu·∫≠t ch√≠nh c·ªßa s·∫£n ph·∫©m TamSoil Polynite ECO ·ªü tr·∫°ng th√°i dung d·ªãch sau khi pha, v√† n√™u r√µ cƒÉn c·ª© ph√°p l√Ω cao nh·∫•t (Lu·∫≠t) m√† C·ª•c H·∫£i quan ƒë√£ d·ª±a v√†o ƒë·ªÉ ban h√†nh th√¥ng b√°o m√£ s·ªë cho s·∫£n ph·∫©m n√†y.

Response: T√¥i kh√¥ng t√¨m th·∫•y th√¥ng tin trong t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p.



**Response:** T√¥i kh√¥ng t√¨m th·∫•y th√¥ng tin trong t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p.

# VII. Performance Comparison and Analysis

## Enhanced Features Summary

### ‚úÖ **What's New in This Enhanced Version:**

1. **üîç Hybrid Retrieval**: BM25 (lexical) + FAISS (semantic) ensemble
2. **üáªüá≥ Vietnamese Tokenization**: Better BM25 matching for Vietnamese text
3. **üìö Parent-Child Architecture**: Better context preservation
4. **üìÑ Enhanced Data Loading**: Preserves structured metadata
5. **üéØ Improved Prompting**: Vietnamese instructions with citation requirements
6. **‚ö° ProtonX Embeddings**: High-quality Vietnamese embeddings (free)

### **Architecture Flow:**
```
Query ‚Üí [BM25 + FAISS] ‚Üí Ensemble ‚Üí Parent-Child Hydration ‚Üí Enhanced RAG ‚Üí Response
```

### **Key Improvements Over Original:**
- **Better Recall**: Hybrid retrieval catches both exact matches and semantic similarity
- **Better Context**: Parent-child architecture provides more complete context
- **Vietnamese-Optimized**: Tokenization and specialized embeddings
- **Production-Ready**: Robust error handling and metadata preservation
- **Citation Support**: Structured metadata for accurate source attribution
