In [None]:
! pip install langchain_community tiktoken langchain_text_splitters faiss-cpu qdrant-client protonx dotenv langchain-google-genai pyvi rank_bm25

# I. Load documents (Enhanced with metadata preservation)

In [1]:
# Enhanced document loading with metadata preservation - TESTING VERSION (50 docs only)
import os
import json
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.documents import Document

def load_legal_docs_from_folders(root_dir: str, max_docs: int = 50) -> list[Document]:
    """
    Enhanced loader that preserves structured metadata from folders containing
    both 'metadata.json' and 'content.txt' files.
    
    Args:
        root_dir: Directory to load documents from
        max_docs: Maximum number of documents to load (for testing)
    """
    documents = []
    doc_count = 0
    
    print(f"ÔøΩÔøΩ Loading documents (max {max_docs} for testing)...")
    
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if doc_count >= max_docs:
            print(f"‚úÖ Reached limit of {max_docs} documents for testing")
            break
            
        if "metadata.json" in filenames and "content.txt" in filenames:
            metadata_path = os.path.join(dirpath, "metadata.json")
            content_path = os.path.join(dirpath, "content.txt")

            try:
                with open(metadata_path, 'r', encoding='utf-8') as f:
                    metadata_json = json.load(f)
                with open(content_path, 'r', encoding='utf-8') as f:
                    full_text = f.read()

                # Enhanced page content with title and summary
                page_content = (
                    f"Ti√™u ƒë·ªÅ: {metadata_json.get('title', '')}\n\n"
                    f"T√≥m t·∫Øt: {metadata_json['metadata'].get('t√≥m t·∫Øt vƒÉn b·∫£n', '')}\n\n"
                    f"To√†n vƒÉn: {full_text}"
                )

                # Preserve important metadata
                final_metadata = metadata_json.get("metadata", {}).get("thu·ªôc t√≠nh", {})
                final_metadata['url'] = metadata_json.get('url', '')
                final_metadata['title'] = metadata_json.get('title', '')
                final_metadata['source'] = content_path
                final_metadata['doc_id'] = doc_count + 1  # Add document ID for testing
                
                doc = Document(page_content=page_content, metadata=final_metadata)
                documents.append(doc)
                doc_count += 1
                
                # Show progress every 10 documents
                if doc_count % 10 == 0:
                    print(f"üìÑ Loaded {doc_count}/{max_docs} documents...")
                    
            except Exception as e:
                print(f"Error processing files in {dirpath}: {e}")
    
    return documents

# TESTING CONFIGURATION - Load only a few documents
root_dir = '../data/raw_data/documents'
MAX_DOCS_FOR_TESTING = 50 

print("üß™ TESTING MODE: Loading limited documents for faster testing")
print(f"ÔøΩÔøΩ Will load maximum {MAX_DOCS_FOR_TESTING} documents")
print("=" * 60)

try:
    docs = load_legal_docs_from_folders(root_dir, max_docs=MAX_DOCS_FOR_TESTING)
    print(f'\n‚úÖ Enhanced loading: Loaded {len(docs)} structured documents with metadata')
    if docs:
        print("üìã Sample metadata:", docs[0].metadata)
        print(f"üìä Document range: 1-{len(docs)}")
except Exception as e:
    print(f"‚ùå Enhanced loading failed: {e}")
    print("üîÑ Falling back to simple loading...")
    
    # Fallback to simple loading with limit
    loader = DirectoryLoader(
        root_dir, 
        glob='**/*.txt', 
        loader_cls=TextLoader, 
        show_progress=True
    )
    all_docs = loader.load()
    docs = all_docs[:MAX_DOCS_FOR_TESTING]  # Take only first 100
    print(f'‚úÖ Simple loading: Loaded {len(docs)} documents (limited to {MAX_DOCS_FOR_TESTING})')

print(f'\nüìÑ First document preview:')
print(docs[0].page_content[:300])

print(f"\nÔøΩÔøΩ TESTING SUMMARY:")
print(f"üìä Total documents loaded: {len(docs)}")
print(f"‚è±Ô∏è  Ready for testing with reduced dataset")
print("=" * 60)

üß™ TESTING MODE: Loading limited documents for faster testing
ÔøΩÔøΩ Will load maximum 50 documents
ÔøΩÔøΩ Loading documents (max 50 for testing)...
üìÑ Loaded 10/50 documents...
üìÑ Loaded 20/50 documents...
üìÑ Loaded 30/50 documents...
üìÑ Loaded 40/50 documents...
üìÑ Loaded 50/50 documents...
‚úÖ Reached limit of 50 documents for testing

‚úÖ Enhanced loading: Loaded 50 structured documents with metadata
üìã Sample metadata: {'S·ªë hi·ªáu': '05/CT-UBND', 'Lo·∫°i vƒÉn b·∫£n': 'Ch·ªâ th·ªã', 'N∆°i ban h√†nh': 'Th√†nh ph·ªë C·∫ßn Th∆°', 'Ng∆∞·ªùi k√Ω': 'V∆∞∆°ng Qu·ªëc Nam', 'Ng√†y ban h√†nh': '26/08/2025', 'Ng√†y hi·ªáu l·ª±c': 'ƒê√£ bi·∫øt', 'Ng√†y c√¥ng b√°o': 'ƒêang c·∫≠p nh·∫≠t', 'S·ªë c√¥ng b√°o': 'ƒêang c·∫≠p nh·∫≠t', 'T√¨nh tr·∫°ng': 'ƒê√£ bi·∫øt', 'url': 'https://thuvienphapluat.vn/van-ban/Bo-may-hanh-chinh/Chi-thi-05-CT-UBND-2025-day-manh-viec-chap-hanh-quy-dinh-phap-luat-ve-gia-Can-Tho-670694.aspx', 'title': 'Ch·ªâ th·ªã 05/CT-UBND 2025 ƒë·∫©y m·∫°nh vi·ªác ch·∫•p h

# II. Enhanced Document Splitting (Parent-Child Architecture)

In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore

# Parent-Child Architecture for better context preservation
print("Setting up parent-child document architecture...")

# Parent splitter: larger chunks for context
parent_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, 
    chunk_overlap=300,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# Child splitter: smaller chunks for retrieval
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400, 
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

# Create parent chunks and store them
docstore = InMemoryStore()
child_chunks = []

for doc in docs:
    # Split into parent chunks
    parent_chunks = parent_splitter.split_documents([doc])
    
    for i, parent_chunk in enumerate(parent_chunks):
        parent_id = f"{doc.metadata.get('title', 'doc')}-{i}"
        docstore.mset([(parent_id, parent_chunk)])
        
        # Split parent into child chunks
        sub_chunks = child_splitter.split_documents([parent_chunk])
        for sub_chunk in sub_chunks:
            sub_chunk.metadata['parent_id'] = parent_id
            # Preserve original metadata
            sub_chunk.metadata.update(doc.metadata)
            child_chunks.append(sub_chunk)

print(f"Created {len(child_chunks)} child chunks from {len(docs)} documents")
print(f"Parent chunks stored in docstore for context retrieval")

Setting up parent-child document architecture...
Created 2261 child chunks from 50 documents
Parent chunks stored in docstore for context retrieval


In [4]:
print(f'Parent-child architecture setup complete!')
print(f'Child chunks for retrieval: {len(child_chunks)}')
print(f'Average child chunk size: {sum(len(chunk.page_content) for chunk in child_chunks) // len(child_chunks)} characters')

Parent-child architecture setup complete!
Child chunks for retrieval: 2261
Average child chunk size: 608 characters


In [5]:
# Show chunk size distribution
chunk_sizes = [len(chunk.page_content) for chunk in child_chunks]
print(f"Child chunk sizes (first 10): {chunk_sizes[:10]}")
print(f"Min: {min(chunk_sizes)}, Max: {max(chunk_sizes)}, Avg: {sum(chunk_sizes)//len(chunk_sizes)}")

Child chunk sizes (first 10): [547, 664, 747, 635, 393, 485, 455, 489, 656, 623]
Min: 25, Max: 883, Avg: 608


# III. Enhanced Embedding with ProtonX

In [6]:
import os
from dotenv import load_dotenv
from typing import List
from langchain_core.embeddings import Embeddings
from protonx import ProtonX

load_dotenv()  # take environment variables

True

In [7]:
class ProtonXEmbeddings(Embeddings):
    """
    A custom LangChain embeddings class for the ProtonX API.
    This wrapper handles authentication and batching to prevent token limit errors.
    """
    def __init__(self, batch_size: int = 8, max_retries: int = 3, delay: int = 5):
        """Initializes the ProtonX client and sets the batch size."""
        self.client = ProtonX()  # Automatically uses PROTONX_API_KEY from environment
        self.batch_size = batch_size
        self.max_retries = max_retries
        self.delay = delay
        print(f"ProtonXEmbeddings initialized with batch size: {self.batch_size}")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Generates embeddings for a list of documents, processing them in batches
        to stay within the API's token limits.
        """
        all_embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i + self.batch_size]
            for attempt in range(self.max_retries):
                try:
                    response = self.client.embeddings.create(input=batch)
                    if not response or not response.get("data"):
                        raise ValueError(f"Failed to get embeddings from ProtonX API for batch starting at index {i}")

                    batch_embeddings = [item.get("embedding") for item in response.get("data")]
                    all_embeddings.extend(batch_embeddings)
                    break

                except ConnectionError as e:
                    print(f"Connection error on attempt {attempt + 1}/{self.max_retries}. Retrying in {self.delay} seconds...")
                    if attempt + 1 == self.max_retries:
                        raise e
                    time.sleep(self.delay)
        return all_embeddings

    def embed_query(self, text: str) -> List[float]:
        """Generates an embedding for a single query text."""
        for attempt in range(self.max_retries):
            try:
                response = self.client.embeddings.create(text)
                if not response or not response.get("data"):
                    raise ValueError("Failed to get embedding from ProtonX API for the query")

                return response.get("data")[0].get("embedding")
            except ConnectionError as e:
                print(f"Connection eror on attempt {attempt + 1}/{self.max_retries}. Retrying in {self.delay} seconds...")
                if attempt + 1 == self.max_retries:
                    raise e
                time.sleep(self.delay)

In [8]:
embeddings = ProtonXEmbeddings(batch_size=15)
print("ProtonX embeddings object created and ready to use.")

ProtonXEmbeddings initialized with batch size: 15
ProtonX embeddings object created and ready to use.


In [None]:
# import os

# # Set your ProtonX API key
# os.environ["PROTONX_API_KEY"] = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJlbWFpbCI6Im5wcWFjZWR1QGdtYWlsLmNvbSIsImlhdCI6MTc1NzkwNTY1NiwiZXhwIjoxNzYwNDk3NjU2fQ.7vSGzeMSRAbdzrWSZikUt5Xftr1BMQb_OEoi577R39s"

# # Then initialize the client
# from protonx import ProtonX
# client = ProtonX()

# # Test the connection
# result = client.embeddings.create("T√¥i y√™u Vi·ªát Nam ƒë·∫Øm say hihi!!!")
# print("Success:", result)

In [9]:
import time
from tqdm import tqdm

# 1. Extract texts and metadatas from child chunks
texts = [chunk.page_content for chunk in child_chunks]
metadatas = [chunk.metadata for chunk in child_chunks]

print(f"üìä Starting embedding process for {len(texts)} child chunks...")
print(f"üîß Using batch size: {embeddings.batch_size}")
print(f"‚è±Ô∏è  Estimated time: {len(texts) / embeddings.batch_size * 2:.1f} seconds (rough estimate)")
print("=" * 60)

# 2. Simple progress bar like the image
start_time = time.time()

# Process embeddings with simple progress bar
text_embeddings = []
total_chunks = len(texts)

# Create a simple progress bar that matches the image style
with tqdm(total=total_chunks, desc="Embedding chunks", unit="chunk") as pbar:
    
    for i in range(0, len(texts), embeddings.batch_size):
        batch = texts[i:i + embeddings.batch_size]
        
        try:
            # Process batch
            batch_embeddings = embeddings.embed_documents(batch)
            text_embeddings.extend(batch_embeddings)
            
            # Update progress bar
            pbar.update(len(batch))
            
        except Exception as e:
            print(f"\n‚ùå Error in batch {i//embeddings.batch_size + 1}: {e}")
            raise e

end_time = time.time()
total_time = end_time - start_time

# Final summary
print("\n" + "=" * 60)
print("üéâ EMBEDDING PROCESS COMPLETED!")
print("=" * 60)
print(f"ÔøΩÔøΩ Total chunks processed: {len(text_embeddings)}")
print(f"‚è±Ô∏è  Total time: {total_time:.2f} seconds ({total_time/60:.1f} minutes)")
print(f"‚ö° Average speed: {total_time/len(text_embeddings):.2f} seconds per chunk")
print(f"ÔøΩÔøΩ Throughput: {len(text_embeddings)/total_time:.1f} chunks per second")
print(f"ÔøΩÔøΩ Embedding dimension: {len(text_embeddings[0]) if text_embeddings else 'N/A'}")
print("=" * 60)

üìä Starting embedding process for 2261 child chunks...
üîß Using batch size: 15
‚è±Ô∏è  Estimated time: 301.5 seconds (rough estimate)


Embedding chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2261/2261 [34:54<00:00,  1.08chunk/s]


üéâ EMBEDDING PROCESS COMPLETED!
ÔøΩÔøΩ Total chunks processed: 2261
‚è±Ô∏è  Total time: 2094.16 seconds (34.9 minutes)
‚ö° Average speed: 0.93 seconds per chunk
ÔøΩÔøΩ Throughput: 1.1 chunks per second
ÔøΩÔøΩ Embedding dimension: 768





In [10]:
# Embed a sample text to find the dimension
sample_embedding = embeddings.embed_query("this is a test")
dimension = len(sample_embedding)
print(f"The embedding dimension is: {dimension}")

The embedding dimension is: 768


# IV. Enhanced Vector Stores (FAISS + BM25)

In [11]:
# Setup FAISS vector store
from langchain_community.vectorstores import FAISS

print("\nCreating FAISS vector store...")
# FAISS requires (text, embedding) pairs
text_embedding_pairs = list(zip(texts, text_embeddings))
vectorstore_local = FAISS.from_embeddings(text_embedding_pairs, embeddings, metadatas=metadatas)
print("FAISS vector store created successfully.")

# Setup BM25 retriever with Vietnamese tokenization
from langchain_community.retrievers import BM25Retriever
from pyvi import ViTokenizer

print("\nSetting up BM25 retriever with Vietnamese tokenization...")

def tokenize_vi_for_bm25(doc):
    """Tokenize Vietnamese text for better BM25 matching"""
    doc.page_content = ViTokenizer.tokenize(doc.page_content)
    return doc

# Create tokenized copies for BM25
child_chunks_tokenized = [tokenize_vi_for_bm25(chunk) for chunk in child_chunks]
bm25_retriever = BM25Retriever.from_documents(child_chunks_tokenized)
bm25_retriever.k = 15  # Retrieve top 15 for ensemble
print("BM25 retriever with Vietnamese tokenization created successfully.")


Creating FAISS vector store...
FAISS vector store created successfully.

Setting up BM25 retriever with Vietnamese tokenization...
BM25 retriever with Vietnamese tokenization created successfully.


In [12]:
# Setup Ensemble Retrieval (BM25 + FAISS)
from langchain.retrievers import EnsembleRetriever

print("\nSetting up ensemble retrieval...")

# Create FAISS retriever
faiss_retriever = vectorstore_local.as_retriever(search_kwargs={'k': 15})

# Create ensemble retriever combining BM25 and FAISS
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.6, 0.4]  # Favor BM25 for Vietnamese lexical matching
)

print("Ensemble retriever (BM25 + FAISS) created successfully.")

# Setup Parent Document Retrieval Chain
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

def _get_parent_docs(input_dict: dict) -> list[Document]:
    """Retrieve parent documents from child documents"""
    child_docs = input_dict["child_docs"]
    store = input_dict["docstore"]
    parent_ids = []
    for doc in child_docs:
        if "parent_id" in doc.metadata and doc.metadata["parent_id"] not in parent_ids:
            parent_ids.append(doc.metadata["parent_id"])
    return [doc for doc in store.mget(parent_ids) if doc is not None]

# Create the parent retrieval chain
parent_retriever_chain = (
    {"child_docs": ensemble_retriever}
    | RunnablePassthrough.assign(docstore=lambda x: docstore)
    | RunnableLambda(_get_parent_docs)
)

print("Parent document retrieval chain created successfully.")


Setting up ensemble retrieval...
Ensemble retriever (BM25 + FAISS) created successfully.
Parent document retrieval chain created successfully.


# V. Enhanced Retrieval Testing

In [13]:
# Test the enhanced hybrid retrieval
query = "nh√† n∆∞·ªõc t·∫∑ng bao nhi√™u ti·ªÅn cho m·ªôt ng∆∞·ªùi?"

print(f"Testing hybrid retrieval for query: '{query}'\n")

# Test ensemble retrieval (child chunks)
print("=== Ensemble Retrieval (Child Chunks) ===")
ensemble_docs = ensemble_retriever.invoke(query)
print(f"Retrieved {len(ensemble_docs)} child chunks")

# Test parent document retrieval
print("\n=== Parent Document Retrieval ===")
parent_docs = parent_retriever_chain.invoke(query)
print(f"Retrieved {len(parent_docs)} parent documents")

# Show sample results
print(f"\n--- Sample Parent Document ---")
if parent_docs:
    print(f"Title: {parent_docs[0].metadata.get('title', 'N/A')}")
    print(f"Content preview: {parent_docs[0].page_content[:300]}...")
    print(f"Source: {parent_docs[0].metadata.get('source', 'N/A')}")

Testing hybrid retrieval for query: 'nh√† n∆∞·ªõc t·∫∑ng bao nhi√™u ti·ªÅn cho m·ªôt ng∆∞·ªùi?'

=== Ensemble Retrieval (Child Chunks) ===
Retrieved 29 child chunks

=== Parent Document Retrieval ===
Retrieved 16 parent documents

--- Sample Parent Document ---
Title: C√¥ng ƒëi·ªán 149/Cƒê-TTg 2025 t·∫∑ng qu√† ng∆∞·ªùi d√¢n d·ªãp k·ª∑ ni·ªám 80 nƒÉm C√°ch m·∫°ng Th√°ng T√°m Qu·ªëc kh√°nh 2/9
Content preview: To√†n vƒÉn: TH·ª¶ T∆Ø·ªöNG CH√çNH PH·ª¶ -------- C·ªòNG H√íA X√É H·ªòI CH·ª¶ NGHƒ®A VI·ªÜT NAM ƒê·ªôc l·∫≠p - T·ª± do - H·∫°nh ph√∫c --------------- S·ªë: 149/Cƒê-TTg H√† N·ªôi, ng√†y 28 th√°ng 8 nƒÉm 2025

TH·ª¶ T∆Ø·ªöNG CH√çNH PH·ª¶ --------

C·ªòNG H√íA X√É H·ªòI CH·ª¶ NGHƒ®A VI·ªÜT NAM ƒê·ªôc l·∫≠p - T·ª± do - H·∫°nh ph√∫c ---------------

S·ªë: 149/Cƒê-TTg

H√† N·ªôi,...
Source: ../data/raw_data/documents\C√¥ng ƒëi·ªán 149Cƒê-TTg 2025 t·∫∑ng qu√† ng∆∞·ªùi d√¢n d·ªãp k·ª∑ ni·ªám 80 nƒÉm C√°ch m·∫°ng Th√°ng T√°m Qu·ªëc kh√°nh 29\content.txt


# VI. Enhanced RAG with Hybrid Retrieval

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0)

# Enhanced prompt template with Vietnamese instructions and citation requirements
template = """Answer the question based ONLY on the following context.
Your answer must be in Vietnamese.
Your answer should be well-structured and easy to read.
- Use bullet points or numbered lists for multiple items or steps.
- Use **bold** for key terms, names, or important numbers and concepts.
- Use *italics* for emphasis or to highlight specific terms.

**After providing the answer, you MUST cite your sources accurately using the metadata from the context.**
For each source used, provide its title and document number (S·ªë hi·ªáu) if available.

Context:
{context}

Question: {question}

If the context does not provide enough information, say "T√¥i kh√¥ng t√¨m th·∫•y th√¥ng tin trong t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p." and do not provide an answer."""

prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    """Format retrieved documents with metadata for better context"""
    formatted_docs = []
    for i, doc in enumerate(docs):
        # Add document metadata as header
        title = doc.metadata.get('title', f'Document {i+1}')
        source = doc.metadata.get('source', 'Unknown source')
        formatted_doc = f"--- Document: {title} ---\n{doc.page_content}\n--- End Document ---"
        formatted_docs.append(formatted_doc)
    return "\n\n".join(formatted_docs)

# Enhanced RAG chain using parent document retrieval
rag_chain = (
    {"context": parent_retriever_chain | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("Enhanced RAG chain with hybrid retrieval and parent documents created successfully.")

Enhanced RAG chain with hybrid retrieval and parent documents created successfully.


In [16]:
# Test the enhanced RAG system
query = "nh√† n∆∞·ªõc t·∫∑ng bao nhi√™u ti·ªÅn cho m·ªôt ng∆∞·ªùi?"
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
print(f"Response: {response}\n")

Query: nh√† n∆∞·ªõc t·∫∑ng bao nhi√™u ti·ªÅn cho m·ªôt ng∆∞·ªùi?

Response: D·ª±a tr√™n th√¥ng tin t·ª´ c√°c t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p, nh√¢n d·ªãp k·ª∑ ni·ªám **80 nƒÉm C√°ch m·∫°ng Th√°ng T√°m v√† Qu·ªëc kh√°nh 2/9**, nh√† n∆∞·ªõc s·∫Ω t·∫∑ng qu√† cho ng∆∞·ªùi d√¢n v·ªõi m·ª©c c·ª• th·ªÉ nh∆∞ sau:

*   **M·ª©c qu√† t·∫∑ng**: **100.000 ƒë·ªìng/ng∆∞·ªùi d√¢n**.

M√≥n qu√† n√†y ƒë∆∞·ª£c trao *cho to√†n d√¢n ƒÉn T·∫øt ƒê·ªôc l·∫≠p* v√† ph·∫£i ƒë∆∞·ª£c chuy·ªÉn ƒë·∫øn ng∆∞·ªùi d√¢n xong tr∆∞·ªõc ng√†y Qu·ªëc kh√°nh 02/9/2025.

***

**Ngu·ªìn:**
1.  C√¥ng ƒëi·ªán 149/Cƒê-TTg 2025 t·∫∑ng qu√† ng∆∞·ªùi d√¢n d·ªãp k·ª∑ ni·ªám 80 nƒÉm C√°ch m·∫°ng Th√°ng T√°m Qu·ªëc kh√°nh 2/9 (S·ªë hi·ªáu: 149/Cƒê-TTg)
2.  C√¥ng ƒëi·ªán 09/Cƒê-KBNN 2025 qu√°n tri·ªát ch·ªâ ƒë·∫°o th·ª±c hi·ªán K·∫øt lu·∫≠n 183/KT-TW v√† 260/NQ-CP (S·ªë hi·ªáu: 09/Cƒê-KBNN)



In [17]:
# Test with another query
query = "nh√† n∆∞·ªõc t·∫∑ng qu√† cho d√¢n nh√¢n d·ªãp g√¨?"
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
print(f"Response: {response}\n")

Query: nh√† n∆∞·ªõc t·∫∑ng qu√† cho d√¢n nh√¢n d·ªãp g√¨?

Response: D·ª±a tr√™n th√¥ng tin t·ª´ c√°c t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p, Nh√† n∆∞·ªõc t·∫∑ng qu√† cho ng∆∞·ªùi d√¢n nh√¢n d·ªãp:

*   **K·ª∑ ni·ªám 80 nƒÉm C√°ch m·∫°ng Th√°ng T√°m v√† Qu·ªëc kh√°nh 2/9**.

---
**Ngu·ªìn:**
*   C√¥ng ƒëi·ªán 149/Cƒê-TTg 2025 t·∫∑ng qu√† ng∆∞·ªùi d√¢n d·ªãp k·ª∑ ni·ªám 80 nƒÉm C√°ch m·∫°ng Th√°ng T√°m Qu·ªëc kh√°nh 2/9, S·ªë hi·ªáu: 149/Cƒê-TTg.
*   C√¥ng ƒëi·ªán 152/C√ê-TTg 2025 kh·∫©n tr∆∞∆°ng tri·ªÉn khai t·∫∑ng qu√† nh√¢n d·ªãp k·ª∑ ni·ªám 80 nƒÉm Qu·ªëc kh√°nh, S·ªë hi·ªáu: 152/Cƒê-TTg.
*   C√¥ng ƒëi·ªán 154/Cƒê-TTg 2025 ƒë·∫©y nhanh t·∫∑ng qu√† Nh√¢n d√¢n nh√¢n d·ªãp k·ª∑ ni·ªám Qu·ªëc kh√°nh, S·ªë hi·ªáu: 154/Cƒê-TTg.
*   C√¥ng ƒëi·ªán 09/Cƒê-KBNN 2025 qu√°n tri·ªát ch·ªâ ƒë·∫°o th·ª±c hi·ªán K·∫øt lu·∫≠n 183/KT-TW v√† 260/NQ-CP, S·ªë hi·ªáu: 09/Cƒê-KBNN.



In [18]:
from IPython.display import Markdown, display

In [19]:
# Test with complex legal query
query = "D·ª±a tr√™n h·ªì s∆° do C√¥ng ty TNHH OTES CORPORATION cung c·∫•p, h√£y m√¥ t·∫£ c√°c ƒë·∫∑c t√≠nh k·ªπ thu·∫≠t ch√≠nh c·ªßa s·∫£n ph·∫©m TamSoil Polynite ECO ·ªü tr·∫°ng th√°i dung d·ªãch sau khi pha, v√† n√™u r√µ cƒÉn c·ª© ph√°p l√Ω cao nh·∫•t (Lu·∫≠t) m√† C·ª•c H·∫£i quan ƒë√£ d·ª±a v√†o ƒë·ªÉ ban h√†nh th√¥ng b√°o m√£ s·ªë cho s·∫£n ph·∫©m n√†y."
print(f"Query: {query}\n")
print("=" * 80)

response = rag_chain.invoke(query)
print(f"Response: {response}\n")

# Display with markdown formatting
display(Markdown(f"**Response:** {response}"))


Query: D·ª±a tr√™n h·ªì s∆° do C√¥ng ty TNHH OTES CORPORATION cung c·∫•p, h√£y m√¥ t·∫£ c√°c ƒë·∫∑c t√≠nh k·ªπ thu·∫≠t ch√≠nh c·ªßa s·∫£n ph·∫©m TamSoil Polynite ECO ·ªü tr·∫°ng th√°i dung d·ªãch sau khi pha, v√† n√™u r√µ cƒÉn c·ª© ph√°p l√Ω cao nh·∫•t (Lu·∫≠t) m√† C·ª•c H·∫£i quan ƒë√£ d·ª±a v√†o ƒë·ªÉ ban h√†nh th√¥ng b√°o m√£ s·ªë cho s·∫£n ph·∫©m n√†y.

Response: T√¥i kh√¥ng t√¨m th·∫•y th√¥ng tin trong t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p.



**Response:** T√¥i kh√¥ng t√¨m th·∫•y th√¥ng tin trong t√†i li·ªáu ƒë∆∞·ª£c cung c·∫•p.

# VII. Performance Comparison and Analysis

## Enhanced Features Summary

### ‚úÖ **What's New in This Enhanced Version:**

1. **üîç Hybrid Retrieval**: BM25 (lexical) + FAISS (semantic) ensemble
2. **üáªüá≥ Vietnamese Tokenization**: Better BM25 matching for Vietnamese text
3. **üìö Parent-Child Architecture**: Better context preservation
4. **üìÑ Enhanced Data Loading**: Preserves structured metadata
5. **üéØ Improved Prompting**: Vietnamese instructions with citation requirements
6. **‚ö° ProtonX Embeddings**: High-quality Vietnamese embeddings (free)

### **Architecture Flow:**
```
Query ‚Üí [BM25 + FAISS] ‚Üí Ensemble ‚Üí Parent-Child Hydration ‚Üí Enhanced RAG ‚Üí Response
```

### **Key Improvements Over Original:**
- **Better Recall**: Hybrid retrieval catches both exact matches and semantic similarity
- **Better Context**: Parent-child architecture provides more complete context
- **Vietnamese-Optimized**: Tokenization and specialized embeddings
- **Production-Ready**: Robust error handling and metadata preservation
- **Citation Support**: Structured metadata for accurate source attribution


In [25]:
# Optional: Add simple re-ranking for even better precision
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def simple_rerank(docs, query, top_k=5):
    """Simple re-ranking based on query-document similarity"""
    if not docs:
        return docs
    
    # Use the same ProtonX embeddings for consistency
    model = SentenceTransformer('dangvantuan/vietnamese-embedding')
    
    # Get embeddings
    query_embedding = model.encode([query])
    doc_embeddings = model.encode([doc.page_content for doc in docs])
    
    # Calculate similarities
    similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
    
    # Sort by similarity and return top-k
    doc_sim_pairs = list(zip(docs, similarities))
    doc_sim_pairs.sort(key=lambda x: x[1], reverse=True)
    
    return [doc for doc, _ in doc_sim_pairs[:top_k]]

# Test re-ranking
print("Testing simple re-ranking...")
test_query = "nh√† n∆∞·ªõc t·∫∑ng bao nhi√™u ti·ªÅn cho m·ªôt ng∆∞·ªùi?"
retrieved_docs = parent_retriever_chain.invoke(test_query)
reranked_docs = simple_rerank(retrieved_docs, test_query, top_k=3)

print(f"Original retrieval: {len(retrieved_docs)} documents")
print(f"After re-ranking: {len(reranked_docs)} documents")
print("Re-ranking function ready for use if needed.")


Testing simple re-ranking...


Token indices sequence length is longer than the specified maximum sequence length for this model (774 > 128). Running this sequence through the model will result in indexing errors


Original retrieval: 16 documents
After re-ranking: 3 documents
Re-ranking function ready for use if needed.
