In [1]:
import os
import json
import re
import tiktoken
from docx import Document
from PyPDF2 import PdfReader
import nltk
from nltk.tokenize import word_tokenize
import pdfplumber

In [2]:
def remove_law_header(text):
    """
    T√¨m "Ch∆∞∆°ng I" v√† l·∫•y t·ª´ ƒë√≥ tr·ªü ƒëi (ƒêi·ªÅu 1 n·∫±m TRONG Ch∆∞∆°ng I).
    """
    pattern = r"(Ch∆∞∆°ng\s+[I1])"
    match = re.search(pattern, text, flags=re.IGNORECASE)

    if match:
        # L·∫•y t·ª´ "Ch∆∞∆°ng I" tr·ªü ƒëi (ƒêi·ªÅu 1 n·∫±m sau ƒë√≥)
        return text[match.start():]
    else:
        # Kh√¥ng t√¨m th·∫•y Ch∆∞∆°ng I => gi·ªØ nguy√™n
        return text

In [3]:
def extract_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def clean_text(t):
    """Gi·ªØ l·∫°i newline ƒë·ªÉ regex t√°ch ƒëi·ªÅu"""
    t = t.replace("\xa0", " ")
    t = re.sub(r"[ \t]+", " ", t)  
    return t.strip()


In [4]:
#T√°ch ƒëi·ªÅu-kho·∫£n

regex_dieu = re.compile(r"(?m)^(ƒêi·ªÅu\s+\d+)\.")  
regex_khoan = re.compile(r"(?<=\n|\s)(\d{1,2})\.(?=\s)")   
regex_diem = re.compile(r"^[a-z]\)$")  

def split_by_dieu(text):
    parts = regex_dieu.split(text)
    results = []
    
    for i in range(1,len(parts), 2):
        if i + 1 < len(parts):
            dieu_title = parts[i].strip()
            dieu_content = parts[i+1].strip()
            results.append((dieu_title, dieu_content))

    return results

def split_by_khoan(dieu_content):
    parts = regex_khoan.split(dieu_content)
    results = []

    for i in range(1, len(parts), 2):
        khoan_num = parts[i]
        khoan_content = parts[i+1].strip()
        results.append((khoan_num, khoan_content))

    return results


def split_by_diem(khoan_content):
    parts = regex_diem.split(khoan_content)
    results = []

    for i in range(1, len(parts), 2):
        diem_letter = parts[i]
        diem_content = parts[i+1].strip()
        results.append((diem_letter, diem_content))

    return results

In [5]:
#ƒêo token

def count_tokens(text):
    return len(text.split())

In [6]:
def create_chunks(law_name, text):
    chunks = []
    ds_dieu = split_by_dieu(text)

    for dieu_title, dieu_body in ds_dieu:
        nums = re.findall(r"\d+", dieu_title)
        if not nums:
            continue
        dieu_num = int(nums[0])  

        ds_khoan = split_by_khoan(dieu_body)

        if not ds_khoan or len(ds_khoan) == 0:
            chunk_id = f"{law_name}_d{dieu_num}"
            chunks.append({
                "id": chunk_id,
                "law": law_name,
                "dieu": dieu_num, 
                "khoan": None,
                "diem": None,
                "text": f"[ƒêi·ªÅu {dieu_num}]: {dieu_body.strip()}"
            })
            continue

        for khoan_num, khoan_body in ds_khoan:
            chunk_id = f"{law_name}_d{dieu_num}_k{khoan_num}"
            
            chunks.append({
                "id": chunk_id,
                "law": law_name,
                "dieu": dieu_num,  
                "khoan": int(khoan_num),
                "diem": None,
                "text": f"[ƒêi·ªÅu {dieu_num}] [Kho·∫£n {khoan_num}]: {khoan_body.strip()}"
            })

    return chunks

In [7]:
if __name__ == "__main__":
    PDF_PATH = "luatgtdb.pdf"        
    LAW_NAME = "168/2024/Nƒê-CP"

    raw_text = extract_text(PDF_PATH)
    clean = clean_text(raw_text)

    chunks = create_chunks(LAW_NAME, clean)

    print("T·ªïng chunk t·∫°o ƒë∆∞·ª£c:", len(chunks))

    with open("luatgtdb_chunks.json", "w", encoding="utf-8") as f:
        json.dump(chunks, f, ensure_ascii=False, indent=2)

    print("ƒê√£ l∆∞u file luatgtdb_chunks.json ‚úî")

T·ªïng chunk t·∫°o ƒë∆∞·ª£c: 338
ƒê√£ l∆∞u file luatgtdb_chunks.json ‚úî


In [8]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import os
from operator import itemgetter
import gradio as gr
import json

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings  
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

In [10]:
# C·∫§U H√åNH API KEY GEMINI
os.environ["GOOGLE_API_KEY"] = "AIzaSyDjjNTAujGgYUPgLk644qIyV6xxgGSwwYc"  

In [11]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings.base import Embeddings

class STEmbedder(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        texts = [f"passage: {t}" for t in texts]
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        text = f"query: {text}"
        return self.model.encode([text], convert_to_numpy=True)[0].tolist()


In [12]:
# KH·ªûI T·∫†O LLM & EMBEDDING
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,
)

embeddings = STEmbedder("intfloat/multilingual-e5-large")


In [13]:
# LOAD JSON & T·∫†O DOCUMENTS
JSON_PATH = "luatgtdb_chunks.json"   

with open(JSON_PATH, "r", encoding="utf-8") as f:
    raw_chunks = json.load(f)   # list[dict]

documents = [
    Document(
        page_content=item["text"],
        metadata={
            "id": item.get("id"),
            "law": item.get("law"),
            "dieu": item.get("dieu"),
            "khoan": item.get("khoan"),
            "diem": item.get("diem"),
        },
    )
    for item in raw_chunks
]

print(f"ƒê√£ load {len(documents)} chunks t·ª´ JSON.")

ƒê√£ load 338 chunks t·ª´ JSON.


In [14]:
# T·∫†O VECTORSTORE & RETRIEVER

vectorstore = DocArrayInMemorySearch.from_documents(
    documents=documents,
    embedding=embeddings,
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 30})
print("Vectorstore ƒë√£ kh·ªüi t·∫°o xong.")


Vectorstore ƒë√£ kh·ªüi t·∫°o xong.


## So s√°nh c√°c Retriever

In [15]:
# 1. VECTOR SEARCH RETRIEVER (Hi·ªán t·∫°i)

vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 30})


In [16]:
# 2. BM25 RETRIEVER (Keyword-based)
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(documents)

In [17]:
# 3. HYBRID RETRIEVER (K·∫øt h·ª£p Vector + BM25)
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from typing import List

class HybridRetriever(BaseRetriever):
    """Hybrid retriever k·∫øt h·ª£p Vector Search + BM25"""
    vector_retriever: BaseRetriever
    bm25_retriever: BaseRetriever
    
    def _get_relevant_documents(self, query: str) -> List[Document]:
        vector_docs = self.vector_retriever.invoke(query)
        bm25_docs = self.bm25_retriever.invoke(query)
        
        # K·∫øt h·ª£p k·∫øt qu·∫£ t·ª´ c·∫£ hai retriever
        # Lo·∫°i b·ªè tr√πng l·∫∑p d·ª±a tr√™n metadata['id']
        doc_dict = {}
        for doc in vector_docs:
            doc_id = doc.metadata.get("id")
            doc_dict[doc_id] = doc
        for doc in bm25_docs:
            doc_id = doc.metadata.get("id")
            if doc_id not in doc_dict:
                doc_dict[doc_id] = doc
        
        return list(doc_dict.values())
    
    async def _aget_relevant_documents(self, query: str) -> List[Document]:
        return self._get_relevant_documents(query)

# Kh·ªüi t·∫°o Hybrid Retriever
hybrid_retriever = HybridRetriever(
    vector_retriever=vector_retriever,
    bm25_retriever=bm25_retriever
)


In [21]:
import json
from collections import defaultdict

# Load eval dataset
eval_file = "evaluation.json"
with open(eval_file, "r", encoding="utf-8") as f:
    eval_data = json.load(f)


# H√†m ki·ªÉm tra n·∫øu retrieved documents ch·ª©a gold document
def check_if_relevant(retrieved_docs, gold_id):
    """Ki·ªÉm tra xem gold_id c√≥ trong retrieved documents kh√¥ng"""
    for doc in retrieved_docs:
        if doc.metadata.get("id") == gold_id:
            return True
    return False

# H√†m t√≠nh rank c·ªßa gold document (position n√≥ xu·∫•t hi·ªán)
def get_rank(retrieved_docs, gold_id):
    """L·∫•y v·ªã tr√≠ (rank) c·ªßa gold document trong retrieved list"""
    for i, doc in enumerate(retrieved_docs):
        if doc.metadata.get("id") == gold_id:
            return i + 1  # 1-indexed
    return None


total_queries = len(eval_data)  


In [22]:
# 6b. T√çNH TH√äM METRIC: PRECISION, NDCG, HIT RATE

import numpy as np
import pandas as pd

def calculate_ndcg(rank, k):
    """T√≠nh NDCG (Normalized Discounted Cumulative Gain)"""
    if rank is None or rank > k:
        return 0
    # DCG: 1 / log2(rank+1)
    dcg = 1.0 / np.log2(rank + 1)
    # Ideal DCG: lu√¥n l√† 1 (best rank l√† 1)
    idcg = 1.0
    return dcg / idcg

# Recalculate v·ªõi th√™m metrics
results_extended = {
    "vector_search": {
        "hit": 0, "mrr": 0, "recall_5": 0, "recall_10": 0,
        "precision_5": 0, "precision_10": 0,
        "ndcg_5": 0, "ndcg_10": 0,
        "ranks": []
    },
    "bm25": {
        "hit": 0, "mrr": 0, "recall_5": 0, "recall_10": 0,
        "precision_5": 0, "precision_10": 0,
        "ndcg_5": 0, "ndcg_10": 0,
        "ranks": []
    },
    "hybrid": {
        "hit": 0, "mrr": 0, "recall_5": 0, "recall_10": 0,
        "precision_5": 0, "precision_10": 0,
        "ndcg_5": 0, "ndcg_10": 0,
        "ranks": []
    }
}


for idx, item in enumerate(eval_data):
    query = item["query"]
    gold_id = item["gold_id"]
    
    # Vector Search
    vs_docs = vector_retriever.invoke(query)
    vs_rank = get_rank(vs_docs, gold_id)
    
    # BM25
    bm25_docs = bm25_retriever.invoke(query)
    bm25_rank = get_rank(bm25_docs, gold_id)
    
    # Hybrid
    hybrid_docs = hybrid_retriever.invoke(query)
    hybrid_rank = get_rank(hybrid_docs, gold_id)
    
    for retriever_name, rank in [("vector_search", vs_rank), ("bm25", bm25_rank), ("hybrid", hybrid_rank)]:
        if rank is not None:
            results_extended[retriever_name]["hit"] += 1
            results_extended[retriever_name]["mrr"] += 1 / rank
            results_extended[retriever_name]["ranks"].append(rank)
            
            # Precision@5 v√† @10
            if rank <= 5:
                results_extended[retriever_name]["precision_5"] += 1
                results_extended[retriever_name]["recall_5"] += 1
            if rank <= 10:
                results_extended[retriever_name]["precision_10"] += 1
                results_extended[retriever_name]["recall_10"] += 1
            
            # NDCG@5 v√† @10
            results_extended[retriever_name]["ndcg_5"] += calculate_ndcg(rank, 5)
            results_extended[retriever_name]["ndcg_10"] += calculate_ndcg(rank, 10)


metrics_comprehensive = []

for retriever_name in ["vector_search", "bm25", "hybrid"]:
    data = results_extended[retriever_name]
    
    hit_rate = (data["hit"] / total_queries * 100) if total_queries > 0 else 0
    mrr = (data["mrr"] / total_queries) if total_queries > 0 else 0
    precision_5 = (data["precision_5"] / total_queries * 100) if total_queries > 0 else 0
    precision_10 = (data["precision_10"] / total_queries * 100) if total_queries > 0 else 0
    recall_5 = (data["recall_5"] / total_queries * 100) if total_queries > 0 else 0
    recall_10 = (data["recall_10"] / total_queries * 100) if total_queries > 0 else 0
    ndcg_5 = (data["ndcg_5"] / total_queries) if total_queries > 0 else 0
    ndcg_10 = (data["ndcg_10"] / total_queries) if total_queries > 0 else 0
    mean_rank = (np.mean(data["ranks"]) if data["ranks"] else float('inf'))
    
    metrics_comprehensive.append({
        "Retriever": retriever_name.upper().replace("_", " "),
        "Hit Rate (%)": f"{hit_rate:.1f}%",
        "MRR": f"{mrr:.4f}",
        "Precision@5 (%)": f"{precision_5:.1f}%",
        "Recall@5 (%)": f"{recall_5:.1f}%",
        "NDCG@5": f"{ndcg_5:.4f}",
        "Precision@10 (%)": f"{precision_10:.1f}%",
        "Recall@10 (%)": f"{recall_10:.1f}%",
        "NDCG@10": f"{ndcg_10:.4f}",
        "Mean Rank": f"{mean_rank:.1f}" if mean_rank != float('inf') else "N/A"
    })

df_comprehensive = pd.DataFrame(metrics_comprehensive)
print("\n")
print(df_comprehensive.to_string(index=False))




    Retriever Hit Rate (%)    MRR Precision@5 (%) Recall@5 (%) NDCG@5 Precision@10 (%) Recall@10 (%) NDCG@10 Mean Rank
VECTOR SEARCH        96.4% 0.7610           86.2%        86.2% 0.7782            92.4%         92.4%  0.7982       2.4
         BM25        87.4% 0.7727           87.4%        87.4% 0.7982            87.4%         87.4%  0.7982       1.3
       HYBRID        98.0% 0.7615           86.2%        86.2% 0.7782            92.4%         92.4%  0.7982       2.9


## Khuy·∫øn ngh·ªã s·ª≠ d·ª•ng

### File hi·ªán t·∫°i ƒëang d√πng: **Vector Search**
- T·ªët cho chatbot h·ªèi ƒë√°p ph√°p lu·∫≠t v√¨ c·∫ßn hi·ªÉu ng·ªØ c·∫£nh v√† √Ω nghƒ©a c·ªßa c√¢u h·ªèi
- Tuy ch·∫≠y h∆°n BM25 nh∆∞ng k·∫øt qu·∫£ ch√≠nh x√°c h∆°n

### ƒê·ªÉ thay ƒë·ªïi retriever cho RAG chain:
1. **D√πng BM25**: Thay `vector_retriever` b·∫±ng `bm25_retriever` ·ªü cell "X√ÇY D·ª∞NG RAG CHAIN"
2. **D√πng Hybrid**: Thay `vector_retriever` b·∫±ng `hybrid_retriever` (khuy·∫øn ngh·ªã nh·∫•t)
3. **D√πng Vector Search**: Gi·ªØ nguy√™n (m·∫∑c ƒë·ªãnh hi·ªán t·∫°i)


In [23]:
# ƒê·ªäNH NGHƒ®A H√ÄM FORMAT CONTEXT

def format_docs(docs):
    parts = []
    for d in docs:
        law = d.metadata.get("law")
        dieu = d.metadata.get("dieu")
        khoan = d.metadata.get("khoan")
        diem = d.metadata.get("diem")
        header = f"[Lu·∫≠t {law} - ƒêi·ªÅu {dieu}, Kho·∫£n {khoan}"
        if diem:
            header += f", ƒêi·ªÉm {diem}]"
        else:
            header += "]"
        parts.append(f"{header}\n{d.page_content}")
    return "\n\n---\n\n".join(parts)

In [24]:
# PROMPT CHO RAG

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "B·∫°n l√† tr·ª£ l√Ω ph√°p l√Ω, tr·∫£ l·ªùi d·ª±a tr√™n c√°c ƒëi·ªÅu kho·∫£n lu·∫≠t trong ph·∫ßn context. "
            "N·∫øu c√≥ th·ªÉ, h√£y n√™u r√µ ƒêi·ªÅu/Kho·∫£n/ƒêi·ªÉm."
        ),
        MessagesPlaceholder("chat_history"),  # l·ªãch s·ª≠ h·ªôi tho·∫°i nhi·ªÅu turn
        (
            "human",
            "C√¢u h·ªèi: {question}\n\n"
            "VƒÉn b·∫£n lu·∫≠t li√™n quan:\n{context}"
        ),
    ]
)

In [25]:
#X√ÇY D·ª∞NG RAG CHAIN

# Chain: question -> retriever -> format_docs -> prompt -> llm

base_rag_chain = (
    {
        "question": itemgetter("question"),
        "chat_history": itemgetter("chat_history"),
        "context": itemgetter("question")
        | RunnableLambda(lambda q: retriever.invoke(q))
        | RunnableLambda(format_docs),
    }
    | prompt
    | llm
)

In [26]:
_store = {} 
def get_session_history(session_id: str) -> InMemoryChatMessageHistory:
    if session_id not in _store:
        _store[session_id] = InMemoryChatMessageHistory()
    return _store[session_id]


rag_with_history = RunnableWithMessageHistory(
    base_rag_chain,
    get_session_history,
    input_messages_key="question",      
    history_messages_key="chat_history"
)

In [27]:
def chat(message, history):
    """
    message: c√¢u m·ªõi user g·ª≠i
    history: l·ªãch s·ª≠ h·ªôi tho·∫°i c·ªßa Gradio (kh√¥ng c·∫ßn t·ª± x·ª≠ l√Ω, v√¨ ta d√πng InMemoryChatMessageHistory ri√™ng)
    """
    # D√πng 1 session_id c·ªë ƒë·ªãnh cho Gradio. N·∫øu mu·ªën multi-user th√¨ map theo user id/cookie.
    session_id = "gradio-session"

    result = rag_with_history.invoke(
        {"question": message},
        config={"configurable": {"session_id": session_id}},
    )

    # ChatGoogleGenerativeAI tr·∫£ v·ªÅ AIMessage -> d√πng .content
    return result.content

In [38]:
custom_css = """
:root { --radius: 14px; }
body { background: linear-gradient(135deg, #f6f9ff, #eef2ff); }
.gradio-container {max-width: none !important; width: min(1400px, calc(100% - 48px)); margin: 32px auto; font-family: 'Inter', system-ui, -apple-system, sans-serif;}
#chatbot {height: 700px !important; border-radius: var(--radius); border: 1px solid #e5e7eb; box-shadow: 0 14px 36px rgba(0,0,0,0.08);}
.gr-chatbot-message { border-radius: 12px; font-size: 15px; line-height: 1.5; }
.gr-text-input input, .gr-text-input textarea { font-size: 15px; }
footer {display: none !important;}
"""

with gr.Blocks(css=custom_css) as view:
    gr.ChatInterface(
        fn=chat,
        title="üö¶ CHATBOT H·ªñ TR·ª¢ H·ªéI ƒê√ÅP GIAO TH√îNG",
        description="H·ªèi ƒë√°p nhanh v·ªÅ quy ƒë·ªãnh giao th√¥ng, tr√≠ch d·∫´n ƒêi·ªÅu/Kho·∫£n/ƒêi·ªÉm li√™n quan.",
        chatbot=gr.Chatbot(
            elem_id="chatbot",
            height=700,
            avatar_images=(
                "https://img.icons8.com/color/96/traffic-light.png",
                "https://img.icons8.com/fluency/96/police-badge.png"
            ),
        ),
        textbox=gr.Textbox(
            placeholder="Nh·∫≠p c√¢u h·ªèi c·ªßa b·∫°n (vd: M·ª©c ph·∫°t v∆∞·ª£t ƒë√®n ƒë·ªè?)",
            autofocus=True,
            lines=2,
            submit_btn="G·ª≠i",
        ),
        examples=[
            ["M·ª©c ph·∫°t khi kh√¥ng ƒë·ªôi m≈© b·∫£o hi·ªÉm?"],
            ["Gi·ªõi h·∫°n n·ªìng ƒë·ªô c·ªìn v·ªõi ng∆∞·ªùi ƒëi·ªÅu khi·ªÉn xe m√°y?"],
            ["Quy ƒë·ªãnh v·ªÅ t·ªëc ƒë·ªô t·ªëi ƒëa trong khu v·ª±c ƒë√¥ng d√¢n c∆∞?"],
        ],
        submit_btn="G·ª≠i",
    )

view.launch(inbrowser=True)

  with gr.Blocks(css=custom_css) as view:


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


