In [5]:
!pip install -q ipywidgets
!pip install -q torch langchain langgraph pydantic typing_extensions
!pip install -q langchain-huggingface langchain_community langchain_ollama
!pip install -q sentence_transformers faiss-cpu

In [None]:
import os
import asyncio

# Install Ollama
!curl -fsSL https://ollama.com/install.sh | sh

# Run Ollama in the background
os.environ['OLLAMA_HOST'] = '0.0.0.0'
!nohup ollama serve &

# Give it a moment to start up
await asyncio.sleep(5)

# Model for the RAG application
!ollama pull qwen3:8b

# Evaluator model
!ollama pull qwen3:30b

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
nohup: appending output to 'nohup.out'
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G

In [7]:
import json
import torch
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.schema import Document, StrOutputParser
from langchain_ollama.llms import OllamaLLM
from sentence_transformers.cross_encoder import CrossEncoder
from langgraph.graph import StateGraph, END, START
from typing_extensions import TypedDict
from typing import List, Any, Union

# --- Global Variables ---
retriever = None
reranker = None
rag_app = None

# --- Graph State Definition ---
class GraphState(TypedDict):
    question: str
    generation: str
    documents: Union[List[Document], List[dict]]
    llm: Any

# --- Node Functions (Retrieve, Filter, Generate) ---

def retrieve_node(state):
    documents = retriever.invoke(state["question"])
    print(f"---NODE: RETRIEVE (Completed)---")
    print(f"  Retrieved {len(documents)} document chunks.")
    return {"documents": documents}


def documents_filter_node(state):
    question = state["question"]
    documents = state["documents"]
    llm = state["llm"]

    system = """You are a document relevance grader for a chatbot that assists in Vietnamese traditional medicine. Your goal is to filter out document that is not relevant to the user's question.

    Instructions:
    1. A document is considered RELEVANT ('y') if it contains information that is relevant to the user's question, even if it does not cover every single detail in the question.
    2. A document is NOT RELEVANT ('n') only if it is completely off-topic.
    3. Your job is to filter out entirely irrelevant documents, not to answer the question. Do not be overly strict.

    Conclude with a single character: 'y' if relevant or 'n' if not relevant. /no_think"""

    prompt = ChatPromptTemplate.from_messages([
        ("system", system),
        ("human", "Document:\n\n{document}\n\nQuestion: {question}"),
    ])
    grader = prompt | llm | StrOutputParser()

    filtered_docs = []
    grading_results_log = []

    for d in documents:
        doc_content = d.metadata.get("source_record")
        if not doc_content:
            continue
        response_str = grader.invoke({"question": question, "document": doc_content})
        decision_part = re.sub(r"<think>.*?</think>", "", response_str, flags=re.DOTALL).strip().lower()
        final_word = decision_part.split()[-1] if decision_part.split() else ""
        grade = "no"
        if "y" in final_word:
            grade = "yes"
            filtered_docs.append(d)
        doc_name = doc_content.get('name', 'Unknown')[:40]
        grading_results_log.append(f"  - Doc: '{doc_name}...' -> Grade: {grade.upper()}")

    print("---NODE: GRADE DOCUMENTS (Completed)---")
    print("\n".join(grading_results_log))

    return {"documents": filtered_docs}


def rerank_documents_node(state):
    print("---NODE: RERANK DOCUMENTS---")
    question = state["question"]
    documents = state["documents"]

    if not documents:
        return {"documents": []}

    # Create pairs of [question, document_content] for the reranker
    pairs = []
    for d in documents:
        # Rerank based on the text content that was embedded
        pairs.append([question, d.page_content])

    # Get scores from the reranker model
    scores = reranker.predict(pairs)

    # Combine documents with their scores
    scored_docs = list(zip(scores, documents))

    # Sort documents by score in descending order
    scored_docs.sort(key=lambda x: x[0], reverse=True)

    # Filter out documents below a relevance threshold and keep the top N
    final_docs = []
    rerank_log = []
    print("  Reranking results:")
    for score, doc in scored_docs:
        doc_name = doc.metadata.get("source_record", {}).get('name', 'Unknown')[:40]
        rerank_log.append(f"  - Doc: '{doc_name}...' -> Score: {score:.4f}")
        if score > 0.3:
            final_docs.append(doc)
        # Limit to the top 5 most relevant documents
        if len(final_docs) >= 5:
            break
    print("\n".join(rerank_log))
    print(f"  Passing {len(final_docs)} documents to the generator.")
    return {"documents": final_docs}


def generate_node(state):
    question = state["question"]
    documents = state["documents"]
    llm = state["llm"]

    source_records = [
        doc.metadata.get("source_record")
        for doc in documents if hasattr(doc, 'metadata') and doc.metadata.get("source_record")
    ]

    prompt_text = """You are an assistant in Vietnamese traditional medicine. Your task is to synthesize information from all provided documents to give a comprehensive answer to the user's question.

    Instructions:
    1.  Carefully read the user's question and analyze **every document** provided in the context.
    2.  Identify all documents from the context that are relevant to the user's question. If all the documents are not relevant, tell the user that no relevant document could be found.
    3.  Construct a final answer that begins with a clear introductory sentence.
    4.  Then, present the relevant medicine in a numbered list. For each medicine, state its name and clearly explain how the properties and its used is helpful. If the medicine has side-effect please tell it to the user.
    5.  Conclude with a summary statement if appropriate.
    6.  Do not make up information. Base your entire answer ONLY on the provided context.
    7.  Your final, synthesized answer must be in **Vietnamese**.

    Context:
    {context}

    Question: {question}
    Synthesized Answer (in Vietnamese):"""

    prompt_template = PromptTemplate.from_template(prompt_text)
    rag_chain = prompt_template | llm | StrOutputParser()
    generation = rag_chain.invoke({"context": source_records, "question": question})
    print("---NODE: GENERATE (Completed)---")

    return {"generation": generation, "documents": source_records}


def load_and_prepare_rag():
    global retriever, rag_app, reranker
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"--- Using device: {device} ---")

    documents = []
    with open('./merged_data.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    for record in data:
        page_content = (
            f"Tên vị thuốc: {record.get('name', '')}. "
            f"Chi tiết: {record.get('detail', '')}. "
            f"Tóm tắt: {record.get('summaried', '')}"
        )
        doc = Document(page_content=page_content, metadata={'source_record': record})
        documents.append(doc)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=40)
    chunks = text_splitter.split_documents(documents)

    embeddings = HuggingFaceEmbeddings(
        model_name="BAAI/bge-m3",
        model_kwargs={"device": device, "trust_remote_code": True}
    )
    vectorstore = FAISS.from_documents(chunks, embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 30})
    print("✅ Retriever đã sẵn sàng!")

    reranker = CrossEncoder('BAAI/bge-reranker-v2-m3', max_length=8192, device=device, trust_remote_code=True)
    print("✅ Reranker đã sẵn sàng!")

    workflow = StateGraph(GraphState)
    workflow.add_node("retrieve", retrieve_node)
    workflow.add_node("documents_filter_node", documents_filter_node)
    workflow.add_node("rerank_documents", rerank_documents_node)
    workflow.add_node("generate", generate_node)
    workflow.add_edge(START, "retrieve")
    workflow.add_edge("retrieve", "documents_filter_node")
    workflow.add_edge("documents_filter_node", "rerank_documents")
    # workflow.add_edge("retrieve", "rerank_documents")
    workflow.add_edge("rerank_documents", "generate")
    workflow.add_edge("generate", END)
    rag_app = workflow.compile()
    print("✅ Graph đã được biên dịch và sẵn sàng!")

load_and_prepare_rag()

--- Using device: cpu ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

✅ Retriever đã sẵn sàng!


config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

✅ Reranker đã sẵn sàng!
✅ Graph đã được biên dịch và sẵn sàng!


## Chatbot UI

In [8]:
import re

input_box = widgets.Textarea(placeholder='Hãy nhập câu hỏi của bạn...', layout={'width': '90%', 'height': '100px'})
submit_button = widgets.Button(description='Gửi câu hỏi', button_style='success', icon='paper-plane')
output_area = widgets.Output(layout={'border': '1px solid black', 'padding': '10px'})

def on_button_click(b):
    with output_area:
        clear_output()
        print("🤔 Đang xử lý, vui lòng chờ...")
        query = input_box.value
        if not query:
            clear_output(); print("⚠️ Vui lòng nhập câu hỏi."); return

        try:
            # llm = OllamaLLM(model="qwen3:8b", num_ctx=10000, reasoning=False)
            llm = OllamaLLM(model="qwen3:8b", num_ctx=10000)
            inputs = {"question": query, "llm": llm}
            final_state = rag_app.invoke(inputs)

            generation = final_state.get('generation', "Không thể tạo câu trả lời.")
            documents = final_state.get('documents', [])

            clear_output()

            # Parse for <think> tags and create dropdown for it
            match = re.search(r"<think>(.*?)</think>", generation, re.DOTALL)

            if match:
                thinking_process = match.group(1).strip()
                final_answer = re.sub(r"<think>.*?</think>", "", generation, flags=re.DOTALL).strip()

                think_accordion = widgets.Accordion(children=[widgets.HTML(f"<pre><code>{thinking_process}</code></pre>")])
                think_accordion.set_title(0, 'Xem quá trình suy luận 🤔')
                think_accordion.selected_index = None

                display(think_accordion)
                display(Markdown(f"### 💬 Câu trả lời của Bot:\n{final_answer}"))
            else:
                display(Markdown(f"### 💬 Câu trả lời của Bot:\n{generation}"))

            # Display sources in a collapsible dropdown ---
            if documents:
                # Create an HTML formatted string for the sources
                sources_html = "<ul>"
                for i, doc in enumerate(documents):
                    sources_html += f"<li><b>Nguồn {i+1}:</b> <code>{json.dumps(doc, indent=2, ensure_ascii=False)}</code></li>"
                sources_html += "</ul>"

                # Create the accordion for sources
                sources_accordion = widgets.Accordion(children=[widgets.HTML(sources_html)])
                sources_accordion.set_title(0, 'Xem các nguồn tài liệu đã sử dụng 📚')
                sources_accordion.selected_index = None # Start collapsed

                display(sources_accordion)

        except Exception as e:
            clear_output()
            display(Markdown(f"**Lỗi:** Gặp sự cố trong quá trình xử lý: `{e}`"))

submit_button.on_click(on_button_click)
display(widgets.VBox([input_box, submit_button, output_area]))

VBox(children=(Textarea(value='', layout=Layout(height='100px', width='90%'), placeholder='Hãy nhập câu hỏi củ…

## Đánh giá mô hình

In [9]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnableLambda
import pandas as pd
import re

In [10]:
judge_llm = OllamaLLM(model="qwen3:30b", num_ctx=10000, reasoning=None)

In [11]:
# Custom parser function to extract <think> tags and the JSON object
def parse_think_and_json(llm_output: str) -> dict:
    """
    Parses the raw LLM output to extract reasoning from <think> tags and a score from a JSON object.
    """
    try:
        # Dind content within <think> tags
        think_match = re.search(r'<think>(.*?)</think>', llm_output, re.DOTALL)
        reasoning = think_match.group(1).strip() if think_match else "No reasoning found in <think> tags."

        # Find the JSON block
        json_match = re.search(r'\{.*\}', llm_output, re.DOTALL)
        if json_match:
            json_data = json.loads(json_match.group(0))
            score = json_data.get('score')
        else:
            score = None

        return {"score": score, "reasoning": reasoning}

    except (json.JSONDecodeError, AttributeError):
        return {"score": None, "reasoning": "Error parsing LLM output."}

In [12]:
# --- Answer Relevance ---
answer_relevance_prompt = ChatPromptTemplate.from_template(
    """You are an expert evaluator of a RAG system. Your single task is to assess if a generated answer is relevant to a user's question.

    **--- CRITICAL INSTRUCTION 1: IGNORE FACTUALITY ---**
    You must completely IGNORE whether the information in the answer is factually correct. Your evaluation must be based *only* on whether the answer's topic and content directly address the user's question. Fact-checking is handled by a different evaluator.

    **--- CRITICAL INSTRUCTION 2: HANDLING ADDITIONAL INFORMATION ---**
    The answer may list several medicines. For each medicine, it might describe multiple uses. **If a medicine is included because at least one of its uses is relevant to the question, you should NOT penalize the answer for also listing its other, non-relevant uses.** This additional information is considered acceptable context. Only penalize the answer if it includes medicines whose properties are *entirely* unrelated to the user's question.

    **Scoring Guide (continuous scale from 0.0 to 1.0):**
    - 1.0: The answer is perfectly relevant and directly addresses all aspects of the question, following the rules above.
    - 0.75: The answer is relevant and addresses the main points of the question.
    - 0.5: The answer is partially relevant but incomplete or tangential.
    - 0.25: The answer has minor relevance but does not address the main point of the question.
    - 0.0: The answer is completely irrelevant to the question.

    **User Question:**
    {question}

    **Generated Answer:**
    {answer}

    Provide your final evaluation as a JSON object with a single key: "score".

    JSON Response:
    """
)

# --- Context Relevance ---
context_relevance_prompt = ChatPromptTemplate.from_template(
    """You are an expert evaluator assessing the usefulness of retrieved documents for a RAG system.

    **--- CRITICAL INSTRUCTION: Understand the Goal ---**
    The "Retrieved Context" is **NOT** the final answer. It is the raw information a separate AI will use to generate an answer. Your task is to judge if the context **CONTAINS** the necessary information for that AI to *create* a good answer. The context can contain extra, irrelevant details; this is acceptable as long as the core, useful information is present.

    **Scoring Guide (continuous scale from 0.0 to 1.0):**
    - 0.0: The context is completely irrelevant to the question.
    - 0.25: The context has minor relevance but is missing key information.
    - 0.5: The context is partially relevant but contains a lot of noise or irrelevant details.
    - 0.75: The context is highly relevant and contains the necessary information to answer the question.
    - 1.0: The context is perfectly relevant, and contains everything needed to give a complete answer.

    **User Question:**
    {question}

    **Retrieved Context:**
    {context}

    Provide your final evaluation as a JSON object with a single key: "score".

    JSON Response:
    """
)

# --- Groundedness and Completeness (Faithfulness) ---
faithfulness_score_prompt = ChatPromptTemplate.from_template(
    """You are an expert evaluator. Your task is to assess the quality of a generated answer based on a provided context.
    You must evaluate TWO aspects in a single score:
    1.  **Groundedness**: Is the answer factually supported by the context? Penalize any claims in the answer that are not found in the context.
    2.  **Completeness**: Does the answer omit any critical information from the context, especially warnings, side effects, or contraindications?

    **Scoring Guide (continuous scale from 0.0 to 1.0):**
    - 1.0: The answer is both fully supported by the context (no hallucinations) AND includes all critical details.
    - 0.75: The answer is fully supported but has a minor omission, OR it is mostly supported with no critical omissions.
    - 0.5: The answer has some unsupported claims OR omits important information (e.g., a common side effect).
    - 0.25: The answer has significant unsupported claims OR omits critical safety information (e.g., a severe warning).
    - 0.0: The answer is a complete hallucination or is dangerously misleading due to severe omissions.

    **Provided Context:**
    {context}

    **Generated Answer:**
    {answer}

    Provide your final evaluation as a JSON object with a single key: "score".

    JSON Response:
    """
)

eval_answer_relevance_chain = answer_relevance_prompt | judge_llm | RunnableLambda(parse_think_and_json)
eval_context_relevance_chain = context_relevance_prompt | judge_llm | RunnableLambda(parse_think_and_json)
eval_faithfulness_score_chain = faithfulness_score_prompt | judge_llm | RunnableLambda(parse_think_and_json)

In [14]:
llm = OllamaLLM(model="qwen3:8b", num_ctx=10000, reasoning=None)

with open('./evaluated_questions.txt', 'r', encoding='utf-8') as f:
  eval_questions = [line.strip() for line in f.readlines()]
evaluation_results = []

print("Starting evaluation...")

for i, question in enumerate(eval_questions):
    print(f"  Evaluating question {i+1}/{len(eval_questions)}: \"{question}\"")
    # 1. Get the response from RAG chain
    inputs = {"question": question, "llm": llm}
    rag_response = rag_app.invoke(inputs)
    documents = rag_response.get('documents', [])
    raw_output = rag_response.get('generation', "Không thể tạo câu trả lời.")
    # Parses the raw output from the generator LLM to separate the <think> tags from the final answer.
    think_match = re.search(r'<think>(.*?)</think>', raw_output, re.DOTALL)
    if think_match:
        reasoning = think_match.group(1).strip()
        # The final answer is everything *after* the closing </think> tag
        answer = raw_output[think_match.end():].strip()
    else:
        # If no think tag is found, the whole output is the answer
        reasoning = "No <think> tag found in generator output."
        answer = raw_output.strip()

    # 2. Run the evaluation chains
    ar_result = eval_answer_relevance_chain.invoke({"question": question, "answer": answer})
    cr_result = eval_context_relevance_chain.invoke({"question": question, "context": documents})
    f_result = eval_faithfulness_score_chain.invoke({"question": question, "context": documents, "answer": answer})

    # 3. Store the results
    evaluation_results.append({
        "question": question,
        "answer": answer,
        "documents": documents,
        "llm_reasoning": reasoning,
        "answer_relevance_score": ar_result.get('score'),
        "answer_relevance_reasoning": ar_result.get('reasoning'),
        "context_relevance_score": cr_result.get('score'),
        "context_relevance_reasoning": cr_result.get('reasoning'),
        "faithfulness_score": f_result.get('score'),
        "faithfulness_reasoning": f_result.get('reasoning')
    })

print("Evaluation complete!")

Starting evaluation...
  Evaluating question 1/60: "Vị thuốc nào trong tài liệu có tác dụng bổ huyết?"
---NODE: RETRIEVE (Completed)---
  Retrieved 30 document chunks.
---NODE: GRADE DOCUMENTS (Completed)---
  - Doc: 'Không có tên khác được liệt kê....' -> Grade: YES
  - Doc: '...' -> Grade: YES
  - Doc: 'KÊ HUYẾT ĐẰNG...' -> Grade: YES
  - Doc: 'Lộc Giác Giao...' -> Grade: YES
  - Doc: 'VÔ TÂM THẢO...' -> Grade: YES
  - Doc: 'TANG HOA...' -> Grade: YES
  - Doc: 'Thủy lật căn, Thủy lật tử, Xuyên cốt, Cố...' -> Grade: YES
  - Doc: 'Thịt con hạt đán...' -> Grade: YES
  - Doc: 'LỘC THẬN...' -> Grade: NO
  - Doc: 'Hà-xuân...' -> Grade: YES
  - Doc: 'Khô cận, Mã-cận, So-qui, Thủy-anh...' -> Grade: YES
  - Doc: 'Không có tên khác được liệt kê....' -> Grade: YES
  - Doc: 'Cốt-nột, A-từ-bột-tha, Nột-thú, On-nột-t...' -> Grade: NO
  - Doc: 'TRƯ HUYẾT...' -> Grade: YES
  - Doc: 'Cam-kỷ-tử, Kỷ-tử, Táo-kỷ-vương, Thiên-ti...' -> Grade: YES
  - Doc: 'Khoáng mạch...' -> Grade: NO
  - Doc: 'Địa hoàng,

In [15]:
results_df = pd.DataFrame(evaluation_results)

# Calculate average scores
avg_ar_score = results_df['answer_relevance_score'].mean()
avg_cr_score = results_df['context_relevance_score'].mean()
avg_f_score = results_df['faithfulness_score'].mean()

print(f"Evaluation Summary (Scale: 0.0 to 1.0):")
print(f"  - Average Answer Relevance: {avg_ar_score:.2f}")
print(f"  - Average Context Relevance: {avg_cr_score:.2f}")
print(f"  - Average Faithfulness: {avg_f_score:.2f}")

results_df.to_csv('evaluation_records.csv', index=False, encoding='utf-8-sig')

Evaluation Summary (Scale: 0.0 to 1.0):
  - Average Answer Relevance: 1.00
  - Average Context Relevance: 0.97
  - Average Faithfulness: 0.67
