In [None]:
! pip install langchain langchain-community langchain-core faiss-cpu gpt4all ctransformers unstructured bitsandbytes rouge nltk

Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting gpt4all
  Downloading gpt4all-2.8.2-py3-none-manylinux1_x86_64.whl.metadata (4.8 kB)
Collecting ctransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl.metadata (17 kB)
Collecting unstructured
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.61-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3

In [None]:
import os
import re
import json
import time
import random
import torch
from pprint import pprint
from sklearn.metrics import precision_recall_fscore_support
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.llms import CTransformers

In [None]:
DATA_PATH = '/kaggle/input/uet-rag/data'
QA_FILE = '/kaggle/input/uet-rag/qa.json'
VECTOR_DB_PATH = 'vectorstores/my_db'
EMBEDDING_MODEL_NAME = 'intfloat/multilingual-e5-large'
LLM_MODEL_PATH = '/kaggle/input/uet-rag/models/vinallama-7b-chat_q5_0.gguf'
LLM_MODEL_TYPE = 'llama'
TOP_K = 3
NUM_QUESTIONS = 50
RESULTS_FILE = 'results/results.json'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# VECTOR STORE CREATION
def create_vector_store():
    all_chunks = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)

    for file in os.listdir(DATA_PATH):
        if not file.endswith(".txt"):
            continue
        with open(os.path.join(DATA_PATH, file), 'r', encoding='utf-8') as f:
            lines = f.readlines()
            if len(lines) < 3:
                continue
            url = lines[0].replace("URL:", "").strip()
            title = lines[1].replace("Title:", "").strip()
            content = "".join(lines[2:]).strip()
            doc = Document(page_content=content, metadata={"source": file, "url": url, "title": title})
            chunks = splitter.split_documents([doc])
            all_chunks.extend(chunks)

    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    db = FAISS.from_documents(all_chunks, embeddings)
    db.save_local(VECTOR_DB_PATH)
    return db


# LOAD MODELS
def load_vector_db():
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    return FAISS.load_local(VECTOR_DB_PATH, embeddings, allow_dangerous_deserialization=True)

def load_llm():
    return CTransformers(
        model=LLM_MODEL_PATH,
        model_type=LLM_MODEL_TYPE,
        config={
            "max_new_tokens": 512,
            "temperature": 0.2,
            "context_length": 2048,
            "repetition_penalty": 1.3,
            "top_k": 20,
            "top_p": 0.7,
            "stream": False,
            "threads": 4
        }
    )


# RETRIEVAL-AUGMENTED GENERATION
def get_answer(query, db, llm, top_k=TOP_K):
    docs = db.similarity_search(query, k=top_k)
    context = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"""Trả lời trực tiếp và ngắn gọn câu hỏi sau, không quá 50 từ, không cần thêm câu từ dẫn dắt, không dùng ký tự đặc biệt vào đáp án:
{query}.
Dựa vào thông tin sau:
{context}
"""
    return llm.invoke(prompt).strip()


# EVALUATION METRICS
def normalize_text(s):
    s = s.lower()
    s = re.sub(r'[^\w\s]', '', s)
    return re.sub(r'\s+', ' ', s).strip()

def compute_f1(pred, gold):
    pred_tokens = normalize_text(pred).split()
    gold_tokens = normalize_text(gold).split()
    common = set(pred_tokens) & set(gold_tokens)
    if not common:
        return 0.0
    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def evaluate(results):
    em_scores, f1_scores, bleu_scores, rouge_scores = [], [], [], []
    rouge = Rouge()
    smoothie = SmoothingFunction().method4

    for item in results:
        gold = item.get("answer", "")
        pred = item.get("llm_answer", "")
        norm_gold = normalize_text(gold)
        norm_pred = normalize_text(pred)

        em_scores.append(int(norm_gold == norm_pred))
        f1_scores.append(compute_f1(pred, gold))
        bleu_scores.append(sentence_bleu([norm_gold.split()], norm_pred.split(), smoothing_function=smoothie))
        try:
            rouge_score = rouge.get_scores(pred, gold)[0]['rouge-l']['f']
        except:
            rouge_score = 0.0
        rouge_scores.append(rouge_score)

    print("\n=== Evaluation Metrics ===")
    print(f"Exact Match (EM): {sum(em_scores)/len(em_scores):.4f}")
    print(f"F1 Score:         {sum(f1_scores)/len(f1_scores):.4f}")
    print(f"BLEU Score:       {sum(bleu_scores)/len(bleu_scores):.4f}")
    print(f"ROUGE-L Score:    {sum(rouge_scores)/len(rouge_scores):.4f}")


# TESTING ON Q/A SET
def test_qa_set(qa_file, db, llm, results_file='results.json'):
    with open(QA_FILE, 'r', encoding='utf-8') as f:
        qa_pairs = json.load(f)

    results = []
    for i in range(len(qa_pairs)):
        qa = qa_pairs[i]
        if isinstance(qa, dict):
            print(f'\nQuestion {i} / {len(qa_pairs)}')
            start = time.time()
            answer = get_answer(qa['question'], db, llm)
            qa['llm_answer'] = answer
            results.append(qa)
            # print(f'Time: {time.time() - start:.2f}s')
            print(f'Question: {qa["question"]}')
            print(f'LLM answer: {answer}')
    
    with open(results_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    return results
    

In [5]:
create_vector_store()

  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
2025-05-23 18:34:49.429930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748025289.699279      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748025289.771040      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/160k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

<langchain_community.vectorstores.faiss.FAISS at 0x7bd90f0345d0>

In [6]:
db = load_vector_db()
llm = load_llm()

results = test_qa_set(QA_FILE, db, llm, RESULTS_FILE)


Question 0 / 63
Question: Học bổng Annex HKII năm học 2024-2025 được thông báo vào ngày nào?
LLM answer: Ngày 03 tháng 03 năm 2025.


Question 1 / 63
Question: Thông tin về học bổng K-T năm học 2024-2025 được thông báo vào ngày nào?
LLM answer: Ngày 28/02/2025.


Question 2 / 63
Question: Lễ trao học bổng Mitsubishi năm học 2024-2025 diễn ra khi nào?
LLM answer: Lễ trao học bổng Mitsubishi năm học 2024-2025 diễn ra vào ngày 21 tháng 2 năm 2025.


Question 3 / 63
Question: Trường Đại học Công nghệ tặng bao nhiêu suất học bổng Vietcombank năm học 2024-2025?
LLM answer: Trường Đại học Công nghệ tặng 20 suất học bổng Vietcombank năm học 2024-2025.


Question 4 / 63
Question: Tổng giá trị học bổng Vietcombank năm học 2024-2025 là bao nhiêu?
LLM answer: Tổng giá trị học bổng Vietcombank năm học 2024-2025 là 200.000.000đ. Mỗi suất học bổng trị giá 5.000.000đ.


Question 5 / 63
Question: Học bổng Vietcombank sẽ được chuyển cho sinh viên bằng hình thức nào?
LLM answer: Không có thông tin về hì

In [7]:
evaluate(results)


=== Evaluation Metrics ===
Exact Match (EM): 0.0952
F1 Score:         0.5124
BLEU Score:       0.3314
ROUGE-L Score:    0.4725
