In [1]:
!pip install -q \
  "torch>=2.0.0" \
  "transformers>=4.40.0" \
  "accelerate>=0.30.0" \
  "huggingface-hub>=0.23.0" \
  "sentence-transformers>=2.7.0" \
  "langchain>=0.2.0" \
  "langchain-core>=0.2.0" \
  "langchain-community>=0.1.0" \
  "langchain-text-splitters>=0.2.0" \
  "chromadb>=0.5.0" \
  "langchain-chroma>=0.2.0" \
  "pypdf>=4.2.0" \
  "langserve[all]>=0.1.0" \
  "fastapi>=0.115.0" \
  "uvicorn>=0.30.0" \
  "gradio>=5.0.0" \
  "langchain-huggingface"\
  "wget"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.9/475.9 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m122.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import sys

PROJECT_ROOT = "/content/rag_langchain"
os.makedirs(os.path.join(PROJECT_ROOT, "data_source", "generative_ai"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "src", "base"), exist_ok=True)
os.makedirs(os.path.join(PROJECT_ROOT, "src", "rag"), exist_ok=True)

os.chdir(PROJECT_ROOT)
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
%%bash
touch /content/rag_langchain/src/__init__.py
touch /content/rag_langchain/src/base/__init__.py
touch /content/rag_langchain/src/rag/__init__.py

In [4]:
import os
import wget

DATA_DIR = "/content/rag_langchain/data_source/generative_ai"
os.makedirs(DATA_DIR, exist_ok=True)

pdf_links = [
    {
        "title": "Vòng lặp for và ứng dụng",
        "url": "https://docs.google.com/uc?export=download&id=1_zJnj5qORwzMH6vgftGzkYu4fFew6Pq4"
    },
    {
        "title": "Giám sát hệ thống AI với Grafana và Prometheus",
        "url": "https://docs.google.com/uc?export=download&id=1gZWLJddiuLd-ZZ8j_Nmfu_r7fISGRW6J"
    },
      {
        "title": "Các Thước Đo Đánh Giá Mô Hình Hồi Quy",
        "url": "https://docs.google.com/uc?export=download&id=1C-f9pNW0mkMxaakDcpliN3isTVqRQtR3"
    },
      {
        "title": "A simple, strong baseline for Long-Term Forecasts",
        "url": "https://docs.google.com/uc?export=download&id=16KFeWi0ONqV3ZJYAgC_20y7bxUNGd9hU"
    },
]

for pdf_info in pdf_links:
    save_path = os.path.join(DATA_DIR, f"{pdf_info['title']}.pdf")
    if not os.path.exists(save_path):
        try:
            wget.download(pdf_info["url"], out=save_path)
        except Exception as e:
            pass

In [5]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain_huggingface import HuggingFacePipeline

def get_hf_llm(
    model_name: str = "Qwen/Qwen2.5-3B-Instruct",
    temperature: float = 0.2,
    max_new_tokens: int = 450,
    **kwargs
):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        low_cpu_mem_usage=True
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_p=0.75
    )

    llm = HuggingFacePipeline(pipeline=model_pipeline, model_kwargs=kwargs)
    return llm

In [6]:
import re
import unicodedata
from typing import List

def clean_vietnamese_text(text: str) -> str:
    text = unicodedata.normalize('NFC', text)
    text = "".join(
        char for char in text
        if not unicodedata.category(char).startswith('C') or char in '\n\t'
    )
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n', text)
    return text.strip()

In [7]:
import glob
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

class SimpleLoader:
    def load_pdf(self, pdf_file: str):
        docs = PyPDFLoader(pdf_file, extract_images=True).load()
        for doc in docs:
            doc.page_content = clean_vietnamese_text(doc.page_content)
        return docs

    def load_dir(self, dir_path: str) -> List:
        pdf_files = glob.glob(f"{dir_path}/*.pdf")
        if not pdf_files:
            raise ValueError(f"No PDF files found in {dir_path}")

        all_docs = []
        for pdf_file in tqdm(pdf_files, desc="Loading PDFs"):
            try:
                all_docs.extend(self.load_pdf(pdf_file))
            except Exception as e:
                pass
        return all_docs

class TextSplitter:
    def __init__(
        self,
        chunk_size: int = 400,
        chunk_overlap: int = 120,
    ):
        self.splitter = RecursiveCharacterTextSplitter(
            separators=["\n\n", "\n", " ", ""],
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

    def split(self, documents):
        return self.splitter.split_documents(documents)

In [8]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

class VectorDB:
    def __init__(
        self,
        documents=None,
        embedding_model: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        collection_name: str = "vietnamese_docs",
        persist_dir: str = "/content/chroma_data",
    ):
        self.persist_dir = persist_dir
        self.collection_name = collection_name

        self.embedding = HuggingFaceEmbeddings(model_name=embedding_model)
        self.db = self._build_db(documents)

    def _build_db(self, documents):
        if documents is None or len(documents) == 0:
            db = Chroma(
                collection_name=self.collection_name,
                embedding_function=self.embedding,
                persist_directory=self.persist_dir,
            )
        else:
            db = Chroma.from_documents(
                documents=documents,
                embedding=self.embedding,
                collection_name=self.collection_name,
                persist_directory=self.persist_dir,
            )
        return db

    def get_retriever(self, search_kwargs: dict = None):
        if search_kwargs is None:
            search_kwargs = {"k": 4}

        return self.db.as_retriever(
            search_type="similarity",
            search_kwargs=search_kwargs,
        )

In [9]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

class FocusedAnswerParser(StrOutputParser):
    def parse(self, text: str) -> str:
        text = text.strip()
        if "[TRẢ LỜI]:" in text:
            answer = text.split("[TRẢ LỜI]:")[-1].strip()
        else:
            answer = text

        answer = re.sub(r'^\s*[•\-\*]\s*', '', answer, flags=re.MULTILINE)
        answer = re.sub(r'\n+', ' ', answer)
        lines = [line.strip() for line in answer.split('. ') if line.strip() and len(line.strip()) > 5]
        return '. '.join(lines[:5]) + ('.' if lines else '')

class OfflineRAG:
    def __init__(self, llm):
        self.llm = llm
        self.prompt = PromptTemplate.from_template("""
Bạn là trợ lý AI phân tích tài liệu tiếng Việt.

[TÀI LIỆU]:
{context}

[CÂU HỎI]:
{question}

Hãy trả lời dựa trên tài liệu. Nếu tài liệu không có thông tin, nói rõ "Không có thông tin".
Trả lời đầy đủ thông tin (3-5 câu chi tiết), không thêm bất kỳ chi tiết nào ngoài tài liệu.
[TRẢ LỜI]:""")

        self.answer_parser = FocusedAnswerParser()

    def get_chain(self, retriever):
        def format_docs(docs):
            formatted = []
            seen = set()
            for doc in docs:
                content = doc.page_content.strip()
                if content and len(content) > 40 and content not in seen:
                    formatted.append(content)
                    seen.add(content)
            return "\n\n".join(formatted)
        rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | self.prompt
            | self.llm
            | self.answer_parser
        )
        return rag_chain

In [None]:
os.chdir("/content/rag_langchain")

llm = get_hf_llm()

data_dir = "/content/rag_langchain/data_source/generative_ai"

loader = SimpleLoader()
text_splitter = TextSplitter(chunk_size=400, chunk_overlap=120)

raw_docs = loader.load_dir(data_dir)
split_docs = text_splitter.split(raw_docs)

vdb = VectorDB(documents=split_docs)
retriever = vdb.get_retriever(search_kwargs={"k": 4})

rag = OfflineRAG(llm)
rag_chain = rag.get_chain(retriever)

def answer_question(question: str) -> str:
    try:
        return rag_chain.invoke(question)
    except Exception as e:
        return f"Error: {str(e)}"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

In [None]:
import gradio as gr

custom_css = """
#question-area textarea {
    font-size: 35px;
    line-height: 1.5;
}
#answer-area textarea {
    font-size: 31px;
    line-height: 1.5;
}
"""

with gr.Blocks(title="RAG Vietnamese QA", css=custom_css) as demo:
    gr.Markdown("# RAG - Hỏi Đáp về Tài Liệu")

    with gr.Row():
        with gr.Column(scale=1):
            question_input = gr.Textbox(
                label="Câu hỏi",
                placeholder="Ví dụ: Vì sao classification lại không thể chỉ nhìn accuracy để đánh giá?",
                lines=3,
                elem_id="question-area"
            )
            submit_btn = gr.Button("Gửi", variant="primary")

        with gr.Column(scale=2):
            answer_output = gr.Textbox(
                label="Câu trả lời",
                lines=6,
                interactive=False,
                elem_id="answer-area"
            )

    submit_btn.click(
        fn=answer_question,
        inputs=question_input,
        outputs=answer_output,
    )

demo.launch(share=True)

  with gr.Blocks(title="RAG Vietnamese QA", css=custom_css) as demo:


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7359ab2ad818c52cb1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


