In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install -q PyMuPDF
! pip install -q pdfplumber

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m77.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import re
import json
import fitz  # PyMuPDF
import pdfplumber

# 1. Chunk QUY CHẾ TỔ CHỨC VÀ QUẢN LÝ ĐÀO TẠO TRÌNH ĐỘ ĐẠI HỌC

In [4]:
def split_text_with_overlap(text: str, chunk_size: int = 1000, overlap: int = 200):
    """
    Sliding-window chunking theo ký tự, có overlap.
    - chunk_size: độ dài tối đa của mỗi chunk (số ký tự)
    - overlap: số ký tự trùng giữa 2 chunk liên tiếp
    """
    if chunk_size <= overlap:
        raise ValueError("chunk_size phải > overlap")

    text = text.strip()
    if not text:
        return []

    chunks = []
    step = chunk_size - overlap
    start = 0
    text_len = len(text)

    while start < text_len:
        end = min(start + chunk_size, text_len)
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        start += step

    return chunks


In [5]:
def chunk_qd1830_for_rag(json_data,
                         chunk_size: int = 1000,
                         overlap: int = 200,
                         include_tables: bool = True):
    """
    - json_data: object đã load từ qd_1830_simple_with_tables.json
    - chunk_size, overlap: tham số sliding window
    - include_tables: nếu True thì tạo thêm chunk cho từng dòng bảng
    """
    doc_id = json_data.get("doc_id", "qd_1830_2021")
    doc_title = json_data.get("title", "")

    chunks = []
    chunk_id_counter = 0

    for chapter in json_data.get("chapters", []):
        chapter_title = chapter.get("title", "")

        for article in chapter.get("articles", []):
            article_title = article.get("title", "")
            full_section_title = f"{chapter_title} - {article_title}".strip(" -")

            # --- 2.1. Chunk text của Điều ---
            text = (article.get("text") or "").strip()
            if text:
                text_chunks = split_text_with_overlap(
                    text, chunk_size=chunk_size, overlap=overlap
                )
                for i, chunk_text in enumerate(text_chunks):
                    chunks.append({
                        "chunk_id": f"{doc_id}_art_{chunk_id_counter}",
                        "doc_id": doc_id,
                        "doc_title": doc_title,
                        "chapter_title": chapter_title,
                        "article_title": article_title,
                        "section_title": full_section_title,
                        "chunk_index": i,
                        "chunk_type": "text",
                        "text": chunk_text
                    })
                    chunk_id_counter += 1

            # --- 2.2. (Optional) Chunk cho table ---
            if include_tables:
                for table in article.get("tables", []):
                    caption = table.get("caption", "")
                    headers = table.get("headers") or []
                    rows = table.get("rows") or []

                    for row_idx, row in enumerate(rows):
                        # ghép header + row thành 1 câu text cho dễ search
                        # nếu header/row lệch size thì zip sẽ cắt ở min length
                        row_pairs = []
                        for h, cell in zip(headers, row):
                            h = (h or "").strip()
                            cell = (cell or "").strip()
                            if h or cell:
                                row_pairs.append(f"{h}: {cell}")
                        row_text = " | ".join(row_pairs)

                        table_chunk_text = f"{caption}. {row_text}"

                        chunks.append({
                            "chunk_id": f"{doc_id}_tbl_{chunk_id_counter}",
                            "doc_id": doc_id,
                            "doc_title": doc_title,
                            "chapter_title": chapter_title,
                            "article_title": article_title,
                            "section_title": full_section_title,
                            "chunk_index": row_idx,
                            "chunk_type": "table_row",
                            "table_caption": caption,
                            "text": table_chunk_text
                        })
                        chunk_id_counter += 1

    return chunks


In [9]:
# 1) Load JSON đã extract
with open("/content/drive/MyDrive/NLP/RAG_final/Parsed_pdfs/qd_1830.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 2) Chunking
chunks = chunk_qd1830_for_rag(
    data,
    chunk_size=1000,   # ~250 tokens
    overlap=200        # ~50 tokens overlap (≈ 20% chunk_size)
)

# 3) Lưu ra file .json để embed
with open("/content/drive/MyDrive/NLP/RAG_final/Chunking/qd_1830_chunks_debug.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)