In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install -q PyMuPDF
! pip install -q pdfplumber

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import re
import json
import fitz  # PyMuPDF
import pdfplumber

# 1. Parse QUY CHẾ TỔ CHỨC VÀ QUẢN LÝ ĐÀO TẠO TRÌNH ĐỘ ĐẠI HỌC

In [None]:
CHAPTER_RE = re.compile(r"^Chương\s+([IVXLC]+)\.?\s*(.*)$", re.IGNORECASE)
ARTICLE_RE = re.compile(r"^Điều\s+(\d+)\.\s*(.+)$")
TABLE_CAPTION_RE = re.compile(r"^Bảng\s+(\d+)\.?\s*(.*)$", re.IGNORECASE)

In [None]:
def extract_qd1830_to_json(pdf_path: str) -> dict:
    result = {
        "doc_id": "qd_1830_2021",
        "title": "QUY CHẾ TỔ CHỨC VÀ QUẢN LÝ ĐÀO TẠO TRÌNH ĐỘ ĐẠI HỌC",
        "chapters": []
    }

    current_chapter = None
    current_article = None

    with pdfplumber.open(pdf_path) as pdf:
        for page_index, page in enumerate(pdf.pages):
            page_no = page_index + 1

            # 1. Text theo dòng
            text = page.extract_text() or ""
            lines = [line.strip() for line in text.splitlines() if line.strip()]

            # 2. Bảng trên trang
            page_tables_raw = page.extract_tables() or []
            page_tables = []
            for tb in page_tables_raw:
                if not tb:
                    continue
                # tb là list rows, mỗi row là list cell
                # hàng đầu tiên coi như header
                headers = tb[0]
                rows = tb[1:]
                page_tables.append({"headers": headers, "rows": rows})

            table_idx = 0

            # 3. Parse từng dòng
            for line in lines:
                # Chương
                m_chap = CHAPTER_RE.match(line)
                if m_chap:
                    roman, rest_title = m_chap.groups()
                    full_title = f"Chương {roman}."
                    if rest_title:
                        full_title += f" {rest_title}"
                    current_chapter = {
                        "title": full_title.strip(),
                        "articles": []
                    }
                    result["chapters"].append(current_chapter)
                    current_article = None
                    continue

                # Điều
                m_art = ARTICLE_RE.match(line)
                if m_art:
                    art_no, art_title = m_art.groups()
                    full_title = f"Điều {art_no}. {art_title}".strip()
                    current_article = {
                        "title": full_title,
                        "text": "",
                        "tables": []
                    }
                    if current_chapter is None:
                        current_chapter = {
                            "title": "Chương ?",
                            "articles": []
                        }
                        result["chapters"].append(current_chapter)
                    current_chapter["articles"].append(current_article)
                    continue

                # Caption bảng
                m_tbl = TABLE_CAPTION_RE.match(line)
                if m_tbl and current_article is not None:
                    tbl_no, tbl_title = m_tbl.groups()
                    caption = f"Bảng {tbl_no}."
                    if tbl_title:
                        caption += f" {tbl_title}"

                    headers, rows = [], []
                    if table_idx < len(page_tables):
                        headers = page_tables[table_idx]["headers"]
                        rows = page_tables[table_idx]["rows"]
                        table_idx += 1

                    table_obj = {
                        "caption": caption.strip(),
                        "headers": headers,
                        "rows": rows
                    }
                    current_article["tables"].append(table_obj)
                    # caption không đưa vào text nữa
                    continue

                # Dòng thường -> nối vào text Điều
                if current_article is not None:
                    if current_article["text"]:
                        current_article["text"] += " " + line
                    else:
                        current_article["text"] = line

    return result

In [None]:
data = extract_qd1830_to_json("/content/drive/MyDrive/NLP/RAG_final/raw_data/QD_1830_Quy_che_to_chuc_va_quan_ly_dao_tao_tuyen_sinh_2021.pdf")
with open("/content/drive/MyDrive/NLP/RAG_final/parsed_pdfs/qd_1830.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
