In [4]:
import fitz  # PyMuPDF
import os
import re
from pathlib import Path

# --- Config ---
input_dir = 'training_input'
output_dir = 'training_output'

# heuristics and keywords for header/logo detection
REPEAT_PAGES_THRESHOLD = 0.5  # repeated text on >=50% pages is header
MAX_TOP_PX = 120
TOP_FRAC = 0.12  # fraction of page height to inspect if tall

def ensure_dirs(path):
    Path(path).mkdir(parents=True, exist_ok=True)

def process_text(text):
    """
    Convert lines to Markdown with numbered headings and nested bullets.
    
    ĐÃ CẬP NHẬT: Tất cả các Heading được thêm \n\n trước và sau để tạo khoảng cách lớn.
    """
    lines = text.split('\n')
    out = []

    for raw in lines:
        line = raw.rstrip()
        if not line:
            out.append('')
            continue

        # Numbered headings: 1., 1.2., 1.2.1., 1.2.1.2 etc.
        m = re.match(r'^(\d+(\.\d+)+)\.?\s+(.*)', line)
        if m:
            numbering = m.group(1)
            content = m.group(3)
            level = numbering.count('.')  # số dấu chấm = cấp heading
            if level > 6:
                level = 6
            # THÊM KHOẢNG CÁCH LỚN TRƯỚC VÀ SAU HEADING
            out.append(f'\n\n{"#" * level} {content}\n\n')
            continue

        # Bullet list
        m2 = re.match(r'^([ \t]*)([-•\*\u2022])\s+(.*)', line)
        if m2:
            indent_spaces = len(m2.group(1).replace('\t', '    '))
            level = indent_spaces // 2
            content = m2.group(3)
            out.append(f'{"  " * level}- {content}')
            continue

        # Heading: Chương / Mục
        if re.match(r'^(Chương|CHƯƠNG) \d+:?', line, re.IGNORECASE):
            title = re.split(r':', line, 1)[-1].strip()
            # THÊM KHOẢNG CÁCH LỚN TRƯỚC VÀ SAU HEADING
            out.append(f'\n\n# {title}\n\n')
            continue

        # Default: paragraph
        out.append(line)

    return '\n'.join(out)

ensure_dirs(output_dir)
ensure_dirs(input_dir)

for pdf_file in os.listdir(input_dir):
    if not pdf_file.lower().endswith('.pdf'):
        continue

    pdf_path = os.path.join(input_dir, pdf_file)
    pdf_name = os.path.splitext(pdf_file)[0]
    out_folder = os.path.join(output_dir, pdf_name)
    img_folder = os.path.join(out_folder, 'images')
    ensure_dirs(out_folder)
    ensure_dirs(img_folder)

    md_lines = []
    image_count = 1

    doc = fitz.open(pdf_path)
    pages = [doc.load_page(i) for i in range(len(doc))]
    num_pages = len(pages)

    # --- PASS 1: detect repeated header text across pages ---
    page_top_words = []
    for page in pages:
        rect = page.rect
        page_height = rect.height
        top_limit = min(MAX_TOP_PX, page_height * TOP_FRAC)
        words = page.get_text('words') or []
        top_words = []
        for w in words:
            if len(w) < 5:
                continue
            x0, y0, x1, y1, word_text = w[:5]
            if y0 <= top_limit and word_text.strip():
                top_words.append((word_text.strip(), y0, y1))
        page_top_words.append(top_words)

    # count repeated texts
    text_counts = {}
    for top_words in page_top_words:
        uniq = set(t for t, _, _ in top_words)
        for t in uniq:
            text_counts[t] = text_counts.get(t, 0) + 1
    repeated_texts = {t for t, c in text_counts.items() if c >= max(2, int(REPEAT_PAGES_THRESHOLD * num_pages))}

    # map repeated texts to positions
    repeated_positions = {}
    for i, top_words in enumerate(page_top_words):
        for t, top, bottom in top_words:
            if t in repeated_texts:
                repeated_positions.setdefault(t, []).append((top, bottom))
    for t, lst in repeated_positions.items():
        tops = [a for a, b in lst]
        bottoms = [b for a, b in lst]
        repeated_positions[t] = (min(tops), max(bottoms))

    # --- PASS 2: extract text and images, skip header/logo ---
    for page_number, page in enumerate(pages, start=1):
        rect = page.rect
        page_height = rect.height
        top_words = page_top_words[page_number-1]

        found = []
        for t, top, bottom in top_words:
            if t in repeated_positions:
                found.append((top, bottom))
        header_bottom = max((b for t, b in found), default=min(MAX_TOP_PX, page_height * TOP_FRAC)) + 2

        # text
        words = page.get_text('words') or []
        body_words = [w for w in words if len(w) >= 5 and w[1] >= header_bottom]
        body_words_sorted = sorted(body_words, key=lambda w: (round(w[1]), w[0]))

        lines = []
        cur_top = None
        cur_line = []
        tol = 3
        for w in body_words_sorted:
            x0, y0, x1, y1, word_text = w[:5]
            if cur_top is None or abs(y0 - cur_top) <= tol:
                cur_line.append(word_text)
                if cur_top is None:
                    cur_top = y0
            else:
                if cur_line:
                    lines.append(' '.join(cur_line))
                cur_line = [word_text]
                cur_top = y0
        if cur_line:
            lines.append(' '.join(cur_line))

        # process lines to markdown
        for line in lines:
            processed_chunk = process_text(line)
            # Chỉ thêm vào nếu không phải là khoảng trắng hoàn toàn
            if processed_chunk.strip() or ('\n\n' in processed_chunk): 
                 md_lines.append(processed_chunk)

        # images
        blocks = page.get_text('dict').get('blocks', [])
        for block in blocks:
            if block.get('type') == 1:
                bbox = block.get('bbox', [])
                if len(bbox) != 4:
                    continue
                bx0, by0, bx1, by1 = bbox
                if by0 < header_bottom:
                    continue
                xref = None
                img_info = block.get('image')
                if isinstance(img_info, dict) and 'xref' in img_info:
                    xref = img_info.get('xref')
                if xref is None:
                    continue
                try:
                    img_dict = doc.extract_image(xref)
                    img_bytes = img_dict['image']
                    img_ext = img_dict.get('ext', 'png')
                    img_path = os.path.join(img_folder, f'image_{image_count}.{img_ext}')
                    with open(img_path, 'wb') as imgf:
                        imgf.write(img_bytes)
                    md_lines.append(f'|<image_{image_count}>|')
                    image_count += 1
                except Exception as e:
                    print(f'Warning: failed extracting image on {pdf_file} page {page_number}: {e}')
    
    # Sử dụng `pdf_file` (tên gốc, ví dụ: 'tai_lieu.pdf')
    # Thêm Heading cho tên file. Ký tự \n\n ở đây giúp đảm bảo nó cũng cách xa nội dung đầu tiên
    md_lines.insert(0, f'\n\n#  {pdf_file}\n\n') 
    
    # save markdown
    md_path = os.path.join(out_folder, 'main.md')
    with open(md_path, 'w', encoding='utf-8') as f:
        # Khi join, nếu phần tử đã có \n\n rồi thì chỉ cần join bằng \n.
        # Tuy nhiên, do đã thêm \n\n vào Heading, chỉ cần join bằng một \n là đủ.
        f.write('\n'.join(md_lines))

    doc.close()
