# Markdown Cleaner 工作流程

這份 Notebook 會從單一 Markdown 來源產生：
- （可選）套用移除粗體、註解或統一標題等文字處理
- （可選）輸出整理後的 Markdown 檔案
- 藉由 Pandoc 產生 Word 檔，再用 python-docx 做後製（目錄、分頁、套用範本）

> 💡 Notebook 平常在 Docker 容器內執行：容器負責轉檔，但若要在 Word 中更新欄位（展開自動目錄），必須在主機上額外執行腳本。下方有相關說明。


In [1]:
from pathlib import Path

# ---- 設定區：可依需求調整 ----

source_relative = Path('mkdocs/My_Notes/1017黃茂榮/柯老師1017講稿.md')

# source_relative = Path('mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿.md')

# 若需要覆寫輸出路徑，可取消註解並設定
# output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿_無粗體.md')
# word_output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251016.docx')

output_relative = None
word_output_relative = None
word_template_relative = Path('mkdocs/My_Notes/word樣式文件夾/我的word樣式檔案.dotx')
toc_marker = r'\tableofcontents'  # 在原 Markdown 指定目錄位置的標記

# 匯出開關：預設僅輸出 Word，可視需求調整。
export_markdown = False
export_word = True
insert_page_breaks_for_h2 = False  # 是否於每個二級標題前換頁


remove_bold = False
remove_comments = True
normalize_headings = False

# 自動輸出命名設定：未提供自訂路徑時，以下規則將自動產生檔名與資料夾。
auto_markdown_suffix = '_無粗體'
auto_word_suffix = ''
auto_word_folder_name = None  # 例: '給老師的'
auto_filtered_prefix = 'Filtered_'

# --------------------------------


## 設定區一覽

- `source_relative`：來源 Markdown 的相對路徑。
- `output_relative` / `word_output_relative`：如省略，會依命名規則自動產生對應路徑。
- `word_template_relative`：Word 範本，可沿用樣式（例如目錄字型）。
- `export_markdown` / `export_word`：控制是否輸出對應格式。
- `insert_page_breaks_for_h2`：是否在每個 `##` 前插入換頁，預設關閉。
- `toc_marker`：在 Markdown 中放置的目錄標記（預設 `\tableofcontents`）。Notebook 會把它換成 placeholder，後製時插入真正的目錄欄位。
- 其他布林設定：移除粗體、清除 HTML 註解、統一標題層級。


In [2]:
from pathlib import Path
import re
import shutil
import subprocess
import tempfile
try:
    from docx import Document
    from docx.enum.text import WD_BREAK
    from docx.oxml import OxmlElement
    from docx.oxml.ns import qn
except ImportError:
    Document = None
    WD_BREAK = None
    OxmlElement = None
    qn = None

import xml.etree.ElementTree as ET
from zipfile import ZipFile

BASE_DIR = Path.cwd()
possible_roots = [BASE_DIR, BASE_DIR / "work"] + list(BASE_DIR.parents) + list((BASE_DIR / "work").parents)

NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
TOC_PLACEHOLDER = "[[DOCX_TOC_PLACEHOLDER]]"


def ensure_docx_package():
    if any(x is None for x in (Document, WD_BREAK, OxmlElement, qn)):
        raise ImportError("python-docx 未安裝，請先在此 notebook 執行 `%pip install python-docx`")


def resolve_path(relative):
    p = Path(relative)
    if p.is_absolute():
        return p if p.exists() else None
    for base in possible_roots:
        candidate = (Path(base) / p).resolve()
        if candidate.exists():
            return candidate
    return None


def export_to_docx(text: str, destination: Path, reference_doc: Path | None = None) -> None:
    import subprocess, tempfile, shutil, re

    # 清除可能破壞 Pandoc 的隱藏符號與非印字元（確保輸出合法）。
    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\u200B\uFEFF]', '', text)
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # Pandoc 僅能處理合法 UTF-8
    text.encode('utf-8', 'strict')

    if shutil.which("pandoc") is None:
        raise RuntimeError("pandoc 未安裝")

    with tempfile.NamedTemporaryFile("w", suffix=".md", encoding="utf-8", delete=False) as tmp:
        tmp.write(text)
        md = tmp.name

    try:
        cmd = [
            "pandoc", md,
            "-f", "markdown+pipe_tables+lists_without_preceding_blankline+hard_line_breaks",
            "-t", "docx",
            "-o", str(destination),
            "--wrap=none",
        ]
        if reference_doc is not None:
            cmd.extend(["--reference-doc", str(reference_doc)])
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print("⚠️ Pandoc 錯誤訊息：")
            print(result.stderr or result.stdout)
            raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)

        print(f"已輸出 Word（pandoc）：{destination}")
    finally:
        Path(md).unlink(missing_ok=True)

source_path = resolve_path(source_relative)
if not source_path:
    raise FileNotFoundError(f"找不到來源檔案：{source_relative}")

repo_root = None
for parent in source_path.parents:
    candidate = parent / "mkdocs"
    if candidate.exists():
        repo_root = parent
        break
if repo_root is None:
    for base in possible_roots:
        base = Path(base)
        if (base / "mkdocs").exists():
            repo_root = base
            break
if repo_root is None:
    repo_root = BASE_DIR

output_override = globals().get('output_relative', None)
word_output_override = globals().get('word_output_relative', None)
word_template_setting = globals().get('word_template_relative', None)
export_markdown = globals().get('export_markdown', False)
export_word = globals().get('export_word', True)
auto_markdown_suffix = globals().get('auto_markdown_suffix', '_cleaned')
auto_word_suffix = globals().get('auto_word_suffix', '')
auto_word_folder_name = globals().get('auto_word_folder_name', None)
auto_filtered_prefix = globals().get('auto_filtered_prefix', 'Filtered_')
insert_page_breaks_for_h2 = globals().get('insert_page_breaks_for_h2', False)


def resolve_output_path(candidate):
    if candidate is None:
        return None
    candidate = Path(candidate)
    if candidate.is_absolute():
        resolved = candidate
    else:
        resolved = (repo_root / candidate).resolve()
    resolved.parent.mkdir(parents=True, exist_ok=True)
    return resolved


def infer_markdown_output_path():
    base_dir = source_path.parent
    target_dir = base_dir
    if auto_filtered_prefix:
        if not base_dir.name.startswith(auto_filtered_prefix):
            candidate = base_dir.parent / f"{auto_filtered_prefix}{base_dir.name}"
            if candidate.exists():
                target_dir = candidate
    target_dir.mkdir(parents=True, exist_ok=True)
    suffix = auto_markdown_suffix or ''
    return target_dir / f"{source_path.stem}{suffix}{source_path.suffix}"


def infer_word_output_path():
    base_dir = source_path.parent
    target_dir = base_dir
    if auto_word_folder_name:
        candidate = base_dir / auto_word_folder_name
        if candidate.exists():
            target_dir = candidate
    target_dir.mkdir(parents=True, exist_ok=True)
    suffix = auto_word_suffix or ''
    return target_dir / f"{source_path.stem}{suffix}.docx"


def resolve_word_template():
    if word_template_setting is None:
        return None
    candidate = word_template_setting
    if not isinstance(candidate, Path):
        candidate = Path(candidate)
    resolved = resolve_path(candidate)
    if resolved is None:
        print(f"⚠️ 找不到 Word 樣式檔：{candidate}")
    return resolved

def resolve_style(doc, candidates, fallback='Normal'):
    for name in candidates:
        try:
            return doc.styles[name]
        except KeyError:
            continue
    for style in doc.styles:
        style_name = getattr(style, 'name', '')
        for name in candidates:
            if style_name.endswith(name) or style_name == name:
                return style
    return doc.styles[fallback]

def post_process_docx(docx_path: Path):
    """使用 python-docx 在指定位置插入 TOC 與頁分隔。"""
    ensure_docx_package()
    doc = Document(docx_path)
    if not doc.paragraphs:
        doc.save(docx_path)
        return

    # 移除 Pandoc 預設目錄段落（如英文 Table of Contents 或 TOC 條目）
    for para in list(doc.paragraphs):
        style_name = para.style.name if para.style else ''
        if style_name.startswith('TOC') or 'Table of Contents' in para.text:
            p = para._element
            p.getparent().remove(p)

    apply_page_breaks = globals().get('insert_page_breaks_for_h2', False)

    toc_heading_style = resolve_style(doc, ['TOC Heading', '\u76ee\u9304', '\u76ee\u9304\u6a19\u984c'], 'Normal')
    toc_entry_style = resolve_style(doc, ['TOC 1', '\u76ee\u9304 1'], 'Normal')
    toc_heading_text = '\u76ee\u9304'

    def insert_toc(target_para):
        heading_para = target_para.insert_paragraph_before()
        heading_para.text = toc_heading_text
        heading_para.style = toc_heading_style
        field_para = target_para.insert_paragraph_before()
        field_para.style = toc_entry_style
        run = field_para.add_run()
        fld_begin = OxmlElement('w:fldChar')
        fld_begin.set(qn('w:fldCharType'), 'begin')
        run._r.append(fld_begin)
        instr = OxmlElement('w:instrText')
        instr.set(qn('xml:space'), 'preserve')
        instr.text = r'TOC \o "1-4" \h \z \u'
        run._r.append(instr)
        fld_sep = OxmlElement('w:fldChar')
        fld_sep.set(qn('w:fldCharType'), 'separate')
        run._r.append(fld_sep)
        fld_end = OxmlElement('w:fldChar')
        fld_end.set(qn('w:fldCharType'), 'end')
        run._r.append(fld_end)

    toc_inserted = False
    placeholder_para = None
    for para in doc.paragraphs:
        if para.text.strip() == TOC_PLACEHOLDER:
            placeholder_para = para
            break
    if placeholder_para is not None:
        insert_toc(placeholder_para)
        placeholder_para._element.getparent().remove(placeholder_para._element)
        toc_inserted = True

    if not toc_inserted:
        for para in doc.paragraphs:
            style_name = para.style.name if para.style else ''
            if style_name.startswith('Heading 1'):
                insert_toc(para)
                toc_inserted = True
                break

    if not toc_inserted:
        insert_toc(doc.paragraphs[0] if doc.paragraphs else doc.add_paragraph())

    # 在所有 Heading 2 之前插入分頁（若前一段已是頁分隔或未啟用則略過）
    if apply_page_breaks:
        for para in doc.paragraphs:
            style_name = para.style.name if para.style else ''
            if style_name.startswith('Heading 2'):
                prev = para._element.getprevious()
                page_break_exists = False
                if prev is not None:
                    for br in prev.findall('.//w:br', namespaces=para._element.nsmap):
                        if br.get(qn('w:type')) == 'page':
                            page_break_exists = True
                            break
                if page_break_exists:
                    continue
                break_para = para.insert_paragraph_before()
                break_para.style = doc.styles['Normal']
                break_run = break_para.add_run()
                break_run.add_break(WD_BREAK.PAGE)

    # 移除殘留的 placeholder 文字
    for para in list(doc.paragraphs):
        if para.text.strip() == TOC_PLACEHOLDER:
            p = para._element
            p.getparent().remove(p)

    doc.save(docx_path)


output_path = None
if export_markdown:
    if output_override is not None:
        output_path = resolve_output_path(output_override)
    else:
        output_path = infer_markdown_output_path()

word_output_path = None
if export_word:
    if word_output_override is not None:
        word_output_path = resolve_output_path(word_output_override)
    else:
        word_output_path = infer_word_output_path()

word_template_path = resolve_word_template() if export_word else None

text = source_path.read_text(encoding="utf-8")
print(f"來源檔案：{source_path}")
print(f"原始字元數：{len(text):,}")

processed = text


if remove_bold:
    processed = re.sub(r'(?<!\*)\*\*([^\n*]+)\*\*(?!\*)', r'\1', processed)


if remove_comments:
    processed = re.sub(r"<!--.*?-->", "", processed, flags=re.DOTALL)

if normalize_headings:
    def normalize(match):
        hashes, title = match.group(1), match.group(2).strip()
        if len(hashes) <= 1:
            return "# " + title
        return "## " + title
    processed = re.sub(r"^(#+)\s*(.*)$", normalize, processed, flags=re.MULTILINE)

if export_markdown:
    if output_path is None:
        print('⚠️ export_markdown=True，但未能決定 Markdown 輸出路徑。')
    else:
        output_path.write_text(processed, encoding="utf-8")
        print(f"已輸出 Markdown：{output_path}")
        print(f"新檔字元數：{len(processed):,}")
else:
    if output_override is not None:
        print('ℹ️ 已設定 output_relative，但 export_markdown=False，略過 Markdown 匯出。')
    else:
        print('ℹ️ 略過 Markdown 匯出（export_markdown=False）。')

if export_word:
    if word_output_path is None:
        print('⚠️ export_word=True，但未能決定 Word 匯出路徑。')
    else:
        docx_ready_text = re.sub(r'[ \t]{2,}$', '', processed, flags=re.MULTILINE)
        toc_marker = globals().get('toc_marker', r'\tableofcontents')
        placeholder_used = False
        if toc_marker and toc_marker in docx_ready_text:
            docx_ready_text = docx_ready_text.replace(toc_marker, TOC_PLACEHOLDER, 1)
            placeholder_used = True
            docx_ready_text = docx_ready_text.replace(toc_marker, '')
        if not placeholder_used:
            docx_ready_text = docx_ready_text.replace(TOC_PLACEHOLDER, '')
        export_to_docx(docx_ready_text, word_output_path, reference_doc=word_template_path)
        post_process_docx(word_output_path)
else:
    if word_output_override is not None:
        print('ℹ️ 已設定 word_output_relative，但 export_word=False，略過 Word 匯出。')
    else:
        print('ℹ️ 略過 Word 匯出（export_word=False）。')


來源檔案：/home/jovyan/work/mkdocs/My_Notes/1017黃茂榮/柯老師1017講稿.md
原始字元數：5,048
ℹ️ 略過 Markdown 匯出（export_markdown=False）。
已輸出 Word（pandoc）：/home/jovyan/work/mkdocs/My_Notes/1017黃茂榮/柯老師1017講稿.docx


## 執行與後續自動化

1. 依需要調整上方設定，依序執行兩個程式碼格。
2. Notebook 會：
   - 確認輸出路徑、套用文字處理；
   - 以 Pandoc 產生 Word；
   - 透過 python-docx 重新插入中文目錄、套用範本樣式，並在需要時新增分頁。
3. 目錄欄位會生成，但在 Docker 內無法叫用 Microsoft Word，因此**欄位不會自動展開**。請在主機上使用 AppleScript/Automator 對輸出的 `.docx` 執行「更新欄位」。建議做法：
   - 將輸出資料夾掛載為 Docker 與主機共享目錄；
   - 在主機撰寫簡單的監控腳本（例如 `fswatch` + AppleScript），偵測到新檔案後呼叫 Word 更新欄位；
   - 完成後再開啟 Word，即可直接看到完整目錄。

> 若將 Notebook 移出容器或在具備 GUI 的環境執行，可在這裡補上 `osascript` 自動化流程，但本專案預設保留為容器內純轉檔流程。
