# Markdown Export Toolkit：運維導向流程

此 Notebook 專注在兩件事：
1. 讓 Markdown → Word 轉換具可預測性（固定路徑、固定範本、可重複執行）。
2. 讓後續自動化容易接手（在 Docker 內只做純轉檔，需要 GUI 的動作交給主機腳本）。

### 依賴與環境
- Pandoc（系統層）：負責基礎 Markdown → DOCX。
- `python-docx`（Python）：負責插入目錄、套用 Word 範本、處理分頁。
- macOS 主機可用 `osascript` 叫用 Microsoft Word 更新欄位；若只在 Docker 內執行，會自動略過此步驟。

> ✅ 建議：將輸出目錄掛載成 Docker ↔ 主機共用資料夾，Notebook 跑完立刻能被主機上的自動化（或人工）接續。


In [1]:
%pip install python-docx


Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path

# ---- 設定區：可依需求調整 ----

# source_relative = Path('mkdocs/My_Notes/1017黃茂榮/柯老師1017講稿.md')
source_relative = Path('mkdocs/My_Notes/_老師要的其他文件/借名登記/借名登記法源文獻清單.md')



# source_relative = Path('mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿.md')

# 若需要覆寫輸出路徑，可取消註解並設定
# output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿_無粗體.md')
# word_output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251016.docx')

output_relative = None
word_output_relative = None
word_template_relative = Path('mkdocs/My_Notes/word樣式文件夾/我的word樣式檔案.dotx')
toc_marker = r'\tableofcontents'  # 在原 Markdown 指定目錄位置的標記
toc_max_level = 4  # 目錄最大層級

# 匯出開關：預設僅輸出 Word，可視需求調整。
export_markdown = False
export_word = True
auto_update_word_fields = True  # 是否自動呼叫 Word 更新欄位
insert_page_breaks_for_h2 = True  # 是否於每個二級標題前換頁


remove_bold = False
remove_comments = True
normalize_headings = False

# 自動輸出命名設定：未提供自訂路徑時，以下規則將自動產生檔名與資料夾。
auto_markdown_suffix = '_無粗體'
auto_word_suffix = ''
auto_word_folder_name = None  # 例: '給老師的'
auto_filtered_prefix = 'Filtered_'

# --------------------------------


## 設定區重點

| 參數 | 說明 | 預設 |
| --- | --- | --- |
| `source_relative` | 單一來源 Markdown 路徑。 | — |
| `output_relative` / `word_output_relative` | 指定輸出檔名；若為 `None` 會依規則自動命名。 | `None` |
| `word_template_relative` | Word 範本，可沿用樣式與段落。 | 專案內範本 |
| `export_markdown` / `export_word` | 是否輸出對應格式。 | `False` / `True` |
| `insert_page_breaks_for_h2` | 是否在每個 `##` 前新增換頁。 | `False` |
| `toc_marker` | Markdown 中放置目錄的標記，轉換時會替換成 placeholder。 | `\tableofcontents` |
| 其他布林設定 | `remove_bold`、`remove_comments`、`normalize_headings`。 | 見設定區 |
| `auto_update_word_fields` | 若主機可執行 Word，是否自動展開 TOC 欄位。 | `True` |

> 所有設定都會被讀入 `Config` 物件，為後續流程提供明確的輸入。保持設定簡單，讓轉檔腳本更容易托管/排程。


In [3]:
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
import re
import shutil
import subprocess
import tempfile
import textwrap
from typing import Optional

try:
    from docx import Document
    from docx.enum.text import WD_BREAK
    from docx.oxml import OxmlElement
    from docx.oxml.ns import qn
except ImportError:
    Document = None
    WD_BREAK = None
    OxmlElement = None
    qn = None

BASE_DIR = Path.cwd()
possible_roots = [BASE_DIR, BASE_DIR / "work"] + list(BASE_DIR.parents) + list((BASE_DIR / "work").parents)
TOC_PLACEHOLDER = "[[DOCX_TOC_PLACEHOLDER]]"


@dataclass
class Config:
    source_relative: Path
    output_relative: Optional[Path]
    word_output_relative: Optional[Path]
    word_template_relative: Optional[Path]
    export_markdown: bool
    export_word: bool
    insert_page_breaks_for_h2: bool
    toc_marker: Optional[str]
    remove_bold: bool
    remove_comments: bool
    normalize_headings: bool
    auto_markdown_suffix: str
    auto_word_suffix: str
    auto_word_folder_name: Optional[str]
    auto_filtered_prefix: str
    auto_update_word_fields: bool
    toc_max_level: int


class ConfigLoader:
    """Translate notebook globals into a single, explicit configuration."""

    @staticmethod
    def load() -> Config:
        g = globals()
        return Config(
            source_relative=g['source_relative'],
            output_relative=g.get('output_relative'),
            word_output_relative=g.get('word_output_relative'),
            word_template_relative=g.get('word_template_relative'),
            export_markdown=g.get('export_markdown', False),
            export_word=g.get('export_word', True),
            insert_page_breaks_for_h2=g.get('insert_page_breaks_for_h2', False),
            toc_marker=g.get('toc_marker', r'\tableofcontents'),
            remove_bold=g.get('remove_bold', False),
            remove_comments=g.get('remove_comments', True),
            normalize_headings=g.get('normalize_headings', False),
            auto_markdown_suffix=g.get('auto_markdown_suffix', '_cleaned'),
            auto_word_suffix=g.get('auto_word_suffix', ''),
            auto_word_folder_name=g.get('auto_word_folder_name'),
            auto_filtered_prefix=g.get('auto_filtered_prefix', 'Filtered_'),
            auto_update_word_fields=g.get('auto_update_word_fields', True),
            toc_max_level=int(g.get('toc_max_level', 4)),
        )


def ensure_docx_package() -> None:
    if any(x is None for x in (Document, WD_BREAK, OxmlElement, qn)):
        raise ImportError("python-docx 未安裝，請先在此 notebook 執行 `%pip install python-docx`")


def resolve_path(relative: Path) -> Optional[Path]:
    p = Path(relative)
    if p.is_absolute():
        return p if p.exists() else None
    for base in possible_roots:
        candidate = (Path(base) / p).resolve()
        if candidate.exists():
            return candidate
    return None


def resolve_paths(config: Config) -> tuple[Path, Optional[Path], Optional[Path], Optional[Path]]:
    source_path = resolve_path(config.source_relative)
    if not source_path:
        raise FileNotFoundError(f"找不到來源檔案：{config.source_relative}")

    repo_root = BASE_DIR
    for parent in source_path.parents:
        if (parent / 'mkdocs').exists():
            repo_root = parent
            break

    def resolved_output(relative: Optional[Path]) -> Optional[Path]:
        if relative is None:
            return None
        rel_path = Path(relative)
        target = rel_path if rel_path.is_absolute() else (repo_root / rel_path).resolve()
        target.parent.mkdir(parents=True, exist_ok=True)
        return target

    def auto_markdown_path() -> Path:
        base_dir = source_path.parent
        prefix = config.auto_filtered_prefix
        if prefix and not base_dir.name.startswith(prefix):
            candidate = base_dir.parent / f"{prefix}{base_dir.name}"
            if candidate.exists():
                base_dir = candidate
        base_dir.mkdir(parents=True, exist_ok=True)
        return base_dir / f"{source_path.stem}{config.auto_markdown_suffix}{source_path.suffix}"

    def auto_word_path() -> Path:
        suffix = config.auto_word_suffix or ''
        if config.auto_word_folder_name:
            target_dir = (source_path.parent / config.auto_word_folder_name)
            target_dir.mkdir(parents=True, exist_ok=True)
            return target_dir / f"{source_path.stem}{suffix}.docx"
        base_dir = source_path.parent
        prefix = config.auto_filtered_prefix
        if prefix and not base_dir.name.startswith(prefix):
            candidate = base_dir.parent / f"{prefix}{base_dir.name}"
            if candidate.exists():
                base_dir = candidate
        base_dir.mkdir(parents=True, exist_ok=True)
        return base_dir / f"{source_path.stem}{suffix}.docx"

    markdown_path = resolved_output(config.output_relative)
    if markdown_path is None and config.export_markdown:
        markdown_path = auto_markdown_path()

    word_path = resolved_output(config.word_output_relative)
    if word_path is None and config.export_word:
        word_path = auto_word_path()

    template_path = resolved_output(config.word_template_relative) if config.word_template_relative else None
    return source_path, markdown_path, word_path, template_path


def load_text(path: Path) -> str:
    return path.read_text(encoding='utf-8')


def transform_text(text: str, config: Config) -> str:
    result = text
    if config.remove_bold:
        result = re.sub(r'(?<!\*)\*\*([^\n*]+)\*\*(?!\*)', r'\1', result)
    if config.remove_comments:
        result = re.sub(r"<!--.*?-->", "", result, flags=re.DOTALL)
    if config.normalize_headings:
        def normalize(match: re.Match[str]) -> str:
            hashes, title = match.group(1), match.group(2).strip()
            return '# ' + title if len(hashes) <= 1 else '## ' + title
        result = re.sub(r"^(#+)\s*(.*)$", normalize, result, flags=re.MULTILINE)
    return result


def write_markdown(path: Path, text: str) -> None:
    path.write_text(text, encoding='utf-8')
    print(f"已輸出 Markdown：{path}")
    print(f"新檔字元數：{len(text):,}")


def run_pandoc(text: str, destination: Path, template: Optional[Path]) -> None:
    if shutil.which('pandoc') is None:
        raise RuntimeError('pandoc 未安裝')

    cleaned = re.sub(r'[ \t]{2,}$', '', text, flags=re.MULTILINE)

    with tempfile.NamedTemporaryFile('w', suffix='.md', encoding='utf-8', delete=False) as tmp:
        tmp.write(cleaned)
        tmp_path = tmp.name
    try:
        cmd = [
            'pandoc', tmp_path,
            '-f', 'markdown+pipe_tables+lists_without_preceding_blankline+hard_line_breaks',
            '-t', 'docx',
            '-o', str(destination),
            '--wrap=none',
        ]
        if template:
            cmd.extend(['--reference-doc', str(template)])
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print('⚠️ Pandoc 錯誤訊息：')
            print(result.stderr or result.stdout)
            raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)
        print(f"已輸出 Word（pandoc）：{destination}")
    finally:
        Path(tmp_path).unlink(missing_ok=True)


def resolve_style(doc: Document, candidates: list[str], fallback: str = 'Normal'):
    for name in candidates:
        try:
            return doc.styles[name]
        except KeyError:
            continue
    for style in doc.styles:
        style_name = getattr(style, 'name', '')
        if any(style_name.endswith(name) or style_name == name for name in candidates):
            return style
    return doc.styles[fallback]


def post_process_docx(docx_path: Path, config: Config) -> None:
    ensure_docx_package()
    doc = Document(docx_path)
    if not doc.paragraphs:
        doc.save(docx_path)
        return

    for para in list(doc.paragraphs):
        style_name = para.style.name if para.style else ''
        if style_name.startswith('TOC') or 'Table of Contents' in para.text:
            p = para._element
            p.getparent().remove(p)

    toc_heading_style = resolve_style(doc, ['TOC Heading', '目錄', '目錄標題'], 'Normal')
    toc_entry_style = resolve_style(doc, ['TOC 1', '目錄 1'], 'Normal')
    toc_heading_text = '目錄'
    toc_range = f"1-{config.toc_max_level}"

    def insert_toc(target_para):
        heading_para = target_para.insert_paragraph_before()
        heading_para.text = toc_heading_text
        heading_para.style = toc_heading_style
        field_para = target_para.insert_paragraph_before()
        field_para.style = toc_entry_style
        run = field_para.add_run()
        fld_begin = OxmlElement('w:fldChar')
        fld_begin.set(qn('w:fldCharType'), 'begin')
        run._r.append(fld_begin)
        instr = OxmlElement('w:instrText')
        instr.set(qn('xml:space'), 'preserve')
        instr.text = fr'TOC \o "{toc_range}" \h \z \u'
        run._r.append(instr)
        fld_sep = OxmlElement('w:fldChar')
        fld_sep.set(qn('w:fldCharType'), 'separate')
        run._r.append(fld_sep)
        fld_end = OxmlElement('w:fldChar')
        fld_end.set(qn('w:fldCharType'), 'end')
        run._r.append(fld_end)

    toc_inserted = False
    for para in doc.paragraphs:
        if para.text.strip() == TOC_PLACEHOLDER:
            insert_toc(para)
            para._element.getparent().remove(para._element)
            toc_inserted = True
            break

    if not toc_inserted:
        for para in doc.paragraphs:
            style_name = para.style.name if para.style else ''
            if style_name.startswith('Heading 1'):
                insert_toc(para)
                toc_inserted = True
                break

    if not toc_inserted:
        insert_toc(doc.paragraphs[0] if doc.paragraphs else doc.add_paragraph())

    if config.insert_page_breaks_for_h2:
        for para in doc.paragraphs:
            style_name = para.style.name if para.style else ''
            if style_name.startswith('Heading 2'):
                prev = para._element.getprevious()
                has_page_break = False
                if prev is not None:
                    for br in prev.findall('.//w:br', namespaces=para._element.nsmap):
                        if br.get(qn('w:type')) == 'page':
                            has_page_break = True
                            break
                if has_page_break:
                    continue
                break_para = para.insert_paragraph_before()
                break_para.style = doc.styles['Normal']
                break_para.add_run().add_break(WD_BREAK.PAGE)

    for para in list(doc.paragraphs):
        if para.text.strip() == TOC_PLACEHOLDER:
            p = para._element
            p.getparent().remove(p)

    doc.save(docx_path)







def update_docx_fields_with_word(docx_path: Path, enabled: bool = True) -> None:
    if not enabled:
        return
    if shutil.which("osascript") is None:
        print("⚠️ 找不到 osascript，略過 Word 欄位更新。")
        return
    if not docx_path.exists():
        raise FileNotFoundError(f"找不到檔案：{docx_path}")

    docx_posix = docx_path.as_posix().replace('"', '\\"')

    script = textwrap.dedent(f'''
    tell application "Microsoft Word"
        activate
        set docPath to "{docx_posix}"
        set theDoc to open file name docPath
        try
            tell theDoc
                set fieldCount to count of fields
                if fieldCount > 0 then
                    repeat with idx from 1 to fieldCount
                        update field (field idx)
                    end repeat
                end if
                save
            end tell
            close theDoc saving yes
        on error errMsg number errNum
            try
                close theDoc saving no
            end try
            error errMsg number errNum
        end try
    end tell
    ''')

    result = subprocess.run(["osascript", "-e", script], capture_output=True, text=True)
    if result.returncode != 0:
        raise RuntimeError("AppleScript 更新欄位失敗：" + (result.stderr or result.stdout))
    print("已透過 Word 更新欄位。")


    

config = ConfigLoader.load()
source_path, markdown_path, word_path, template_path = resolve_paths(config)

text = load_text(source_path)
print(f"來源檔案：{source_path}")
print(f"原始字元數：{len(text):,}")

processed = transform_text(text, config)

if config.export_markdown:
    if markdown_path is None:
        print('⚠️ export_markdown=True，但未能決定 Markdown 輸出路徑。')
    else:
        write_markdown(markdown_path, processed)
else:
    if config.output_relative is not None:
        print('ℹ️ 已設定 output_relative，但 export_markdown=False，略過 Markdown 匯出。')

if config.export_word:
    if word_path is None:
        print('⚠️ export_word=True，但未能決定 Word 匯出路徑。')
    else:
        docx_ready_text = processed
        if config.toc_marker and config.toc_marker in docx_ready_text:
            docx_ready_text = docx_ready_text.replace(config.toc_marker, TOC_PLACEHOLDER, 1)
            docx_ready_text = docx_ready_text.replace(config.toc_marker, '')
        else:
            docx_ready_text = docx_ready_text.replace(TOC_PLACEHOLDER, '')
        run_pandoc(docx_ready_text, word_path, template_path)
        post_process_docx(word_path, config)
        update_docx_fields_with_word(word_path, config.auto_update_word_fields)
else:
    if config.word_output_relative is not None:
        print('ℹ️ 已設定 word_output_relative，但 export_word=False，略過 Word 匯出。')


來源檔案：/Users/iw/Documents/NTU/1141/1141_Tax_Ko/mkdocs/My_Notes/_老師要的其他文件/借名登記/借名登記法源文獻清單.md
原始字元數：2,300
已輸出 Word（pandoc）：/Users/iw/Documents/NTU/1141/1141_Tax_Ko/mkdocs/My_Notes/_老師要的其他文件/借名登記/借名登記法源文獻清單.docx
已透過 Word 更新欄位。


## 操作手冊（主機、容器雙流程）

1. **準備環境**
   - 安裝 Pandoc。
   - 在 Notebook 環境執行 `%pip install python-docx`。
   - 如要在主機自動展開 TOC，需 macOS + Microsoft Word + `osascript`。Notebook 會自動檢查：找不到 `osascript` 就列出警示並略過。
2. **設定與執行**
   - 調整第一個 code cell 的輸入參數。
   - 順序執行兩個 code cell。流程會：
     1. 解析設定、決定輸出路徑；
     2. 讀取 Markdown、套用文本處理；
     3. 呼叫 Pandoc 產生 DOCX；
     4. 用 python-docx 插入中文目錄、沿用範本樣式、依設定處理分頁；
     5. 若 `auto_update_word_fields=True` 且主機支援，透過 AppleScript 更新 Word 欄位。
3. **與主機自動化合作**
   - 建議把輸出資料夾掛載到宿主機，Notebook 完成後即可由主機腳本檢查 DOCX。
   - 若需要在容器外另行處理（例如備份、寄送），請以產生的 DOCX 作為唯一輸入來源。
   - 若換到沒有 GUI 的環境，可把 `auto_update_word_fields` 改為 `False`，轉檔流程仍可重複執行。

> **維運提示**：保持輸入/輸出路徑固定、流程無副作用（僅寫入目標檔案），才能放心交給排程器或 CI；一旦 Docker/主機切換造成自動欄位更新失敗，只需重新執行第二個 code cell，即可重建輸出。
