# Markdown Cleaner

處理流程：
- 讀取來源 Markdown（保留原標題層級 `#`）
- 依設定移除粗體、HTML 註解、統一標題為 `##`
- 另存為自訂 Markdown 檔名（預設加上 `_無粗體`）
- 可選擇同時輸出 Word（保留標題階層，若無相依套件會自動使用內建轉換）

可在下方「設定區」控制各功能是否啟用，包括是否輸出 Word。
完成設定後執行主程式格即可產生新檔。


In [1]:
from pathlib import Path

# ---- 設定區：可依需求調整 ----
source_relative = Path('mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿.md')

output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.md')
word_output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.docx')

remove_bold = True
remove_comments = True
normalize_headings = False
export_word = True

# --------------------------------


In [2]:
from pathlib import Path
import re
import shutil
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from zipfile import ZipFile

BASE_DIR = Path.cwd()
possible_roots = [BASE_DIR, BASE_DIR / "work"] + list(BASE_DIR.parents) + list((BASE_DIR / "work").parents)

NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}


def resolve_path(relative):
    p = Path(relative)
    if p.is_absolute():
        return p if p.exists() else None
    for base in possible_roots:
        candidate = (Path(base) / p).resolve()
        if candidate.exists():
            return candidate
    return None



# def export_to_docx(text: str, destination: Path) -> None:
#     import subprocess, tempfile, shutil

#     # 先清除不可列印字元
#     import re
#     text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\u200B\uFEFF]', '', text)
#     text = text.replace('\r\n', '\n').replace('\r', '\n')

#     # 若有 pandoc 就只用 pandoc，不要 fallback
#     if shutil.which("pandoc") is None:
#         raise RuntimeError("pandoc 未安裝")

#     with tempfile.NamedTemporaryFile("w", suffix=".md", encoding="utf-8", delete=False) as tmp:
#         tmp.write(text)
#         md = tmp.name
#     try:
#         subprocess.run([
#             "pandoc", md,
#             "-f", "gfm+pipe_tables+lists_without_preceding_blankline",
#             "-o", str(destination),
#             "--wrap=none"
#         ], check=True)
#         print(f"已輸出 Word（pandoc）：{destination}")
#     finally:
#         Path(md).unlink(missing_ok=True)


def export_to_docx(text: str, destination: Path) -> None:
    import subprocess, tempfile, shutil, re

    # 清除可能破壞 Pandoc 的隱藏符號與非印字元
    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\u200B\uFEFF]', '', text)
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    # Pandoc 僅能處理合法 UTF-8
    text.encode('utf-8', 'strict')

    if shutil.which("pandoc") is None:
        raise RuntimeError("pandoc 未安裝")

    with tempfile.NamedTemporaryFile("w", suffix=".md", encoding="utf-8", delete=False) as tmp:
        tmp.write(text)
        md = tmp.name

    try:
        cmd = [
            "pandoc", md,
            "-f", "markdown+pipe_tables+lists_without_preceding_blankline+hard_line_breaks",
            "-t", "docx",
            "-o", str(destination),
            "--wrap=none"
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print("⚠️ Pandoc 錯誤訊息：")
            print(result.stderr or result.stdout)
            raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)

        print(f"已輸出 Word（pandoc）：{destination}")
    finally:
        Path(md).unlink(missing_ok=True)

source_path = resolve_path(source_relative)
if not source_path:
    raise FileNotFoundError(f"找不到來源檔案：{source_relative}")

repo_root = None
for parent in source_path.parents:
    candidate = parent / "mkdocs"
    if candidate.exists():
        repo_root = parent
        break
if repo_root is None:
    for base in possible_roots:
        base = Path(base)
        if (base / "mkdocs").exists():
            repo_root = base
            break
if repo_root is None:
    repo_root = BASE_DIR

if output_relative.is_absolute():
    output_path = output_relative
else:
    output_path = (repo_root / output_relative).resolve()
output_path.parent.mkdir(parents=True, exist_ok=True)

word_output_path = None
if export_word and word_output_relative:
    if word_output_relative.is_absolute():
        word_output_path = word_output_relative
    else:
        word_output_path = (repo_root / word_output_relative).resolve()
    word_output_path.parent.mkdir(parents=True, exist_ok=True)

text = source_path.read_text(encoding="utf-8")
print(f"來源檔案：{source_path}")
print(f"原始字元數：{len(text):,}")

processed = text


if remove_bold:
    processed = re.sub(r'(?<!\*)\*\*([^\n*]+)\*\*(?!\*)', r'\1', processed)


if remove_comments:
    processed = re.sub(r"<!--.*?-->", "", processed, flags=re.DOTALL)

if normalize_headings:
    def normalize(match):
        hashes, title = match.group(1), match.group(2).strip()
        if len(hashes) <= 1:
            return "# " + title
        return "## " + title
    processed = re.sub(r"^(#+)\s*(.*)$", normalize, processed, flags=re.MULTILINE)

output_path.write_text(processed, encoding="utf-8")
print(f"已輸出：{output_path}")
print(f"新檔字元數：{len(processed):,}")

if export_word:
    if word_output_path is None:
        print('⚠️ 未設定 word_output_relative，因此略過 Word 匯出。')
    else:

        # 移除行尾兩個或以上空格，但不改其他空格或空行
        processed = re.sub(r'[ \t]{2,}$', '', processed, flags=re.MULTILINE)
        export_to_docx(processed, word_output_path)


來源檔案：/home/jovyan/work/mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿.md
原始字元數：29,752
已輸出：/home/jovyan/work/mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.md
新檔字元數：29,138
已輸出 Word（pandoc）：/home/jovyan/work/mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.docx
