# Markdown Cleaner

流程分成兩步：
1. **Markdown 清理**：讀取來源檔、依設定移除粗體標記／註解、輸出新的 `.md`，同時把內容存入 `processed_text`。
2. **Word 匯出（可選）**：執行第二個程式格產生 `.docx`；若 `use_style_template = True` 且 `style_template_relative` 指向樣式檔/資料夾，會套用該樣式，並保留標題層級、第二個 `##` 起換頁。

先在「設定區」調整參數，再依序執行「Markdown 清理」與「Word 匯出」。


In [1]:
!pip install python-docx




In [2]:
from pathlib import Path

# ---- 設定區：可依需求調整 ----
source_relative = Path('mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿.md')

output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.md')
word_output_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.docx')

style_template_relative = Path('mkdocs/My_Notes/113憲判字11號判決/給老師的/word樣式文件夾/我的word樣式檔案')

use_style_template = False

remove_bold = True
remove_comments = True
normalize_headings = False
export_word = True

# --------------------------------


In [3]:
from pathlib import Path
import re
from typing import Optional

BASE_DIR = Path.cwd()
possible_roots = [BASE_DIR, BASE_DIR / 'work'] + list(BASE_DIR.parents) + list((BASE_DIR / 'work').parents)


def resolve_path(relative) -> Optional[Path]:
    if relative is None:
        return None
    p = Path(relative)
    if p.is_absolute():
        return p if p.exists() else None
    for base in possible_roots:
        candidate = (Path(base) / p).resolve()
        if candidate.exists():
            return candidate
    return None

source_path = resolve_path(source_relative)
if not source_path:
    raise FileNotFoundError(f"找不到來源檔案：{source_relative}")

repo_root = None
for parent in source_path.parents:
    candidate = parent / 'mkdocs'
    if candidate.exists():
        repo_root = parent
        break
if repo_root is None:
    for base in possible_roots:
        base = Path(base)
        if (base / 'mkdocs').exists():
            repo_root = base
            break
if repo_root is None:
    repo_root = BASE_DIR

if output_relative.is_absolute():
    output_path = output_relative
else:
    output_path = (repo_root / output_relative).resolve()
output_path.parent.mkdir(parents=True, exist_ok=True)

word_output_path = None
if export_word and word_output_relative:
    if word_output_relative.is_absolute():
        word_output_path = word_output_relative
    else:
        word_output_path = (repo_root / word_output_relative).resolve()
    word_output_path.parent.mkdir(parents=True, exist_ok=True)

style_template_resolved = None
if use_style_template and style_template_relative:
    candidate = resolve_path(style_template_relative)
    if candidate is None:
        candidate = Path(style_template_relative)
        if candidate.exists():
            candidate = candidate.resolve()
    style_template_resolved = candidate

text = source_path.read_text(encoding='utf-8')
print(f"來源檔案：{source_path}")
print(f"原始字元數：{len(text):,}")

processed_text = text
if remove_bold:
    processed_text = re.sub(r"\*\*(.+?)\*\*", lambda m: m.group(1), processed_text, flags=re.DOTALL)
    processed_text = re.sub(r"__(.+?)__", lambda m: m.group(1), processed_text, flags=re.DOTALL)

if remove_comments:
    processed_text = re.sub(r"<!--.*?-->", "", processed_text, flags=re.DOTALL)

if normalize_headings:
    def normalize(match):
        hashes, title = match.group(1), match.group(2).strip()
        if len(hashes) <= 1:
            return '# ' + title
        return '## ' + title
    processed_text = re.sub(r"^(#+)\s*(.*)$", normalize, processed_text, flags=re.MULTILINE)

output_path.write_text(processed_text, encoding='utf-8')
print(f"已輸出：{output_path}")
print(f"新檔字元數：{len(processed_text):,}")


來源檔案：/home/jovyan/work/mkdocs/My_Notes/113憲判字11號判決/法官學院書面稿.md
原始字元數：29,752
已輸出：/home/jovyan/work/mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.md
新檔字元數：29,138


In [4]:
import re
import shutil
import tempfile
from pathlib import Path
from typing import List, Optional, Tuple
from zipfile import ZipFile

try:
    from docx import Document
    from docx.enum.style import WD_STYLE_TYPE
    from docx.enum.text import WD_ALIGN_PARAGRAPH
    from docx.oxml import OxmlElement
    from docx.oxml.ns import qn
    from docx.shared import Pt, RGBColor
except ImportError:
    Document = None

ALLOWED_TEMPLATE_SUFFIXES = {'.dotx', '.docx'}

LIST_ITEM_RE = re.compile(r'^(?P<indent>[ 	]*)(?P<marker>(?:[*+-])|(?:\d+[.)]))\s+(?P<content>.*)$')
TABLE_DIVIDER_RE = re.compile(r'^\s*\|?\s*:?-{3,}:?\s*(?:\|\s*:?-{3,}:?\s*)*\|?\s*$')
TABLE_ROW_SPLIT_RE = re.compile(r'(?<!\\)\|')

def _indent_width(text: str) -> int:
    width = 0
    for char in text:
        width += 4 if char == '	' else 1
    return width

def _collapse_content(lines: List[str]) -> str:
    paragraphs: List[str] = []
    buffer: List[str] = []
    for line in lines:
        stripped = line.strip()
        if not stripped:
            if buffer:
                paragraphs.append(' '.join(buffer).strip())
                buffer.clear()
        else:
            buffer.append(stripped)
    if buffer:
        paragraphs.append(' '.join(buffer).strip())
    return '\n'.join(paragraphs).strip()

def _split_table_row(line: str) -> List[str]:
    trimmed = line.strip()
    if not trimmed:
        return []
    if trimmed.startswith('|'):
        trimmed = trimmed[1:]
    if trimmed.endswith('|'):
        trimmed = trimmed[:-1]
    parts = TABLE_ROW_SPLIT_RE.split(trimmed)
    return [part.replace('\|', '|').strip() for part in parts]

def _is_table_divider(line: str) -> bool:
    return bool(TABLE_DIVIDER_RE.match(line.strip()))

def _is_table_start(lines: List[str], index: int) -> bool:
    return index + 1 < len(lines) and '|' in lines[index] and _is_table_divider(lines[index + 1])

def _parse_alignment(spec: str) -> str:
    spec = spec.strip()
    left = spec.startswith(':')
    right = spec.endswith(':')
    if left and right:
        return 'center'
    if right:
        return 'right'
    return 'left'

def _normalize_row_length(values: List[str], count: int, fill: str = '') -> List[str]:
    result = list(values)
    if len(result) < count:
        result.extend([fill] * (count - len(result)))
    elif len(result) > count:
        del result[count:]
    return result

def _parse_table(lines: List[str], index: int) -> Tuple[dict, int]:
    header = _split_table_row(lines[index])
    align_specs = _split_table_row(lines[index + 1]) if index + 1 < len(lines) else []
    alignments = [_parse_alignment(spec) for spec in align_specs]
    rows: List[List[str]] = []
    i = index + 2
    while i < len(lines):
        raw = lines[i]
        if not raw.strip():
            break
        if '|' not in raw:
            break
        row = _split_table_row(raw)
        if not row:
            break
        rows.append(row)
        i += 1
    column_count = len(header)
    for row in rows:
        column_count = max(column_count, len(row))
    if column_count == 0 and rows:
        column_count = len(rows[0])
    if column_count == 0:
        column_count = 1
    header = _normalize_row_length(header, column_count)
    alignments = _normalize_row_length(alignments, column_count, fill='left')
    normalized_rows = [_normalize_row_length(row, column_count) for row in rows]
    return ({'type': 'table', 'header': header, 'align': alignments, 'rows': normalized_rows}, i)

def _parse_paragraph(lines: List[str], start: int) -> Tuple[dict, int]:
    collected: List[str] = []
    i = start
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()
        if not stripped:
            break
        if _is_table_start(lines, i):
            break
        if LIST_ITEM_RE.match(line):
            break
        if stripped.startswith('#'):
            break
        if stripped == '---':
            break
        collected.append(line)
        i += 1
    if not collected and i < len(lines):
        collected.append(lines[i])
        i += 1
    return ({'type': 'paragraph', 'text': _collapse_content(collected)}, i)

def _parse_list(lines: List[str], start: int, base_indent: Optional[int] = None) -> Tuple[dict, int]:
    items: List[dict] = []
    ordered: Optional[bool] = None
    i = start
    while i < len(lines):
        line = lines[i]
        match = LIST_ITEM_RE.match(line)
        if not match:
            if items and line.strip():
                indent = _indent_width(line[:len(line) - len(line.lstrip(' '))])
                if indent > (base_indent or 0):
                    items[-1]['content'].append(line.strip())
                    i += 1
                    continue
            if items and not line.strip():
                items[-1]['content'].append('')
                i += 1
                continue
            break
        indent = _indent_width(match.group('indent'))
        marker = match.group('marker')
        content = match.group('content').rstrip()
        if base_indent is None:
            base_indent = indent
        if indent < base_indent:
            break
        if indent > base_indent:
            if not items:
                break
            nested_block, new_index = _parse_list(lines, i, indent)
            items[-1].setdefault('children', []).append(nested_block)
            i = new_index
            continue
        current_ordered = marker[0].isdigit()
        if ordered is None:
            ordered = current_ordered
        elif ordered != current_ordered:
            break
        items.append({'content': [content], 'children': []})
        i += 1
        while i < len(lines):
            next_line = lines[i]
            if not next_line.strip():
                items[-1]['content'].append('')
                i += 1
                continue
            next_match = LIST_ITEM_RE.match(next_line)
            if next_match:
                next_indent = _indent_width(next_match.group('indent'))
                if next_indent <= base_indent:
                    break
                nested_block, new_index = _parse_list(lines, i, next_indent)
                items[-1].setdefault('children', []).append(nested_block)
                i = new_index
                continue
            next_indent = _indent_width(next_line[:len(next_line) - len(next_line.lstrip(' '))])
            if next_indent > base_indent:
                items[-1]['content'].append(next_line.strip())
                i += 1
                continue
            break
    normalized_items: List[dict] = []
    for item in items:
        text = _collapse_content(item['content'])
        children = [child for child in item.get('children', []) if child.get('items')]
        normalized_items.append({'text': text, 'children': children})
    return ({'type': 'list', 'ordered': bool(ordered), 'items': normalized_items}, i)

def _parse_markdown_blocks(text: str) -> List[dict]:
    lines = text.splitlines()
    blocks: List[dict] = []
    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()
        if not stripped:
            i += 1
            continue
        if _is_table_start(lines, i):
            table_block, i = _parse_table(lines, i)
            blocks.append(table_block)
            continue
        list_match = LIST_ITEM_RE.match(line)
        if list_match:
            list_block, new_index = _parse_list(lines, i, _indent_width(list_match.group('indent')))
            if list_block['items']:
                blocks.append(list_block)
                i = new_index
                continue
        if stripped.startswith('#'):
            level = len(stripped) - len(stripped.lstrip('#'))
            heading_text = stripped[level:].strip()
            blocks.append({'type': 'heading', 'level': level, 'text': heading_text})
            i += 1
            continue
        if stripped == '---':
            blocks.append({'type': 'horizontal_rule'})
            i += 1
            continue
        paragraph_block, new_index = _parse_paragraph(lines, i)
        if paragraph_block['text']:
            blocks.append(paragraph_block)
        i = max(new_index, i + 1)
    return blocks

def is_docx_archive(path: Path) -> bool:
    try:
        with ZipFile(path) as zf:
            return 'word/document.xml' in zf.namelist()
    except Exception:
        return False


def select_template(path: Optional[Path]) -> Optional[Path]:
    if path is None:
        return None
    candidate = Path(path)
    if candidate.is_dir():
        dotx_files = sorted(candidate.glob('*.dotx'))
        docx_files = sorted(candidate.glob('*.docx'))
        candidate = dotx_files[0] if dotx_files else (docx_files[0] if docx_files else None)
    elif candidate.suffix.lower() not in ALLOWED_TEMPLATE_SUFFIXES:
        dotx_try = candidate.with_suffix('.dotx')
        if dotx_try.exists():
            candidate = dotx_try
        else:
            docx_try = candidate.with_suffix('.docx')
            if docx_try.exists():
                candidate = docx_try
            elif candidate.exists() and is_docx_archive(candidate):
                return candidate
            else:
                candidate = None
    if candidate and candidate.exists():
        return candidate
    print(f"⚠️ 找不到樣式集：{path}")
    return None


def apply_template_styles(doc_path: Path, template_path: Path) -> None:
    doc_path = Path(doc_path)
    template_path = Path(template_path)
    with tempfile.TemporaryDirectory() as tmp_doc, tempfile.TemporaryDirectory() as tmp_tpl:
        tmp_doc = Path(tmp_doc)
        tmp_tpl = Path(tmp_tpl)
        with ZipFile(doc_path, 'r') as zin:
            zin.extractall(tmp_doc)
        with ZipFile(template_path, 'r') as zin:
            zin.extractall(tmp_tpl)
        for rel in ['word/styles.xml', 'word/numbering.xml', 'word/theme/theme1.xml']:
            src = tmp_tpl / rel
            if src.exists():
                dst = tmp_doc / rel
                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy(src, dst)
        tmp_output = doc_path.with_suffix('.tmp.docx')
        with ZipFile(tmp_output, 'w') as zout:
            for file in tmp_doc.rglob('*'):
                if file.is_file():
                    zout.write(file, file.relative_to(tmp_doc))
        tmp_output.replace(doc_path)


def export_to_docx(processed: str, destination: Path, template_path: Optional[Path]) -> None:
    if Document is None:
        print('⚠️ 未安裝 python-docx，無法輸出 Word。請執行 `pip install python-docx` 後重試。')
        return
    dest_path = Path(destination)
    dest_path.parent.mkdir(parents=True, exist_ok=True)
    template = select_template(template_path) if use_style_template else None
    temp_template: Optional[Path] = None
    try:
        if template and template.suffix.lower() not in ALLOWED_TEMPLATE_SUFFIXES and template.exists():
            temp_template = dest_path.with_suffix('.template.dotx')
            shutil.copyfile(template, temp_template)
            template = temp_template
        if template and template.exists():
            document = Document(str(template))
            body = document._element.body
            for child in list(body):
                body.remove(child)
        else:
            document = Document()
        target_styles = {'Normal', 'Heading 1', 'Heading 2', 'Heading 3', 'Heading 4', 'Heading 5', 'Heading 6'}
        if document.styles:
            for style in document.styles:
                if style.type == WD_STYLE_TYPE.PARAGRAPH and style.name in target_styles:
                    style.font.color.rgb = RGBColor(0, 0, 0)
                    style.paragraph_format.left_indent = None
                    style.paragraph_format.first_line_indent = None
        blocks = _parse_markdown_blocks(processed)
        heading2_seen = False
        alignment_map = {
            'left': WD_ALIGN_PARAGRAPH.LEFT,
            'center': WD_ALIGN_PARAGRAPH.CENTER,
            'right': WD_ALIGN_PARAGRAPH.RIGHT,
        }

        def _set_runs_black(paragraph) -> None:
            for run in paragraph.runs:
                run.font.color.rgb = RGBColor(0, 0, 0)

        def _write_text(paragraph, content: str, bold: bool = False) -> None:
            for run in paragraph.runs[::-1]:
                paragraph._p.remove(run._element)
            if not content:
                run = paragraph.add_run(' ')
                if bold:
                    run.font.bold = True
            else:
                parts = content.split('\n')
                for idx, part in enumerate(parts):
                    if idx:
                        paragraph.add_run().add_break()
                    run = paragraph.add_run(part)
                    if bold:
                        run.font.bold = True
            _set_runs_black(paragraph)

        def _find_style_name(candidates: List[str], expected_type) -> Optional[str]:
            for name in candidates:
                try:
                    style = document.styles[name]
                except KeyError:
                    continue
                if style.type == expected_type:
                    return name
            return None

        bullet_style = _find_style_name(['List Bullet', 'Bulleted List', 'List Paragraph'], WD_STYLE_TYPE.PARAGRAPH)
        number_style = _find_style_name(['List Number', 'Numbered List', 'List Paragraph'], WD_STYLE_TYPE.PARAGRAPH)
        table_style_name = _find_style_name(['Table Grid', 'Table Normal', 'Light Grid'], WD_STYLE_TYPE.TABLE)

        def render_list(block: dict, level: int = 0) -> None:
            style_name = number_style if block.get('ordered') else bullet_style
            for item in block.get('items', []):
                text = item.get('text', '')
                para = document.add_paragraph(style=style_name) if style_name else document.add_paragraph()
                para.paragraph_format.first_line_indent = None
                if level:
                    para.paragraph_format.left_indent = Pt(18 * level)
                _write_text(para, text)
                for child in item.get('children', []):
                    if child.get('type') == 'list':
                        render_list(child, level + 1)

        def render_table(block: dict) -> None:
            header = block.get('header', [])
            rows = block.get('rows', [])
            alignments = block.get('align', [])
            columns = len(header) if header else (len(rows[0]) if rows else 0)
            if columns == 0:
                return
            total_rows = len(rows) + (1 if header else 0)
            if total_rows == 0:
                return
            table = document.add_table(rows=total_rows, cols=columns)
            if table_style_name:
                table.style = table_style_name
            current_row = 0
            if header:
                for col_idx in range(columns):
                    cell = table.cell(0, col_idx)
                    paragraph = cell.paragraphs[0]
                    alignment_key = alignments[col_idx] if col_idx < len(alignments) else 'left'
                    paragraph.paragraph_format.alignment = alignment_map.get(alignment_key, WD_ALIGN_PARAGRAPH.LEFT)
                    _write_text(paragraph, header[col_idx] if col_idx < len(header) else '', bold=True)
                current_row = 1
            for row_idx, data in enumerate(rows):
                table_row = table.rows[current_row + row_idx]
                for col_idx in range(columns):
                    cell = table_row.cells[col_idx]
                    paragraph = cell.paragraphs[0]
                    alignment_key = alignments[col_idx] if col_idx < len(alignments) else 'left'
                    paragraph.paragraph_format.alignment = alignment_map.get(alignment_key, WD_ALIGN_PARAGRAPH.LEFT)
                    cell_text = data[col_idx] if col_idx < len(data) else ''
                    _write_text(paragraph, cell_text)

        for block in blocks:
            block_type = block.get('type')
            if block_type == 'horizontal_rule':
                para = document.add_paragraph('')
                p = para._p
                pPr = p.get_or_add_pPr()
                pBdr = OxmlElement('w:pBdr')
                bottom = OxmlElement('w:bottom')
                bottom.set(qn('w:val'), 'single')
                bottom.set(qn('w:sz'), '6')
                bottom.set(qn('w:space'), '1')
                bottom.set(qn('w:color'), '000000')
                pBdr.append(bottom)
                pPr.append(pBdr)
            elif block_type == 'heading':
                level = block.get('level', 1)
                doc_level = max(1, min(level, 9))
                if doc_level == 2:
                    if heading2_seen:
                        document.add_page_break()
                    heading2_seen = True
                para = document.add_heading(block.get('text') or ' ', level=doc_level)
                _set_runs_black(para)
            elif block_type == 'list':
                if block.get('items'):
                    render_list(block, 0)
            elif block_type == 'table':
                render_table(block)
            elif block_type == 'paragraph':
                text = block.get('text', '').strip()
                if text:
                    para = document.add_paragraph()
                    _write_text(para, text)

        document.save(dest_path)
        if template and template.exists():
            apply_template_styles(dest_path, template)
        print(f"已輸出 Word（python-docx）：{dest_path}")
    except Exception as err:
        print(f'⚠️ Word 匯出失敗：{err}')
        if dest_path.exists():
            dest_path.unlink(missing_ok=True)
    finally:
        if temp_template and temp_template.exists():
            temp_template.unlink(missing_ok=True)

if not export_word:
    print('Word 匯出已停用，未產生 .docx。')
elif word_output_path is None:
    print('⚠️ 未設定 word_output_relative，因此略過 Word 匯出。')
elif 'processed_text' not in globals():
    print('⚠️ 找不到 processed_text，請先執行 Markdown 清理程式格。')
else:
    export_to_docx(processed_text, word_output_path, style_template_resolved)


已輸出 Word（python-docx）：/home/jovyan/work/mkdocs/My_Notes/113憲判字11號判決/給老師的/遺產稅案例研討_法官學院書面稿_20251011.docx
