# 租稅法總論稿件合併

這個 notebook 會將 `_Material/1121稿/Filtered_租稅法總論` 中的逐次講義整合成單一的 Word 檔案，並依據每份檔案的 YAML 前言建立段落標題。


In [4]:
# 如環境尚未安裝相關套件，可以先執行這個區塊。
%pip install -q python-docx pyyaml
# %pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [5]:
from pathlib import Path
from typing import List, Optional, Union

# === 基本環境設定 ===
BASE_DIR = Path.cwd().resolve()


def build_possible_roots(*paths: Path) -> List[Path]:
    roots: List[Path] = []
    for path in paths:
        if path is None:
            continue
        path = path.expanduser()
        for candidate in [path] + list(path.parents):
            candidate = candidate.resolve()
            if candidate not in roots:
                roots.append(candidate)
    return roots


possible_roots = build_possible_roots(BASE_DIR, BASE_DIR / 'work')


def ensure_path(value: Union[Path, str, None]) -> Optional[Path]:
    if value is None or value == "":
        return None
    if isinstance(value, Path):
        return value
    return Path(value)


def resolve_existing(target: Union[Path, str, None]) -> Optional[Path]:
    candidate = ensure_path(target)
    if candidate is None:
        return None
    candidate = candidate.expanduser()
    if candidate.is_absolute():
        resolved = candidate.resolve()
        return resolved if resolved.exists() else None
    for base in possible_roots:
        resolved = (base / candidate).resolve()
        if resolved.exists():
            return resolved
    return None


# === 使用者可調整參數 ===
CONFIG_INPUT_PATH: Union[Path, str, None] = None  # 指定來源資料夾（可為相對或絕對路徑）
CONFIG_OUTPUT_PATH: Union[Path, str, None] = None  # 指定輸出檔案名稱或路徑
OUTPUT_SUFFIX = '_合併測試.docx'
WORD_TEMPLATE_PATH: Union[Path, str, None] = Path('mkdocs/My_Notes/word樣式文件夾/Doc1.dotx')
EAST_ASIA_FONT_OVERRIDE: Optional[str] = None  # None 表示沿用模板字體
AUTO_INSERT_TOC = True
TOC_TITLE = '目錄'
TOC_MAX_LEVEL = 6  # 指定自動目錄包含的最大標題層級，預設為 4
TOC_LEVEL_RANGE = f'1-{TOC_MAX_LEVEL}'
TOC_HEADING_LEVEL = 1
TOC_PAGE_BREAK_AFTER = True
AUTO_UPDATE_WORD_FIELDS = True


SOURCE_CANDIDATES = [
    Path('mkdocs/My_Notes/課_二89_租稅法總論'),
    # Path('mkdocs/My_Notes/_1121_租稅法總論'),
]


# 

SOURCE_RELATIVE = None
SOURCE_DIR = None

preferred_input = resolve_existing(CONFIG_INPUT_PATH)
if preferred_input is not None:
    SOURCE_RELATIVE = ensure_path(CONFIG_INPUT_PATH)
    SOURCE_DIR = preferred_input
else:
    for candidate in SOURCE_CANDIDATES:
        resolved = resolve_existing(candidate)
        if resolved is not None:
            SOURCE_RELATIVE = candidate
            SOURCE_DIR = resolved
            break

if SOURCE_DIR is None:
    joined = " - ".join(str(c) for c in SOURCE_CANDIDATES)
    raise FileNotFoundError('找不到講義資料夾，已嘗試: - ' + joined)


TEMPLATE_RELATIVE: Optional[Path] = None
TEMPLATE_PATH: Optional[Path] = None
if WORD_TEMPLATE_PATH is not None:
    template_resolved = resolve_existing(WORD_TEMPLATE_PATH)
    if template_resolved is None:
        raise FileNotFoundError(f"找不到 Word 範本：{WORD_TEMPLATE_PATH}")
    TEMPLATE_RELATIVE = ensure_path(WORD_TEMPLATE_PATH)
    TEMPLATE_PATH = template_resolved


OUTPUT_PATH = None

def derive_output_path(source_dir: Path, override: Union[Path, str, None] = None, suffix: str = OUTPUT_SUFFIX) -> Path:
    override_path = ensure_path(override)
    if override_path is not None:
        override_path = override_path.expanduser()
        if not override_path.suffix:
            override_path = override_path.with_suffix('.docx')
        if override_path.is_absolute():
            return override_path.resolve()
        return (source_dir / override_path).resolve()

    base_name = source_dir.name or '合併'
    if base_name.startswith('Filtered_'):
        base_name = base_name[len('Filtered_'):]
    file_name = f'{base_name}{suffix}'
    if not file_name.lower().endswith('.docx'):
        file_name = f'{file_name}.docx'
    return (source_dir / file_name).resolve()


OUTPUT_PATH = derive_output_path(SOURCE_DIR, CONFIG_OUTPUT_PATH)

print(f'Working directory : {BASE_DIR}')
print(f'使用資料夾        : {SOURCE_RELATIVE}')
print(f'Source directory  : {SOURCE_DIR}')
print(f'Word template     : {TEMPLATE_RELATIVE}')
print(f'Output file       : {OUTPUT_PATH}')
print(f'Auto TOC enabled  : {AUTO_INSERT_TOC}')
print(f'East Asia font    : {EAST_ASIA_FONT_OVERRIDE}')
print(f'Auto field update : {AUTO_UPDATE_WORD_FIELDS}')


Working directory : /Users/iw/Documents/NTU/1141/1141_Tax_Ko/notebooks
使用資料夾        : mkdocs/My_Notes/課_二89_租稅法總論
Source directory  : /Users/iw/Documents/NTU/1141/1141_Tax_Ko/mkdocs/My_Notes/課_二89_租稅法總論
Word template     : mkdocs/My_Notes/word樣式文件夾/Doc1.dotx
Output file       : /Users/iw/Documents/NTU/1141/1141_Tax_Ko/mkdocs/My_Notes/課_二89_租稅法總論/課_二89_租稅法總論_合併測試.docx
Auto TOC enabled  : True
East Asia font    : None
Auto field update : True


In [6]:
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional
import sys
import re
import shutil
import subprocess
import tempfile
import importlib
import inspect

from docx import Document

import importlib.util

EAST_ASIA_FONT_OVERRIDE = globals().get('EAST_ASIA_FONT_OVERRIDE', None)

# 確保容器內可匯入 notebooks/lib 或 lib 模組
for _search_root in [Path.cwd()] + list(Path.cwd().parents):
    for _package_root in {_search_root, _search_root / 'notebooks'}:
        if _package_root.is_dir() and str(_package_root) not in sys.path:
            sys.path.insert(0, str(_package_root))


def _import_word_module_from(path: Path):
    spec = importlib.util.spec_from_file_location('word_doc_pipeline', path)
    module = importlib.util.module_from_spec(spec)
    assert spec.loader is not None
    spec.loader.exec_module(module)
    return module


def _load_word_pipeline_module():
    roots = [Path.cwd()] + list(Path.cwd().parents)
    candidate_paths = [
        Path('lib/word_doc_pipeline.py'),
        Path('notebooks/lib/word_doc_pipeline.py'),
        Path('scripts/word_doc_pipeline.py'),
    ]

    for root in roots:
        for rel in candidate_paths:
            candidate = (root / rel).resolve()
            if candidate.is_file():
                return _import_word_module_from(candidate)

    for root in roots:
        for match in root.rglob('word_doc_pipeline.py'):
            return _import_word_module_from(match)

    raise ModuleNotFoundError(
        "找不到 word_doc_pipeline.py，請確認 notebooks/lib 或 scripts 目錄存在並已掛載。"
    )


MODULE_CANDIDATES = [
    'notebooks.lib.word_doc_pipeline',
    'lib.word_doc_pipeline',
    'scripts.word_doc_pipeline',
]

_loaded_module = None
for module_name in MODULE_CANDIDATES:
    try:
        module = __import__(module_name, fromlist=['insert_table_of_contents'])
        _loaded_module = importlib.reload(module)
        break
    except ModuleNotFoundError:
        continue

if _loaded_module is None:
    _loaded_module = _load_word_pipeline_module()

_loaded_module.DEFAULT_EAST_ASIA_FONT = EAST_ASIA_FONT_OVERRIDE
insert_table_of_contents = _loaded_module.insert_table_of_contents
update_docx_fields_with_word = _loaded_module.update_docx_fields_with_word
DEFAULT_EAST_ASIA_FONT = _loaded_module.DEFAULT_EAST_ASIA_FONT


def get_east_asia_font_override() -> Optional[str]:
    value = globals().get('EAST_ASIA_FONT_OVERRIDE', DEFAULT_EAST_ASIA_FONT)
    if value == "" or value is False:
        value = None
    _loaded_module.DEFAULT_EAST_ASIA_FONT = value
    return value


# === 全域設定 ===
DEFAULT_EAST_ASIA_FONT = '標楷體'
PREFERRED_METADATA_KEYS = ['課程', '日期', '周次', '節次']
SUPPORTED_EXTENSIONS = ('.md',)
HEADING_PATTERN = re.compile(r'^(#{1,6})\s+(.*)$')
CODE_FENCE_PATTERN = re.compile(r'^(```+|~~~+)')
SHOW_METADATA_AFTER_HEADING = False
TOC_PLACEHOLDER = '[[DOCX_TOC_PLACEHOLDER]]'
COMPILED_TITLE = '租稅法總論講義彙編'


@dataclass
class Entry:
    path: Path
    meta: Dict[str, str]
    content: str


def parse_front_matter(raw_text: str) -> tuple[Dict[str, str], str]:
    """解析 Markdown front matter，並排除含「課程」欄位。"""
    raw_text = raw_text.lstrip('\ufeff')
    match = re.match(r'^---\s*\n(.*?)\n---\s*\n?', raw_text, flags=re.DOTALL)
    if not match:
        return {}, raw_text.strip()

    meta_block = match.group(1)
    meta: Dict[str, str] = {}
    for line in meta_block.splitlines():
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        line = line.replace('：', ':', 1)
        if ':' not in line:
            continue
        key, value = line.split(':', 1)
        key = key.strip().replace(':', '').replace('：', '')
        if '課程' in key:
            continue
        meta[key] = value.strip()

    body = raw_text[match.end():].lstrip('\r\n')
    return meta, body.strip()


def build_heading(metadata: Dict[str, str], fallback_label: str) -> str:
    """生成講次小標（metadata 已預先移除含課程欄位）。"""
    if not metadata:
        return fallback_label

    parts: List[str] = []
    if metadata.get('周次'):
        w = str(metadata['周次'])
        parts.append(f"第{w}週" if not w.startswith('第') else w)
    if metadata.get('節次'):
        s = str(metadata['節次'])
        parts.append(f"第{s}節" if not s.startswith('第') else s)
    if metadata.get('日期'):
        parts.append(str(metadata['日期']))

    if not parts:
        extras = [str(v).strip() for v in metadata.values() if v and str(v).strip()]
        return '｜'.join(extras) if extras else fallback_label
    return '｜'.join(parts)


def collect_entries(source_dir: Path) -> List[Entry]:
    if not source_dir.exists():
        raise FileNotFoundError(f'找不到資料夾: {source_dir}')

    files = sorted([p for ext in SUPPORTED_EXTENSIONS for p in source_dir.glob(f'*{ext}')])
    if not files:
        raise RuntimeError('資料夾中沒有找到任何支援的檔案 (.md)')

    entries: List[Entry] = []
    for path in files:
        raw = path.read_text(encoding='utf-8')
        meta, body = parse_front_matter(raw)
        entries.append(Entry(path=path, meta=meta or {}, content=body))
    return entries


def format_metadata(metadata: Dict[str, str]) -> str:
    metadata = {
        k: v
        for k, v in metadata.items()
        if '課程' not in str(k).replace('：', '').replace(':', '').strip()
    }
    if not metadata:
        return ''

    items: List[tuple[str, str]] = []
    for key in PREFERRED_METADATA_KEYS:
        if key in metadata:
            items.append((key, metadata[key]))
    for key, value in metadata.items():
        if key in PREFERRED_METADATA_KEYS:
            continue
        items.append((key, value))

    return '\n'.join(f'- **{key}**：{value}' for key, value in items)


def shift_heading_levels(markdown: str, offset: int = 2) -> str:
    if offset == 0:
        return markdown

    lines = markdown.splitlines()
    adjusted: List[str] = []
    in_code_block = False

    for line in lines:
        fence = CODE_FENCE_PATTERN.match(line)
        if fence:
            in_code_block = not in_code_block
            adjusted.append(line)
            continue
        if in_code_block:
            adjusted.append(line)
            continue

        match = HEADING_PATTERN.match(line)
        if not match:
            adjusted.append(line)
            continue

        hashes, title = match.group(1), match.group(2).strip()
        current_level = len(hashes)
        new_level = min(current_level + offset, 6)
        adjusted.append('#' * new_level + ' ' + title)
    return '\n'.join(adjusted)


def assemble_markdown(entries: List[Entry]) -> str:
    parts: List[str] = [f'# {COMPILED_TITLE}', '']

    if AUTO_INSERT_TOC:
        parts.append(TOC_PLACEHOLDER)
        parts.append('')
    else:
        parts.extend(['\\newpage', ''])

    for index, entry in enumerate(entries):
        heading = build_heading(entry.meta, entry.path.stem)
        parts.append(f'## {heading}')
        parts.append('')

        if SHOW_METADATA_AFTER_HEADING:
            metadata_block = format_metadata(entry.meta)
            if metadata_block:
                parts.append(metadata_block)
                parts.append('')

        if entry.content:
            parts.append(shift_heading_levels(entry.content, offset=2))

        if index != len(entries) - 1:
            parts.extend(['', '\\newpage', ''])

    markdown_output = '\n'.join(parts)
    return markdown_output.strip() + '\n'


def render_markdown_to_docx(markdown_text: str, destination: Path, template_path: Optional[Path]) -> None:
    if shutil.which('pandoc') is None:
        raise RuntimeError('pandoc 未安裝')

    destination.parent.mkdir(parents=True, exist_ok=True)
    cleaned = re.sub(r'[ \t]{2,}$', '', markdown_text, flags=re.MULTILINE)

    with tempfile.NamedTemporaryFile('w', suffix='.md', encoding='utf-8', delete=False) as tmp:
        tmp.write(cleaned)
        tmp_path = tmp.name
    try:
        cmd = [
            'pandoc', tmp_path,
            '-f', 'markdown+pipe_tables+lists_without_preceding_blankline+hard_line_breaks+raw_tex+yaml_metadata_block',
            '-t', 'docx',
            '-o', str(destination),
            '--wrap=none',
        ]
        if template_path:
            cmd.extend(['--reference-doc', str(template_path)])
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print('⚠️ Pandoc 錯誤訊息：')
            print(result.stderr or result.stdout)
            raise subprocess.CalledProcessError(result.returncode, cmd, result.stdout, result.stderr)
        print(f"已輸出 Word（pandoc）：{destination}")
    finally:
        Path(tmp_path).unlink(missing_ok=True)


def finalize_docx(docx_path: Path, *, auto_update: bool) -> None:
    doc = Document(docx_path)
    placeholders = [para for para in doc.paragraphs if para.text.strip() == TOC_PLACEHOLDER]

    font_choice = get_east_asia_font_override()
    if AUTO_INSERT_TOC:
        target_para = placeholders[0] if placeholders else None
        if target_para is None:
            target_para = doc.paragraphs[0] if doc.paragraphs else doc.add_paragraph()
        toc_kwargs = dict(
            title=TOC_TITLE,
            level_range=TOC_LEVEL_RANGE,
            heading_level=TOC_HEADING_LEVEL,
            east_asia_font=font_choice,
            page_break_after=TOC_PAGE_BREAK_AFTER,
        )
        signature = inspect.signature(insert_table_of_contents)
        if 'target_paragraph' in signature.parameters:
            toc_kwargs['target_paragraph'] = target_para
        insert_table_of_contents(doc, **toc_kwargs)

    for para in placeholders:
        p = para._element
        parent = p.getparent()
        if parent is not None:
            parent.remove(p)

    doc.save(docx_path)
    update_docx_fields_with_word(docx_path, enabled=auto_update)


def compile_word_document():
    entries = collect_entries(SOURCE_DIR)
    markdown_text = assemble_markdown(entries)
    render_markdown_to_docx(markdown_text, OUTPUT_PATH, TEMPLATE_PATH)
    finalize_docx(OUTPUT_PATH, auto_update=AUTO_UPDATE_WORD_FIELDS)
    print(f'完成：{OUTPUT_PATH}')


compile_word_document()



已輸出 Word（pandoc）：/Users/iw/Documents/NTU/1141/1141_Tax_Ko/mkdocs/My_Notes/課_二89_租稅法總論/課_二89_租稅法總論_合併測試.docx
已透過 Word 更新欄位。
完成：/Users/iw/Documents/NTU/1141/1141_Tax_Ko/mkdocs/My_Notes/課_二89_租稅法總論/課_二89_租稅法總論_合併測試.docx
