In [1]:
import re
import subprocess
from datetime import datetime
from functools import lru_cache
from pathlib import Path

# === 使用者設定 ===
INPUT_BIB_PATH = Path('lawbank_runs/稅_借名_借用/papers.bib')
DEDUP_BIB_PATH = Path('lawbank_runs/稅_借名_借用/papers_dedup.bib')
HOWPUBLISH_DOCX_PATH = Path('【稅_借名】文獻清單匯出.docx')
HOWPUBLISH_SORT_MODE = 'stroke'  # 'stroke' 或 'date'
DATE_SORT_DESCENDING = True      # 僅在 HOWPUBLISH_SORT_MODE='date' 時生效
STROKE_DATA_PATH = Path('reference/Unihan_IRGSources.txt')

# === 工具函式 ===
def clean_entry(entry: str) -> str:
    """刪除值為『未知』『無』或空 {} 的欄位"""
    return re.sub(
        r'^\s*\w+\s*=\s*\{\s*(?:未知|無)?\s*\},?,?\s*$',
        '',
        entry,
        flags=re.MULTILINE
    )

def parse_entry(entry: str):
    """解析 Bib 條目"""
    m_type = re.match(r'(\w+)\s*\{', entry)
    m_title = re.search(r'title\s*=\s*\{(.*?)\}', entry, re.DOTALL)
    m_author = re.search(r'author\s*=\s*\{(.*?)\}', entry, re.DOTALL)
    m_year = re.search(r'year\s*=\s*\{(.*?)\}', entry)
    typ = m_type.group(1) if m_type else 'unknown'
    title = m_title.group(1).strip() if m_title else '未知標題'
    author = m_author.group(1).strip() if m_author else '未知作者'
    year = m_year.group(1).strip() if m_year else '未知年份'
    cleaned_entry = clean_entry(entry)
    return typ, title, author, year, cleaned_entry

def extract_field(entry: str, field_name: str):
    pattern = rf'{field_name}\s*=\s*\{{(.*?)\}}'
    match = re.search(pattern, entry, flags=re.DOTALL | re.IGNORECASE)
    if match:
        value = match.group(1).strip()
        return value or None
    return None

def parse_date_from_text(text: str) -> datetime:
    match = re.search(r'(\d{4})年\s*(\d{1,2})?月?\s*(\d{1,2})?日?', text)
    if not match:
        return datetime.min
    year = int(match.group(1))
    month = int(match.group(2)) if match.group(2) else 1
    day = int(match.group(3)) if match.group(3) else 1
    try:
        return datetime(year, month, day)
    except ValueError:
        return datetime(year, month, 1)

@lru_cache(maxsize=1)
def load_stroke_map():
    stroke_map = {}
    path = STROKE_DATA_PATH
    if not path.exists():
        print(f'⚠️ 找不到字根筆畫檔案：{path}，將以字典順序排序作者。')
        return stroke_map
    with path.open('r', encoding='utf-8') as f:
        for line in f:
            if '\tkTotalStrokes\t' not in line:
                continue
            code, field, value = line.strip().split('\t')
            if field != 'kTotalStrokes':
                continue
            primary = value.split()[0]
            try:
                cp = int(code[2:], 16)
                stroke_map[chr(cp)] = int(primary)
            except ValueError:
                continue
    return stroke_map

def stroke_key(name: str):
    stroke_map = load_stroke_map()
    if not stroke_map:
        return (name,)
    counts = []
    for ch in name:
        if ch.isspace():
            continue
        counts.append(stroke_map.get(ch, 100))
    return tuple(counts) if counts else (100,)

def parse_howpublish_entry(text: str):
    author_segment, _, _ = text.partition('，')
    primary_author = author_segment.split('、')[0].strip() if author_segment else '未知作者'
    date_value = parse_date_from_text(text)
    return {
        'raw': text,
        'primary_author': primary_author,
        'date': date_value,
    }

def collect_howpublish(entries):
    values = []
    for _, _, _, entry in entries:
        howpublish = extract_field(entry, 'howpublished') or extract_field(entry, 'howpublish')
        if howpublish:
            values.append(parse_howpublish_entry(howpublish))
    return values

def sort_howpublish(entries):
    mode = HOWPUBLISH_SORT_MODE.lower()
    if mode == 'date':
        return sorted(
            entries,
            key=lambda item: (item['date'], stroke_key(item['primary_author']), item['raw']),
            reverse=DATE_SORT_DESCENDING
        )
    return sorted(
        entries,
        key=lambda item: (stroke_key(item['primary_author']), item['date'], item['raw'])
    )

def write_howpublish_docx(items, output_path: Path):
    if not items:
        print()
        print('⚠️ 去重後條目未找到 howpublish 欄位，未產生 DOCX。')
        return
    md_path = output_path.with_suffix('.md')
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    sorted_items = sort_howpublish(items)
    sort_label = '作者筆畫' if HOWPUBLISH_SORT_MODE.lower() == 'stroke' else '出版時間'
    total_count = len(sorted_items)
    lines = [
        '# 文獻清單 匯出',
        f'- 產出時間：{timestamp}',
        f'- 排序方式：{sort_label}',
        f'- 條目總數：{total_count}',
        '',
        '## 條目清單',
        ''
    ] + [f"{idx_val}. {item['raw']}" for idx_val, item in enumerate(sorted_items, 1)]

    md_path.write_text('\n'.join(lines), encoding='utf-8')

    subprocess.run([
        'pandoc',
        str(md_path),
        '-f', 'markdown',
        '-t', 'docx',
        '-o', str(output_path)
    ], check=True)

    print()
    print(f"✅ 已輸出 howpublish DOCX（共 {len(sorted_items)} 筆）：{output_path}")

# === 主流程 ===
text = INPUT_BIB_PATH.read_text(encoding='utf-8')
entries = re.split(r'@(?=\w+\s*\{)', text)
entries = [e.strip() for e in entries if e.strip()]
parsed = [parse_entry(e) for e in entries]

# === 去重 ===
seen = {}
duplicates = []
for typ, title, author, year, entry in parsed:
    key = title.lower()
    if key in seen:
        duplicates.append((title, author, year))
    else:
        seen[key] = (title, author, year, entry)
unique_entries = list(seen.values())

# === 統計輸出 ===
total_count = len(parsed)
dup_count = len(duplicates)
unique_count = len(unique_entries)
print(f'原始篇數：{total_count}')
print(f'重複篇數：{dup_count}')
if dup_count:
    print()
    print('重複條目：')
    for title, author, year in duplicates:
        print(f' - {title} / {author} / {year}')
print()
print(f'去重後篇數：{unique_count}')
print('去重後條目：')
for title, author, year, _ in unique_entries:
    print(f' - {title} / {author} / {year}')

# === 寫出去重後 Bib ===
dedup_text = '\n\n'.join(f"@{entry}" for _, _, _, entry in unique_entries)
DEDUP_BIB_PATH.write_text(dedup_text, encoding='utf-8')
print()
print(f"✅ 已輸出去重後檔案：{DEDUP_BIB_PATH}")

# === 匯出 howpublish DOCX ===
howpublish_items = collect_howpublish(unique_entries)
write_howpublish_docx(howpublish_items, HOWPUBLISH_DOCX_PATH)


原始篇數：37
重複篇數：4

重複條目：
 - 稅捐稽徵協力義務、推計課稅與協力義務違反的制裁－以納稅者權利保護法第 14 條規定討論與條文修正建議為中心 / 柯格鐘 / 2019
 - 論菸稅與菸品健康福利捐之財政民主統制 / 廖欽福 / 2016
 - 實質課稅與稅捐規避行為之舉證責任 / 黃源浩 / 2014
 - 實質課稅原則之適用界限 / 陳清秀 / 2011

去重後篇數：33
去重後條目：
 - 稅捐假扣押制度之裁判分析（中） / 黃俊杰 / 2025
 - 跨國稅捐事務的思考流程與審查步驟 / 陳衍任 / 2024
 - 稅法上借牌營業課稅問題之探討－從體系正義觀點出發 / 陳清秀 / 2024
 - 再論租稅刑罰與租稅行政罰之關係 / 柯格鐘 / 2022
 - 租稅核課處分存續力之突破與退稅請求權 / 盛子龍 / 2022
 - 納保法實施後稅務專庭對於納稅者權利保障的實踐 / 謝如蘭 / 2022
 - 夫妻財產制之法理探析－兼論稅務及土地實務運作 / 羅裕翔 and 趙逸凡 / 2021
 - 打房的第二支箭－評析支持囤房稅正反雙方論點 / 趙逸凡 / 2021
 - 借用他人名義在特銷稅上的評價－最高行政法院 109 年度上字第 5886 號判決簡評 / 柯格鐘 / 2021
 - 臺灣傳奇「青果大王」欠稅案 / 盧秀虹 / 2021
 - 租稅協定之解釋適用（下） / 陳清秀 / 2021
 - 租稅協定之適用問題－以臺英租稅協定之適用為例 / 陳清秀 / 2020
 - 事實認定錯誤之溢繳稅款的返還－最高行政法院 107 年度判字第 340 號判決評釋 / 柯格鐘 / 2020
 - 稅捐稽徵協力義務、推計課稅與協力義務違反的制裁－以納稅者權利保護法第 14 條規定討論與條文修正建議為中心 / 柯格鐘 / 2019
 - 營業稅法信託課稅問題之研討－以德國法為比較中心（下） / 江彥佐 / 2018
 - 營業稅法信託課稅問題之研討－以德國法為比較中心（上） / 江彥佐 / 2018
 - 從私法自治權之保障，談稅法對於私法秩序之融合原則 / 陳清秀 / 2017
 - 論菸稅與菸品健康福利捐之財政民主統制 / 廖欽福 / 2016
 - 稅法上之正當法律程序 / 陳清秀 / 2016
 - 論欠稅限制出境之合憲性 / 范文清 / 2