In [11]:
import re
from pathlib import Path
import unicodedata
from datetime import datetime
import pandas as pd

# === 可調整參數 ===
root_dir = Path.cwd()             # bib 檔所在根目錄
reference_file = Path("reference/Unihan_IRGSources.txt")
output_dir = Path("match_bib_output_md")
mode = 3                          # 1=依輸入順序；2=依筆畫；3=兩者都輸出
prefix_digits = 3                 # 流水號位數
TOP_N = None                       # 只輸出前 N 筆（None 表示全部）
CSV_PATH = Path("filtered_keywords_with_score.csv")  # 若無手動貼清單則自動依 CSV

# === 手動清單（留空則自動使用 CSV） ===
file_names_text = """ """

# === 清單來源 ===
if file_names_text.strip():
    print("📄 使用手動貼上的檔名清單。")
    file_names = [x.strip() for x in file_names_text.splitlines() if x.strip()]
else:
    print("📊 自動從 CSV 生成清單（依 custom_score 降序）。")
    df = pd.read_csv(CSV_PATH)
    if "paper_name" not in df.columns:
        raise ValueError("CSV 檔缺少欄位 'paper_name'")
    df_sorted = df.sort_values("custom_score", ascending=False)
    if TOP_N is not None:
        df_sorted = df_sorted.head(TOP_N)
        print(f"⚙️ 僅輸出前 {TOP_N} 篇（依 custom_score 排序）")
    file_names = list(df_sorted["paper_name"])

print(f"共載入 {len(file_names)} 個檔名。")

# === 載入筆畫表 ===
def load_stroke_table(path):
    table = {}
    if not path.exists():
        return table
    pattern = re.compile(r"U\+([0-9A-F]+)\s+kTotalStrokes\s+(\d+)")
    for line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
        m = pattern.search(line)
        if m:
            char = chr(int(m.group(1), 16))
            table[char] = int(m.group(2))
    return table

strokes_table = load_stroke_table(reference_file)

# === 正規化 ===
def normalize(s: str) -> str:
    if not s:
        return ""
    s = re.sub(r"[ \u3000]", " ", s)
    s = re.sub(r"[–—－]", "-", s)
    s = re.sub(r"[：﹕]", ":", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# === 筆畫數 ===
def stroke_count(name: str) -> int:
    count = 0
    for ch in name:
        if ch in strokes_table:
            count += strokes_table[ch]
        elif unicodedata.category(ch).startswith("Lo"):
            count += 10  # 未知字預設10筆畫
    return count

# === 解析 Bib ===
def parse_bib_entry(text):
    title_match = re.search(r"title\s*=\s*\{([^}]+)\}", text)
    author_match = re.search(r"author\s*=\s*\{([^}]+)\}", text)
    how_match = re.search(r"howpublished\s*=\s*\{([^}]+)\}", text)
    if title_match:
        title = normalize(title_match.group(1))
        author = normalize(author_match.group(1)) if author_match else ""
        how = normalize(how_match.group(1)) if how_match else ""
        return title, author, how
    return None, None, None

# === 掃描 Bib ===
bib_files = list(root_dir.rglob("*.bib"))
entries = []
for bib_path in bib_files:
    text = bib_path.read_text(encoding="utf-8", errors="ignore")
    for raw in re.split(r"@article|@book|@misc", text)[1:]:
        title, author, how = parse_bib_entry(raw)
        if title:
            entries.append((title, author, how))
print(f"📚 已載入 {len(entries)} 筆 Bib 條目。")

# === 比對檔名 ===
matches = []
for fname in file_names:
    for title, author, how in entries:
        if normalize(title).replace(" ", "") in normalize(fname).replace(" ", ""):
            matches.append((fname, title, author, how))
            break

# === 匯出結果 ===
output_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def export_markdown(data, filename, title):
    lines = [
        f"# {title}",
        "",
        f"**生成時間**：{timestamp}",
        "",
        f"此檔案列出自動比對所得文獻引用清單（共 {len(data)} 筆）。",
        "",
        "以下依指定順序編號：",
        ""
    ]
    for i, (_, _, _, how) in enumerate(data, start=1):
        num = f"{i:0{prefix_digits}d}"
        lines.append(f"{num}. {how}")
    (output_dir / filename).write_text("\n".join(lines), encoding="utf-8")
    print(f"✅ 輸出 {len(data)} 筆 → {filename}")

# === 主輸出 ===
if mode in (1, 3):
    export_markdown(matches, "按輸入順序.md", "文獻清單（按輸入順序）")

if mode in (2, 3):
    sorted_matches = sorted(matches, key=lambda x: stroke_count(x[2]) if x[2] else 0)
    export_markdown(sorted_matches, "按筆畫排序.md", "文獻清單（按作者筆畫排序）")

# === 找出未匹配項 ===
matched_fnames = {m[0] for m in matches}
unmatched = [f for f in file_names if f not in matched_fnames]

if unmatched:
    miss_path = output_dir / "未匹配清單.txt"
    miss_path.write_text("\n".join(unmatched), encoding="utf-8")
    print(f"⚠️ 未匹配 {len(unmatched)} 筆，已輸出 → {miss_path.resolve()}")
else:
    print("🎯 所有檔案皆成功匹配。")

📊 自動從 CSV 生成清單（依 custom_score 降序）。
共載入 271 個檔名。
📚 已載入 593 筆 Bib 條目。
✅ 輸出 271 筆 → 按輸入順序.md
✅ 輸出 271 筆 → 按筆畫排序.md
🎯 所有檔案皆成功匹配。
