In [1]:
# === 匯入套件 ===
!pip install pdfminer.six tqdm

import re
from pathlib import Path
from tqdm import tqdm
from pdfminer.high_level import extract_text

# === 基本設定 ===
from pathlib import Path

input_dir = Path("papers")
output_all = Path("all_texts.txt")
output_verdicts = Path("判決整理/verdicts.txt")

# 自動建立輸出資料夾
output_verdicts.parent.mkdir(parents=True, exist_ok=True)


# === 輔助函數 ===
def pdf_to_text(pdf_path: Path) -> str:
    """將單一 PDF 轉為文字；若失敗則回傳空字串"""
    try:
        text = extract_text(pdf_path)
        text = re.sub(r'\s+', ' ', text)  # 去除多餘空白
        return text.strip()
    except Exception as e:
        print(f"⚠️ 無法解析：{pdf_path.name} ({e})")
        return ""

def find_verdicts(text: str) -> list[str]:
    """找出含「判決」的句子或段落"""
    # 切句（粗略）
    sentences = re.split(r'[。！？\n]', text)
    return [s.strip() for s in sentences if '判決' in s]

# === 主流程 ===
all_texts = []
verdicts = []

pdf_files = sorted(input_dir.glob("*.pdf"))

for pdf in tqdm(pdf_files, desc="Parsing PDFs"):
    text = pdf_to_text(pdf)
    if not text:
        continue
    all_texts.append(f"\n\n=== {pdf.name} ===\n{text}")
    verdicts_in_pdf = find_verdicts(text)
    if verdicts_in_pdf:
        verdicts.append(f"\n\n=== {pdf.name} ===\n" + "\n".join(verdicts_in_pdf))

# === 輸出整合檔案 ===
output_all.write_text("\n".join(all_texts), encoding="utf-8")
output_verdicts.write_text("\n".join(verdicts), encoding="utf-8")

print(f"\n✅ 全文整合：{output_all}")
print(f"✅ 含「判決」段落：{output_verdicts}")
print(f"共 {len(pdf_files)} 份 PDF，成功解析 {len(all_texts)} 份。")



Parsing PDFs:   0%|          | 0/30 [00:00<?, ?it/s]The PDF <_io.BufferedReader name='papers/不動產借名登記契約之負擔行為與處分行為.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Parsing PDFs:   3%|▎         | 1/30 [00:00<00:23,  1.24it/s]The PDF <_io.BufferedReader name='papers/不動產借名登記契約有效性的檢討.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Parsing PDFs:   7%|▋         | 2/30 [00:02<00:31,  1.12s/it]The PDF <_io.BufferedReader name='papers/不動產借名登記相關民事法律關係－以評析近年最高法院裁判為主.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
Parsing PDFs:  10%|█         | 3/30 [00:02<00:25,  1.05it/s]The PDF <


✅ 全文整合：all_texts.txt
✅ 含「判決」段落：判決整理/verdicts.txt
共 30 份 PDF，成功解析 29 份。





In [11]:
from pathlib import Path
import re, json, csv

# === 基本設定 ===
base_dir = Path("判決整理")
base_dir.mkdir(parents=True, exist_ok=True)
input_path = base_dir / "verdicts.txt"
normalized_output = base_dir / "verdicts_normalized.txt"
reason_output = base_dir / "verdicts_with_reason.txt"
stats_csv = base_dir / "verdicts_stats.csv"
stats_json = base_dir / "verdicts_stats.json"

text = input_path.read_text(encoding="utf-8", errors="ignore")

# === 清洗亂碼與控制字元 ===
text = re.sub(r'$begin:math:text$cid:\\d+$end:math:text$', '', text)
text = re.sub(r'[\x00-\x1f\u200b\u3000]', '', text)

# === 中文數字轉阿拉伯 ===
num_map = {"〇":0,"○":0,"零":0,"一":1,"二":2,"三":3,"四":4,"五":5,"六":6,"七":7,"八":8,"九":9,"十":10,"百":100,"千":1000}
def chinese_to_arabic(s):
    s = s.strip()
    if not s: return s
    if re.match(r'^\d+$', s): return str(int(s))
    total, num, unit = 0, 0, 1
    for ch in reversed(s):
        if ch not in num_map: continue
        val = num_map[ch]
        if val >= 10:
            unit = val
            if num == 0: num = 1
            total += num * unit
            num = 0; unit = 1
        else:
            num = num * 10 + val
    total += num * unit
    return str(total)

# === 判決字號 pattern ===
verdict_pattern = re.compile(
    r"""
    (最高法院|[\u4e00-\u9fa5]{2,6}高等法院|[\u4e00-\u9fa5]{2,6}地方法院)   # 法院
    [\s\S]{0,5}?([〇○零一二三四五六七八九十百千\d]{1,6})\s*年(?:度)?       # 年
    [\s\S]{0,6}?([^\s第：:]{1,8}?字)\s*第\s*([〇○零一二三四五六七八九十百千\d]{1,6}) # 字第號
    \s*號(?:民事|刑事|行政)?判決
    """, re.VERBOSE | re.DOTALL
)

# === 判決見解（謂、指出、認為）pattern ===
reason_pattern = re.compile(
    r"""
    (?P<verdict>(?:台|臺)?(?:最高法院|[\u4e00-\u9fa5]{2,6}(?:高等|地方法)院)[\s\S]{0,50}?判決)
    (?P<reason>[\s\S]{0,80}?(?:謂|指出|認為)[\s\S]{0,200}?[。；\n])
    """, re.VERBOSE
)

records = {}
reasons = []
current_pdf = None

for line in text.splitlines():
    if line.strip().startswith("==="):
        current_pdf = line.strip("= ").replace(".pdf", "")
        continue
    for m in verdict_pattern.finditer(line):
        court, year, word, num = m.group(1), m.group(2), m.group(3), m.group(4)
        year_ar = chinese_to_arabic(year)
        num_ar = chinese_to_arabic(num)
        key = f"{court} {year_ar} 年 {word}第 {num_ar} 號判決"
        if key not in records:
            records[key] = {"count":0, "sources":set()}
        records[key]["count"] += 1
        if current_pdf:
            records[key]["sources"].add(current_pdf)

for m in reason_pattern.finditer(text):
    v = re.sub(r'\s+', ' ', m.group('verdict')).strip()
    r = re.sub(r'\s+', ' ', m.group('reason')).strip()
    reasons.append(f"{v}：{r}")

# === 輸出 ===
sorted_items = sorted(records.items(), key=lambda x: (x[0].startswith("最高法院")==False, x[0]))
normalized_output.write_text("\n".join(k for k,_ in sorted_items), encoding="utf-8")
reason_output.write_text("\n".join(reasons), encoding="utf-8")

with stats_csv.open("w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["判決字號", "出現次數", "出現來源"])
    for k, v in sorted_items:
        src = "、".join(sorted(v["sources"])) if v["sources"] else ""
        writer.writerow([k, v["count"], src])

json.dump(
    {k: {"count": v["count"], "sources": sorted(v["sources"])} for k,v in records.items()},
    stats_json.open("w", encoding="utf-8"),
    ensure_ascii=False, indent=2
)

print(f"✅ 抓取到判決 {len(records)} 筆")
print(f"✅ 含見解句 {len(reasons)} 筆")
print(f"✅ 去重清單：{normalized_output}")
print(f"✅ 含見解：{reason_output}")
print(f"✅ 統計 CSV：{stats_csv}")

✅ 抓取到判決 0 筆
✅ 含見解句 32 筆
✅ 去重清單：判決整理/verdicts_normalized.txt
✅ 含見解：判決整理/verdicts_with_reason.txt
✅ 統計 CSV：判決整理/verdicts_stats.csv
