In [9]:
!pip install pandas



In [10]:
# === 匯入套件 ===
!pip install pdfminer.six tqdm

import re
import json
from pathlib import Path
from tqdm import tqdm
from pdfminer.high_level import extract_text

# === 基本設定 ===
input_dir = Path.cwd()                     # 頂層資料夾
output_json = Path("all_pdfs_parsed.json") # 匯出 JSON 結果
output_json.parent.mkdir(parents=True, exist_ok=True)

# === 輔助函數 ===
def pdf_to_text(pdf_path: Path) -> str:
    """將單一 PDF 轉為文字；若失敗則回傳空字串"""
    try:
        text = extract_text(pdf_path)
        text = re.sub(r'\s+', ' ', text)  # 去除多餘空白
        return text.strip()
    except Exception as e:
        print(f"⚠️ 無法解析：{pdf_path.name} ({e})")
        return ""

def find_verdicts(text: str) -> list[str]:
    """找出含「判決」的句子或段落"""
    sentences = re.split(r'[。！？\n]', text)
    return [s.strip() for s in sentences if '判決' in s]

# === 載入已存在的 JSON（若有）===
existing = []
if output_json.exists():
    try:
        existing = json.loads(output_json.read_text(encoding="utf-8"))
        print(f"🔹 已載入既有資料，共 {len(existing)} 筆。")
    except Exception as e:
        print(f"⚠️ 既有 JSON 讀取失敗，將重新建立 ({e})")
        existing = []

# 建立已處理檔案的集合（以完整路徑為鍵）
existing_paths = {entry["path"] for entry in existing if "path" in entry}

# === 主流程 ===
pdf_files = sorted(input_dir.rglob("*.pdf"))  # 遞迴抓取所有 PDF
new_results = []

for pdf in tqdm(pdf_files, desc="Parsing PDFs"):
    if str(pdf) in existing_paths:
        # 已處理過的跳過
        continue

    text = pdf_to_text(pdf)
    if not text:
        continue
    verdicts = find_verdicts(text)
    new_results.append({
        "path": str(pdf),
        "filename": pdf.name,
        "text": text,
        "verdicts": verdicts,
        "verdict_count": len(verdicts)
    })

# === 合併新舊結果 ===
if new_results:
    all_results = existing + new_results
else:
    all_results = existing

# === 輸出 JSON ===
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"\n✅ 已輸出 JSON：{output_json}")
print(f"📄 原有 {len(existing)} 筆，新解析 {len(new_results)} 筆，總計 {len(all_results)} 筆。")

🔹 已載入既有資料，共 184 筆。


Parsing PDFs:   0%|          | 0/311 [00:00<?, ?it/s]The PDF <_io.BufferedReader name='/Users/iw/Documents/NTU/1141/1141_Nominee_Data/lawbank_runs/借牌/downloads/施茂林 - 當前政風核心工作－展現存在價值、再創績效高峰（二）.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/iw/Documents/NTU/1141/1141_Nominee_Data/lawbank_runs/借牌/downloads/王志誠 - 日本金融資產證券化之法制架構與啟發－兼論我國金融資產證券化之立法取向.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='/Users/iw/Documents/NTU/1141/1141_Nominee_Data/lawbank_runs/利用他人名義/downloads/戴銘昇 - 論證券交易法之取得股份申報－評司法院大法官會議釋字第五八六號解釋.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the c


✅ 已輸出 JSON：all_pdfs_parsed.json
📄 原有 184 筆，新解析 116 筆，總計 300 筆。





In [11]:
# === 高容錯 JSON 檢索系統 + 條件滿足說明 + 正則可選 ===
# 可決定是否啟用正則規則檢查（例如「稅法第\d+條」）

import json
import re
import pandas as pd
from pathlib import Path

# === 載入 JSON ===
data = json.loads(Path("all_pdfs_parsed.json").read_text(encoding="utf-8"))
df = pd.DataFrame(data)

# === 可調整條件（留空或註解即略過）===
kw_include_all = ["借", "稅"]
kw_include_any = ["借用", "借名", "股票", "投標", "借用名義", "營業", "借牌", "借名登記", "執業", "靠行", "帳戶", "利用"]
kw_exclude = ["草稿", "測試"]
title_include = []
title_exclude = []
count_thresholds = {"借": 1, "稅": 1}

# === 正則設定 ===
use_regex_search = False               # True→檢查 regex_patterns；False→忽略
regex_patterns = [r"稅法第\d+條"]     # 可放多條

use_case_sensitive = False
use_regex_mode = False                # True→關鍵詞視為正則
min_text_length = 1000
max_text_length = None
sort_by = "filename"
ascending = True

# === 正規化：去除空格與雜字 ===
def normalize_text(s: str) -> str:
    if not s:
        return ""
    s = re.sub(r"\s+", "", s)
    s = s.replace(" ", "")
    return s

df["text_norm"] = df["text"].apply(normalize_text)
df["title_norm"] = df["filename"].apply(normalize_text)

# === 檢索函數 ===
def count_kw(text, kw):
    flags = 0 if use_case_sensitive else re.IGNORECASE
    pattern = kw if use_regex_mode else re.escape(kw)
    return len(re.findall(pattern, text, flags))

def match_kw(text, kws, mode="and"):
    if not kws:
        return True, []
    hits = []
    for kw in kws:
        flags = 0 if use_case_sensitive else re.IGNORECASE
        pattern = kw if use_regex_mode else re.escape(kw)
        if re.search(pattern, text, flags):
            hits.append(kw)
        elif mode == "and":
            return False, hits
    if mode == "or" and not hits:
        return False, []
    return True, hits

def passes_thresholds(text):
    unmet = {}
    for kw, th in count_thresholds.items():
        c = count_kw(text, kw)
        if c < th:
            unmet[kw] = c
    return len(unmet) == 0, unmet

def passes_regex(text):
    if not use_regex_search or not regex_patterns:
        return True, []  # 若關閉 regex 檢查則視為通過
    failed = []
    for pat in regex_patterns:
        if not re.search(pat, text):
            failed.append(pat)
    return len(failed) == 0, failed

# === 主檢索 ===
results = []
for _, row in df.iterrows():
    text = row["text_norm"]
    title = row["title_norm"]
    report = []

    if not text:
        continue
    if min_text_length and len(text) < min_text_length:
        continue
    if max_text_length and len(text) > max_text_length:
        continue

    ok_all, hits_all = match_kw(text, kw_include_all, "and")
    ok_any, hits_any = match_kw(text, kw_include_any, "or")
    ok_title, hits_title = match_kw(title, title_include, "and")
    ok_regex, failed_regex = passes_regex(text)
    ok_threshold, unmet = passes_thresholds(text)

    excluded = any(ex in text for ex in kw_exclude) or any(ex in title for ex in title_exclude)

    if all([ok_all, ok_any, ok_title, ok_regex, ok_threshold]) and not excluded:
        report.append(f"✅ 內文同時含有：{hits_all or '無'}")
        report.append(f"✅ 內文至少含任一：{hits_any or '無'}")
        if count_thresholds:
            for kw, th in count_thresholds.items():
                c = count_kw(text, kw)
                report.append(f"✅ 關鍵詞「{kw}」出現 {c} 次（門檻 {th}）")
        if use_regex_search and regex_patterns:
            if ok_regex:
                report.append(f"✅ 通過正則：{regex_patterns}")
            else:
                report.append(f"❌ 未匹配正則：{failed_regex}")
        if title_include:
            report.append(f"✅ 標題命中：{hits_title or '無'}")

        row_dict = row.to_dict()
        row_dict["match_report"] = "\n".join(report)
        results.append(row_dict)

# === 結果輸出 ===
filtered_df = pd.DataFrame(results).sort_values(sort_by, ascending=ascending)
print(f"共 {len(filtered_df)} 筆符合條件：\n")

for _, r in filtered_df.iterrows():
    print(f"- {r['filename']}｜字數 {len(r['text'])}")
    print(r["match_report"])
    print("-" * 50)

# === 寫出結果 ===
out_json = Path("filtered_results_with_report.json")
out_csv = Path("filtered_results_with_report.csv")
filtered_df.to_json(out_json, orient="records", force_ascii=False, indent=2)
filtered_df.to_csv(out_csv, index=False, encoding="utf-8-sig")

print(f"\n✅ 已輸出 JSON：{out_json.resolve()}")
print(f"✅ 已輸出 CSV：{out_csv.resolve()}")

共 219 筆符合條件：

- 余萬能 - 藥師調劑處方跨行政區交付藥品行為之適法性.pdf｜字數 13504
✅ 內文同時含有：['借', '稅']
✅ 內文至少含任一：['營業', '借牌', '執業']
✅ 關鍵詞「借」出現 1 次（門檻 1）
✅ 關鍵詞「稅」出現 1 次（門檻 1）
--------------------------------------------------
- 劉建宏 - 稅捐爭訟實務上之「重審復查決定」－一○三年台財訴字第一○三一三九四○五一○號訴願決定評析.pdf｜字數 4868
✅ 內文同時含有：['借', '稅']
✅ 內文至少含任一：['投標', '營業', '借牌']
✅ 關鍵詞「借」出現 3 次（門檻 1）
✅ 關鍵詞「稅」出現 60 次（門檻 1）
--------------------------------------------------
- 吳天雲 - 兩岸利用地下通匯洗錢之現狀與預防.pdf｜字數 24975
✅ 內文同時含有：['借', '稅']
✅ 內文至少含任一：['借名', '營業', '帳戶']
✅ 關鍵詞「借」出現 2 次（門檻 1）
✅ 關鍵詞「稅」出現 2 次（門檻 1）
--------------------------------------------------
- 吳從周 - 我國不動產借名登記契約之發展現狀－特別著重觀察內部效力與外部效力演變之互動.pdf｜字數 23090
✅ 內文同時含有：['借', '稅']
✅ 內文至少含任一：['借用', '借名', '借名登記', '利用']
✅ 關鍵詞「借」出現 281 次（門檻 1）
✅ 關鍵詞「稅」出現 10 次（門檻 1）
--------------------------------------------------
- 吳志正 - 醫療契約之當事人.pdf｜字數 28064
✅ 內文同時含有：['借', '稅']
✅ 內文至少含任一：['營業', '借牌', '執業']
✅ 關鍵詞「借」出現 8 次（門檻 1）
✅ 關鍵詞「稅」出現 1 次（門檻 1）
--------------------------------------------------
- 呂秉翰 - 各國保全人員值勤規範之比較分析.pdf｜字數