In [7]:
import os, json, pathlib

RES_DIR = "/home/jovyan/work/resources"
CONF_PATH = f"{RES_DIR}/confusion.json"
MAP_PATH  = f"{RES_DIR}/manual_map.json"
DOM_PATH  = f"{RES_DIR}/domain_terms.txt"

os.makedirs(RES_DIR, exist_ok=True)

def _ensure_file(path, default_text):
    if not os.path.exists(path):
        with open(path, "w", encoding="utf-8") as f: f.write(default_text)

# 先放空白；之後你可累積
_ensure_file(CONF_PATH, "{}\n")
_ensure_file(MAP_PATH, "{}\n")
_ensure_file(DOM_PATH, "\n")

def load_resources():
    with open(CONF_PATH, "r", encoding="utf-8") as f: CONFUSION = json.load(f)  # dict[str, list[str]]
    with open(MAP_PATH, "r", encoding="utf-8") as f: MANUAL_MAP = json.load(f)  # dict[str, str]
    with open(DOM_PATH, "r", encoding="utf-8") as f: DOMAIN = set(w.strip() for w in f if w.strip())
    return CONFUSION, MANUAL_MAP, DOMAIN

CONFUSION, MANUAL_MAP, DOMAIN = load_resources()
len(CONFUSION), len(MANUAL_MAP), len(DOMAIN)

(0, 0, 0)

In [None]:
# 需要：pip install -q pymupdf regex
import os, re, fitz
from collections import Counter

RES_DIR  = "/home/jovyan/work/resources"
DOM_PATH = f"{RES_DIR}/domain_terms.txt"
os.makedirs(RES_DIR, exist_ok=True)

PDF_DIR = "/home/jovyan/work/pdfs"   # ← 改成你的PDF根目錄
MIN_LEN = 2                          # 最小詞長
TOP_K   = 4000                       # 保留前TOP_K高頻詞

def list_pdfs(root: str):
    out = []
    for dirpath, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith(".pdf"):
                out.append(os.path.join(dirpath, f))
    return sorted(out)

def extract_terms_from_pdfs(paths, min_len=2, top_k=4000):
    pat = re.compile(rf'[\u4e00-\u9fffA-Za-z0-9]{{{min_len},}}')
    freq = Counter()
    for p in paths:
        try:
            doc = fitz.open(p)
            for pg in doc:
                t = pg.get_text("text") or ""
                for w in pat.findall(t):
                    freq[w] += 1
        except Exception as e:
            print("略過無法讀取：", p, "|", e)
    # 對關稅/法規常見詞加權，讓專有詞更容易進清單
    bonus_kw = ("關稅","估價","關稅估價","完稅","價格","協定","報關","稅捐","條","第","款","章","項")
    for k in list(freq):
        if any(b in k for b in bonus_kw):
            freq[k] *= 3
    return [w for w,_ in freq.most_common(top_k)]

pdfs = list_pdfs(PDF_DIR)
print("PDF 檔數：", len(pdfs))

terms = extract_terms_from_pdfs(pdfs, min_len=MIN_LEN, top_k=TOP_K)

# 合併到既有 DOMAIN（若檔案存在）
domain = set()
if os.path.exists(DOM_PATH):
    with open(DOM_PATH, "r", encoding="utf-8") as f:
        domain |= {x.strip() for x in f if x.strip()}

domain |= set(terms)

with open(DOM_PATH, "w", encoding="utf-8") as f:
    f.write("\n".join(sorted(domain)))

print("已更新 DOMAIN 條目數：", len(domain))

In [None]:
# pip install -q pymupdf regex
import fitz, re

PDFS = [
    # 範例：把你的關稅/法規 PDF 路徑放進來
    # "/home/jovyan/work/pdfs/關稅法.pdf",
]

def extract_terms_from_pdfs(paths, min_len=2, top_k=2000):
    freq = {}
    pat = re.compile(r'[\u4e00-\u9fffA-Za-z0-9]{%d,}' % min_len)
    for p in paths:
        try:
            doc = fitz.open(p)
            for pg in doc:
                t = pg.get_text("text") or ""
                for w in pat.findall(t):
                    freq[w] = freq.get(w,0) + 1
        except Exception as e:
            print("skip:", p, e)
    # 簡易過濾：數字+「條」「第」「款」「關稅」「估價」等優先
    bonus = ("關稅","估價","條","第","款","稅捐","貨物","完稅","價格","協定","報關")
    for k in list(freq):
        if any(b in k for b in bonus):
            freq[k] *= 3
    terms = [w for w,_ in sorted(freq.items(), key=lambda x: -x[1])[:top_k]]
    return terms

if PDFS:
    new_terms = extract_terms_from_pdfs(PDFS, top_k=4000)
    DOMAIN.update(new_terms)
    with open(DOM_PATH, "w", encoding="utf-8") as f:
        f.write("\n".join(sorted(DOMAIN)))
    print("DOMAIN 條目數：", len(DOMAIN))
else:
    print("未提供 PDF，跳過抽詞。")

In [None]:
# pip install -q transformers torch rapidfuzz regex pangu zhon
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch, re, difflib, pangu
from rapidfuzz import process, fuzz

tok = AutoTokenizer.from_pretrained("bert-base-chinese")
mlm = AutoModelForMaskedLM.from_pretrained("bert-base-chinese").eval()

ZH_PUNC = {",":"，",".":"。","?":"？","!":"！",":":"：",";":"；"}
def norm_punc(s):
    s = s.translate(str.maketrans(ZH_PUNC))
    for p in "，。？！：；": s = s.replace(" "+p, p)
    return pangu.spacing_text(s)

def mlm_topk_char(sent_chars, i, k=5):
    t = tok(sent_chars, is_split_into_words=True, return_tensors="pt")
    ids = t["input_ids"][0].clone()
    pos = i + 1  # [CLS] 偏移
    ids[pos] = tok.mask_token_id
    with torch.inference_mode():
        logits = mlm(input_ids=ids.unsqueeze(0)).logits[0, pos]
    topk = torch.topk(logits, k)
    return [tok.convert_ids_to_tokens(int(x)) for x in topk.indices]

def protect_terms(text, terms):
    for t in sorted(terms, key=len, reverse=True):
        text = text.replace(t, f"⟦{t}⟧")
    return text

def unprotect_terms(text):
    return text.replace("⟦","").replace("⟧","")

def correct_sentence(sent, CONFUSION, MANUAL_MAP, DOMAIN):
    # 詞級先行（你的人工作業會逐步累積到 MANUAL_MAP）
    for a,b in sorted(MANUAL_MAP.items(), key=lambda x: len(x[0]), reverse=True):
        sent = re.sub(re.escape(a), b, sent)
    s = protect_terms(sent, DOMAIN)
    chars = list(s)
    for i,ch in enumerate(chars):
        if ch in "⟦⟧": continue
        # 候選取「混淆表」如有，否則允許原字
        cand_pool = CONFUSION.get(ch, [])
        if ch not in cand_pool: cand_pool = [ch] + list(cand_pool)
        # 用 MLM 打分取得高機率候選，與候選池交集
        topk = mlm_topk_char(chars, i, k=6)
        cands = [c for c in topk if c in cand_pool]
        if cands and cands[0] != ch:
            chars[i] = cands[0]
    out = unprotect_terms("".join(chars))
    # 對不在 DOMAIN 的相近詞拉回（可選）
    if DOMAIN:
        words = set(re.findall(r'[\u4e00-\u9fffA-Za-z0-9]{2,}', out))
        for w in words:
            if w not in DOMAIN:
                hit = process.extractOne(w, DOMAIN, scorer=fuzz.WRatio)
                if hit and hit[1] >= 93:
                    out = re.sub(re.escape(w), hit[0], out)
    return norm_punc(out)

def split_sents(text):
    return [p for p in re.split(r'(?<=[。！？\?])\s+|\n+', text) if p.strip()]

def correct_text(text, CONFUSION, MANUAL_MAP, DOMAIN):
    return "\n".join(correct_sentence(s, CONFUSION, MANUAL_MAP, DOMAIN) for s in split_sents(text))

def show_diff(a, b):
    return "".join(difflib.unified_diff(a.splitlines(1), b.splitlines(1), fromfile="orig.txt", tofile="fixed.txt", lineterm=""))

In [None]:
IN_PATH  = "/home/jovyan/work/transcripts/raw_trad.txt"   # 改成你的檔案
OUT_PATH = "/home/jovyan/work/transcripts/auto_fixed.txt"

raw = open(IN_PATH, "r", encoding="utf-8").read()
CONFUSION, MANUAL_MAP, DOMAIN = load_resources()
fixed = correct_text(raw, CONFUSION, MANUAL_MAP, DOMAIN)

os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
open(OUT_PATH, "w", encoding="utf-8").write(fixed)
print(show_diff(raw, fixed))
print("輸出：", OUT_PATH)

In [None]:
def update_manual_map(pairs):
    """
    pairs: List[Tuple['錯','正']]
    """
    mp = json.load(open(MAP_PATH,"r",encoding="utf-8"))
    for a,b in pairs:
        if a and b and a!=b:
            mp[a] = b
    with open(MAP_PATH,"w",encoding="utf-8") as f:
        json.dump(mp, f, ensure_ascii=False, indent=2)
    print("已更新 manual_map.json，共", len(mp), "條")

# 範例：把你這輪人工更正回寫
# update_manual_map([("上至", "上字"), ("完稅價", "完稅價格")])