In [84]:
import json
import random

# Fichiers d'entrée et de sortie
ground_truth_file = "C:/Users/jguo/Documents/PURE/data/bert_simple_ground_truth.jsonl"
train_file = "C:/Users/jguo/Documents/PURE/data/new_train.jsonl"
valid_file = "C:/Users/jguo/Documents/PURE/data/new_valid.jsonl"

# Fixer la graine aléatoire pour assurer la reproductibilité
random.seed(42)

# Charger les données ground truth
with open(ground_truth_file, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

print(f"Nombre total: {len(data)}")

# Mélanger les données et diviser en 80/20
random.shuffle(data)
split = int(0.8 * len(data))
train_data = data[:split]
valid_data = data[split:]

print(f"Résultat de la division: train={len(train_data)}, valid={len(valid_data)}")

# Sauvegarder les fichiers de sortie
with open(train_file, "w", encoding="utf-8") as f:
    for obj in train_data:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

with open(valid_file, "w", encoding="utf-8") as f:
    for obj in valid_data:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print("Terminé ✅")


Nombre total: 853
Résultat de la division: train=682, valid=171
Terminé ✅


In [95]:
import json

# 原始文件路径
file_a = "C:/Users/jguo/Documents/PURE/data/bert_simple_ground_truth.jsonl"  # 包含你想“对比”的ID
file_b = "C:/Users/jguo/Documents/PURE/data/output.jsonl"  # 需要清理的文件
output_file = "C:/Users/jguo/Documents/PURE/data/test_pure.jsonl"  # 保存结果的路径

# 第一步：收集 file_A 中所有的 id
ids_in_a = set()
with open(file_a, "r", encoding="utf-8") as fa:
    for line in fa:
        data = json.loads(line)
        ids_in_a.add(data["id"])

# 第二步：过滤 file_B 中的条目
with open(file_b, "r", encoding="utf-8") as fb, open(output_file, "w", encoding="utf-8") as fout:
    for line in fb:
        data = json.loads(line)
        if data["id"] not in ids_in_a:
            fout.write(json.dumps(data, ensure_ascii=False) + "\n")


In [66]:
!py -m pip install Unidecode dateparser
# 或：python -m pip install Unidecode dateparser





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\jguo\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [85]:
import re
import json
import dateparser

# 同时覆盖带/不带重音的月份
MONTHS = [
    "janvier",
    "février", "fevrier",
    "mars",
    "avril",
    "mai",
    "juin",
    "juillet",
    "août", "aout",
    "septembre",
    "octobre",
    "novembre",
    "décembre", "decembre",
]

MONTHS_ALT = "|".join(MONTHS)
DATE_PATTERN = re.compile(
    rf"\b(?:1er|\d{{1,2}})\s+(?:{MONTHS_ALT})\s+\d{{4}}\b",
    flags=re.IGNORECASE
)

def extract_full_dates(text: str):
    """
    从原句中提取“完整日期表达”（例如：'1er février 1877'），
    返回一个列表，元素为句子里的原始片段（保留重音与大小写）。
    """
    if not isinstance(text, str) or not text.strip():
        return []

    results = []
    for m in DATE_PATTERN.finditer(text):
        phrase = text[m.start():m.end()]
        dt = dateparser.parse(
            phrase,
            languages=["fr"],
            settings={"DATE_ORDER": "DMY"}
        )
        if dt:
            results.append(phrase.strip())
    return results

def replace_incomplete_dates(sent, triples):
    """
    用句子中出现的“原始日期片段”替换 triples 里不完整/格式化的日期（如 '1877-02' 或 '1877-02-01'）。
    """
    # 兜底：非字符串句子或 triples 非列表，安全返回
    if not isinstance(sent, str):
        return triples if isinstance(triples, list) else []

    triples = triples if isinstance(triples, list) else []

    # 1) 提取原句中的完整日期片段，并建立 (year, month) -> 原片段 的映射
    full_dates = extract_full_dates(sent)
    ym_to_phrase = {}
    for phr in full_dates:
        dt = dateparser.parse(
            phr, languages=["fr"],
            settings={"DATE_ORDER": "DMY"}
        )
        if dt:
            key = (f"{dt.year}", f"{dt.month:02d}")
            # 若同一 (year, month) 出现多个，以最后一个为准；也可自定义优先级
            ym_to_phrase[key] = phr

    # 2) 逐个 triple 回写
    for triple in triples:
        obj = triple.get("obj", "")
        if not isinstance(obj, str):
            continue

        # 情况 A: YYYY-MM
        m = re.fullmatch(r"(\d{4})-(\d{2})", obj)
        if m:
            year, month = m.group(1), m.group(2)
            phr = ym_to_phrase.get((year, month))
            if phr:
                triple["obj"] = phr
            continue

        # 情况 B: YYYY-MM-DD
        m2 = re.fullmatch(r"(\d{4})-(\d{2})-(\d{2})", obj)
        if m2:
            year, month, day = m2.group(1), m2.group(2), int(m2.group(3))
            phr = ym_to_phrase.get((year, month))
            if phr:
                # day 出现在原片段里（如 '17' 或 '17 ' 等），或 1 对应 '1er'
                day_str = str(day)
                if (day_str in phr) or (f"{day:02d}" in phr) or (day == 1 and "1er" in phr):
                    triple["obj"] = phr
            continue

    return triples

# ====== 批处理文件 ======
input_file = "C:/Users/jguo/Documents/PURE/data/new_train.jsonl"
output_file = "C:/Users/jguo/Documents/PURE/data/train_fixed.jsonl"

n_total = 0
n_replaced = 0

with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:
        if not line.strip():
            continue
        d = json.loads(line)

        sent = d.get("sent")  # 某些行可能没有 'sent' 或值为 None
        triples = d.get("triples", [])

        before = json.dumps(triples, ensure_ascii=False, sort_keys=True)
        triples_fixed = replace_incomplete_dates(sent, triples)
        after = json.dumps(triples_fixed, ensure_ascii=False, sort_keys=True)

        if before != after:
            n_replaced += 1

        d["triples"] = triples_fixed
        fout.write(json.dumps(d, ensure_ascii=False) + "\n")
        n_total += 1

print(f"Traitement terminé ! Lignes: {n_total}, Modifiées: {n_replaced}")


Traitement terminé ! Lignes: 682, Modifiées: 232


In [68]:
import json
import dateparser
import re

def fr_date_to_iso(date_str):
    # Convertit une date en français (« 1er février 1877 », etc.) au format ISO (« 1877-02-01 »)
    dt = dateparser.parse(date_str, languages=['fr'])
    if dt:
        return dt.strftime("%Y-%m-%d")
    return date_str

def update_triples_date_obj(triples):
    # Pour chaque triple, si l'objet est une date complète en français, on la remplace par le format ISO
    pattern = r"\b(?:1er|\d{1,2})\s+[a-zéû]+(?:\s+\d{4})"
    for triple in triples:
        obj = triple.get("obj", "")
        # Vérifie si l'objet est une date en français (« 1er février 1877 », « 21 mai 1956 », etc.)
        if re.fullmatch(pattern, obj, re.IGNORECASE):
            iso_date = fr_date_to_iso(obj)
            triple["obj"] = iso_date
    return triples

# ========== Traitement batch du fichier JSONL ==========
input_file = "C:/Users/jguo/Documents/PURE/data/train_fixed.jsonl"
output_file = "C:/Users/jguo/Documents/PURE/data/new_train.jsonl"

with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:
        d = json.loads(line)
        d["triples"] = update_triples_date_obj(d["triples"])
        fout.write(json.dumps(d, ensure_ascii=False) + "\n")

print("Traitement terminé !")


Traitement terminé !


In [89]:
import json
import unicodedata
import re

# ===== 工具：不区分大小写/重音的包含判断 =====
def _norm(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    # 去重音
    s = "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")
    # 统一空白
    s = re.sub(r"\s+", " ", s)
    return s

def _strip_voie_if_matches_sentence(text: str, sentence: str) -> str:
    """
    如果 text 以 'voie ' 开头，且去掉前缀的部分能在句子中找到（宽松匹配），
    则返回去掉 'voie ' 的部分；否则返回原 text。
    """
    if not isinstance(text, str):
        return text
    m = re.match(r"^\s*voie\s+(.+)$", text, flags=re.IGNORECASE)
    if not m:
        return text
    tail = m.group(1).strip()
    if not tail:
        return text
    if _norm(tail) in _norm(sentence):
        return tail
    return text

# ===== 关系侧：哪些字段应当是“地名（整串）”，哪些字段是“类型（caractéristique géographique）” =====
def sides_expect_names(rel: str):
    """
    返回 (sub_is_name, obj_is_name)
    """
    rel = (rel or "").strip()
    # isLandmarkType: sub=地名, obj=类型
    if rel == "isLandmarkType":
        return True, False
    # isLandmarkTypeOf: sub=类型, obj=地名
    if rel in ("isLandmarkTypeOf", "isLandmarkTypeOF"):
        return False, True
    # 以下关系两边通常都是地名
    if rel in ("hasOldName", "hasNewName", "touches", "within"):
        return True, True
    # 时间型关系：sub=地名, obj=时间
    if rel in ("hasGeometryChangeOn", "hasNameChangeOn", "isNumberedOn",
               "isClassifiedOn", "disappearsOn", "appearsOn", "hasAppearedRelationOn"):
        return True, False
    # 其他未知关系：保守认为 sub 是地名、obj 不一定
    return True, False

# ====== 运行 ======
input_file = "C:/Users/jguo/Documents/PURE/data/new_valid.jsonl"
output_file = "C:/Users/jguo/Documents/PURE/data/valid_filtered.jsonl"

n_in, n_out = 0, 0
with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        n_in += 1
        data = json.loads(line)

        sent = data.get("sent", "")  # 句子原文，用于匹配
        triples = data.get("triples", [])
        new_triples = []

        for triple in triples:
            rel = triple.get("rel", "")
            sub = triple.get("sub", "")
            obj = triple.get("obj", "")

            # 1) 过滤掉 obj == "noTime"
            if obj == "noTime":
                continue

            # 2) 过滤掉 hasNewName/hasOldName 中 sub==obj 的冗余
            if rel in ("hasNewName", "hasOldName") and _norm(sub) == _norm(obj):
                continue

            if sub == "municipality":
                continue

            # 3) 纠正 'voie XXX' → 'XXX'（仅当该侧按关系应是“地名整串”且句子中能匹配）
            sub_is_name, obj_is_name = sides_expect_names(rel)

            if sub_is_name:
                sub = _strip_voie_if_matches_sentence(sub, sent)
            if obj_is_name:
                obj = _strip_voie_if_matches_sentence(obj, sent)

            # 写回三元组
            triple["sub"] = sub
            triple["obj"] = obj

            new_triples.append(triple)

        data["triples"] = new_triples
        fout.write(json.dumps(data, ensure_ascii=False) + "\n")
        n_out += 1

print(f"Tous les filtres appliqués. Entrées lues={n_in}, sorties écrites={n_out}. Fichier : {output_file}")


Tous les filtres appliqués. Entrées lues=171, sorties écrites=171. Fichier : C:/Users/jguo/Documents/PURE/data/valid_filtered.jsonl


In [70]:
!pip install transformers




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [71]:
!pip install unidecode




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [72]:
!pip install dateparser





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [94]:
import json
import re
import difflib
from unidecode import unidecode
import dateparser
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("C:/Users/jguo/Documents/PURE/camembert-base")

def normalize(text: str) -> str:
    return unidecode(text).lower().replace(" ", "").replace("’", "").replace("'", "")

# 在 normalize 下方加：
def normalize_loose(text: str) -> str:
    if not isinstance(text, str):
        return ""
    s = unidecode(text).lower()
    # 去掉所有非字母数字与斜杠（保留 J/16 这类代码）
    s = re.sub(r"[^a-z0-9/]+", "", s)
    return s

def char_to_token_span(tokens, text, char_start, char_end):
    """将字符区间映射到 SentencePiece token 区间。"""
    curr_char = 0
    start_token_idx = None
    end_token_idx = None
    for idx, token in enumerate(tokens):
        token_str = token.replace("▁", " ")
        token_str = token_str.strip()
        # 跳过原文中的连续空白
        while curr_char < len(text) and text[curr_char].isspace():
            curr_char += 1
        token_begin = curr_char
        token_end = curr_char + len(token_str)
        if start_token_idx is None and token_begin <= char_start < token_end:
            start_token_idx = idx
        if token_begin < char_end <= token_end:
            end_token_idx = idx
        curr_char = token_end
    if start_token_idx is not None and end_token_idx is not None:
        return start_token_idx, end_token_idx
    return None, None

def find_sublist_ci(lst, sub):
    for i in range(len(lst) - len(sub) + 1):
        if [t.lower() for t in lst[i:i+len(sub)]] == [t.lower() for t in sub]:
            return i, i + len(sub) - 1
    return -1, -1


# ---------- 事件类型与时间抽取 ----------
def tag_event_type(sent: str):
    m = re.search(r"\|\|\s*([^\|]+?)\s*\|\|", sent)
    if m:
        event = m.group(1).strip()
        before_event = sent[:m.start()]
        return event, before_event, m.start(), m.end()
    return None, None, -1, -1

def extract_time(sent: str):
    """
    简单时间抽取：dd mois yyyy | yyyy-mm-dd | yyyy | C\d{1,2}(e)?
    """
    m = re.search(r"\d{1,2} [a-zéû]+ \d{4}|\d{4}-\d{2}-\d{2}|\d{4}|C\d{1,2}(e)?", sent)
    if m:
        return m.group(), m.start(), m.end()
    return None, -1, -1

# ---------- 主找 span 函数 ----------
def find_token_span(tokens, text, entity, entity_type=None):
    # Sauter les entités "thoroughfare" et "municipality" sans aucun affichage
    if entity_type in ("thoroughfare", "municipality"):
        return None, None
    # Appariement souple pour les codes de voies : "voie AH/15", "AH/15", etc.
    if entity_type == "landmark":
        code_pattern = r"[A-Z]{1,3}/\d{1,3}"
        entity_code = re.findall(code_pattern, entity)
        if entity_code:
            c = entity_code[0]
            idx = text.find(c)
            if idx != -1:
                return char_to_token_span(tokens, text, idx, idx + len(c))
        entity_words = entity.strip().split()
        if len(entity_words) >= 2:
            last_words = " ".join(entity_words[-2:])
            idx = text.lower().find(last_words.lower())
            if idx != -1:
                return char_to_token_span(tokens, text, idx, idx + len(last_words))
            last_word = entity_words[-1]
            idx2 = text.lower().find(last_word.lower())
            if idx2 != -1:
                left = max(0, idx2-6)
                match = re.search(r"(du|de|des|la|le)\s+" + re.escape(last_word), text[left:idx2+len(last_word)], re.IGNORECASE)
                if match:
                    span_start = left + match.start()
                    span_end = left + match.end()
                    return char_to_token_span(tokens, text, span_start, span_end)
    if entity_type == "date":
        century_match = re.fullmatch(r"C(\d{1,2})(e)?", entity, re.IGNORECASE)
        if century_match:
            century_num = int(century_match.group(1))
            roman_map = {
                1: "I", 2: "II", 3: "III", 4: "IV", 5: "V", 6: "VI", 7: "VII", 8: "VIII", 9: "IX", 10: "X",
                11: "XI", 12: "XII", 13: "XIII", 14: "XIV", 15: "XV", 16: "XVI", 17: "XVII", 18: "XVIII", 19: "XIX", 20: "XX"
            }
            roman = roman_map.get(century_num, "")
            candidates = []
            patterns = [
                rf"{century_num}e siècle",
                rf"{century_num} siècle",
                rf"{roman}e siècle",
                rf"{roman} siècle"
            ]
            for pat in patterns:
                idx = text.lower().find(pat.lower())
                if idx != -1:
                    candidates.append((idx, idx + len(pat)))
            if candidates:
                char_start, char_end = candidates[0]
                return char_to_token_span(tokens, text, char_start, char_end)
        mois_fr = ["janvier", "fevrier", "mars", "avril", "mai", "juin", "juillet",
                   "aout", "septembre", "octobre", "novembre", "decembre"]
        mois_fr_accent = ["janvier", "février", "mars", "avril", "mai", "juin", "juillet",
                          "août", "septembre", "octobre", "novembre", "décembre"]
        m_full = re.fullmatch(r"(\d{4})-(\d{2})-(\d{2})", entity)
        m_month = re.fullmatch(r"(\d{4})-(\d{2})", entity)
        m_year = re.fullmatch(r"(\d{4})", entity)
        year, month_idx, day = None, None, None
        if m_full:
            year = m_full.group(1)
            month_idx = int(m_full.group(2)) - 1
            day = int(m_full.group(3))
        elif m_month:
            year = m_month.group(1)
            month_idx = int(m_month.group(2)) - 1
        elif m_year:
            year = m_year.group(1)
        txt_norm = unidecode(text).lower()
        candidates = []
        if year and month_idx is not None:
            for mois in [mois_fr[month_idx], mois_fr_accent[month_idx]]:
                if day:
                    regexs = [
                        rf"\b{day}\s+{mois}\s+{year}\b",
                        rf"\b{str(day).zfill(2)}\s+{mois}\s+{year}\b",
                        rf"\b1er\s+{mois}\s+{year}\b" if day == 1 else "",
                    ]
                    for rgx in regexs:
                        if not rgx: continue
                        mobj = re.search(rgx, txt_norm)
                        if mobj:
                            candidates.append((mobj.start(), mobj.end()))
                rgx = rf"\b{mois}\s+{year}\b"
                mobj = re.search(rgx, txt_norm)
                if mobj:
                    candidates.append((mobj.start(), mobj.end()))
        elif year:
            mobj = re.search(rf"\b{year}\b", txt_norm)
            if mobj:
                candidates.append((mobj.start(), mobj.end()))
        if candidates:
            candidates.sort()
            char_start, char_end = candidates[0]
            return char_to_token_span(tokens, unidecode(text), char_start, char_end)
        if entity in text:
            char_start = text.index(entity)
            return char_to_token_span(tokens, text, char_start, char_start + len(entity))
        if entity_type is not None and entity_type not in ("thoroughfare", "municipality"):
            print(f"Entity [{entity}] not found in [{text}]")
        return None, None

    idx_raw = text.find(entity)
    if idx_raw != -1:
        char_start = idx_raw
        char_end = idx_raw + len(entity)
        return char_to_token_span(tokens, text, char_start, char_end)
    entity_norm = normalize(entity)
    text_norm = normalize(text)
    if entity_norm in text_norm:
        idx = text_norm.index(entity_norm)
        best_start, best_end = None, None
        for start in range(len(text)):
            for end in range(start + len(entity), min(len(text), start + len(entity) + 8)):
                frag = text[start:end]
                if normalize(frag) == entity_norm:
                    best_start, best_end = start, end
                    break
            if best_start is not None:
                break
        if best_start is not None and best_end is not None:
            return char_to_token_span(tokens, text, best_start, best_end)
    text_norm = normalize(text)
    entity_norm = normalize(entity)
    idx_norm = text_norm.find(entity_norm)
    if idx_norm != -1:
        best_start, best_end = None, None
        for start in range(len(text)):
            for end in range(start + 1, min(len(text), start + len(entity) + 8) + 1):
                frag = text[start:end]
                if normalize(frag) == entity_norm:
                    best_start, best_end = start, end
                    break
            if best_start is not None:
                break
        if best_start is not None and best_end is not None:
            return char_to_token_span(tokens, text, best_start, best_end)
        else:
            if entity_type is not None and entity_type not in ("thoroughfare", "municipality"):
                print(f"[AVERTISSEMENT] L'entité normalisée « {entity} » n'a pas été retrouvée dans le texte original « {text} ».")
        # —— 新增：宽松匹配（忽略标点/连字符等）
    ent_loose = normalize_loose(entity)
    txt_loose = normalize_loose(text)
    if ent_loose and ent_loose in txt_loose:
        # 找到一个最早的 loose 命中片段，映射回原文字符区间
        # 简化实现：在原文中滑动窗口，找 normalize_loose 一致的最短片段
        best_start = best_end = None
        for start in range(len(text)):
            for end in range(start + 1, min(len(text), start + len(entity) + 20) + 1):
                if normalize_loose(text[start:end]) == ent_loose:
                    best_start, best_end = start, end
                    break
            if best_start is not None:
                break
        if best_start is not None:
            return char_to_token_span(tokens, text, best_start, best_end)

    max_ratio = 0
    best_start, best_end = None, None
    for start in range(len(text)):
        for end in range(start + max(2, len(entity) - 4), min(len(text), start + len(entity) + 8)):
            frag = text[start:end]
            frag_norm = normalize(frag)
            if len(frag_norm) < 3:
                continue
            ratio = difflib.SequenceMatcher(None, frag_norm, entity_norm).ratio()
            if ratio > 0.82 and ratio > max_ratio:
                max_ratio = ratio
                best_start, best_end = start, end
    if best_start is not None and best_end is not None:
        if entity_type not in ("thoroughfare", "municipality"):
            print(f"[INFO] Appariement flou '{entity}' ≈ '{text[best_start:best_end]}', similarité={max_ratio:.2f}")
        return char_to_token_span(tokens, text, best_start, best_end)
    if entity_type is not None and entity_type not in ("thoroughfare", "municipality"):
           print(f"Entity [{entity}] not found in [{text}]")
    return None, None

def looks_like_date(s: str) -> bool:
    if not isinstance(s, str):
        return False
    s = s.strip()
    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s): return True
    if re.fullmatch(r"\d{1,2} [a-zéû]+ \d{4}", s.lower()): return True
    if re.fullmatch(r"\d{4}", s): return True
    if re.fullmatch(r"C\d{1,2}(e)?", s, re.IGNORECASE): return True
    return False

def find_sublist(lst, sub):
    """Trouve le début et la fin d'une sous-liste dans une liste."""
    for i in range(len(lst) - len(sub) + 1):
        if lst[i:i+len(sub)] == sub:
            return i, i + len(sub) - 1
    return -1, -1

def _first_delim_left_end(tokens):
    for i, t in enumerate(tokens):
        if t in ("|", "▁|"):
            return max(0, i - 1)
    return len(tokens) - 1

GEO_PREFIXES_LOWER = {
    "rue", "place", "boulevard", "quai", "impasse", "passage",
    "allée", "avenue", "cours", "chemin", "square", "parvis"   
}

def _is_geo_prefix_token(tok):
    base = tok.replace("▁", "").lower()
    return base in GEO_PREFIXES_LOWER

def _is_punct_token(tok):
    base = tok.replace("▁", "")
    return base in {",", "-", "–", "—"}


def build_ner(sent, tokens, triples=None):
    ner = []
    
    def _prefer_primary_nom(spans):
        """
        收敛 label='nom'：
        - 选择最“主要”的 nom：起点最小者；若起点相同取跨度最大的。
        - 删除其它被该主 nom 完全包含的 nom。
        其他标签不动。
        """
        noms = [(i, s, e) for i, (s, e, l) in enumerate(spans) if l == "nom"]
        if not noms:
            return spans

        # 选主 nom：起点最小；若并列，跨度最大
        s_keep, e_keep = sorted([(s, e) for _, s, e in noms],
                                key=lambda se: (se[0], -(se[1]-se[0])))[0]

        new = []
        for s, e, l in spans:
            if l == "nom" and not (s == s_keep and e == e_keep):
                # 被主 nom 完全包含 → 丢弃
                if s >= s_keep and e <= e_keep:
                    continue
            new.append([s, e, l])
        return new

    def _dedup_spans(spans):
        """按 (start,end,label) 精确去重，保留首次出现顺序。"""
        seen = set()
        out = []
        for s,e,l in spans:
            key = (s,e,l)
            if key in seen:
                continue
            seen.add(key)
            out.append([s,e,l])
        return out

    def _prefer_longest_nom_geo(spans):
        """
        收敛 'nom géographique'：
        - 若存在起点==0 的 NG，保留起点为0且跨度最长的那一个；
        删除所有被其完全包含的 NG。
        - 否则保留 NG 中跨度最长的那一个；
        删除被其完全包含的其它 NG。
        其他标签不动。
        """
        ng_idxs = [(i, s, e) for i,(s,e,l) in enumerate(spans) if l == "nom géographique"]
        if not ng_idxs:
            return spans

        # 候选：起点为 0 的 NG
        zero = [(i,s,e) for i,s,e in ng_idxs if s == 0]
        if zero:
            keep_s, keep_e = max([(s,e) for _,s,e in zero], key=lambda se: (se[1]-se[0], -se[0]))
        else:
            keep_s, keep_e = max([(s,e) for _,s,e in ng_idxs], key=lambda se: (se[1]-se[0], -se[0]))

        new = []
        for s,e,l in spans:
            if l == "nom géographique" and not (s == keep_s and e == keep_e):
                # 被保留的 NG 完全包含 → 丢弃
                if s >= keep_s and e <= keep_e:
                    continue
            new.append([s,e,l])
        return new

    # 1) 从 triples 收集需要当作“地名整串”的文本（按关系方向）
    landmark_texts = []
    if triples:
        for t in triples:
            rel = t.get("rel", "")
            sub = t.get("sub", "")
            obj = t.get("obj", "")
            if rel == "isLandmarkType":
                if sub: landmark_texts.append(sub)       # sub = nom géographique
            elif rel in ("isLandmarkTypeOf", "isLandmarkTypeOF"):
                if obj: landmark_texts.append(obj)       # obj = nom géographique
            elif rel in ("hasOldName", "hasNewName", "touches", "within"):
                if sub: landmark_texts.append(sub)
                if obj: landmark_texts.append(obj)
            elif rel in ("hasGeometryChangeOn", "hasNameChangeOn", "isNumberedOn",
                         "isClassifiedOn", "disappearsOn", "appearsOn", "hasAppearedRelationOn"):
                if sub: landmark_texts.append(sub)
            else:
                if sub: landmark_texts.append(sub)

    # 去重保序
    seen = set(); lm_list = []
    for s in landmark_texts:
        if s and s not in seen:
            seen.add(s); lm_list.append(s)

    # 2) 可能的地物前缀（大小写不敏感）
    geo_prefixes = [
        "Rue", "Place", "Boulevard", "Quai", "Impasse", "Passage",
        "Allée", "Avenue", "Cours", "Chemin", "Square"
    ]

    # 3) 为每个地名串产出三类标签
        # 3) 为每个地名串产出三类标签
    for loc_text in lm_list:
        loc_tokens = tokenizer.tokenize(loc_text)
        s, e = find_sublist_ci(tokens, loc_tokens)
        if s == -1:
            # 子序列不等（大小写/重音/分词差异），退回字符级查找再映射到 token
            s, e = find_token_span(tokens, sent, loc_text, entity_type="landmark")
        if s in (None, -1) or e in (None, -1):
            continue

        # —— 新增：若 loc_text 命中的是“专名部分”，且左邻是地物前缀，则把整串起点左扩到前缀
        # 例如：tokens = ["▁rue","▁valent","in","▁ha","ü","y",...]
        # loc_text = "Valentin Haüy" 命中 [1,5]，左邻 tokens[0] = "rue"（前缀），应当合并为 [0,5]
        if s - 1 >= 0 and _is_geo_prefix_token(tokens[s - 1]):
            # 先标前缀
            ner.append([s - 1, s - 1, "caractéristique géographique"])
            # 整串改为包含前缀
            s0 = s - 1
            ner.append([s0, e, "nom géographique"])
            # nom 就是专名（保持从原 s 开始）
            ner.append([s, e, "nom"])
            continue  # 已完成本条 loc_text 处理

        # —— 原有逻辑：在命中范围内再找“最长的内部前缀”
        ner.append([s, e, "nom géographique"])
        best = None
        for pref in ["Rue","Place","Boulevard","Quai","Impasse","Passage","Allée","Avenue","Cours","Chemin","Square"]:
            p_tok = tokenizer.tokenize(pref)
            L = len(p_tok)
            if s + L - 1 <= e and [t.lower() for t in tokens[s:s+L]] == [t.lower() for t in p_tok]:
                if best is None or L > best[2]:
                    best = (s, s+L-1, L)
        if best:
            ner.append([best[0], best[1], "caractéristique géographique"])
            nom_start = best[1] + 1
        else:
            nom_start = s

        # —— 新规则：只跳过“紧跟前缀的”法语虚词，再遇到首个实词就停止跳过
        INITIAL_STOPS = {"de", "du", "des", "la", "le"}
        while nom_start <= e:
            tok = tokens[nom_start].replace("▁", "").lower()
            # 合并 l' / d' 的切分情况，比如 "l" + "'" 或 "d" + "'"
            if tok in {"l", "d"} and nom_start + 1 <= e and tokens[nom_start+1] == "'":
                nom_start += 2
                continue
            if tok in INITIAL_STOPS:
                nom_start += 1
                continue
            break

        if nom_start <= e:
            ner.append([nom_start, e, "nom"])

    # === 句首兜底：若当前还没有“从 0 开始”的 nom géographique”，而句首看起来就是地名（含多前缀） ===
    if not any(lbl == "nom géographique" and s == 0 for s, e, lbl in ner):
        if len(tokens) > 1 and _is_geo_prefix_token(tokens[0]):
            end0 = _first_delim_left_end(tokens)
            if end0 >= 0 and end0 > 0:
                # 整串：标题区
                ner.append([0, end0, "nom géographique"])

                # 所有前缀单词都标注为 'caractéristique géographique'
                prefix_positions = []
                for i in range(0, end0 + 1):
                    if _is_geo_prefix_token(tokens[i]):
                        ner.append([i, i, "caractéristique géographique"])
                        prefix_positions.append(i)

                # nom：从最后一个前缀后的第一个非虚词开始，到标题末尾（去掉收尾标点）
                nom_start = 0
                if prefix_positions:
                    nom_start = max(prefix_positions) + 1

                # 跳过开头标点
                def _is_punct_token(tok):
                    base = tok.replace("▁", "")
                    return base in {",", "-", "–", "—"}

                while nom_start <= end0 and _is_punct_token(tokens[nom_start]):
                    nom_start += 1

                # 只跳过“紧跟前缀的”法语虚词
                INITIAL_STOPS = {"de", "du", "des", "la", "le"}
                while nom_start <= end0:
                    tok = tokens[nom_start].replace("▁", "").lower()
                    if tok in {"l", "d"} and nom_start + 1 <= end0 and tokens[nom_start+1] == "'":
                        nom_start += 2
                        continue
                    if tok in INITIAL_STOPS:
                        nom_start += 1
                        continue
                    break

                # 末尾去掉标点
                nom_end = end0
                while nom_end >= nom_start and _is_punct_token(tokens[nom_end]):
                    nom_end -= 1

                if nom_start <= nom_end:
                    ner.append([nom_start, nom_end, "nom"])


    # 4) EventType / Time 保留你的逻辑
    event, _, _, _ = tag_event_type(sent)
    if event:
        ev_tokens = tokenizer.tokenize(event)
        s2, e2 = find_sublist_ci(tokens, ev_tokens)
        if s2 != -1:
            ner.append([s2, e2, "EventType"])

    time_str, t_start, _ = extract_time(sent)
    if time_str:
        before = sent[:t_start]
        b_tok = tokenizer.tokenize(before)
        t_tok = tokenizer.tokenize(time_str)
        ner.append([len(b_tok), len(b_tok)+len(t_tok)-1, "Time"])
    ner = _dedup_spans(ner)
    ner = _prefer_longest_nom_geo(ner)   # 你已添加，用来收敛 NG
    ner = _prefer_primary_nom(ner)       # ← 新增：收敛 nom，保留最左且尽量长的那个
    return ner



def build_relations(sent, tokens, ner, triples):
    relations = []
    def _first_delim_token_index(tokens):
        for i, t in enumerate(tokens):
            if t in ("|", "▁|"):
                return i
        return len(tokens)  # 没有分隔符就当整句

    def get_span_from_ner_pref(entity, label, prefer_before_idx=None):
        """先按文本归一化匹配；若有多个候选，优先 start < prefer_before_idx。否则取跨度最大的。"""
        ent_norm = normalize(entity or "")
        candidates = []
        for start, end, ner_label in ner:
            if ner_label != label:
                continue
            ner_text = "".join(tokens[start:end+1]).replace("▁", " ").strip()
            if normalize(ner_text) == ent_norm:
                candidates.append((start, end))
        if not candidates:
            # 回退：拿该 label 的全部候选中，优先标题区
            candidates = [(s, e) for (s, e, l) in ner if l == label]
        if not candidates:
            return None
        if prefer_before_idx is not None:
            title_side = [(s, e) for (s, e) in candidates if s < prefer_before_idx]
            if title_side:
                candidates = title_side
        # 取跨度最大的
        candidates.sort(key=lambda x: (x[1]-x[0], -x[0]), reverse=True)
        return candidates[0]

    def _pick_longest_span(ner, label):
        c = [(s, e) for (s, e, l) in ner if l == label]
        if not c: return None
        c.sort(key=lambda x: (x[1]-x[0], -x[0]), reverse=True)
        return c[0]

    def _leading_name_span(tokens, ner):
        """返回句首的 nom géographique（如果 NER 已有）或直接用 0..第一个'|'之前。"""
        span = _pick_longest_span(ner, "nom géographique")
        if span and span[0] == 0:
            return span
        end0 = _first_delim_left_end(tokens)
        if end0 >= 0 and end0 > 0:
            return (0, end0)
        return None

    def get_span_from_ner(entity, label):
        best_span = None
        ent_norm = normalize(entity or "")
        for start, end, ner_label in ner:
            if ner_label != label:
                continue
            ner_text = "".join(tokens[start:end+1]).replace("▁", " ").strip()
            if normalize(ner_text) == ent_norm:
                return (start, end)
            if best_span is None or (end - start) > (best_span[1] - best_span[0]):
                best_span = (start, end)
        return best_span

    for triple in (triples or []):
        sub_type = triple.get("sub_type", "landmark")
        obj_type = "date" if looks_like_date(triple.get("obj")) else triple.get("obj_type", "landmark")
        if sub_type in ("thoroughfare", "municipality") or obj_type in ("thoroughfare", "municipality"):
            continue

        rel = triple.get("rel")
        sub_span = obj_span = None

        rel_norm = rel
        if rel == "isLandmarkTypeOF":
            rel_norm = "isLandmarkTypeOf"

        elif rel_norm == "isLandmarkType":
            # 规则：sub = nom géographique, obj = caractéristique géographique（标题优先）
            delim_idx = _first_delim_token_index(tokens)

            sub_span = get_span_from_ner_pref(triple.get("sub"), "nom géographique",
                                            prefer_before_idx=delim_idx)
            obj_span = get_span_from_ner_pref(triple.get("obj"), "caractéristique géographique",
                                            prefer_before_idx=delim_idx)

            # 兜底（字符级宽松查找 → token 映射）
            if sub_span is None:
                sub_span = find_token_span(tokens, sent, triple.get("sub"), entity_type="landmark")
            if obj_span is None:
                obj_span = find_token_span(tokens, sent, triple.get("obj"), entity_type="landmark")

        elif rel_norm == "isLandmarkTypeOf":
            # sub = caractéristique géographique, obj = nom géographique
            delim_idx = _first_delim_token_index(tokens)
            sub_span = get_span_from_ner_pref(triple.get("sub"), "caractéristique géographique", prefer_before_idx=delim_idx)
            obj_span = get_span_from_ner_pref(triple.get("obj"), "nom géographique", prefer_before_idx=delim_idx)

            if sub_span is None:
                sub_span = find_token_span(tokens, sent, triple.get("sub"), entity_type="landmark")
            if obj_span is None:
                obj_span = find_token_span(tokens, sent, triple.get("obj"), entity_type="landmark")


        elif rel_norm in ("hasGeometryChangeOn", "hasNameChangeOn", "isNumberedOn",
                          "isClassifiedOn", "disappearsOn", "appearsOn", "hasAppearedRelationOn"):
            sub_span = get_span_from_ner(triple.get("sub"), "nom géographique")
            obj_span = get_span_from_ner(triple.get("obj"), "Time")
            if sub_span is None:
                sub_span = find_token_span(tokens, sent, triple.get("sub"), entity_type="landmark")
            if obj_span is None:
                obj_span = find_token_span(tokens, sent, triple.get("obj"), entity_type="date")

        elif rel_norm in ("hasOldName", "hasNewName"):
            # 目标：sub = 句首的“nom géographique”（主街名），obj = 旧/新名称（也是地名串）
            # 先按文本精确找
            sub_span = get_span_from_ner(triple.get("sub"), "nom géographique")
            obj_span = get_span_from_ner(triple.get("obj"), "nom géographique")

            # sub 找不到 → 回退到句首地名
            if sub_span is None:
                sub_span = _leading_name_span(tokens, ner)

            # obj 找不到 → 再用通用 finder 兜底
            if obj_span is None:
                obj_span = find_token_span(tokens, sent, triple.get("obj"), entity_type="landmark")

            # 仍然没有就再用“NER最长 nom géographique”作为 obj（极端兜底）
            if obj_span is None:
                obj_span = _pick_longest_span(ner, "nom géographique")

        elif rel_norm in ("touches", "within"):
            sub_span = get_span_from_ner(triple.get("sub"), "nom géographique")
            obj_span = get_span_from_ner(triple.get("obj"), "nom géographique")
            if sub_span is None:
                sub_span = find_token_span(tokens, sent, triple.get("sub"), entity_type="landmark")
            if obj_span is None:
                obj_span = find_token_span(tokens, sent, triple.get("obj"), entity_type="landmark")

        else:
            # 其他关系：地名-时间作兜底
            sub_span = get_span_from_ner(triple.get("sub"), "nom géographique")
            obj_span = get_span_from_ner(triple.get("obj"), "Time")
            if sub_span is None:
                sub_span = find_token_span(tokens, sent, triple.get("sub"), entity_type="landmark")
            if obj_span is None:
                # 如果 obj 不是时间，看作地名兜底
                etype = "date" if looks_like_date(triple.get("obj")) else "landmark"
                obj_span = find_token_span(tokens, sent, triple.get("obj"), entity_type=etype)

        if sub_span and sub_span[0] is not None and obj_span and obj_span[0] is not None:
            relations.append([sub_span[0], sub_span[1], obj_span[0], obj_span[1], rel])
    

    return relations


def process_one_line(d):
    sent = d.get("sent")
    if not isinstance(sent, str) or not sent.strip():
        # 跳过非字符串或空句子的样本，避免 tokenizer 抛 TypeError
        return None

    doc_key = d.get("id", "")
    tokens = tokenizer.tokenize(sent)

    ner = build_ner(sent, tokens, d.get("triples", []))
    relations = build_relations(sent, tokens, ner, d.get("triples", []))
    

    return {
        "doc_key": doc_key,
        "dataset": "évolutions d'événements",
        "sentences": [tokens],
        "ner": [ner],
        "relations": [relations],
    }

# ====== 批处理 ======
input_file = "C:/Users/jguo/Documents/PURE/data/train_filtered.jsonl"
output_file = "C:/Users/jguo/Documents/PURE/data/train_pure.jsonl"

n_in, n_out = 0, 0
with open(input_file, "r", encoding="utf-8") as f, open(output_file, "w", encoding="utf-8") as fout:
    for line in f:
        line = line.strip()
        if not line:
            continue
        n_in += 1
        data = json.loads(line)
        if not isinstance(data.get("sent"), str) or not data.get("sent").strip():
            print("[SKIP]", data.get("id"), "sent=", repr(data.get("sent")))

        ex = process_one_line(data)
        if ex is None:  # 跳过非法样本
            continue
        fout.write(json.dumps(ex, ensure_ascii=False) + "\n")
        n_out += 1
        # 放在批处理 for 循环里的 json.loads(line) 后面
        

print(f"Traitement terminé ! lus={n_in}, écrits={n_out}")


Entity [Rue Cochin] not found in [boulevard de port-royal || Historique || Les rues des Trois Couronnes Saint-Marcel, Cochin, des Bourguignons, des Capucins, de Port-Royal, des Charbonniers Saint-Marcel, des Cendriers, du Champs des Capucins, l'impasse Hautefort et le Champ des Capucins ont été absorbés ou supprimés par ce boulevard]
[INFO] Appariement flou 'rue Rouvet' ≈ 'rue roue', similarité=0.88
Traitement terminé ! lus=682, écrits=682
