In [None]:
# Install Google packages if needed

!pip install google
!pip install google-genai

In [None]:
import os
import logging
import json
import random
import time
import re
from google import genai
import openai

In [None]:
work_dir = "C:\Users\jguo\Desktop\text2kgbench\"
os.chdir(work_dir)

In [None]:
# 设置文件路径
input_file = os.path.join(work_dir, "text2kgbench/text2kgbench/text2kgbench/llm_responses/output_llm_responses_part1.jsonl")
output_cleaned_file = os.path.join(work_dir, "text2kgbench/text2kgbench/text2kgbench/llm_responses/cleaned_output_llm_responses_part1.jsonl")
output_sampled_file = os.path.join(work_dir, "text2kgbench/text2kgbench/text2kgbench/text2kgbench/llm_responses/sampled_100_output.jsonl")

# 步骤 1：清洗数据，只保留 response 字段
cleaned_data = []

with open(input_file, 'r', encoding='utf-8') as fin:
    for line in fin:
        obj = json.loads(line)
        # 如果没有 response 但有 sent，就用 sent 替换 response
        response = obj.get("response", obj.get("sent", ""))
        triples = obj.get("triples", [])
        uid = obj.get("id", "")
        cleaned_data.append({"id": uid, "response": response, "triples": triples})

# 保存清洗后的文件
with open(output_cleaned_file, 'w', encoding='utf-8') as fout:
    for item in cleaned_data:
        json.dump(item, fout, ensure_ascii=False)
        fout.write('\n')

# 步骤 2：随机抽取 100 条用于评估（你可以改为别的数字）
sample_size = 100
sampled_data = random.sample(cleaned_data, min(sample_size, len(cleaned_data)))

# 保存抽样后的文件
with open(output_sampled_file, 'w', encoding='utf-8') as fout:
    for item in sampled_data:
        json.dump(item, fout, ensure_ascii=False)
        fout.write('\n')

print(f"✅ 已生成 {output_cleaned_file} 和 {output_sampled_file}")

In [None]:
import pandas as pd
import json
import re


# 读取CSV
df = pd.read_csv('/users/jguo/Desktop/text2kgbench/denominations-emprises-voies-actuelles.csv', dtype=str, sep=';')

id_col = "Identifiant"
denom_col = "Dénomination complète minuscule"
info_cols = ["Historique", "Dénomination", "Classement", "Ouverture", "Numérotation"]

jsonl_lines = []

for idx, row in df.iterrows():
    exceptions = {
        "dr.":"dr",
        "larg.":"larg",
        "arr. mun.":"arr mun",
        "M.": "M",
        "jan." : "jan",
        "févr.": "févr",
        "avr.": "avr",
        "juil.": "juil",
        "sept.": "sept",
        "oct.": "oct",
        "nov.": "nov",
        "déc.": "déc",
        "Arr.":  "arr",
        "; place Cabanis ;":  ", place Cabanis,",
        ". (ponts)": " (ponts)",
        "Ord.": "Ord",
        "préf.": "préf",
        "ord.": "ord",
        "arr.": "arr",
        "(U. P.)": "U P",
        " ; rue Puteaux": ", rue Puteaux",
        " L'arr.": " L'arr",
        ". mun." : " mun",
        "(sup.)": " (sup)"
      }

    identifiant = str(row.get(id_col, '')).strip()
    denom = str(row.get(denom_col, '')).strip().lower()
    if not identifiant or not denom:
        continue  # 跳过无效行

    for col in info_cols:
        info = str(row.get(col, '')).strip()
        if not info or info.lower() in ['nan', 'none']:
            continue  # 跳过空内容

        # # 多条信息分割
        # if ';' in info:
        #     infos = [i.strip() for i in info.split(';') if i.strip()]
        # elif '\n' in info:
        #     infos = [i.strip() for i in info.split('\n') if i.strip()]
        # else:
        #     infos = [info]

        # 多条信息分割
        
        for abbr, full in exceptions.items():
            info = info.replace(abbr, full)

        # On découpe selon le point ".", le point virgule ";" ou le saut de ligne "\n"
        infos = re.split(r'[;.\n]', info)

        # On enlève les espaces vides et les chaînes vides
        infos = [x.strip() for x in infos if x.strip()]

        for i, single_info in enumerate(infos):
            if len(infos) == 1:
                id_str = f"{identifiant}_{col.lower()}"
            else:
                id_str = f"{identifiant}_{col.lower()}_{i+1}"
            sent = f"{denom} || {col} || {single_info}"
            jsonl_lines.append(json.dumps({"id": id_str, "sent": sent}, ensure_ascii=False))

# 写入jsonl文件
with open('output.jsonl', 'w', encoding='utf-8') as f:
    for line in jsonl_lines:
        f.write(line + '\n')

print(f"已生成 {len(jsonl_lines)} 条数据，保存在 output.jsonl")

In [None]:
import os
import logging
import json
import time
import re
from google import genai

# ==================== CONFIGURATION ====================
dossier_train   = "/Users/jguo/Desktop/text2kgbench/text2kgbench/text2kgbench/text2kgbench/ground_truth/ground_truth.jsonl"
dossier_test = "C:/Users/jguo/Desktop/text2kgbench/text2kgbench/text2kgbench/text2kgbench/output.jsonl"
dossier_sortie  = "/Users/jguo/Desktop/text2kgbench/text2kgbench/text2kgbench/text2kgbench/llm_responses/"

prompt_file = "/Users/jguo/Desktop/text2kgbench/text2kgbench/text2kgbench/text2kgbench/event_json_to_ttl/prompts/promptSimple.txt"
nom_modele = "gemini-2.5-pro-preview-05-06"

client = genai.Client(
    vertexai=True,
    project="ignf-simv-inference",
    location="us-central1"
)

os.makedirs(dossier_sortie, exist_ok=True)

with open(prompt_file, "r", encoding="utf-8") as prompt:
    template_prompt = prompt.read()
print(template_prompt)

def load_train_examples(filepath, max_total=60):
    examples = []
    with open(filepath, "r", encoding="utf-8") as f:
        for ligne in f:
            ligne = ligne.strip()
            if not ligne or ligne == ',':
                continue
            if ligne.endswith(','):
                ligne = ligne[:-1]
            try:
                obj = json.loads(ligne)
                examples.append(obj)
            except Exception as e:
                print("跳过异常行:", e, ligne)
            if len(examples) >= max_total:
                break
    return examples

def examples_to_prompt_str(exemples):
    return "\n\n".join(json.dumps(ex, ensure_ascii=False) for ex in exemples)

def parse_llm_output(raw_output):
    import json
    import re
    # 1. 提取所有```json ... ```块
    json_blocks = re.findall(r'```json\s*(\{.*?\})\s*```', raw_output, re.DOTALL)
    results = []
    if json_blocks:
        for block in json_blocks:
            try:
                obj = json.loads(block)
                results.append(obj)
            except Exception as e:
                print("块解析失败:", e, block)
        return results

    # 2. 提取所有裸JSON对象
    json_objs = re.findall(r'(\{(?:[^{}]|(?:\{[^{}]*\}))*\})', raw_output, re.DOTALL)
    if json_objs:
        for obj_str in json_objs:
            try:
                obj = json.loads(obj_str)
                results.append(obj)
            except Exception as e:
                print("裸对象解析失败:", e, obj_str)
        if results:
            return results

    # 3. 每行一个JSON对象
    lines = [l for l in raw_output.strip().splitlines() if l.strip()]
    for l in lines:
        print("尝试解析行:", l)
        try:
            obj = json.loads(l)
            results.append(obj)
        except Exception as e:
            print("行解析失败:", e, l)
    if results:
        return results

    # 4. 整体解析为JSON对象或数组
    try:
        arr = json.loads(raw_output)
        if isinstance(arr, dict):
            return [arr]
        if isinstance(arr, list):
            return arr
    except Exception as e:
        print("整体解析失败:", e, raw_output)
    return []

def try_batch(batch, batch_prompt, batch_size, chemin_sortie, i):
    print("本批次输入ID：", [rec["id"] for rec in batch])
    try:
        response = client.models.generate_content(
            model=nom_modele,
            contents=batch_prompt,
        )
        raw_output = response.text
        print(f"\n--- LLM Raw Output for Batch {i} (size={batch_size}) ---\n{raw_output}\n--- END ---\n")
        results = []
        try:
            results = json.loads(raw_output)
            if isinstance(results, dict):
                results = [results]
        except Exception:
            results = parse_llm_output(raw_output)
        # 过滤掉None和无id的
        results = [parsed for parsed in results if parsed and parsed.get("id")]
        print("解析后输出ID：", [parsed.get("id") for parsed in results])
        if not results or len(results) < len(batch):
            print(f"⚠️ 批量输出解析失败或条数不足，batch_size={batch_size}，应有{len(batch)}条，实际{len(results)}条，原始内容：\n", raw_output)
            return False
        else:
            with open(chemin_sortie, "a", encoding="utf-8") as fout:
                for parsed in results:
                    print(f"写入: {parsed.get('id', '[NO_ID]')}")
                    fout.write(json.dumps(parsed, ensure_ascii=False) + "\n")
            print(f"✅ Batch {i} ~ {i+len(batch)-1} 已写入 {len(results)} 条 (batch_size={batch_size})")
            return True
    except Exception as e:
        print(f"❌ Erreur lors du traitement du batch {i}-{i+batch_size}: {e}")
        return False

# ========== TRAITEMENT PRINCIPAL ==========

few_shot_examples = load_train_examples(dossier_train)
few_shot_text     = examples_to_prompt_str(few_shot_examples)

with open(dossier_test, "r", encoding="utf-8") as fin:
    records = [json.loads(ligne) for ligne in fin if ligne.strip()]

base = os.path.splitext(os.path.basename(dossier_test))[0]
i = 0
file_idx = 1
chemin_sortie = os.path.join(
    dossier_sortie, f"{base}_llm_responses_part{file_idx}.jsonl"
)

while i < len(records):
    # 每500条切换一个输出文件
    if i % 10000 == 0 and i > 0:
        print(f"---- 切换到下一个输出文件 ----")
        file_idx += 1
        chemin_sortie = os.path.join(
            dossier_sortie, f"{base}_llm_responses_part{file_idx}.jsonl"
        )

    for batch_size in [20,5,1]:
        batch = records[i:i+batch_size]
        if not batch:
            break
        batch_prompt = template_prompt + "\n\nExemples :\n" + few_shot_text + "\n\n"
        for rec in batch:
            batch_prompt += f'Phrase (id={rec["id"]}): {rec["sent"]}\n'
        batch_prompt += "\nGénère pour chaque phrase un objet JSON sur une ligne, au format JSONL (une ligne par objet JSON, pas de liste, pas de crochets).\n"
        token_count = len(batch_prompt) // 4
        print(f"Batch {i} ~ {i+len(batch)-1} (size={batch_size}) 估算token数: {token_count}")

        print(f"\n=== Batch {i} ~ {i+len(batch)-1} Prompt (size={batch_size}) ===\n{batch_prompt}\n")
       
        ok = try_batch(batch, batch_prompt, batch_size, chemin_sortie, i)
        if ok:
            i += batch_size
            break
        else:
            print(f"⚠️ Batch size {batch_size} 失败，尝试更小的 batch...")
            time.sleep(2)
    else:
        # 如果1也失败，则无限重试
        print(f"❌ 连 batch_size=1 都失败，开始无限重试第{i}条数据")
        while True:
            batch = records[i:i+1]
            batch_prompt = template_prompt + "\n\nExemples :\n" + few_shot_text + "\n\n"
            batch_prompt += f'Phrase (id={batch[0]["id"]}): {batch[0]["sent"]}\n'
            batch_prompt += "\nGénère pour chaque phrase un objet JSON sur une ligne, au format JSONL (une ligne par objet JSON, pas de liste, pas de crochets).\n"
            token_count = len(batch_prompt) // 4
            print(f"重试 Batch {i} (size=1) 估算token数: {token_count}")
            ok = try_batch(batch, batch_prompt, 1, chemin_sortie, i)
            if ok:
                i += 1
                break
            else:
                print(f"⚠️ 单条重试失败，3秒后再试...")
                time.sleep(3)

    time.sleep(5)

In [None]:
# 假设
output_ids = ['11815_dénomination', '11815_ouverture', '12621_historique', '9529_historique', '9529_classement', '14371_historique', '9673_historique', '9673_classement', '9673_ouverture', '9673_numérotation', '11703_classement', '11703_ouverture', '9651_dénomination', '9651_ouverture', '14312_historique', '14414_historique', '14494_historique', '14566_historique', '14574_historique', '13914_classement', '12250_historique', '12250_ouverture', '15466_dénomination', '12925_historique', '12925_ouverture', '14037_historique', '14037_classement', '13936_classement', '10518_ouverture', '14561_historique', '11029_historique', '11029_dénomination', '11029_ouverture', '13957_historique', '13957_classement', '12669_historique', '12669_classement', '11306_historique', '11306_ouverture', '12573_historique_1', '12573_historique_2', '12573_dénomination', '12573_numérotation', '11972_historique', '11972_ouverture', '9240_historique', '9240_dénomination', '9240_ouverture', '13902_classement', '13902_ouverture']
input_ids = ['11815_dénomination', '11815_ouverture', '12621_historique', '9529_historique', '9529_classement', '14371_historique', '9673_historique', '9673_classement', '9673_ouverture', '9673_numérotation', '11703_classement', '11703_ouverture', '9651_dénomination', '9651_ouverture', '14312_historique', '14414_historique', '14494_historique', '14566_historique', '14574_historique', '13914_classement', '12250_historique', '12250_ouverture', '15466_dénomination', '12925_historique', '12925_ouverture', '14037_historique', '14037_classement', '13936_classement', '10518_ouverture', '14561_historique', '11029_historique', '11029_dénomination', '11029_ouverture', '13957_historique', '13957_classement', '12669_historique', '12669_classement', '11306_historique', '11306_ouverture', '12573_historique_1', '12573_historique_2', '12573_dénomination', '12573_numérotation', '11972_historique', '11972_ouverture', '9240_historique', '9240_dénomination', '9240_ouverture', '13902_classement', '13902_ouverture']

missing_ids = [id for id in input_ids if id not in output_ids]
print("被跳过/遗漏的ID：", missing_ids)
print("共遗漏：", len(missing_ids), "条")

# Generating LLM responses

In [None]:
response = get_llm_response(prompt_example)

In [None]:
print(response)

# Parsing responses

In [None]:
# go through responses of LLM responses
# parse_llm_response(response)
#

# Evaluating the triples with the ground truth

In [None]:
ground_truth_files = [
    "onto_1_movie": "data/ground_truth/ont_1_movie_ground_truth.jsonl",
    "onto_2_music": "data/ground_truth/ont_1_music_ground_truth.jsonl"
}

In [None]:


import json
with open("/Users/jguo/Desktop/text2kgbench/text2kgbench/text2kgbench/text2kgbench/llm_responses/ont_1_rue_mizon_test_llm_responses.jsonl",encoding="utf-8") as f:
    for i,line in enumerate(f):
        rec = json.loads(line)
        print("id=", rec["id"], "; triples=", rec.get("triples"))
        if i>=2: break


In [None]:
rec.get("triples")


In [None]:
import json
with open("/Users/jguo/Desktop/text2kgbench/text2kgbench/text2kgbench/text2kgbench/ground_truth/ont_1_rue_mizon_ground_truth.jsonl","r",encoding="utf-8") as f:
    for i,line in enumerate(f):
        rec = json.loads(line)
        print(rec["id"], rec.get("triples"))
        if i>=2: break
