In [None]:
## add id to original label guided dataset

import json
from tqdm import tqdm
from pathlib import Path

file_a_path = "../VariErr-Label-Guided-longest.json"
file_b_path = "../varierr.json"
output_path = "../VariErr-Label-Guided-longest-with-ID.json"

with open(file_b_path, "r") as f:
    full_dataset = [json.loads(line) for line in f]
    pair_to_id = {
        (sample["context"].strip(), sample["statement"].strip()): sample["id"]
        for sample in full_dataset
    }

print(f"We have {len(pair_to_id)} pairs.")

with open(file_a_path, "r") as f:
    data = [json.loads(line) for line in f]

with open(output_path, "w") as f_out:
    for sample in tqdm(data):
        premise = sample["premise"].strip()
        hypothesis = sample["hypothesis"].strip()
        key = (premise, hypothesis)

        if key in pair_to_id:
            sample["id"] = pair_to_id[key]
        else:
            print(f"Not found: premise='{premise}...', hypothesis='{hypothesis}...'")
            sample["id"] = None

        f_out.write(json.dumps(sample, ensure_ascii=False) + "\n")

print("Done.")


In [1]:
# for manual checking

import os
import re
import pandas as pd
from pathlib import Path

BASE_DIR = Path("/Users/phoebeeeee/ongoing/LLM_AED/generation/qwen_72b_generation_raw")
OUTPUT_CSV = BASE_DIR / "manual_check.csv"
FILE_TYPES = ["E", "N", "C"]
VALIDATION = True 
STRIP_NUMBERING = True 

_CLEAN_PREFIX_RE = re.compile(r"^\s*(?:[\d]+[\.\)]|[-•*]|[a-zA-Z][\.\)]|\(\w+\))\s*")

def clean_explanation(text: str) -> str:
    return _CLEAN_PREFIX_RE.sub("", text).strip()

def read_file_lines(file_path: Path, strip_numbering: bool = True):
    lines = []
    if not file_path.is_file():
        return lines
    with file_path.open("r", encoding="utf-8") as f:
        for raw in f:
            s = raw.strip()
            if not s:
                continue  # 跳过空行
            if strip_numbering:
                s = clean_explanation(s)
                if not s:     # 清理后为空则跳过（避免写入 ""）
                    continue
            lines.append(s)
    return lines

def aggregate_to_csv(base_dir: Path, output_csv: Path):
    records = []
    for sub in sorted([p for p in base_dir.iterdir() if p.is_dir()]):
        subfolder_name = sub.name
        for ft in FILE_TYPES:
            fpath = sub / ft  # 期望文件名就是 'E' / 'N' / 'C'
            if not fpath.exists():
                continue
            lines = read_file_lines(fpath, strip_numbering=STRIP_NUMBERING)
            for idx, content in enumerate(lines, start=1):
                records.append({
                    "validation": VALIDATION,
                    "subfolder": subfolder_name,
                    "file_type": ft,
                    "line_no": idx,
                    "content": content
                })

    if not records:
        print("未收集到任何记录，请检查目录与文件命名是否正确。")
        return

    df = pd.DataFrame.from_records(
        records,
        columns=["validation", "subfolder", "file_type", "line_no", "content"]
    )
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"已保存：{output_csv}（共 {len(df)} 行）")

if __name__ == "__main__":
    aggregate_to_csv(BASE_DIR, OUTPUT_CSV)


已保存：/Users/phoebeeeee/ongoing/LLM_AED/generation/qwen_72b_generation_raw/manual_check.csv（共 3920 行）


In [14]:
# integrate explanations generated by LLMs to a singel file

import json, re
from pathlib import Path
from tqdm import tqdm

explanation_root = Path("/Users/phoebeeeee/ongoing/LLM_AED/generation/llama_70b_generation_raw")
input_jsonl = Path("/Users/phoebeeeee/ongoing/LLM_AED/dataset/varierr/varierr.json")
output_jsonl = Path("/Users/phoebeeeee/ongoing/LLM_AED/new_processing/llama_70b_generation_raw.jsonl")

# suffix = ".txt"

def clean_explanation(text: str) -> str:
    return re.sub(r"^\s*(?:[\d]+[\.\)]|[-•*]|[a-zA-Z][\.\)]|\(\w+\))\s*", "", text).strip()

label_map = {"E": "e", "N": "n", "C": "c"}

with open(input_jsonl, "r", encoding="utf-8") as f:
    instances = [json.loads(line) for line in f]

with open(output_jsonl, "w", encoding="utf-8") as fout:
    for instance in tqdm(instances, desc="Inject explanations"):
        sample_id = str(instance["id"])
        subfolder = explanation_root / sample_id
        new_comments = []

        if not subfolder.exists():
            print(f"missing folder: {subfolder}")
        else:
            for label in ["E", "N", "C"]:
                tried_files = [
                    # f"{label}_third.txt"
                    # f"{label}_second.txt",
                    # f"{label}_first.txt",
                    label
                ]
        
                file_found = False
                for fname in tried_files:
                    file_path = subfolder / f"{fname}"
                    if file_path.exists():
                        with open(file_path, "r", encoding="utf-8") as f:
                            for raw in f:
                                if not raw.strip():
                                    continue  
                                exp = clean_explanation(raw)
                                if exp: 
                                    new_comments.append([exp, label_map[label]])
                        file_found = True

                if not file_found:
                    print(f"No file found for {label} in {subfolder}")
        new_instance = {
            "id": instance["id"],
            "premise": instance["context"],
            "hypothesis": instance["statement"],
            "generated_explanations": new_comments
        }
        if new_comments:
            fout.write(json.dumps(new_instance, ensure_ascii=False) + "\n")

print("Done.")


Inject explanations:   0%|          | 0/500 [00:00<?, ?it/s]

Inject explanations: 100%|██████████| 500/500 [00:00<00:00, 1383.45it/s]

Done.





In [19]:
## double check explanation number in _raw.jsonl

import pandas as pd

in_path = "/Users/phoebeeeee/ongoing/LLM_AED/no_preprocessing/qwen_72b_generation_raw.jsonl"  # 改成你的文件路径
df = pd.read_json(in_path, lines=True)

labels_series = df["generated_explanations"].apply(
    lambda lst: [x[1] for x in (lst or []) if isinstance(x, (list, tuple)) and len(x) >= 2]
)

labels_exploded = labels_series.explode().dropna().astype(str)
total = labels_exploded.shape[0]
label_counts = labels_exploded.value_counts()

print(f"Total generated_explanations: {total}")
for lbl, cnt in label_counts.items():
    print(f"label={lbl}: {cnt} ({cnt/total:.2%})")

print("Unique labels:", sorted(label_counts.index.tolist()))


Total generated_explanations: 3920
label=n: 1382 (35.26%)
label=e: 1337 (34.11%)
label=c: 1201 (30.64%)
Unique labels: ['c', 'e', 'n']


In [43]:
# count number

import os
import re
import pandas as pd

ROOT_FOLDER = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw"

def count_generations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for line in f if line.strip())

def count_all_generations():
    records = []
    total = {"E": 0, "N": 0, "C": 0}

    for subfolder in os.listdir(ROOT_FOLDER):
        sub_path = os.path.join(ROOT_FOLDER, subfolder)
        if not os.path.isdir(sub_path):
            continue

        row = {"folder": subfolder}
        for label in ["E", "N", "C"]:
            file_path = os.path.join(sub_path, f"{label}_third.txt")
            if os.path.isfile(file_path):
                count = count_generations(file_path)
            else:
                count = 0
            row[label] = count
            total[label] += count

        records.append(row)

    df = pd.DataFrame(records)
    df.loc["TOTAL"] = ["TOTAL"] + [total["E"], total["N"], total["C"]]
    # print(df)

    print(f"E={total['E']}，N={total['N']}，C={total['C']}，total: {total['E'] + total['N'] + total['C']}")

count_all_generations()


E=1198，N=1407，C=1415，total: 4020


In [None]:
# get avg score for instance

import json
from collections import defaultdict

with open('../scores.json', 'r') as f:
    data = json.load(f)

groups = defaultdict(list)
for key, value in data.items():
    try:
        id_label, _ = key.rsplit('-', 1)
        groups[id_label].append(value)
    except ValueError:
        print(f"'{key}' does not match.")
        continue

averaged_data = {k: sum(v) / len(v) for k, v in groups.items()}
with open('../avg_llama3.1_scores.jsonn', 'w') as f:
    json.dump(averaged_data, f, indent=2)


In [None]:
## Thresholding for ChaosNLI
import json

def process_label_distribution(label_probs, threshold=0.2):
    valid_indices = [i for i, p in enumerate(label_probs) if p >= threshold]
    count = len(valid_indices)
    if count == 0:
        return [0.0, 0.0, 0.0]
    return [1.0 / count if i in valid_indices else 0.0 for i in range(3)]

input_file = '../dev_cleaned.json'
output_file = '../dev_cleaned_20.json'

with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line in fin:
        item = json.loads(line)
        raw_label = item['label']
        new_label = process_label_distribution(raw_label)
        item['label'] = new_label
        fout.write(json.dumps(item, ensure_ascii=False) + '\n')


In [227]:
# processing peer results

import json
import os
from collections import defaultdict

# 输入输出路径
input_file = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original_peer/qwen_72b_original_peer/scores.json"
output_dir = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original_peer/qwen_72b_original_peer"
os.makedirs(output_dir, exist_ok=True)

with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# {model_id: {instance_id: [(label_code, old_idx, value), ...]}}
grouped = defaultdict(lambda: defaultdict(list))

for key, value in data.items():
    try:
        model_id, instance_id, rest = key.split("_", 2)
        label_code, old_idx = rest.split("-")
        old_idx = int(old_idx)
    except ValueError:
        print(f"跳过非法 key: {key}")
        continue

    grouped[model_id][instance_id].append((label_code, old_idx, value))

# 重新编号并写文件
for model_id, instances in grouped.items():
    new_dict = {}
    for instance_id, items in instances.items():
        # 保持原始顺序：先按 old_idx 排好
        items.sort(key=lambda x: x[1])
        for new_idx, (label_code, _, value) in enumerate(items):
            new_key = f"{instance_id}_{label_code}-{new_idx}"
            new_dict[new_key] = value

    output_file = os.path.join(output_dir, f"{model_id}.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(new_dict, f, ensure_ascii=False, indent=2)

    print(f"已写出 {output_file}")


已写出 /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original_peer/qwen_72b_original_peer/llama8b.json
已写出 /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original_peer/qwen_72b_original_peer/llama70b.json
已写出 /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original_peer/qwen_72b_original_peer/qwen7b.json
已写出 /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original_peer/qwen_72b_original_peer/qwen72b.json


In [68]:
import json

file1 = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/chase_peer/qwen_72b_chase_peer/qwen72b.json"
file2 = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/qwen_72b_all/scores.json"

with open(file1, "r", encoding="utf-8") as f:
    data1 = json.load(f)
with open(file2, "r", encoding="utf-8") as f:
    data2 = json.load(f)

keys1 = set(data1.keys())
keys2 = set(data2.keys())

if keys1 == keys2:
    print("两个文件的 key 完全相同 ✅")
else:
    print("两个文件的 key 不完全相同 ❌")
    print("只在 file1 中存在的 key:", keys1 - keys2)
    print("只在 file2 中存在的 key:", keys2 - keys1)


两个文件的 key 完全相同 ✅


In [223]:
# count error number + overlap per label
import pandas as pd
import ast
from collections import Counter

path = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/llama_8b_original/kld_jsd/llama_8b_original_after_0.9_merged_errors.csv"  # 改成你的文件路径
df = pd.read_csv(path)

# 把单元格里的 '["c"]'、'[]' 等转为 Python 列表；兼容引号异常
def parse_list_cell(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        # 某些 CSV 里会出现双引号翻倍的情况
        try:
            return ast.literal_eval(s.replace('""', '"'))
        except Exception:
            return []

# 映射为 e/n/c
map_long = {"entailment": "e", "neutral": "n", "contradiction": "c",
            "e": "e", "n": "n", "c": "c"}

cnt2 = Counter()       # llm_error 计数（按标签逐项）
cnt3 = Counter()       # varierr_error 计数（按标签逐项）
overlap_cnt = Counter()  # 两列里相同的标签计数（逐行一次）

# 统计第二列（通常是 'e'/'n'/'c'）
for items in df["llm_error"].apply(parse_list_cell):
    for it in items:
        k = map_long.get(it, None)
        if k in ("e", "n", "c"):
            cnt2[k] += 1

# 统计第三列（可能是完整单词）
for items in df["varierr_error"].apply(parse_list_cell):
    for it in items:
        k = map_long.get(it, None)
        if k in ("e", "n", "c"):
            cnt3[k] += 1

# 逐行统计两列相同的标签（同一行里只计一次该标签）
for llm_items, var_items in zip(
    df["llm_error"].apply(parse_list_cell),
    df["varierr_error"].apply(parse_list_cell)
):
    llm_set = {map_long.get(it) for it in llm_items if map_long.get(it) in ("e", "n", "c")}
    var_set = {map_long.get(it) for it in var_items if map_long.get(it) in ("e", "n", "c")}
    inter = llm_set & var_set
    for lab in inter:
        overlap_cnt[lab] += 1

print("第二列 llm_not_calidated_error：")
print(f"  e: {cnt2.get('e',0)}  n: {cnt2.get('n',0)}  c: {cnt2.get('c',0)}")

print("第三列 varierr_error：")
print(f"  e: {cnt3.get('e',0)}  n: {cnt3.get('n',0)}  c: {cnt3.get('c',0)}")

print("两列相同的 error（逐行交集计数）：")
print(f"  e: {overlap_cnt.get('e',0)}  n: {overlap_cnt.get('n',0)}  c: {overlap_cnt.get('c',0)}")


第二列 llm_not_calidated_error：
  e: 32  n: 195  c: 90
第三列 varierr_error：
  e: 53  n: 23  c: 53
两列相同的 error（逐行交集计数）：
  e: 1  n: 10  c: 7


In [475]:
# count error number + overlap per label
import pandas as pd
import ast
from collections import Counter

path = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/llama_8b_all/validated_overlap/with_validation_0.1_merged_validation.csv"  # 改成你的文件路径
df = pd.read_csv(path)

# 把单元格里的 '["c"]'、'[]' 等转为 Python 列表；兼容引号异常
def parse_list_cell(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    return ast.literal_eval(s)
    # except Exception:
        # 某些 CSV 里会出现双引号翻倍的情况
        # try:
        #     return ast.literal_eval(s.replace('""', '"'))
        # except Exception:
        #     return []

# 映射为 e/n/c
map_long = {"entailment": "e", "neutral": "n", "contradiction": "c", "e": "e", "n": "n", "c": "c"}

cnt2 = Counter()       # llm 计数（按标签逐项）
cnt3 = Counter()       # varierr计数（按标签逐项）
overlap_cnt = Counter()  # 两列里相同的标签计数（逐行一次）

# 统计第二列（通常是 'e'/'n'/'c'）
for items in df["llm_validated"].apply(parse_list_cell):
    for it in items:
        k = map_long.get(it, None)
        if k in ("e", "n", "c"):
            cnt2[k] += 1

# 统计第三列（可能是完整单词）
for items in df["varierr_validated"].apply(parse_list_cell):
    for it in items:
        k = map_long.get(it, None)
        if k in ("e", "n", "c"):
            cnt3[k] += 1

# 逐行统计两列相同的标签（同一行里只计一次该标签）
for llm_items, var_items in zip(
    df["llm_validated"].apply(parse_list_cell),
    df["varierr_validated"].apply(parse_list_cell)
):
    llm_set = {map_long.get(it) for it in llm_items if map_long.get(it) in ("e", "n", "c")}
    var_set = {map_long.get(it) for it in var_items if map_long.get(it) in ("e", "n", "c")}
    inter = llm_set & var_set
    for lab in inter:
        overlap_cnt[lab] += 1

print("llm_validated：")
print(f"  e: {cnt2.get('e',0)}  n: {cnt2.get('n',0)}  c: {cnt2.get('c',0)}")

print("varierr_validated：")
print(f"  e: {cnt3.get('e',0)}  n: {cnt3.get('n',0)}  c: {cnt3.get('c',0)}")

print("两列相同的")
print(f"  e: {overlap_cnt.get('e',0)}  n: {overlap_cnt.get('n',0)}  c: {overlap_cnt.get('c',0)}")


llm_validated：
  e: 466  n: 418  c: 353
varierr_validated：
  e: 210  n: 380  c: 159
两列相同的
  e: 200  n: 318  c: 112


In [113]:
# average number of space-separated words per explanation


from pathlib import Path
import re

# 根目录（按你的路径）
BASE_DIR = Path("/Users/phoebeeeee/ongoing/LLM_AED/generation/llama_70b_generation_raw")

# 行首序号清理正则：匹配 "1."、"2)"、"3、"、"4-"、"5:"、"6：" 等
LEADING_NUM_RE = re.compile(r"^\s*\d+\s*[\.\)\-、:：]?\s*")

def clean_line(line: str) -> str:
    """去首尾空白、去掉行首序号，返回清理后的行。"""
    s = line.strip()
    s = LEADING_NUM_RE.sub("", s)
    return s.strip()

def read_file_lines(p: Path):
    """读取文件并逐行清洗，返回非空解释列表。"""
    if not p.is_file():
        return []
    cleaned = []
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            s = clean_line(line)
            if s:  # 过滤清洗后为空的行
                cleaned.append(s)
    return cleaned

def collect_explanations(base: Path):
    """
    遍历 base 下所有子文件夹，读取 E/N/C 三个文件，
    返回所有 explanation 的列表（来自 E/N/C 的总和）。
    """
    all_exps = []
    for sub in sorted(base.iterdir()):
        if not sub.is_dir():
            continue
        # 每个子目录下的 E/N/C（无扩展名）
        for name in ["E", "N", "C"]:
            fp = sub / name
            exps = read_file_lines(fp)
            all_exps.extend(exps)
    return all_exps

def main():
    all_exps = collect_explanations(BASE_DIR)
    total_count = len(all_exps)
    avg_len = (sum(len(x.split()) for x in all_exps) / total_count)

    print(f"Total explanations from E/N/C: {total_count}")
    print(f"Average length (characters): {avg_len:.2f}")
    print(total_count)

    # 如需查看一个样例：
    # for i, e in enumerate(all_exps[:10], 1):
    #     print(f"{i}. {e}")

if __name__ == "__main__":
    main()


Total explanations from E/N/C: 4022
Average length (characters): 22.35
4022


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import json
import re
from pathlib import Path
from typing import List, Dict, Any

LEADING_NUM_RE = re.compile(r"^\s*\d+\s*[\.\)\-、:：]?\s*")

def clean_line(line: str) -> str:
    """去首尾空白、去掉行首序号，返回清理后的行。"""
    s = line.strip()
    s = LEADING_NUM_RE.sub("", s)
    return s.strip()

def read_jsonl(path: Path) -> List[Dict[str, Any]]:
    items = []
    with path.open("r", encoding="utf-8", errors="ignore") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                items.append(obj)
            except json.JSONDecodeError as e:
                # 不中断，报行号方便你去捶数据
                print(f"[WARN] JSON 解析失败 {path}:{i}: {e}")
    return items

def collect_reasons(sample: Dict[str, Any], key: str):
    """返回 (该类的标注项总数, 清洗后非空 reason 列表)"""
    arr = sample.get(key, [])
    if not isinstance(arr, list):
        return 0, []
    reasons = []
    for ann in arr:
        if not isinstance(ann, dict):
            continue
        r = clean_line(str(ann.get("reason", "")))
        if r:
            reasons.append(r)
    return len(reasons), reasons

def main(paths: List[Path]):
    overall_label_items = 0
    overall_reasons: List[str] = []

    # 按类别的细分统计
    per_cat = {
        "entailment": {"label_items": 0, "reasons": []},
        "neutral": {"label_items": 0, "reasons": []},
        "contradiction": {"label_items": 0, "reasons": []},
    }

    for p in paths:
        for obj in read_jsonl(p):
            for cat in ["entailment", "neutral", "contradiction"]:
                n_items, reasons = collect_reasons(obj, cat)
                per_cat[cat]["label_items"] += n_items
                per_cat[cat]["reasons"].extend(reasons)
                overall_label_items += n_items
                overall_reasons.extend(reasons)

    labels_with_reason = len(overall_reasons)  # 带非空 reason 的标注项个数
    total_reasons = len(overall_reasons)
    num_label_items = overall_label_items
    avg_expl_per_label = (total_reasons / num_label_items) if num_label_items else 0.0

    # 计算 explanation 平均长度（按 split 后 token 数）
    def avg_len(strs: List[str]) -> float:
        if not strs:
            return 0.0
        lengths = [len(s.split()) for s in strs]
        return sum(lengths) / len(lengths)

    avg_tokens_per_expl = avg_len(overall_reasons)

    # 输出
    print("=== Overall ===")
    print(f"total_reasons: {total_reasons}")
    print(f"labels_with_reason: {labels_with_reason}")
    print(f"num_label_items: {num_label_items}")
    print(f"Avg. Expl./Label: {avg_expl_per_label:.4f}")
    print(f"Avg. tokens/expl (cleaned, split): {avg_tokens_per_expl:.2f}")

    print("\n=== By Category ===")
    for cat in ["entailment", "neutral", "contradiction"]:
        tr = len(per_cat[cat]["reasons"])
        ti = per_cat[cat]["label_items"]
        avg_cat = (tr / ti) if ti else 0.0
        avg_tok = avg_len(per_cat[cat]["reasons"])
        print(f"[{cat}]")
        print(f"  total_reasons: {tr}")
        print(f"  labels_with_reason: {tr}")
        print(f"  num_label_items: {ti}")
        print(f"  Avg. Expl./Label: {avg_cat:.4f}")
        print(f"  Avg. tokens/expl: {avg_tok:.2f}")


path = [Path("/Users/phoebeeeee/ongoing/LLM_AED/dataset/varierr/varierr.json")]
main(path)


=== Overall ===
total_reasons: 1933
labels_with_reason: 1933
num_label_items: 1933
Avg. Expl./Label: 1.0000
Avg. tokens/expl (cleaned, split): 13.89

=== By Category ===
[entailment]
  total_reasons: 554
  labels_with_reason: 554
  num_label_items: 554
  Avg. Expl./Label: 1.0000
  Avg. tokens/expl: 13.92
[neutral]
  total_reasons: 977
  labels_with_reason: 977
  num_label_items: 977
  Avg. Expl./Label: 1.0000
  Avg. tokens/expl: 13.97
[contradiction]
  total_reasons: 402
  labels_with_reason: 402
  num_label_items: 402
  Avg. Expl./Label: 1.0000
  Avg. tokens/expl: 13.65


In [288]:
import json
import csv

input_file = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/qwen_72b_all/varierr_without_qwen_72b.json"   # 你的jsonl文件路径
output_file = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/qwen_72b_all/error_overlap.csv" # 输出csv文件路径

abb_dict = {"entailment":"e", "neutral":"n", "contradiction":"c"}

with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8", newline="") as fout:
    writer = csv.writer(fout)
    # 表头
    writer.writerow(["id", "error_labels", "error_llm", "chaosnli_low_labels"])

    for line in fin:
        data = json.loads(line)

        error_labels = data.get("error_labels", [])
        error_labels = [abb_dict[error] for error in error_labels]
        error_llm = data.get("error_llm", [])
        error_llm = [abb_dict[error] for error in error_llm]
        chaosnli_labels = data.get("chaosnli_labels", {})

        # 找出 chaosnli_labels 里 < 20 的 label 名称
        low_labels = [k for k, v in chaosnli_labels.items() if v is not None and v < 20]

        writer.writerow([
            data.get("id"),
            ";".join(error_labels) if error_labels else "",
            ";".join(error_llm) if error_llm else "",
            ";".join(low_labels) if low_labels else ""
        ])

print(f"筛选结果已保存到 {output_file}")

筛选结果已保存到 /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/qwen_72b_all/error_overlap.csv


In [528]:
import pandas as pd
import ast
from collections import Counter
from pathlib import Path

def parse_list_cell(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    # CSV 里存的是 JSON 风格列表，如 ["e","n"]，用 literal_eval 解析
    return ast.literal_eval(s)

# 将各种写法映射为 e/n/c
map_long = {
    "entailment": "e", "neutral": "n", "contradiction": "c",
    "e": "e", "n": "n", "c": "c"
}

def process_csv(path: Path):
    df = pd.read_csv(path)

    overlap_23 = Counter()  # LLM ∩ VariErr
    overlap_24 = Counter()  # LLM ∩ Chaos
    total_col2 = Counter()  # LLM
    total_col3 = Counter()  # VariErr
    total_col4 = Counter()  # Chaos

    for items2, items3, items4 in zip(
        df["llm_validated"].apply(parse_list_cell),
        df["varierr_validated"].apply(parse_list_cell),
        df["chaos_validated"].apply(parse_list_cell)
    ):
        set2 = {map_long.get(it) for it in items2 if map_long.get(it) in ("e", "n", "c")}
        set3 = {map_long.get(it) for it in items3 if map_long.get(it) in ("e", "n", "c")}
        set4 = {map_long.get(it) for it in items4 if map_long.get(it) in ("e", "n", "c")}

        for lab in (set2 & set3):
            overlap_23[lab] += 1
        for lab in (set2 & set4):
            overlap_24[lab] += 1

        for lab in set2:
            total_col2[lab] += 1
        for lab in set3:
            total_col3[lab] += 1
        for lab in set4:
            total_col4[lab] += 1

    # 打印该文件结果
    print(f"\n=== {path.name} ===")
    print("LLM")
    print(f"  e: {total_col2.get('e',0)}  n: {total_col2.get('n',0)}  c: {total_col2.get('c',0)}")

    print("llm和varierr：")
    print(f"  e: {overlap_23.get('e',0)}  n: {overlap_23.get('n',0)}  c: {overlap_23.get('c',0)}")

    print("llm和chaos：")
    print(f"  e: {overlap_24.get('e',0)}  n: {overlap_24.get('n',0)}  c: {overlap_24.get('c',0)}")

    # print("VariErr：")
    # print(f"  e: {total_col3.get('e',0)}  n: {total_col3.get('n',0)}  c: {total_col3.get('c',0)}")

    # print("Chaos")
    # print(f"  e: {total_col4.get('e',0)}  n: {total_col4.get('n',0)}  c: {total_col4.get('c',0)}")

    # 返回计数，便于全局汇总
    return {
        "overlap_23": overlap_23,
        "overlap_24": overlap_24,
        "total_col2": total_col2,
        "total_col3": total_col3,
        "total_col4": total_col4,
    }

def process_folder(folder: str, pattern: str = "*.csv"):
    folder_path = Path(folder)
    files = sorted(folder_path.glob(pattern))
    if not files:
        print(f"No CSV matched in: {folder}")
        return
    
    for csv_path in files:
        try:
            res = process_csv(csv_path)
        except Exception as e:
            print(f"[ERROR] {csv_path.name}: {e}")
            continue


if __name__ == "__main__":
    # 替换为你的目录路径；可选 pattern 调整筛选
    process_folder(
        folder="/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap",
        pattern="*.csv"
    )



=== with_validation_0.1_merged_validation.csv ===
LLM
  e: 500  n: 500  c: 500
llm和varierr：
  e: 210  n: 380  c: 159
llm和chaos：
  e: 327  n: 389  c: 190

=== with_validation_0.2_merged_validation.csv ===
LLM
  e: 500  n: 500  c: 500
llm和varierr：
  e: 210  n: 380  c: 159
llm和chaos：
  e: 327  n: 389  c: 190

=== with_validation_0.3_merged_validation.csv ===
LLM
  e: 495  n: 500  c: 496
llm和varierr：
  e: 210  n: 380  c: 159
llm和chaos：
  e: 327  n: 389  c: 190

=== with_validation_0.4_merged_validation.csv ===
LLM
  e: 491  n: 500  c: 492
llm和varierr：
  e: 210  n: 380  c: 157
llm和chaos：
  e: 326  n: 389  c: 189

=== with_validation_0.5_merged_validation.csv ===
LLM
  e: 491  n: 500  c: 492
llm和varierr：
  e: 210  n: 380  c: 157
llm和chaos：
  e: 326  n: 389  c: 189

=== with_validation_0.6_merged_validation.csv ===
LLM
  e: 491  n: 500  c: 492
llm和varierr：
  e: 210  n: 380  c: 157
llm和chaos：
  e: 326  n: 389  c: 189

=== with_validation_0.7_merged_validation.csv ===
LLM
  e: 489  n: 500  c: 

In [515]:
# count overlap between col2 with col3 & col4 (row-wise, label-wise)
import pandas as pd
import ast
from collections import Counter

path = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/llama_8b_all/validated_overlap/with_validation_0.1_merged_validation.csv"  # 改成你的新CSV路径
df = pd.read_csv(path)

# def parse_list_cell(x):
#     if pd.isna(x):
#         return []
#     s = str(x).strip()
#     if not s:
#         return []
#     # 分号分隔
#     if ";" in s:
#         return [t.strip() for t in s.split(";") if t.strip()]
#     # 单个标签
#     return [s]
def parse_list_cell(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    return ast.literal_eval(s)
    
def to_enc_set(items):
    out = set()
    for it in items:
        # key = map_long.get(str(it).strip().lower())
        key = str(it).strip().lower()
        if key in ("e", "n", "c"):
            out.add(key)
    return out

# 将各种写法映射为 e/n/c
map_long = {
    "entailment": "e", "neutral": "n", "contradiction": "c",
    "e": "e", "n": "n", "c": "c"
}


# cnt2 = Counter()     
# cnt3 = Counter()     
# cnt4 = Counter()
overlap_23 = Counter()  # 两列里相同的标签计数（逐行一次）
overlap_24= Counter() 

total_col2 = Counter()
total_col3 = Counter()
total_col4 = Counter()

for items2, items3, items4 in zip(
    df["llm_validated"].apply(parse_list_cell),
    df["varierr_validated"].apply(parse_list_cell),
    df["chaos_validated"].apply(parse_list_cell)
):
    # set2 = to_enc_set(items2)
    # set3 = to_enc_set(items3)
    # set4 = to_enc_set(items4)
    # print(items2)
    set2 = {map_long.get(it) for it in items2 if map_long.get(it) in ("e", "n", "c")}
    set3 = {map_long.get(it) for it in items3 if map_long.get(it) in ("e", "n", "c")}
    set4 = {map_long.get(it) for it in items4 if map_long.get(it) in ("e", "n", "c")}

    for lab in (set2 & set3):
        overlap_23[lab] += 1

    for lab in (set2 & set4):
        overlap_24[lab] += 1

    for lab in set2:
        total_col2[lab] += 1
    for lab in set3:
        total_col3[lab] += 1
    for lab in set4:
        total_col4[lab] += 1

print("LLM")
print(f"  e: {total_col2.get('e',0)}  n: {total_col2.get('n',0)}  c: {total_col2.get('c',0)}")

print("llm和varierr：")
print(f"  e: {overlap_23.get('e',0)}  n: {overlap_23.get('n',0)}  c: {overlap_23.get('c',0)}")

print("llm和chaos：")
print(f"  e: {overlap_24.get('e',0)}  n: {overlap_24.get('n',0)}  c: {overlap_24.get('c',0)}")


print("VariErr：")
print(f"  e: {total_col3.get('e',0)}  n: {total_col3.get('n',0)}  c: {total_col3.get('c',0)}")

print("Chaos")
print(f"  e: {total_col4.get('e',0)}  n: {total_col4.get('n',0)}  c: {total_col4.get('c',0)}")



LLM
  e: 466  n: 418  c: 353
llm和varierr：
  e: 200  n: 318  c: 112
llm和chaos：
  e: 312  n: 332  c: 135
VariErr：
  e: 210  n: 380  c: 159
Chaos
  e: 327  n: 389  c: 190


In [None]:
## how many of the E/N/C labels have explanation

import json

def check_jsonl(file_path):
    sum = 0
    required_labels = {"e", "n", "c"}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            explanations = data.get("generated_explanations", [])
            # 取出每个解释的 label（在列表最后一个元素）
            labels = {item[-1] for item in explanations if isinstance(item, list) and item}
            missing = required_labels - labels
            if missing:
                print(f"ID: {data.get('id')} 缺少: {', '.join(missing)}")
            sum += len(labels)
        print("sum is", sum)

check_jsonl("/Users/phoebeeeee/ongoing/LLM_AED/new_processing/qwen_72b_generation_raw.jsonl")


sum is 1500


In [440]:
## how many are validated

import json
from collections import Counter

def check_jsonl_validated(file_path, required_labels=("e", "n", "c")):
    required_labels = set(required_labels)
    sample_label_counter = Counter()   # 按样本统计多少个instance包含某label
    complete_count = 0
    total_samples = 0
    missing_list = []                  # [(id, [missing labels])]

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            # total_samples += 1
            data = json.loads(line)
            exps = data.get("generated_explanations", [])

            # 每个 instance 的 validated label set
            validated_labels = set()
            for item in exps:
                if isinstance(item, list) and len(item) >= 3:
                    label = item[1]
                    status = str(item[2]).strip().lower()
                    if status == "validated":
                        validated_labels.add(label)

            # 按样本统计覆盖情况
            for lbl in validated_labels:
                sample_label_counter[lbl] += 1
            total_samples += len(validated_labels)

            # 判断是否完整
            missing = sorted(required_labels - validated_labels)
            if missing:
                missing_list.append((data.get("id"), missing))
            else:
                complete_count += 1

    # 打印汇总
    print("\n===== Summary (validated only, sample-level) =====")
    print("总样本数:", total_samples/500)
    print("完整样本数 (含 e/n/c 且均为 validated):", complete_count)
    print("多少个样本包含各label:", dict(sample_label_counter))

    return missing_list


if __name__ == "__main__":
    path = "/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/with_validation_0.8.jsonl"
    missing = check_jsonl_validated(path)
    # print(missing)  # 如果需要看缺失详情


===== Summary (validated only, sample-level) =====
总样本数: 2.834
完整样本数 (含 e/n/c 且均为 validated): 419
多少个样本包含各label: {'c': 462, 'n': 493, 'e': 462}


{"id": "73260n", "context": "The disputes among nobles were not the first concern of ordinary French citizens.", "statement": "Ordinary French citizens were not concerned with the disputes among nobles.", "entailment": [{"annotator": 1, "id": "1456-entailment-1", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": false}, {"annotator": 2, "makes_sense": false}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "In the context, \"The first concern\" can be read as a pars pro toto which would mean that it was really no concern at all.", "self_corrected": true}], "neutral": [{"annotator": 0, "id": "1456-neutral-1", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "\"not the first concern\" doesn't mean not the concern. The statement can be true or false.", "self_corrected": false}, {"annotator": 1, "id": "1456-neutral-2", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "It might not be the most important concern to the French citizens, but maybe an important concern after all.", "self_corrected": false}, {"annotator": 2, "id": "1456-neutral-3", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "The disputes among nobles could be second concern of ordinary French citizens.", "self_corrected": false}], "contradiction": [{"annotator": 3, "id": "1456-contradiction-1", "judgments": [{"annotator": 0, "makes_sense": false}, {"annotator": 1, "makes_sense": false}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "They could be concerned. But it is not their first concern", "self_corrected": false}], "idk": [], "label_count_round_1": {"contradiction": 1.0, "entailment": 1.0, "neutral": 3.0}, "label_count_round_2": {"contradiction": 1.0, "entailment": null, "neutral": 3.0}, "label_set_round_1": ["contradiction", "neutral", "entailment"], "label_set_round_2": ["neutral", "contradiction"], "error_labels": ["entailment"], "has_ambiguity": true, "chaosnli_labels": {"n": 18, "e": 72, "c": 10}}


In [443]:
import json
from collections import Counter

def count_label_sets(file_path):
    round1_counter = 0
    round2_counter = 0
    total = 0

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            total += 1
            data = json.loads(line)

            # 统计 round_1
            for lbl in data.get("label_set_round_1", []):
                round1_counter += 1

            # 统计 round_2
            for lbl in data.get("label_set_round_2", []):
                round2_counter += 1

    print("\n===== Summary =====")
    print("总样本数:", total)
    print("Round 1 label 分布:", round1_counter)
    print("Round 2 label 分布:", round2_counter)

    return round1_counter, round2_counter


if __name__ == "__main__":
    path = "/Users/phoebeeeee/ongoing/LLM_AED/dataset/varierr/varierr.json"
    r1, r2 = count_label_sets(path)



===== Summary =====
总样本数: 500
Round 1 label 分布: 878
Round 2 label 分布: 749


In [456]:
def write_merged_errors(model_jsonl, varierr_json, out_csv):
    data_a = {}
    with open(model_jsonl, "r", encoding="utf-8") as f:
        data_a = {json.loads(line)["id"]: json.loads(line) for line in f}

    data_b = {}
    with open(varierr_json, "r", encoding="utf-8") as f:
        data_b = {json.loads(line)["id"]: json.loads(line) for line in f}

    all_ids = list(data_a.keys())

    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["id", "llm_validated", "varierr_validated"])
        writer.writeheader()

        for id_ in all_ids:
            if id_ not in data_b:
                print(f"ID {id_} not found in VariErr dataset.")

            row = {
                "id": id_,
                "llm_validated": json.dumps(data_a.get(id_, {}).get("label_set_round_2", []), ensure_ascii=False),
                "varierr_validated": json.dumps(data_b.get(id_, {}).get("label_set_round_2", []), ensure_ascii=False),
            }
            writer.writerow(row)


In [477]:
import json
import csv
from pathlib import Path

def write_merged_errors_batch(model_dir, varierr_json, out_dir=None, suffix="_merged_validation.csv"):
    model_dir = Path(model_dir)
    varierr_json = Path(varierr_json)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    abb_dict = {"entailment": "e", "contradiction": "c", "neutral": "n"}

    # 读取 VariErr
    with varierr_json.open("r", encoding="utf-8") as f:
        data_b = {json.loads(line)["id"]: json.loads(line) for line in f}

    # 遍历 model_dir 下所有 .jsonl
    for model_jsonl in sorted(model_dir.glob("*.jsonl")):
        out_csv = out_dir / f"{model_jsonl.stem}{suffix}"

        # 读取 model_jsonl
        with model_jsonl.open("r", encoding="utf-8") as f:
            data_a = {json.loads(line)["id"]: json.loads(line) for line in f}

        all_ids = list(data_a.keys())

        with out_csv.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(
                f, fieldnames=["id", "llm_validated", "varierr_validated", "chaos_validated"]
            )
            writer.writeheader()

            for id_ in all_ids:
                if id_ not in data_b:
                    print(f"[{model_jsonl.name}] ID {id_} not found in VariErr dataset.")

                # VariErr 的 Round 2 labels（映射为 e/n/c）
                raw_labels = data_b.get(id_, {}).get("label_set_round_2", [])
                mapped_labels = [abb_dict.get(lbl, lbl) for lbl in raw_labels]

                # ChaosNLI validated （阈值 >=20）
                chaos_dict = data_b.get(id_, {}).get("chaosnli_labels", {})
                chaos_validated = [lbl for lbl, val in chaos_dict.items() if val >= 20]

                row = {
                    "id": id_,
                    "llm_validated": json.dumps(
                        data_a.get(id_, {}).get("label_set_round_2", []), ensure_ascii=False
                    ),
                    "varierr_validated": json.dumps(mapped_labels, ensure_ascii=False),
                    "chaos_validated": json.dumps(chaos_validated, ensure_ascii=False),
                }
                writer.writerow(row)

        print(f"Written: {out_csv}")


In [489]:
write_merged_errors_batch(
    model_dir="/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/threshold",
    varierr_json="/Users/phoebeeeee/ongoing/LLM_AED/dataset/varierr/varierr.json",
    out_dir="/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap"  # 或者指定输出目录；None 则写到 model_dir/merged_csv/
)

Written: /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap/with_validation_0.1_merged_validation.csv
Written: /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap/with_validation_0.2_merged_validation.csv
Written: /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap/with_validation_0.3_merged_validation.csv
Written: /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap/with_validation_0.4_merged_validation.csv
Written: /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap/with_validation_0.5_merged_validation.csv
Written: /Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/validated_overlap/with_validation_0.6_merged_validation.csv
Written: /Users/phoebe

{"id": "61429c", "context": "In this enclosed but airy building, you'll find ladies with large machetes expertly chopping off hunks of kingfish, tuna, or shark for eager buyers.", "statement": "You'll find small lepers chopping of chunks of tuna, its the only place they can work.", "entailment": [], "neutral": [{"annotator": 0, "id": "1265-neutral-1", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "The context doesn't mention whether the ladies are small lepers and whether its the only place they can work.", "self_corrected": false}, {"annotator": 1, "id": "1265-neutral-2", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "The context does not say anything about lepers or where they could work.", "self_corrected": false}, {"annotator": 2, "id": "1265-neutral-3", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "\"Small lepers\" don't have to be \"ladies\"; we don't know whether \"small lepers\" can find other jobs.", "self_corrected": false}, {"annotator": 3, "id": "1265-neutral-4", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "Lepers and the only place to work at are not mentioned", "self_corrected": false}], "contradiction": [], "idk": [], "label_count_round_1": {"contradiction": null, "entailment": null, "neutral": 4.0}, "label_count_round_2": {"contradiction": null, "entailment": null, "neutral": 4.0}, "label_set_round_1": ["neutral"], "label_set_round_2": ["neutral"], "error_labels": [], "has_ambiguity": false, "chaosnli_labels": {"n": 41, "c": 57, "e": 2}}


In [476]:
write_merged_errors(
    model_jsonl="/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/chase/llama_8b_chase/with_validation_0.7.jsonl",
    varierr_json="/Users/phoebeeeee/ongoing/LLM_AED/dataset/varierr/varierr.json",
    out_csv="/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/all/qwen_72b_all/validated_overlap/with_validation_0.7_merged_validation.csv"
)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/chase/llama_8b_chase/with_validation_0.7.jsonl'

In [None]:
# remove LLM-detected errors from original VariErr

import json

varierr_file = '/Users/phoebeeeee/ongoing/LLM_AED/dataset/varierr/varierr.json'
model_file = '/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/with_validation_0.8.jsonl'
output_file = '/Users/phoebeeeee/ongoing/LLM_AED/new_processing/validation_result/original/qwen_72b_original/varierr_without_qwen_72b.json'

with open(varierr_file, 'r', encoding='utf-8') as f:
    varierr_data = {json.loads(line)['id']: json.loads(line) for line in f}

with open(model_file, 'r', encoding='utf-8') as f:
    model_data = {json.loads(line)['id']: json.loads(line) for line in f}
label_map = {'e': 'entailment', 'n': 'neutral', 'c': 'contradiction'}

merged = []

for uid, var_entry in varierr_data.items():
    model_entry = model_data.get(uid, {})

    var_entry.pop('entailment', None)
    var_entry.pop('contradiction', None)
    var_entry.pop('neutral', None)
    var_entry.pop('idk', None)

    if 'error' in model_entry:
        error_raw = model_entry['error']
        error_mapped = [label_map.get(lbl, lbl) for lbl in error_raw]
        var_entry['error_llm'] = error_mapped
    if 'not_validated_exp' in model_entry:
        var_entry['not_validated_exp_llm'] = model_entry['not_validated_exp']

    original_labels = set(var_entry.get('label_set_round_1', []))
    error_labels = set(var_entry.get('error_llm', []))
    label_set_llm = sorted(original_labels - error_labels)

    var_entry['label_set_llm'] = label_set_llm
    merged.append(var_entry)

with open(output_file, 'w', encoding='utf-8') as out:
    for item in merged:
        json.dump(item, out, ensure_ascii=False)
        out.write('\n')

print(f"Done.")


{"id": "49807n", "context": "The next year, he built himself a palace, Iolani, which can still be toured in Honolulu.", "statement": "Lolani was built in only 1 year.", "entailment": [{"annotator": 1, "id": "362-entailment-1", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "\"The next year\" can be interpreted as indicating that the building of Lolani was concluded in the same year.", "self_corrected": false}, {"annotator": 3, "id": "362-entailment-2", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "It was built \"the next year\".", "self_corrected": false}], "neutral": [{"annotator": 0, "id": "362-neutral-1", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "The context makes no mention of how long it took to build lolani.", "self_corrected": false}, {"annotator": 1, "id": "362-neutral-2", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": true}], "label_correction": false, "reason": "\"The next year\" can be interpreted as indicating that the building of Lolani was started in the next year.", "self_corrected": false}, {"annotator": 2, "id": "362-neutral-3", "judgments": [{"annotator": 0, "makes_sense": true}, {"annotator": 1, "makes_sense": true}, {"annotator": 2, "makes_sense": true}, {"annotator": 3, "makes_sense": false}], "label_correction": false, "reason": "We don't know when did Lolani start to be built.", "self_corrected": false}], "contradiction": [], "idk": [], "label_count_round_1": {"contradiction": null, "entailment": 2.0, "neutral": 3.0}, "label_count_round_2": {"contradiction": null, "entailment": 2.0, "neutral": 3.0}, "label_set_round_1": ["neutral", "entailment"], "label_set_round_2": ["neutral", "entailment"], "error_labels": [], "has_ambiguity": true, "chaosnli_labels": {"n": 67, "e": 28, "c": 5}}
