In [None]:
## add id to original label guided dataset

import json
from tqdm import tqdm
from pathlib import Path

file_a_path = "../VariErr-Label-Guided-longest.json"
file_b_path = "../varierr.json"
output_path = "../VariErr-Label-Guided-longest-with-ID.json"

with open(file_b_path, "r") as f:
    full_dataset = [json.loads(line) for line in f]
    pair_to_id = {
        (sample["context"].strip(), sample["statement"].strip()): sample["id"]
        for sample in full_dataset
    }

print(f"We have {len(pair_to_id)} pairs.")

with open(file_a_path, "r") as f:
    data = [json.loads(line) for line in f]

with open(output_path, "w") as f_out:
    for sample in tqdm(data):
        premise = sample["premise"].strip()
        hypothesis = sample["hypothesis"].strip()
        key = (premise, hypothesis)

        if key in pair_to_id:
            sample["id"] = pair_to_id[key]
        else:
            print(f"Not found: premise='{premise}...', hypothesis='{hypothesis}...'")
            sample["id"] = None

        f_out.write(json.dumps(sample, ensure_ascii=False) + "\n")

print("Done.")


In [None]:
# integrate explanations generated by LLMs to a singel file

import json, re
from pathlib import Path
from tqdm import tqdm

explanation_root = Path("/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw")
input_jsonl = Path("/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/dataset/varierr/varierr.json")
output_jsonl = Path("/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw.jsonl")

# suffix = ".txt"

def clean_explanation(text: str) -> str:
    return re.sub(r"^\s*(?:[\d]+[\.\)]|[-•*]|[a-zA-Z][\.\)]|\(\w+\))\s+", "", text).strip()

label_map = {"E": "e", "N": "n", "C": "c"}

with open(input_jsonl, "r", encoding="utf-8") as f:
    instances = [json.loads(line) for line in f]

with open(output_jsonl, "w", encoding="utf-8") as fout:
    for instance in tqdm(instances, desc="Inject explanations"):
        sample_id = str(instance["id"])
        subfolder = explanation_root / sample_id
        new_comments = []

        if not subfolder.exists():
            print(f"missing folder: {subfolder}")
        else:
            for label in ["E", "N", "C"]:
                tried_files = [
                    # f"{label}_third.txt"
                    # f"{label}_second.txt",
                    # f"{label}_first.txt",
                    label
                ]
        
                file_found = False
                for fname in tried_files:
                    file_path = subfolder / f"{fname}"
                    if file_path.exists():
                        with open(file_path, "r", encoding="utf-8") as f:
                            explanations = [
                                clean_explanation(line)
                                for line in f
                                if line.strip()
                            ]
                        new_comments.extend([[exp, label_map[label]] for exp in explanations])
                        file_found = True
                        break

                if not file_found:
                    print(f"No file found for {label} in {subfolder}")
        new_instance = {
            "id": instance["id"],
            "premise": instance["context"],
            "hypothesis": instance["statement"],
            "generated_explanations": new_comments
        }

        fout.write(json.dumps(new_instance, ensure_ascii=False) + "\n")

print("Done.")


Inject explanations: 100%|██████████| 500/500 [00:00<00:00, 1360.50it/s]

Done.





In [43]:
# count number

import os
import re
import pandas as pd

ROOT_FOLDER = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/qwen_7b_generation_raw"

def count_generations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for line in f if line.strip())

def count_all_generations():
    records = []
    total = {"E": 0, "N": 0, "C": 0}

    for subfolder in os.listdir(ROOT_FOLDER):
        sub_path = os.path.join(ROOT_FOLDER, subfolder)
        if not os.path.isdir(sub_path):
            continue

        row = {"folder": subfolder}
        for label in ["E", "N", "C"]:
            file_path = os.path.join(sub_path, f"{label}_third.txt")
            if os.path.isfile(file_path):
                count = count_generations(file_path)
            else:
                count = 0
            row[label] = count
            total[label] += count

        records.append(row)

    df = pd.DataFrame(records)
    df.loc["TOTAL"] = ["TOTAL"] + [total["E"], total["N"], total["C"]]
    # print(df)

    print(f"E={total['E']}，N={total['N']}，C={total['C']}，total: {total['E'] + total['N'] + total['C']}")

count_all_generations()


E=1198，N=1407，C=1415，total: 4020


In [None]:
# get avg score for instance

import json
from collections import defaultdict

with open('../scores.json', 'r') as f:
    data = json.load(f)

groups = defaultdict(list)
for key, value in data.items():
    try:
        id_label, _ = key.rsplit('-', 1)
        groups[id_label].append(value)
    except ValueError:
        print(f"'{key}' does not match.")
        continue

averaged_data = {k: sum(v) / len(v) for k, v in groups.items()}
with open('../avg_llama3.1_scores.jsonn', 'w') as f:
    json.dump(averaged_data, f, indent=2)


In [None]:
## Thresholding for ChaosNLI
import json

def process_label_distribution(label_probs, threshold=0.2):
    valid_indices = [i for i, p in enumerate(label_probs) if p >= threshold]
    count = len(valid_indices)
    if count == 0:
        return [0.0, 0.0, 0.0]
    return [1.0 / count if i in valid_indices else 0.0 for i in range(3)]

input_file = '../dev_cleaned.json'
output_file = '../dev_cleaned_20.json'

with open(input_file, 'r', encoding='utf-8') as fin, open(output_file, 'w', encoding='utf-8') as fout:
    for line in fin:
        item = json.loads(line)
        raw_label = item['label']
        new_label = process_label_distribution(raw_label)
        item['label'] = new_label
        fout.write(json.dumps(item, ensure_ascii=False) + '\n')


In [14]:
import pandas as pd
import ast
from collections import Counter

path = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/llama-70b-all/merged_errors.csv"  # 改成你的文件路径
df = pd.read_csv(path)

# 把单元格里的 '["c"]'、'[]' 等转为 Python 列表；兼容引号异常
def parse_list_cell(x):
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        # 某些 CSV 里会出现双引号翻倍的情况
        try:
            return ast.literal_eval(s.replace('""', '"'))
        except Exception:
            return []

# 第三列映射为 e/n/c
map_long = {"entailment": "e", "neutral": "n", "contradiction": "c",
            "e": "e", "n": "n", "c": "c"}

cnt2 = Counter()
cnt3 = Counter()

# 统计第二列（通常是 'e'/'n'/'c'）
for items in df["llm_not_calidated_error"].apply(parse_list_cell):
    for it in items:
        k = map_long.get(it, None)
        if k in ("e", "n", "c"):
            cnt2[k] += 1

# 统计第三列（可能是完整单词）
for items in df["varierr_error"].apply(parse_list_cell):
    for it in items:
        k = map_long.get(it, None)
        if k in ("e", "n", "c"):
            cnt3[k] += 1

print("第二列 llm_not_calidated_error：")
print(f"  e: {cnt2.get('e',0)}  n: {cnt2.get('n',0)}  c: {cnt2.get('c',0)}")

print("第三列 varierr_error：")
print(f"  e: {cnt3.get('e',0)}  n: {cnt3.get('n',0)}  c: {cnt3.get('c',0)}")


第二列 llm_not_calidated_error：
  e: 76  n: 69  c: 247
第三列 varierr_error：
  e: 53  n: 23  c: 53


In [15]:
import pandas as pd
import ast
from collections import Counter

path = "/Users/phoebeeeee/ongoing/Beyond-noise-MA-Zuo/EACL/llama-70b-all/merged_errors.csv"  # 改成你的 CSV 路径
df = pd.read_csv(path)

def parse_list_cell(x):
    """把 '["c"]' / '[]' / 空 转成 Python 列表"""
    if pd.isna(x):
        return []
    s = str(x).strip()
    if not s:
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        # 修正可能的双引号转义
        try:
            return ast.literal_eval(s.replace('""', '"'))
        except Exception:
            return []

# 统一成 e/n/c
MAP = {"entailment": "e", "neutral": "n", "contradiction": "c",
       "e": "e", "n": "n", "c": "c"}

def normalize(lst):
    out = []
    for it in lst:
        k = MAP.get(it, None)
        if k in ("e","n","c"):
            out.append(k)
    return out

def dupes(lst):
    c = Counter(lst)
    return [k for k, v in c.items() if v > 1]

overlap_rows = 0
overlap_label_counter = Counter()        # 统计重叠 label 总数（按 e/n/c）
overlap_per_row_counter = Counter()      # 每行重叠个数分布（1/2/3）

for idx, row in df.iterrows():
    rid = row.get("id", idx)  # 若无 id 列则用行号
    col2_raw = parse_list_cell(row["llm_not_calidated_error"])
    col3_raw = parse_list_cell(row["varierr_error"])

    col2 = normalize(col2_raw)
    col3 = normalize(col3_raw)

    # 列内重复（可选，如果你只关心两列之间重叠，这两行可以删掉）
    d2 = dupes(col2)
    d3 = dupes(col3)

    # 两列之间的重叠（按集合计算）
    overlap = sorted(set(col2) & set(col3))

    if d2 or d3 or overlap:
        print(f"Row {idx} | id={rid}")
        if d2:
            print(f"  重复(第二列 llm_not_calidated_error): {d2}  原始: {col2_raw}")
        if d3:
            print(f"  重复(第三列 varierr_error): {d3}       原始: {col3_raw}")
        if overlap:
            # print(f"  两列之间有重叠: {overlap}  第二列: {col2}  第三列: {col3}")
            print(f"  重叠标签个数：{len(overlap)}")

    if overlap:
        overlap_rows += 1
        overlap_label_counter.update(overlap)          # 统计 e/n/c
        overlap_per_row_counter[len(overlap)] += 1     # 记录每行重叠了几个标签

# ---- 汇总统计 ----
total_rows = len(df)
print("\n===== 汇总 =====")
print(f"存在两列重叠标签的行数：{overlap_rows} / {total_rows}")

print("重叠标签总计（按 e/n/c）：")
for k in ("e","n","c"):
    print(f"  {k}: {overlap_label_counter.get(k, 0)}")

print("每行重叠个数分布：")
for n in sorted(overlap_per_row_counter):
    print(f"  重叠 {n} 个标签的行数：{overlap_per_row_counter[n]}")


Row 10 | id=48454c
  重叠标签个数：1
Row 27 | id=16996e
  重叠标签个数：1
Row 60 | id=65130n
  重叠标签个数：1
Row 84 | id=49462c
  重叠标签个数：1
Row 98 | id=52542n
  重叠标签个数：1
Row 115 | id=82415n
  重叠标签个数：1
Row 138 | id=80630e
  重叠标签个数：1
Row 152 | id=106013c
  重叠标签个数：1
Row 176 | id=89995c
  重叠标签个数：1
Row 186 | id=17576n
  重叠标签个数：1
Row 188 | id=64123c
  重叠标签个数：1
Row 206 | id=84055n
  重叠标签个数：1
Row 234 | id=124853e
  重叠标签个数：1
Row 241 | id=87332c
  重叠标签个数：1
Row 260 | id=73191n
  重叠标签个数：1
Row 289 | id=75259c
  重叠标签个数：1
Row 300 | id=102857n
  重叠标签个数：1
Row 303 | id=104412e
  重叠标签个数：1
Row 311 | id=21340n
  重叠标签个数：1
Row 319 | id=34043c
  重叠标签个数：1
Row 358 | id=32754n
  重叠标签个数：1
Row 366 | id=34573n
  重叠标签个数：1
Row 369 | id=118460n
  重叠标签个数：1
Row 379 | id=138530e
  重叠标签个数：1
Row 394 | id=76037n
  重叠标签个数：1
Row 431 | id=81579e
  重叠标签个数：1
Row 443 | id=46059c
  重叠标签个数：1
Row 454 | id=82510c
  重叠标签个数：1
Row 481 | id=142604e
  重叠标签个数：1
Row 484 | id=13765c
  重叠标签个数：1
Row 486 | id=116176c
  重叠标签个数：1
Row 487 | id=110234e
  重叠标签个数：1
Row 

In [None]:
# combine multiple generations by ID    

import argparse
import json
from collections import OrderedDict

REQ_FIELDS = ["id", "premise", "hypothesis", "generated_explanations"]

def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def merge_by_id(files):
    merged = OrderedDict() 
    for fp in files:
        for obj in read_jsonl(fp):
            if not all(k in obj for k in REQ_FIELDS):
                continue

            _id = obj["id"]
            if _id not in merged:
                merged[_id] = {
                    "id": _id,
                    "premise": obj["premise"],
                    "hypothesis": obj["hypothesis"],
                    "generated_explanations": []
                }
                existing = set()
            else:
                # 如有不一致，保留第一次出现的 premise/hypothesis
                existing = {
                    (e[0], e[1]) if isinstance(e, list) and len(e) >= 2 else tuple(e)
                    for e in merged[_id]["generated_explanations"]
                }

            # 合并 explanations 并去重（基于 (text,label)）
            # for e in obj.get("generated_explanations", []):
            #     key = (e[0], e[1]) if isinstance(e, list) and len(e) >= 2 else tuple(e)
            #     if key not in existing:
            #         merged[_id]["generated_explanations"].append(e)
            #         existing.add(key)
    return merged

def write_jsonl(merged, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        for rec in merged.values():
            json.dump(rec, f, ensure_ascii=False)
            f.write("\n")

def main():
    parser = argparse.ArgumentParser(description="合并多个 JSONL（绝对路径），按 id 合并 generated_explanations。")
    parser.add_argument("files", nargs="+", help="一个或多个 JSONL 文件的绝对路径")
    parser.add_argument("-o", "--output", default="generation_all.jsonl", help="输出文件路径")
    args = parser.parse_args()

    merged = merge_by_id(args.files)
    write_jsonl(merged, args.output)
    print(f"完成：合并 {len(merged)} 个 id，已保存到 {args.output}")

if __name__ == "__main__":
    main()
