In [None]:
# find peer-validated pair, write in to ``label_count_round_3``

import json

input_path = "../varierr.json"
output_path = "../varierr_with_peer_validated.json"
labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r") as infile, open(output_path, "w") as outfile:
    for line in infile:
        data = json.loads(line)
        peer_validated_count = {}

        for label in labels:
            explanations = data.get(label, [])
            count = 0

            for entry in explanations:
                if not isinstance(entry, dict):
                    continue
                ak = entry.get("annotator")
                peer_yes_votes = 0

                for j in entry.get("judgments", []):
                    if not isinstance(j, dict):
                        continue
                    if j.get("annotator") != ak and j.get("makes_sense") is True:
                        peer_yes_votes += 1

                if peer_yes_votes >= 2:
                    count += 1

            peer_validated_count[label] = count if count > 0 else None

        data["label_count_round_3"] = peer_validated_count
        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")


In [None]:
# aggregated --> label_set_round_3

import json

input_path = "../varierr_with_peer_validated.json"
output_path = "../varierr_with_peer.json"
labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        label_count_round_3 = data.get("label_count_round_3", {})

        label_set_round_3 = []
        for label in labels:
            count = label_count_round_3.get(label)
            if count is not None and count >= 1:
                label_set_round_3.append(label)

        data["label_set_round_3"] = label_set_round_3
        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")


In [None]:
## label_set_round_1 - label_set_round_3 = error_peer
import json

input_path = "../varierr_with_peer.json"
output_path = "../varierr_with_peer_validated.json"

all_passed = True
error_records = [] 

with open(input_path, "r") as infile:
    for line in infile:
        data = json.loads(line)

        label_set_round_1 = set(data.get("label_set_round_1", []))
        label_set_round_2 = set(data.get("label_set_round_2", []))
        error_labels = set(data.get("error_labels", []))

        if label_set_round_1 - label_set_round_2 != error_labels:
            all_passed = False
            error_records.append(data.get("id", "unknown_id"))

if all_passed:
    print("All correct")

    with open(input_path, "r") as infile, open(output_path, "w") as outfile:
        for line in infile:
            data = json.loads(line)

            label_set_round_1 = set(data.get("label_set_round_1", []))
            label_set_round_3 = set(data.get("label_set_round_3", []))

            error_peer = list(label_set_round_1 - label_set_round_3)
            data["error_peer"] = error_peer

            outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

    print(f"New file generated.")

else:
    print("Error records:")
    for eid in error_records:
        print(eid)



In [None]:
## random noise

import json
import random

def inject_multiple_noises(input_path, output_path, seed, label_choices=["entailment", "neutral", "contradiction"]):
    random.seed(seed) 
    with open(input_path, 'r') as fin, open(output_path, 'w') as fout:
        for line in fin:
            data = json.loads(line.strip())
            error_labels = data.get("error_peer", [])

            if error_labels:
                noise_labels = [random.choice(label_choices) for _ in error_labels]
                data["random_noise_peer"] = noise_labels

            fout.write(json.dumps(data) + "\n")
    print(f"Injected noise into {output_path}")

inject_multiple_noises("../varierr_with_peer_validated.json", "../varierr_peer_random_44.json", seed=44)


Injected noise into /Users/phoebeeeee/ongoing/Beyond-noise/dataset/varierr_peer_random_noise/varierr_peer_random_44.json


In [None]:
# new count 

import json
from collections import Counter

input_path = "../varierr_peer_random_44.json"
output_path = "../varierr_peer_random_count_44.json"


labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)

        label_set_round_3 = set(data.get("label_set_round_3", []))
        noise_peer = data.get("random_noise_peer", [])

        noise_counter = Counter(noise_peer)
        random_noise_labels_count = {}
        for label in labels:
            count = 0
            if label in label_set_round_3:
                count += 1

            count += noise_counter.get(label, 0)
            random_noise_labels_count[label] = float(count) if count > 0 else None

        data["random_noise_labels_count"] = random_noise_labels_count

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")



In [None]:
## generate distribution

import json

input_path = "../llama33_70b_explanation_with_validation_count.jsonl"
output_path = "../llama33_70b_explanation_with_validation_dist.jsonl"


def convert_label_count_to_dist(label_count_dict):

    c = label_count_dict.get("contradiction") or 0
    e = label_count_dict.get("entailment") or 0
    n = label_count_dict.get("neutral") or 0

    total = c + e + n
    if total == 0:
        return [0.0, 0.0, 0.0]
    return [e / total, n / total, c / total] 

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in infile:
        item = json.loads(line)

        ## change here "random_noise_labels_count"/"distribution_noise_label_count"
        label_counts = item.get("label_set_round_2")
        if label_counts:
            item["label"] = convert_label_count_to_dist(label_counts)

        json.dump(item, outfile)
        outfile.write("\n")

print("Done")


In [None]:
## clean

import json
from pathlib import Path
from tqdm import tqdm

def normalize_label_dist(chaos_dict):
    label_order = ["entailment", "neutral", "contradiction"]
    values = [(chaos_dict.get(k) or 0.0) for k in label_order]  
    total = sum(values)
    if total == 0:
        return [0.0] * 3
    return [v / total for v in values]


def process_file(input_path, output_path):
    with open(input_path, "r") as fin, open(output_path, "w") as fout:
        for line in tqdm(fin, desc=f"Processing {input_path}"):
            raw = json.loads(line)
            out = {
                "uid": raw.get("id", raw.get("uid")),
                "premise": raw.get("context") or raw["example"]["premise"],
                "hypothesis": raw.get("statement") or raw["example"]["hypothesis"],
                "label": raw.get("label")}
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    input_path = "../varierr_peer_random_dist_44.json"
    output_path = "../varierr_peer_random_cleaned_44.json"
    process_file(input_path, output_path)


# distributional

In [None]:
## distributional

import json
import random

def inject_noise_with_distribution(input_file, output_file, seed):
    label_pool = ['entailment', 'neutral', 'contradiction']
    label_weights = [87, 69, 82]
    random.seed(seed)

    with open(input_file, 'r', encoding='utf-8') as fin, \
         open(output_file, 'w', encoding='utf-8') as fout:
        
        for line in fin:
            data = json.loads(line.strip())

            error_count = len(data.get("error_peer", []))
            if error_count > 0:
                noise_labels = random.choices(label_pool, weights=label_weights, k=error_count)
                data["distribution_noise_label"] = noise_labels 
            fout.write(json.dumps(data, ensure_ascii=False) + "\n")

inject_noise_with_distribution("../varierr_with_peer_validated.json", "../varierr_peer_dist_44.json", seed=44)


In [None]:
# new count 

import json
from collections import Counter

input_path = "../varierr_peer_dist_44.json"
output_path = "../varierr_peer_dist_count_44.json"


labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r") as infile, open(output_path, "w") as outfile:
    for line in infile:
        data = json.loads(line)

        label_set_round_3 = set(data.get("label_set_round_3", []))
        noise_peer = data.get("distribution_noise_label", [])

        noise_counter = Counter(noise_peer)
        dist_noise_labels_count = {}
        for label in labels:
            count = 0

            if label in label_set_round_3:
                count += 1
            count += noise_counter.get(label, 0)
            dist_noise_labels_count[label] = float(count) if count > 0 else None

        data["dist_noise_labels_count"] = dist_noise_labels_count

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")

In [None]:
## generate distribution

import json

input_path = "../varierr_peer_dist_count_44.json"
output_path = "../varierr_peer_dist_dist_44.json"


def convert_label_count_to_dist(label_count_dict):

    c = label_count_dict.get("contradiction") or 0
    e = label_count_dict.get("entailment") or 0
    n = label_count_dict.get("neutral") or 0

    total = c + e + n
    if total == 0:
        return [0.0, 0.0, 0.0] 
    return [e / total, n / total, c / total] 

with open(input_path, "r") as infile, \
     open(output_path, "w") as outfile:

    for line in infile:
        item = json.loads(line)

        ## change here "random_noise_labels_count"/"dist_noise_labels_count"
        label_counts = item.get("dist_noise_labels_count")
        if label_counts:
            item["label"] = convert_label_count_to_dist(label_counts)

        json.dump(item, outfile)
        outfile.write("\n")

print("Done")


In [None]:
## clean

import json
from pathlib import Path
from tqdm import tqdm

def normalize_label_dist(chaos_dict):
    label_order = ["entailment", "neutral", "contradiction"]
    values = [(chaos_dict.get(k) or 0.0) for k in label_order]  
    total = sum(values)
    if total == 0:
        return [0.0] * 3
    return [v / total for v in values]


def process_file(input_path, output_path):
    with open(input_path, "r") as fin, open(output_path, "w") as fout:
        for line in tqdm(fin, desc=f"Processing {input_path}"):
            raw = json.loads(line)
            out = {
                "uid": raw.get("id", raw.get("uid")),
                "premise": raw.get("context") or raw["example"]["premise"],
                "hypothesis": raw.get("statement") or raw["example"]["hypothesis"],
                "label": raw.get("label")}
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

# 示例调用方式
if __name__ == "__main__":
    input_path = "../varierr_peer_dist_dist_44.json"
    output_path = "../varierr_peer_dist_cleaned_44.json"
    process_file(input_path, output_path)


## peer-validated without noise

In [None]:
import json
from collections import Counter

input_path = "../varierr_with_peer_validated.json"
output_path = "../varierr_peer_no_error_dist.json"

labels_order = ["entailment", "neutral", "contradiction"]

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        label_count_round = data.get("label_count_round_3", [])
        cleaned_counts = {label: (0.0 if label_count_round.get(label) is None else label_count_round.get(label))
                          for label in labels_order}

        total = sum(cleaned_counts.values())

        if total == 0:
            label_distribution = [0.0, 0.0, 0.0]
        else:
            label_distribution = [cleaned_counts[label] / total for label in labels_order]

        data["labels"] = label_distribution
        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")


In [None]:
## clean no error

import json
from pathlib import Path
from tqdm import tqdm

def normalize_label_dist(chaos_dict):
    label_order = ["entailment", "neutral", "contradiction"]
    values = [(chaos_dict.get(k) or 0.0) for k in label_order]  
    total = sum(values)
    if total == 0:
        return [0.0] * 3
    return [v / total for v in values]


def process_file(input_path, output_path):
    with open(input_path, "r") as fin, open(output_path, "w") as fout:
        for line in tqdm(fin, desc=f"Processing {input_path}"):
            raw = json.loads(line)
            out = {
                "uid": raw.get("id", raw.get("uid")),
                "premise": raw.get("context") or raw["example"]["premise"],
                "hypothesis": raw.get("statement") or raw["example"]["hypothesis"],
                "label": raw.get("labels")
            }
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    input_path =  "../varierr_peer_no_error_dist.json"
    output_path ="../varierr_peer_no_error_cleaned.json"
    process_file(input_path, output_path)
