In [None]:
## random noise

import json
import random

def inject_multiple_noises(input_path, output_path, seed, label_choices=["entailment", "neutral", "contradiction"]):
    random.seed(seed) 
    with open(input_path, 'r', encoding='utf-8') as fin, open(output_path, 'w', encoding='utf-8') as fout:
        for line in fin:
            data = json.loads(line.strip())
            error_labels = data.get("error_labels", [])

            if error_labels:
                noise_labels = [random.choice(label_choices) for _ in error_labels]
                data["random_noise_labels"] = noise_labels

            fout.write(json.dumps(data) + "\n")

    print(f"Injected noise into {output_path}")

inject_multiple_noises("../varierr_random_noise-44.json", seed=44)


In [None]:
# new count 

import json
from collections import Counter

input_path = "../varierr_random_noise-44.json"
output_path = "../varierr_random_noise_count-44.json"


labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)

        label_set_round_2 = set(data.get("label_set_round_2", []))
        noise_peer = data.get("random_noise_labels", [])

        noise_counter = Counter(noise_peer)

        random_noise_labels_count = {}
        for label in labels:
            count = 0
            if label in label_set_round_2:
                count += 1
            count += noise_counter.get(label, 0)
            random_noise_labels_count[label] = float(count) if count > 0 else None

        data["random_noise_labels_count"] = random_noise_labels_count

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")



In [None]:
## generate distribution

import json

input_path = "../varierr_random_noise_count-44.json"
output_path = "../varierr_random_noise_dist-44.json"

def convert_label_count_to_dist(label_count_dict):

    c = label_count_dict.get("contradiction") or 0
    e = label_count_dict.get("entailment") or 0
    n = label_count_dict.get("neutral") or 0

    total = c + e + n
    if total == 0:
        return [0.0, 0.0, 0.0]  
    return [e / total, n / total, c / total]

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in infile:
        item = json.loads(line)

        ## change here "random_noise_labels_count"/"distribution_noise_label_count/peer_with_noise"
        label_counts = item.get("random_noise_labels_count")
        if label_counts:
            item["label"] = convert_label_count_to_dist(label_counts)

        json.dump(item, outfile)
        outfile.write("\n")

print("Done")


In [None]:
## clean

import json
from pathlib import Path
from tqdm import tqdm

def normalize_label_dist(chaos_dict):
    label_order = ["entailment", "neutral", "contradiction"]
    values = [(chaos_dict.get(k) or 0.0) for k in label_order]  # None-->0.0
    total = sum(values)
    if total == 0:
        return [0.0] * 3
    return [v / total for v in values]


def process_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
        for line in tqdm(fin, desc=f"Processing {input_path}"):
            raw = json.loads(line)
            out = {
                "uid": raw.get("id", raw.get("uid")),
                "premise": raw.get("context") or raw["example"]["premise"],
                "hypothesis": raw.get("statement") or raw["example"]["hypothesis"],
                "label": raw.get("label")
            }
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    input_path = "../varierr_random_noise_dist-42.json"  
    output_path = "../varierr_random_noise_cleaned-42.json" 
    process_file(input_path, output_path)


## distributional

In [None]:
## distributional

import json
import random

def inject_noise_with_distribution(input_file, output_file, seed):
    label_pool = ['contradiction', 'neutral', 'entailment']
    label_weights = [53, 23, 53]
    random.seed(seed)

    with open(input_file, 'r', encoding='utf-8') as fin, \
         open(output_file, 'w', encoding='utf-8') as fout:
        
        for line in fin:
            data = json.loads(line.strip())

            error_count = len(data.get("error_labels", []))
            if error_count > 0:
                noise_labels = random.choices(label_pool, weights=label_weights, k=error_count)
                data["distribution_noise_label"] = noise_labels 
            fout.write(json.dumps(data, ensure_ascii=False) + "\n")

inject_noise_with_distribution("../varierr.json", "../varierr_distributional_error-44.json", seed=44)


In [None]:
# new count 

import json
from collections import Counter

input_path = "../varierr_distributional_error-44.json"
output_path = "../varierr_distributional_error_count-44.json"


labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)

        label_set_round_2 = set(data.get("label_set_round_2", []))
        noise_peer = data.get("distribution_noise_label", [])

        noise_counter = Counter(noise_peer)

        random_noise_labels_count = {}
        for label in labels:
            count = 0
            if label in label_set_round_2:
                count += 1
            count += noise_counter.get(label, 0)
            random_noise_labels_count[label] = float(count) if count > 0 else None

        data["distributional_noise_labels_count"] = random_noise_labels_count

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")



In [None]:
## generate distribution

import json

input_path = "../varierr_distributional_error_count-44.json"
output_path = "../varierr_distributional_error_dist-44.json"

def convert_label_count_to_dist(label_count_dict):

    c = label_count_dict.get("contradiction") or 0
    e = label_count_dict.get("entailment") or 0
    n = label_count_dict.get("neutral") or 0

    total = c + e + n
    if total == 0:
        return [0.0, 0.0, 0.0] 
    return [e / total, n / total, c / total]  

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in infile:
        item = json.loads(line)

        ## change here "random_noise_labels_count"/"distributional_noise_labels_count"
        label_counts = item.get("distributional_noise_labels_count")
        if label_counts:
            item["label"] = convert_label_count_to_dist(label_counts)

        json.dump(item, outfile)
        outfile.write("\n")

print("Done")


处理完成，已写入新文件：/Users/phoebeeeee/ongoing/Beyond-noise/dataset/varierr_self_distributional_noise/varierr_distributional_error_dist-44.json


In [None]:
## clean

import json
from pathlib import Path
from tqdm import tqdm

def normalize_label_dist(chaos_dict):
    label_order = ["entailment", "neutral", "contradiction"]
    values = [(chaos_dict.get(k) or 0.0) for k in label_order] 
    total = sum(values)
    if total == 0:
        return [0.0] * 3
    return [v / total for v in values]


def process_file(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
        for line in tqdm(fin, desc=f"Processing {input_path}"):
            raw = json.loads(line)
            out = {
                "uid": raw.get("id", raw.get("uid")),
                "premise": raw.get("context") or raw["example"]["premise"],
                "hypothesis": raw.get("statement") or raw["example"]["hypothesis"],
                "label": raw.get("label")
            }
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    input_path = "../varierr_distributional_error_dist-44.json"
    output_path = "../varierr_distributional_error_cleaned-44.json"
    process_file(input_path, output_path)
