In [None]:
import json
import random

input_path = "../varierr_with_peer_validated.json"
output_path = "../varierr_with_chaosnli_dist_noise.json"

index_to_label = {"e": "entailment", "n": "neutral", "c": "contradiction"}

def sample_from_inverse_distribution(freq_dict):
    weights = {k: 1 / v for k, v in freq_dict.items() if v > 0}
    total = sum(weights.values())
    probs = {k: w / total for k, w in weights.items()}
    
    labels, probabilities = zip(*probs.items())
    sampled = random.choices(labels, weights=probabilities, k=1)[0]
    return index_to_label[sampled] 

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        chaosnli_counts = data.get("chaosnli_labels", {})
        error_labels = data.get("error_labels", [])

        if chaosnli_counts and error_labels:
            new_labels = []
            for _ in error_labels:
                sampled_label = sample_from_inverse_distribution(chaosnli_counts)
                new_labels.append(sampled_label)

            data["chaosnli_dist_noise"] = new_labels

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")


In [None]:
# chaosnli dist count

import json
from collections import Counter

input_path = "../varierr_with_chaosnli_dist_noise.json"
output_path = "../varierr_with_chaosnli_dist_noise_count.json"


labels = ["entailment", "neutral", "contradiction"]

with open(input_path, "r") as infile, open(output_path, "w") as outfile:
    for line in infile:
        data = json.loads(line)

        label_set_round_2 = set(data.get("label_set_round_3", []))
        noise_peer = data.get("chaosnli_dist_noise", [])
        noise_counter = Counter(noise_peer)
        random_noise_labels_count = {}
        for label in labels:
            count = 0
            if label in label_set_round_2:
                count += 1

            count += noise_counter.get(label, 0)
            random_noise_labels_count[label] = float(count) if count > 0 else None
        data["chaosnli_dist_noise_count"] = random_noise_labels_count

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print("Done")


In [None]:
## generate distribution

import json

input_path = "../varierr_with_chaosnli_dist_noise_count.json"
output_path = "../varierr_with_chaosnli_dist_noise_dist.json"

def convert_label_count_to_dist(label_count_dict):

    c = label_count_dict.get("contradiction") or 0
    e = label_count_dict.get("entailment") or 0
    n = label_count_dict.get("neutral") or 0

    total = c + e + n
    if total == 0:
        return [0.0, 0.0, 0.0]
    return [e / total, n / total, c / total] 

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:

    for line in infile:
        item = json.loads(line)

        label_counts = item.get("chaosnli_dist_noise_count")
        if label_counts:
            item["label"] = convert_label_count_to_dist(label_counts)

        json.dump(item, outfile)
        outfile.write("\n")

print("Done")


In [None]:
## clean

import json
from pathlib import Path
from tqdm import tqdm

def normalize_label_dist(chaos_dict):
    label_order = ["entailment", "neutral", "contradiction"]
    values = [(chaos_dict.get(k) or 0.0) for k in label_order]  
    total = sum(values)
    if total == 0:
        return [0.0] * 3
    return [v / total for v in values]


def process_file(input_path, output_path):
    with open(input_path, "r") as fin, open(output_path, "w") as fout:
        for line in tqdm(fin, desc=f"Processing {input_path}"):
            raw = json.loads(line)
            out = {
                "uid": raw.get("id", raw.get("uid")),
                "premise": raw.get("context") or raw["example"]["premise"],
                "hypothesis": raw.get("statement") or raw["example"]["hypothesis"],
                "label": raw.get("label")
            }
            fout.write(json.dumps(out, ensure_ascii=False) + "\n")

if __name__ == "__main__":
    input_path = "../varierr_with_chaosnli_dist_noise_dist.json"
    output_path = "../varierr_with_chaosnli_dist_noise_cleaned.json"
    process_file(input_path, output_path)
