In [12]:
import json
from collections import Counter, defaultdict

In [3]:
def read_data(filepath: str) -> dict:
    data = dict()
    with open(filepath, "r") as f:
        for line in f.readlines():
            example = json.loads(line)
            doc_id = example["example_id"].split("-")[0]
            context = example["context"]
            anaphor = example["anaphor"]

            # find index of anaphor in context (from the right to ensure the right anaphor)
            anaphor_index = context.rfind(anaphor)

            if doc_id not in data:
                data[doc_id] = {
                    "context": example["context"],
                    "gold_anaphors": [(anaphor, anaphor_index)],
                }
            else:
                if len(context) > len(data[doc_id]["context"]):
                    data[doc_id]["context"] = context
                data[doc_id]["gold_anaphors"].append((anaphor, anaphor_index))

    return data


In [8]:
# input_data_path = "/data/fbai31/in_context_anaphora/data/ChemuRef/lm_trimmed_data/train/k32/trial0.jsonl"
input_data_path = "/data/fbai31/in_context_anaphora/data/ChemuRef/lm_trimmed_data/train/kfull.jsonl"
data = read_data(input_data_path)
len(data)

867

In [19]:
# Anaphora number distribution

# for doc_id, doc_data in data.items():
#     print(doc_id, len(doc_data["gold_anaphors"]))

ana_num_dist = Counter(len(doc_data["gold_anaphors"]) for doc_id, doc_data in data.items())

In [20]:
sorted(ana_num_dist.items(), key=lambda x: x[1], reverse=True)

[(4, 147),
 (5, 146),
 (6, 121),
 (7, 83),
 (3, 78),
 (1, 77),
 (2, 60),
 (8, 56),
 (9, 26),
 (10, 22),
 (11, 19),
 (14, 5),
 (12, 4),
 (19, 3),
 (15, 3),
 (18, 2),
 (13, 2),
 (35, 2),
 (17, 2),
 (47, 1),
 (23, 1),
 (16, 1),
 (34, 1),
 (24, 1),
 (33, 1),
 (53, 1),
 (27, 1),
 (20, 1)]

In [26]:
top_eight_ana_num = sum([ana_num_dist[ana_num] for ana_num in range(1, 9)])
total_ana_num = sum(ana_num_dist.values())
print(top_eight_ana_num, total_ana_num, top_eight_ana_num / total_ana_num)

768 867 0.8858131487889274
