# Merge Sample to Train

This notebook merges the annotated sample to the training dataset of the previosu iteration to produce the finetuning dataset for the next DOREMI iteration.

In [None]:
import json
import pandas as pd

In [None]:
# Load sample
sample_file = "ANNOTATED_sample_agreement_logsum_iter1"
sample_path = f"annotations/{sample_file}.json"
sample = json.load(open(sample_path, "r"))

# Load training file
dataset = "docred" # Dataset used for the iterations (either docred or redocred)
train_file = "train_annotated.json"
train_path = f"../data/{dataset}/{train_file}"
train = json.load(open(train_path, "r"))

# Name of the training file for the next iteration
new_train_filename = "train_iteration1_agreement_logsum.json"

In [None]:
# Format datasets as a dictionary with key the document's title
title2sample = {}
title2train = {}
for d in train:
    title2train[d["title"]] = d
for d in sample:
    title2sample[d["title"]] = d

In [None]:
# Check the number of documents in the training file and the number of positive examples
print(f"Number of documents in {train_file}: {len(train)}")
sample_triples = 0
for d in train:
    sample_triples += len(d["labels"])
    
print(f"Number of positive examples in {train_file}: {sample_triples}")


In [None]:
# Check the number of documents in the training file and the number of positive examples
print(f"Number of documents in {sample_file}: {len(sample)}")
sample_triples = 0
for d in sample:
    sample_triples += len(d["labels"])
    
print(f"Number of positive examples in {sample_file}: {sample_triples}")

In [None]:
# Merging the train and sample file
new_doc, already_doc = 0, 0
title2old_ents = {}
cross_ents = 0
for t in title2train.keys():
    title2old_ents[t] = []
    for e in title2train[t]["vertexSet"]:
        if "ent_id" in e[0].keys():
            title2old_ents[t].append(str(e[0]["ent_id"]))
for d in sample:
    title = d["title"]
    print(f"*** Considering document {title} ***")
    if d["title"] in title2train.keys():
        already_doc += 1
        # Document already in the training dataset
        print(f"Document already in training, merge entities and labels")
        print(f"Entities already present: {title2train[title]['old2new'].keys()}")
        ent2pos = {}
        new_ents = []
        title2train[title]["pairs_to_exclude"] = []
        for ix, ent in enumerate(d["vertexSet"]):
            ent_id = str(ent[0]["ent_id"])
            ent_name = ent[0]["name"]
            print(f"Considering entity {ent_name} (ID: {ent_id} (type: {type(ent_id)}), vertexSet pos: {ix})")
            if ent_id in title2train[title]["old2new"].keys():
                train_pos = title2train[title]["old2new"][ent_id]
                print(f"Entity {ent_name} already in training dataset at vertexSet position {train_pos}")
                ent2pos[ix] = train_pos
                new_ents.append(ent_id)
            else:
                new_pos = len(title2train[title]["vertexSet"])
                print(f"Entity {ent_name} not in training dataset, adding it at position {new_pos}")
                title2train[title]["old2new"][ent_id] = new_pos
                ent2pos[ix] = new_pos
                new_ents.append(ent_id)
                title2train[title]["vertexSet"].append(ent)
        # Creating list of pairs to exclude
        if "pairs_to_exclude" not in title2train[title].keys():
            title2train[title]["pairs_to_exclude"] = []
        for h_ix, h in enumerate(title2train[title]["vertexSet"]):
            for t_ix, t in enumerate(title2train[title]["vertexSet"]):
                if h_ix != t_ix:
                    h_id = str(h[0]["ent_id"])
                    t_id = str(t[0]["ent_id"])
                    # If one entity is in new_ents and the other not, AND both entities are not in old_ents
                    # To consider the case where one entity was previously in the dataset (annotated with old entities) but annotated with new entities
                    if (h_id in new_ents and t_id not in new_ents) or (h_id not in new_ents and t_id in new_ents):
                        print(f"Candidate pairs to exclude: {h_id}, {t_id}")
                        if h_id in new_ents and t_id not in new_ents:
                            print(f"{h_id} in new_ents BUT {t_id} not in new_ents")
                        if h_id not in new_ents and t_id in new_ents:
                            print(f"{h_id} not in new_ents BUT {t_id} in new_ents")
                        old_ents = title2old_ents[title]
                        if h_id in old_ents and t_id in old_ents:
                            print(f"Both entities ({h_id}, {t_id}) were previously in the training data, not excluding them")
                        else:
                            # Cross-entites, one was already there while the other wasn't
                            # Check if the pair was annotated
                            annotated = False
                            for l in d["labels"]:
                                if l["h"] == h_ix and l["t"] == t_ix:
                                    annotated = True
                                    cross_ents += 1
                                if l["h"] == t_ix and l["t"] == h_ix: 
                                    annotated = True
                                    cross_ents += 1
                            if not annotated:
                                print(f"Excluding entity pair ({h_ix}, {t_ix}) for document {title}")
                                title2train[title]["pairs_to_exclude"].append((h_ix,t_ix))
        # Merging labels        
        for l in d["labels"]:
            h, t = l["h"], l["t"]
            h_id = title2sample[title]["vertexSet"][h][0]["ent_id"]
            t_id = title2sample[title]["vertexSet"][t][0]["ent_id"]
            h_name = title2sample[title]["vertexSet"][h][0]["name"]
            t_name = title2sample[title]["vertexSet"][t][0]["name"]
            print(f"Consider label for entity pair ({h}, {t}); ({h_name} (ID: {h_id}), {t_name} (ID: {t_id}))")
            alreadyInTrain = False
            for tl in title2train[title]["labels"]:
                th_id = title2train[title]["vertexSet"][l["h"]][0]["ent_id"]
                tt_id = title2train[title]["vertexSet"][l["t"]][0]["ent_id"] 
                if th_id == h_id and tt_id == t_id:
                    raise ValueError(f"Label for entities ({h_id}, {t_id}) already in training for document {title}")
                    alreadyInTrain = True
            if not alreadyInTrain:
                title2train[title]["labels"].append({"h": ent2pos[l["h"]], "t": ent2pos[l["t"]], "r": l["r"], "evidence": []})
    else:
        new_doc += 1
        print(f"Document NOT in training, adding it")
        # Add new document
        title2train[d["title"]] = d

In [None]:
# Check the number of cross_entities is zero
cross_ents

In [None]:
print(f"Number of documents in the new training file: {len(title2train)}")
print(f"New documents added: {new_doc}, Documents merged: {already_doc}")

In [None]:
# Create new training file
new_train = []
for title in title2train.keys():
    new_train.append(title2train[title])

# len(new_train) must be equal to len(title2train) in the cell above
print(f"Number of documents in the new training file: {len(new_train)}")

In [None]:
num_triples = 0
for d in new_train:
    num_triples += len(d["labels"])
    
print(f"Number of positive examples in the new training file: {num_triples}")

In [None]:
# Saving the new training file
json.dump(new_train, open(f"../data/{dataset}/{new_train_filename}", "w"))