# Annotate Sample

This notebook provides a user-friendly interface to visualize the documents and triples to be annotated. Annotations must be stored in a CSV file, which is then converted in a JSON file to be merged with the training file of the previous DOREMI iteration. 

In [None]:
# Import needed libraries
import json
import csv
import ast
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
##### READ SAMPLE #####
filename = "sample_agreement_logsum_iter1"
dataset = "docred" # Dataset used for the iterations (either docred or redocred)
sample_path = f"../data/{dataset}/{filename}.json"
sample = json.load(open(sample_path, "r"))

In [None]:
# Generate a dictionary with the sample
title2sample = {}
for s in sample:
    title2sample[s["title"]] = s

In [None]:
# Upload models' predictions
# Edit with the directory containing the checkpoints for the considered iteration
predicitions_directory "pretrain_redocred"
models = ["CNN3", "ContextAware", "LSTM", "BiLSTM", "BERT"]
title2preds = {}
for m in models:
    title2preds[m] = {}
    predictions = json.load(open(f"../data/checkpoints/{m}/{predicitions_directory}/train_distant_results.json"))
    for p in predictions:
        if p["title"] in title2sample.keys():
            if str(p["h_idx"]) in title2sample[p["title"]]["old2new"].keys() and str(p["t_idx"]) in title2sample[p["title"]]["old2new"].keys():
                h_idx = title2sample[p["title"]]["old2new"][str(p["h_idx"])]
                t_idx = title2sample[p["title"]]["old2new"][str(p["t_idx"])]
                if p["title"] in title2preds.keys():
                    if (h_idx,t_idx) in title2preds[m][p["title"]].keys():
                        title2preds[m][p["title"]][(h_idx, t_idx)].append({'r': p["r"], 'score': p["score"]})
                    else:
                        title2preds[m][p["title"]][(h_idx, t_idx)] = [{'r': p["r"], 'score': p["score"]}]
                else:
                    title2preds[m][p["title"]] = {}
                    title2preds[m][p["title"]][(h_idx, t_idx)] = [{'r': p["r"], 'score': p["score"]}]  

                

In [None]:
# Initialization of the csv file storing the annotations
# RUN ONLY THE FIRST TIME -- OTHERWISE IT ERASES ALL ANNOTATIONS
csv_file = f"{filename}_annotations.csv"
with open(f"annotations/{csv_file}", mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter=";")
    writer.writerow(["Title", "h_idx", "t_idx", "rels"])

# Storing the triples to be annotated
for d in sample:
    for h_idx, h in enumerate(d["vertexSet"]):
        for t_idx, t in enumerate(d["vertexSet"]):
            if h != t and [h_idx,t_idx] in d["include_pairs"]:
                with open(f"annotations/{csv_file}", mode="a", newline="", encoding="utf-8") as f:
                    writer = csv.writer(f, delimiter=";")
                    writer.writerow([d["title"], h_idx, t_idx, []])

In [None]:
# Initialize an index to track the current position
current_index = 0

# Create a button widget
next_button = widgets.Button(description="Next")
back_button = widgets.Button(description="Back")
output = widgets.Output()

# Function to update the display
def update_output():
    with output:
        clear_output(wait=True)  # Clear the previous output
        # Get the next item from the iterator
        d = sample[current_index]
        title = d["title"]
        print(f"DOCUMENT: {title}")
        for s in d["sents"]:
            print(" ".join([tok for tok in s]))
        print()
        for h_idx, h in enumerate(d["vertexSet"]):
            for t_idx, t in enumerate(d["vertexSet"]):
                if h != t and [h_idx,t_idx] in d["include_pairs"]:
                    h_name = {h[0]["name"]}
                    t_name = {t[0]["name"]}
                    print(f"Pair to consider: ({h_idx}) {h_name}; ({t_idx})) {t_name}")
                    for m in models:
                        if title not in title2preds[m].keys():
                            print(f"\t{m}: []")
                        elif (h_idx,t_idx) not in title2preds[m][title].keys():
                            print(f"\t{m}: []")
                        else:
                            print(f"\t{m}: {title2preds[m][title][(h_idx,t_idx)]}")

# Function to handle "Next" button click
def on_next_button_clicked(b):
    global current_index
    if current_index < len(sample) - 1:  # Check if we can move forward
        current_index += 1
        update_output()
    if current_index == len(sample) - 1:  # Disable "Next" if at the end
        next_button.disabled = True
    back_button.disabled = False  # Enable "Back"

# Function to handle "Back" button click
def on_back_button_clicked(b):
    global current_index
    if current_index > 0:  # Check if we can move backward
        current_index -= 1
        update_output()
    if current_index == 0:  # Disable "Back" if at the start
        back_button.disabled = True
    next_button.disabled = False  # Enable "Next"

# Attach button click events
next_button.on_click(on_next_button_clicked)
back_button.on_click(on_back_button_clicked)

In [None]:
# Initialize buttons and display the first item
back_button.disabled = True  # Start with "Back" disabled
update_output()

# Display the buttons and output
display(widgets.HBox([back_button, next_button]), output)  

# Creation of the annotated sample JSON file

This section generates a JSON file containing the annotated sample. Such a file will be merge to the training file to produce the finetuning dataset for the next iteration.

In [None]:
# Create sample json
csv_file = f"{filename}_annotations.csv"
annotations = pd.read_csv(f"annotations/{csv_file}", sep=";")
annotations.info()

In [None]:
# Load relations information
rel_info = json.load(open("../data/docred/rel_info.json", "r"))
relations = ["NA"]
relations.extend([f"{k}" for k in rel_info.keys()])

In [None]:
# Create a dictionary with the positive examples
title2labels = {}
num_rels = 0
for ix, col in annotations.iterrows():
    list_rels = col["rels"].strip("[]").split(",")
    print(f"Number of relations annotated: {len(list_rels)}; {list_rels}")
    if len(list_rels) > 1:
        for rel in list_rels:
            if rel not in relations:
                title = col["Title"]
                raise ValueError(f"Relation {rel} of document {title} not in predefined list of relations")
            num_rels += 1
            try:
                title2labels[col["Title"]].append({"h": col["h_idx"], "t": col["t_idx"], "r": rel, "evidence": []})
            except KeyError:
                title2labels[col["Title"]] = [{"h": col["h_idx"], "t": col["t_idx"], "r": rel, "evidence": []}]
    elif list_rels[0] != "":
        if list_rels[0] not in relations:
            title = col["Title"]
            raise ValueError(f"Relation {list_rels[0]} of document {title} not in predefined list of relations")
        num_rels += 1
        try:
            title2labels[col["Title"]].append({"h": col["h_idx"], "t": col["t_idx"], "r": list_rels[0], "evidence": []})
        except KeyError:
            title2labels[col["Title"]] = [{"h": col["h_idx"], "t": col["t_idx"], "r": list_rels[0], "evidence": []}]

In [None]:
# Checking the number of positive examples in the sample
num_rels

In [None]:
# Creating the JSON file
for d in sample:
    try:
        d["labels"] = title2labels[d["title"]]
    except KeyError:
        title = d["title"]
        print(f"Only negative examples for {title}")
        d["labels"] = []
    
json.dump(sample, open(f"annotations/ANNOTATED_{filename}.json", "w"))