# Annotate HPLT Alignment

Small notebook to compare original UCDP documents to the one found in HPLT.

In [1]:
%gui asyncio

from __future__ import annotations
from typing import Any
import html
import json
import pathlib
import random
import subprocess
import tempfile
import urllib

from IPython.display import DisplayHandle, display, update_display
import ipywidgets as widgets

In [2]:
DATA_PATH = "https://recurrent.network/AEC/annotate-me.jsonl"
ANNOTATIONS_PATH = pathlib.Path("./align_annotations.jsonl")

##### Data Loading

In [3]:
print("(it'll take a minute or two) Retrieving the dataset… ", flush=True, end="")
dataset: dict[int, dict[str, Any]] = {}
with urllib.request.urlopen(DATA_PATH) as data_file:
    for line in data_file:
        sample = json.loads(line)
        dataset[sample["id"]] = sample
print("done")

(it'll take a minute or two) Retrieving the dataset… done


## Annotations

In [4]:
class Annotator:
    def __init__(self: Annotator) -> None:
        self.handle: DisplayHandle = display(widgets.HTML("<p>Loading…</p>"), display_id=True)
        self.display_annotator_name_form()
        self.init_display_buttons()

    def read_annotations(self: Annotator) -> None:
        self.annotations: dict[int, dict[str, Any]] = {}
        try:
            with ANNOTATIONS_PATH.open("r") as file:
                for line in file:
                    annotation: dict[str, Any] = json.loads(line)
                    self.annotations[annotation["id"]] = annotation
        except FileNotFoundError:
            raise RuntimeError("Annotation file empty, that's not normal! Are you runing jupyter from the directory containing the notebook and the annotations?")

    def display_annotator_name_form(self: Annotator) -> None:
        self.annotator_name_field = widgets.Text(placeholder="Your name")
        button = widgets.Button(description="Start")
        button.on_click(self.start_annotating)
        self.read_annotations()
        self.handle.update(widgets.VBox([
            widgets.HTML(f"Annotation file {ANNOTATIONS_PATH} contains {len(self.annotations)} annotation{'s' if len(self.annotations)>1 else ''}. Don't forget to push."),
            widgets.HBox([widgets.HTML("Annotator's name: "), self.annotator_name_field, button])
        ]))

    def start_annotating(self: Annotator, _: widgets.Button) -> None:
        self.handle.update(widgets.HTML("<p>Loading sample…</p>"))
        self.annotator_name: str = self.annotator_name_field.value
        self.to_annotate: None | int = None
        self.next_sample()

    def init_display_buttons(self: Annotator) -> None:
        self.actor_checkbox = widgets.widgets.Checkbox(value=False, description="Wrong Actor")
        self.date_checkbox = widgets.widgets.Checkbox(value=False, description="Wrong Date")
        self.location_checkbox = widgets.widgets.Checkbox(value=False, description="Wrong Location")
        self.deaths_checkbox = widgets.widgets.Checkbox(value=False, description="Wrong Deaths")
        self.correct_button = widgets.Button(description="Similar Documents", layout=widgets.Layout(width="250px"))
        self.mismatch_button = widgets.Button(description="Completly Different", layout=widgets.Layout(width="250px"))
        self.comment_box = widgets.Text(placeholder="Comments")

        self.correct_button.on_click(self.on_click)
        self.mismatch_button.on_click(self.on_click)
        self.annotation_buttons = widgets.HBox([self.actor_checkbox, self.date_checkbox, self.location_checkbox, self.deaths_checkbox, self.correct_button, self.mismatch_button, self.comment_box])

    def on_click(self: Annotator, target: widgets.Button) -> None:
        similar = None
        if target is self.correct_button:
            similar = True
        elif target is self.mismatch_button:
            similar = False

        annotation: dict[str, Any] = {
                "annotator": self.annotator_name,
                "id": self.to_annotate,
                "similar": similar,
                "wrong_actor": self.actor_checkbox.value,
                "wrong_date": self.date_checkbox.value,
                "wrong_location": self.location_checkbox.value,
                "wrong_deaths": self.deaths_checkbox.value,
                "comments": self.comment_box.value,
            }        
        with ANNOTATIONS_PATH.open("a") as annotation_file:
            print(json.dumps(annotation), file=annotation_file)
        self.annotations[self.to_annotate] = annotation
        
        self.actor_checkbox.value = False
        self.date_checkbox.value = False
        self.location_checkbox.value = False
        self.deaths_checkbox.value = False
        self.comment_box.value = ""
        self.next_sample()

    def next_sample(self: Annotator) -> None:
        self.read_annotations()
        candidates: set[int] = set(dataset.keys()) - self.annotations.keys()
        self.to_annotate: int = random.choice(tuple(candidates))
        sample: dict[str, Any] = dataset[self.to_annotate]
        
        ucdp_path = tempfile.NamedTemporaryFile()
        hplt_path = tempfile.NamedTemporaryFile()
        with tempfile.NamedTemporaryFile(delete_on_close=False, mode="w") as ucdp_file, tempfile.NamedTemporaryFile(delete_on_close=False, mode="w") as hplt_file:
            print(sample["source_date"], file=ucdp_file, end="\n")
            print(sample["source_article_ucdp"], file=ucdp_file, end="")
            print(sample["source_date"], file=hplt_file, end="\n")
            print(sample["source_article"], file=hplt_file, end="")
            ucdp_file.close()
            hplt_file.close()
            result = subprocess.run(["git", "diff", "--word-diff=porcelain", ucdp_file.name, hplt_file.name], stdout=subprocess.PIPE, encoding="utf-8")
    
        differences: list[tuple[str, str]] = []
        header: bool = True
        for line in result.stdout.split('\n'):
            if header:
                if line and line[0] == "@":
                    header = False
                continue
            if not line:
                continue
            if line[0] == "~":
                differences.append((" ", "<br>"))
            else:
                text = html.escape(line[1:]).replace('$', "&dollar;")
                if line[0] == "+":
                    text = f"<ins>{text}</ins>"
                elif line[0] == "-":
                    text = f"<del>{text}</del>"
                differences.append((line[0], text))
        merged_diff: str = "".join(text for state, text in differences)
        ucdp_diff: str = "".join(text for state, text in differences if state != '+')
        hplt_diff: str = "".join(text for state, text in differences if state != '-')
        
        html_diff = f"""
            <section>
            <h3>Event #{self.to_annotate}</h3>
            <style>
                ins {{ background-color: #afa; text-decoration: none; }}
                del {{ background-color: #fcc; text-decoration: none; }}
                table.diff {{ width: 100%; }}
                td, th {{ width: 50%; text-align: start !important; vertical-align: top !important; }}
            </style>
    
            <section>
                <h4>Merged Diff</h4>
                <p>{merged_diff}</p>
            </section>
            <section>
                <h4>Split Diff</h4>
                <table class="">
                    <tr><th>UCDP</th><th>HPLT</th></tr>
                    <tr><td>{ucdp_diff}</td><td>{hplt_diff}</td></tr>
                </table>
            </section>
            <section>
                <h4>UCDP Event Annotations</h4>
                <table>
                    <tr><th>Side A</th><td>{sample['side_a_name']}</td></tr>
                    <tr><th>Side B</th><td>{sample['side_b_name']}</td></tr>
                    <tr><th>Start Date</th><td>{sample['start_date']}</td></tr>
                    <tr><th>End Date</th><td>{sample['end_date']}</td></tr>
                    <tr><th>Location Country</th><td>{sample['location_root_name']}</td></tr>
                    <tr><th>Location ADM1</th><td>{sample['location_adm1_name']}</td></tr>
                    <tr><th>Location ADM2</th><td>{sample['location_adm2_name']}</td></tr>
                    <tr><th>Location Where</th><td>{sample['location_where_name']}</td></tr>
                    <tr><th>Deaths Side A</th><td>{sample['deaths_side_a']}</td></tr>
                    <tr><th>Deaths Side B</th><td>{sample['deaths_side_b']}</td></tr>
                    <tr><th>Deaths Civilian</th><td>{sample['deaths_civilian']}</td></tr>
                    <tr><th>Deaths Unknown</th><td>{sample['deaths_unknown']}</td></tr>
                    <tr><th>Deaths Low</th><td>{sample['deaths_low']}</td></tr>
                    <tr><th>Deaths High</th><td>{sample['deaths_high']}</td></tr>
                </table>
            </section>
            </section>
            """
        self.handle.update(widgets.VBox([widgets.HTML(html_diff), self.annotation_buttons]))

### Procedure

If the articles are somewhat similar, but some pieces of information are missing or are wrong, check the boxes corresponding to the information that is wrong/missing:
- Who (actors) 
- When (date)
- Where (location)
- What (number of deaths)
If the articles are completly different, you can click "Completly Different" without selecting the checkboxes.
If the articles are the same you can click "Similar Documents".

If it's a complicated case you'd like to discuss, write a comment and post a message on Mattermost.

**Don't forget to commit the `align_annotations.jsonl` file every once in a while.** It's saved and re-read after every annotation you do.

In [5]:
annotator = Annotator()

VBox(children=(HTML(value='\n            <section>\n            <h3>Event #235759</h3>\n            <style>\n …