# Data analysis for annotation task between 2 annotators

In [1]:
import os
from brat_parser import get_entities_relations_attributes_groups
from collections import Counter
import pandas as pd

In [2]:
non_indep_studies = [1] + [i for i in range(100,108 + 1)]
common_studies = [2,3,4,11,12,13,14,15,16,19,23,24,26,34,37,43,45,116,117,126]
ignored_files = [".stats_cache"]

#### Text contents standardization

In [5]:
a1_annotations = []
a2_annotations = []
for f in os.listdir('brat_annotations/annotator1'):
    if f not in ignored_files :
        file_id = int(f.split(".")[0])
        if f.endswith(".ann") and file_id not in common_studies and file_id not in non_indep_studies:
            if os.stat('brat_annotations/annotator1/' + f).st_size != 0:
                a1_annotations.append(file_id)
            if os.stat('brat_annotations/annotator2/' + f).st_size != 0:
                a2_annotations.append(file_id)

In [6]:
len(a1_annotations), len(a2_annotations)

(29, 27)

#### Entities count between Annotators

In [11]:
def get_entities_total_count(annotation_dir):
    entities_counter = Counter()
    for file in os.listdir(annotation_dir) :
        if file not in ignored_files and file.endswith(".ann"):
            filepath = os.path.join(annotation_dir, file)
            entities = get_entities_relations_attributes_groups(filepath)[0]
            if entities :
                ent_types = [ent_obj.type for ent_obj in entities.values()]
                entities_counter.update(ent_types)
    return entities_counter

In [12]:
ent_count1 = get_entities_total_count('brat_annotations/annotator1')
ent_count2 = get_entities_total_count('brat_annotations/annotator2')
df = pd.DataFrame({"annotator1" : ent_count1, "annotator2": ent_count2})
df

Unnamed: 0,annotator1,annotator2
PrimaryOutcome,82,81
TimeFrame,146,138
OutcomeDefinition,104,88
OtherOutcome,171,133
SecondaryOutcome,223,229


In [13]:
df.sum()

annotator1    726
annotator2    669
dtype: int64

#### Overlapping entities with multiple overlaps

In [14]:
def is_included(span1,span2) :
    """check if span2 is included in or overlapping with span1"""
    ret = False
    if span1[0] < span2[0] < span1[1] < span2[1] :
        ret = True
    if span1[0] < span2[0] < span2[1] < span1[1] :
        ret = True
    return ret

def is_overlapping(span1, span2):
    """checks if 2 spans have an overlap"""
    return is_included(span1,span2) or is_included(span2,span1) or span1 == span2

def find_overlaps(entity, entity_list):
    matches = []
    for e in entity_list:
        if is_overlapping(entity,e) :
            matches.append(e)
    return matches

def get_entities_spans(filepath) :
    return [v.span[0] for v in get_entities_relations_attributes_groups(filepath)[0].values()]

In [15]:
for s in common_studies :
    a1 = f'brat_annotations/annotator1/{str(s)}.ann'
    a2 = f'brat_annotations/annotator2/{str(s)}.ann'
    ent1 = get_entities_spans(a1)
    ent2 = get_entities_spans(a2)
    for e1 in ent1 :
        matches = find_overlaps(e1, ent2)
        if len(matches) > 1 :
            print("Study : ", s)
            print("Entity compared : ", e1)
            print("Matches  : ", matches)
            break

Study :  14
Entity compared :  (147, 1076)
Matches  :  [(145, 284), (286, 912)]
Study :  16
Entity compared :  (201, 344)
Matches  :  [(199, 231), (233, 298), (299, 322)]
Study :  24
Entity compared :  (1088, 1217)
Matches  :  [(1113, 1139), (1157, 1190)]
