In [1]:
import pandas as pd
import glob
import os
from pathlib import Path
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score

# Create csv with annotations from all annotators

In [40]:
main_files = glob.glob('../data/Bete_main/annotation/*/*.tsv')
nutrition_files = glob.glob('../data/Bete_nutrition/annotation/*/*.tsv')
main_files.sort()
nutrition_files.sort()
all_files = {'main': main_files, 'nutrition': nutrition_files}
new_csv_file = []
for data_name, data_files in all_files.items():
    print(f'Number of annotated documents: {len(data_files)}')
    for cur_file in data_files:
        annotator_name = cur_file.split(os.sep)[-1]
        document_name = cur_file.split(os.sep)[-2]
        with open(cur_file, encoding="utf8") as f:
            body = False
            for line in f:
                if body:
                    line_list = line.strip('\n\t ').split('\t')
                    cur_csv_list = [f'{data_name}_{document_name}', annotator_name] + line_list
                    if len(cur_csv_list) < 7:
                        for i in range(7 - len(cur_csv_list)):
                            cur_csv_list.append('_')
                    new_csv_file.append(cur_csv_list)
    #                 print(cur_csv_list)
                if line.startswith('#Text'):
                    body = True
df = pd.DataFrame(new_csv_file, columns=['File', 'Annotator', 'Token ID', 'Token position', 'Token', 'Entity', 'Relation', 'Relation ID'])
df.to_csv('../data/all_annotated_dataset_2023_01_18.csv')

Number of annotated documents: 665
Number of annotated documents: 652


# Calculate inter-annotator agreement

## Compute entities agreement

In [34]:
def parse_ent(ent: str):
    upd_entity = ent
    if ent.endswith(']'):
        upd_entity = ent.split('[')[0]
    return upd_entity

In [40]:
main_files = glob.glob('../data/Bete_main/annotation/*')
nutrition_files = glob.glob('../data/Bete_nutrition/annotation/*')
main_files.sort()
nutrition_files.sort()
all_files = {'main': main_files, 'nutrition': nutrition_files}
df = pd.read_csv('../data/all_annotated_dataset_2023_01_18.csv')
ent1_full, ent2_full = [], []
entity_map, entity_to_token_count, relation_map = {}, {}, {}
for data_name, data_files in all_files.items():
    print(f"Number of {data_name} documents: {len(data_files)}")
    for cur_file in data_files:
        document_name = cur_file.split(os.sep)[-1]
        file_df = df[df['File'] == f'{data_name}_{document_name}']
        annotator_set = set(file_df['Annotator'])
        # print(document_name)
        # print(annotator_set)
        ann_tuple = []
        for annot in annotator_set:
            annotator_df = file_df[file_df['Annotator'] == annot]
            count_entity = 0
            for ann_ent in annotator_df['Entity']:
                if ann_ent != '_':
                    count_entity += 1
            ann_tuple.append((annot, count_entity))
        sorted_list = sorted(ann_tuple, key=lambda v: (-v[1], v[0]))
        if len(sorted_list) >= 2:
            ann1 = file_df[file_df['Annotator'] == sorted_list[0][0]]
            ann2 = file_df[file_df['Annotator'] == sorted_list[1][0]]
            ent1_list, ent2_list = [], []
            for ent1, ent2 in zip(ann1['Entity'], ann2['Entity']):
                if parse_ent(ent1) != '_' and parse_ent(ent2) != '_' and parse_ent(ent1) != '*' and parse_ent(ent2) != '*':
                # if (parse_ent(ent1) != '_' or parse_ent(ent2) != '_') and parse_ent(ent1) != '*' and parse_ent(ent2) != '*':
                    ent1_list.append(parse_ent(ent1))
                    ent2_list.append(parse_ent(ent2))
            ent1_full.extend(ent1_list)
            ent2_full.extend(ent2_list)
print(classification_report(ent1_full, ent2_full, labels=['Complication',
 'DiabetesType',
 'Dose',
 'Duration',
 'Food',
 'GlucoseValue',
 'Insulin',
 'Medication',
 'NonMedicalTreatment',
 'Set',
 'Symptom',
 'Test',
 'Time']))

Number of main documents: 304
Number of nutrition documents: 258
                     precision    recall  f1-score   support

       Complication       0.75      0.76      0.76       246
       DiabetesType       0.93      0.95      0.94        58
               Dose       0.72      0.78      0.75        50
           Duration       0.73      0.73      0.73        15
               Food       0.99      1.00      0.99      1588
       GlucoseValue       0.80      0.82      0.81       240
            Insulin       0.82      0.50      0.62        36
         Medication       0.85      0.85      0.85        34
NonMedicalTreatment       0.94      0.95      0.95       691
                Set       0.86      0.88      0.87        64
            Symptom       0.95      0.92      0.93       687
               Test       0.94      0.75      0.83        40
               Time       0.92      0.97      0.95       127

          micro avg       0.93      0.93      0.93      3876
          macro av

In [41]:
cohen_kappa_score(ent1_full, ent2_full)

0.9119127934162823

## Compute relation agreement

In [16]:
def parse_rel(rel: str):
    upd_rel = rel
    if '|' in rel:
        upd_rel = rel.split('|')[0]
    return upd_rel

In [42]:
main_files = glob.glob('../data/Bete_main/annotation/*')
nutrition_files = glob.glob('../data/Bete_nutrition/annotation/*')
main_files.sort()
nutrition_files.sort()
all_files = {'main': main_files, 'nutrition': nutrition_files}
df = pd.read_csv('../data/all_annotated_dataset_2023_01_18.csv')
rel1_full, rel2_full = [], []
entity_map, entity_to_token_count, relation_map = {}, {}, {}
for data_name, data_files in all_files.items():
    print(f"Number of {data_name} documents: {len(data_files)}")
    for cur_file in data_files:
        document_name = cur_file.split(os.sep)[-1]
        file_df = df[df['File'] == f'{data_name}_{document_name}']
        annotator_set = set(file_df['Annotator'])
        # print(document_name)
        # print(annotator_set)
        ann_tuple = []
        for annot in annotator_set:
            annotator_df = file_df[file_df['Annotator'] == annot]
            count_relation = 0
            for ann_ent in annotator_df['Relation']:
                if ann_ent != '_':
                    count_relation += 1
            ann_tuple.append((annot, count_relation))
        sorted_list = sorted(ann_tuple, key=lambda v: (-v[1], v[0]))
        if len(sorted_list) >= 2:
            ann1 = file_df[file_df['Annotator'] == sorted_list[0][0]]
            ann2 = file_df[file_df['Annotator'] == sorted_list[1][0]]
            ent1_list, ent2_list = [], []
            for ent1, ent2 in zip(ann1['Relation'], ann2['Relation']):
                if parse_rel(ent1) != '_' and parse_rel(ent2) != '_' and parse_rel(ent1) != '*' and parse_rel(ent2) != '*':
                # if (parse_rel(ent1) != '_' or parse_rel(ent2) != '_') and parse_rel(ent1) != '*' and parse_rel(ent2) != '*':
                    ent1_list.append(parse_rel(ent1))
                    ent2_list.append(parse_rel(ent2))
            rel1_full.extend(ent1_list)
            rel2_full.extend(ent2_list)
print(classification_report(rel1_full, rel2_full, labels=['causes', 'prevents', 'treats', 'has', 'diagnoses', 'complicates']))

Number of main documents: 304
Number of nutrition documents: 258
              precision    recall  f1-score   support

      causes       0.70      0.49      0.58       166
    prevents       0.56      0.71      0.63         7
      treats       0.92      0.88      0.90        26
         has       0.39      0.60      0.47        85
   diagnoses       1.00      0.86      0.92         7
 complicates       0.17      0.50      0.25         2

   micro avg       0.57      0.57      0.57       293
   macro avg       0.62      0.67      0.62       293
weighted avg       0.63      0.57      0.58       293



In [43]:
cohen_kappa_score(rel1_full, rel2_full)

0.32552972416409254

# Create csv with all annotations

In [3]:
# cur_path = os.path.abspath(os.getcwd())
# data_path = os.path.join(cur_path, '..\\data\\Dia-Bete\\Bete_main\\curation\\*\\*')
main_files = glob.glob('../data/Bete_main/curation/*/*')
nutrition_files = glob.glob('../data/Bete_nutrition/curation/*/*')
main_files.sort()
nutrition_files.sort()
all_files = {'main': main_files, 'nutrition': nutrition_files}
new_csv_file = []
for data_name, data_files in all_files.items():
    print(f'Number of annotated documents: {len(data_files)}')
    for cur_file in data_files:
        document_name = cur_file.split('/')[-2]
        with open(cur_file, encoding="utf8") as f:
            body = False
            for line in f:
                if body:
                    line_list = line.strip('\n\t ').split('\t')
                    cur_csv_list = [data_name, document_name] + line_list
                    if len(cur_csv_list) < 7:
                        for i in range(7 - len(cur_csv_list)):
                            cur_csv_list.append('_')
                    new_csv_file.append(cur_csv_list)
    #                 print(cur_csv_list)
                if line.startswith('#Text'):
                    body = True
df = pd.DataFrame(new_csv_file, columns=['Data', 'File', 'Token ID', 'Token position', 'Token', 'Entity', 'Relation', 'Relation ID'])
df.to_csv('../data/curated_dataset_2022_01_07.csv')

Number of annotated documents: 304
Number of annotated documents: 201


# Parse in this format
'''
{
        "keyphrases":{
            "1":{
                "attributes":[],
                "error":false,
                "id":1,
                "idxs":[
                    2,
                    3,
                    4,
                    5,
                    6
                ],
                "label":"Concept",
                "spans":[
                    [
                        4,
                        12
                    ],
                    [
                        13,
                        20
                    ]
                ],
                "text":"gl\u00f3bulos blancos",
                "tokens":[
                    "gl\u00f3bulos",
                    "blancos"
                ]
            },
            "2":{
                "attributes":[],
                "error":false,
                "id":2,
                "idxs":[
                    7,
                    8
                ],
                "label":"Action",
                "spans":[
                    [
                        21,
                        27
                    ]
                ],
                "text":"ayudan",
                "tokens":[
                    "ayudan"
                ]
            },
            "3":{
                "attributes":[],
                "error":false,
                "id":3,
                "idxs":[
                    11
                ],
                "label":"Concept",
                "spans":[
                    [
                        33,
                        42
                    ]
                ],
                "text":"organismo",
                "tokens":[
                    "organismo"
                ]
            },
            "4":{
                "attributes":[],
                "error":false,
                "id":4,
                "idxs":[
                    13,
                    14
                ],
                "label":"Action",
                "spans":[
                    [
                        45,
                        53
                    ]
                ],
                "text":"combatir",
                "tokens":[
                    "combatir"
                ]
            },
            "5":{
                "attributes":[],
                "error":false,
                "id":5,
                "idxs":[
                    15,
                    16,
                    17
                ],
                "label":"Concept",
                "spans":[
                    [
                        54,
                        65
                    ]
                ],
                "text":"infecciones",
                "tokens":[
                    "infecciones"
                ]
            }
        },
        "relations":[
            {
                "arg1":4,
                "arg2":3,
                "label":"subject"
            },
            {
                "arg1":4,
                "arg2":5,
                "label":"target"
            },
            {
                "arg1":2,
                "arg2":1,
                "label":"subject"
            },
            {
                "arg1":2,
                "arg2":4,
                "label":"target"
            }
        ],
        "text":"Los gl\u00f3bulos blancos ayudan a su organismo a combatir infecciones.",
        "tokens":[
            "[CLS]",
            "Los",
            "g",
            "##l\u00f3",
            "##bulo",
            "##s",
            "blancos",
            "ayuda",
            "##n",
            "a",
            "su",
            "organismo",
            "a",
            "combat",
            "##ir",
            "in",
            "##fe",
            "##cciones",
            ".",
            "[SEP]"
        ]
    }
'''