## Imports

In [1]:
import json
import pandas as pd
import numpy as np
from prettytable import PrettyTable
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from stop_words import get_stop_words
import re

## Globals Variables

In [2]:
## Customer Variables
annotations_ousmane_200_json_path = "inputs\\annotations-ousmane-200.json" 
annotations_rossana_200_json_path = "inputs\\rossana_annotations.json" 
annotations_enrico_100_json_path = "inputs\\enrico_annotations.json" 
annotations_ousmane_200_csv_path = "inputs\\annotations-ousmane-200.csv"
annotations_marco_1000_csv_path = "inputs\\1000annotazioni.csv"
annotations_marco_200_csv_path = "inputs\\annotations-marco-200.csv"
annotations_marco_200_2_csv_path = "inputs\\annotations-marco-200-2.csv"
annotations_marco_100_csv_path = "inputs\\annotations-marco-100.csv"
annotations_rossana_200_csv_path = "inputs\\annotations-rossana-200.csv"
annotations_enrico_100_csv_path = "inputs\\annotations-enrico-100.csv"
annotations_matched_200_csv_path = "outputs\\annotations-matched-200.csv"
annotations_contains_matched_200_csv_path = "outputs\\annotations_contains-matched-200.csv"
states_quantitative_analysis_csv_path = "outputs\\states_quantitative_analysis.csv"
events_quantitative_analysis_csv_path = "outputs\\events_quantitative_analysis.csv"
states_quantitative_analysis_groupby_lemmas = "outputs\\states_quantitative_analysis_groupby_lemmas.csv"
events_quantitative_analysis_groupby_lemmas = "outputs\\events_quantitative_analysis_groupby_lemmas.csv"
test_csv_path = "outputs\\test.csv"
temp_computing_path = "outputs\\temp_computing.csv"
exact_agreement_columns = ["author", "sent_id", "text", "TIME", "WRITER-AG", "EVENT", "ORG", "LOC", "ASP-EVENT", "STATE", "WRITER-PA", "REP-EVENT"]
soft_agreement_columns = ["author", "sent_id", "text", "EVENT", "STATE"]
merge_by_columns = ["author", "sent_id", "text"]
element_types_without_events_and_states = ["WRITER-AG", "ORG", "LOC", "ASP-EVENT", "WRITER-PA", "REP-EVENT"]
element_types = ["TIME", "WRITER-AG", "EVENT", "ORG", "LOC", "ASP-EVENT", "STATE", "WRITER-PA", "REP-EVENT"]

## Libraries Variables
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)
wn_lemmatizer = WordNetLemmatizer()


## Common Functions

In [3]:
## This function convert the annotation file json in a annotation file csv.
def annotations_json_to_csv(path_file_json, annotator_name, path_file_csv) :
    df = pd.DataFrame()
    with open(path_file_json, "r", encoding='utf-8') as f:
        file_contents = json.load(f)
        for item in file_contents:
            for annotation in item['annotations']:
                container = dict()
                container['annotator'] = annotator_name
                container['author'] = item['data']['author']
                container['sent_id'] = item['data']['sent_id']
                container['text'] = item['data']['text']
                for v in annotation['result']:
                    container[v['value']['labels'][0]] = v['value']['text']
                df = df.append(container, ignore_index=True)
    df.to_csv(path_file_csv, index=False)

# This function verify if string elements of two list are "contains relations".
# "contains relations": for example we have two string str1 and str2, if str1 is sub strinf of str2
# or str2 is sub string of str1
def checkContainsRelations(items_1, items_2) :
    computed = True
    if len(items_1) != len(items_2) : return False
    for index in range(len(items_1)) : 
        #computed = computed and (str(items_1[index]) in str(items_2[index]) or str(items_2[index]) in str(items_1[index]))
        sub_compute = True
        lx = str(items_1[index]).split()
        ly = str(items_2[index]).split()
        if len(lx) > len(ly) :
            lx, ly = swapedItems(lx, ly)
        for i in range(len(lx)) :
            sub_compute = sub_compute and (lx[i] in ly)
            if not sub_compute :
                break
        computed = computed and sub_compute
        if not computed : return False
    return computed

# Contain relation between two string
def checkContainRelation(item_1, item_2) :
    computed = True
    lx = str(item_1).split()
    ly = str(item_2).split()
    if len(lx) > len(ly) :
        lx, ly = swapedItems(lx, ly)
    for i in range(len(lx)) :
        computed = computed and (lx[i] in ly)
        if not computed :
            return False
    return computed


## This function merge two csv with Pandas
def merged_two_csv(path_csv1, path_csv2, how_mode, onColumns) :
    df1 = pd.read_csv(path_csv1)
    df2 = pd.read_csv(path_csv2)
    df = pd.merge(df1, df2, how=how_mode, on=onColumns) 
    return df

def concateItems(item_1, item_2) :
    item = "Value1: "
    item += str(item_1) if str(item_1) != "nan" else "MISSING_VALUE"
    item += " & Value2: "
    item += str(item_2) if str(item_2) != "nan" else "MISSING_VALUE"
    return item

def swapedItems(item_1, item_2) :
    temp = item_1
    item_1 = item_2
    item_2 = temp
    return item_1,item_2

## 
def computeContainsRelationMatchByEventOrState(df_annotations1, df_annotations2, output_file_name="") :
    df_set_pivot = df_annotations1[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])

    # Create a matched Dictionnary container
    matched_container = {
        'Author': [],
        'SentenceId': [],
        'EVENT': [],
        'STATE': [],
        'MatchedInfo': [],
        'Text': []
    }

    # Iterate Pivot df and Processing soft matching on Some columns 
    for _, row in df_set_pivot.iterrows():
        author = row["author"]
        sent_id = row["sent_id"]
        text = row["text"]
        df_x = df_annotations1.loc[(df_annotations1['author'] == author) & (df_annotations1['sent_id'] == sent_id)]
        df_y = df_annotations2.loc[(df_annotations2['author'] == author) & (df_annotations2['sent_id'] == sent_id)]
        if (df_x.shape[0] < df_y.shape[0]) :
            df_x, df_y = swapedItems(df_x, df_y)
        for _, row_x in df_x.iterrows() :
            items_1 = [row_x["EVENT"], row_x["STATE"]]
            for index_y, row_y in df_y.iterrows() :
                items_2 = [row_y["EVENT"], row_y["STATE"]]
                if items_1 == items_2 : 
                    matched_container['Author'].append(author)
                    matched_container['SentenceId'].append(sent_id)
                    matched_container['Text'].append(text)
                    matched_container['EVENT'].append(row_x["EVENT"])
                    matched_container['STATE'].append(row_x["STATE"])
                    matched_container['MatchedInfo'].append("Exact Match")
                    df_y = df_y.drop(index_y)
                    break
                elif checkContainsRelations(items_1, items_2) :
                    matched_container['Author'].append(author)
                    matched_container['SentenceId'].append(sent_id)
                    matched_container['Text'].append(text)
                    if str(row_x["EVENT"]) != "nan" or str(row_y["EVENT"]) != "nan" : 
                        matched_container['EVENT'].append(concateItems(row_x["EVENT"],row_y["EVENT"]))
                    else : matched_container['EVENT'].append("")
                    if str(row_x["STATE"]) != "nan" or str(row_y["STATE"]) != "nan" : 
                        matched_container['STATE'].append(concateItems(row_x["STATE"], row_y["STATE"]))
                    else : matched_container['STATE'].append("")
                    matched_container['MatchedInfo'].append("Contains Relation Match")
                    df_y = df_y.drop(index_y)
                    break

    # Convert the dictionary into DataFrame
    df_contains_matched = pd.DataFrame(matched_container, columns = ['EVENT', 'STATE', 'MatchedInfo', 'SentenceId', 'Text'])
    df_contains_matched.replace(np.nan, '', regex=True)
    if output_file_name != "" and isinstance(output_file_name, str) : 
        df_contains_matched.to_csv(output_file_name, index=False)
    return df_contains_matched

## This function take two annotations Dataframe and compute the Agreement by Category
def computeContainsRelationMatchByElementType(df_annotations1, df_annotations2, element_type, output_file_name) : 
    
    # Create a matched Dictionnary container
    matched_container = {
        'Author': [],
        'SentenceId': [],
        element_type: [],
        'MatchedInfo': [],
        'Text': []
    }

    if element_type not in element_types_without_events_and_states :
        raise Exception("Element Type not exist or has type EVENT or STATE!")

    # Compute Pivot Set
    df_set_pivot = df_annotations1[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])

    for _, row in df_set_pivot.iterrows() :
        author = row["author"]
        sent_id = row["sent_id"]
        text = row["text"]
        df_x = df_annotations1.loc[(df_annotations1['author'] == author) & (df_annotations1['sent_id'] == sent_id) & (df_annotations1['text'] == text)]
        df_y = df_annotations2.loc[(df_annotations2['author'] == author) & (df_annotations2['sent_id'] == sent_id) & (df_annotations2['text'] == text)]
        if (df_x.shape[0] < df_y.shape[0]) :
            df_x, df_y = swapedItems(df_x, df_y)
        for _, row_x in df_x.iterrows() :
            items_1 = [row_x[element_type]]
            for index_y, row_y in df_y.iterrows() :
                items_2 = [row_y[element_type]]
                if items_1 == items_2 : 
                    matched_container['Author'].append(author)
                    matched_container['SentenceId'].append(sent_id)
                    matched_container['Text'].append(text)
                    matched_container[element_type].append(row_x[element_type])
                    matched_container['MatchedInfo'].append("Exact Match")
                    df_y = df_y.drop(index_y)
                    break
                elif checkContainsRelations(items_1, items_2) :
                    matched_container['Author'].append(author)
                    matched_container['SentenceId'].append(sent_id)
                    matched_container['Text'].append(text)
                    if str(row_x[element_type]) != "nan" or str(row_y[element_type]) != "nan" : 
                        matched_container[element_type].append(concateItems(row_x[element_type],row_y[element_type]))
                    else : matched_container[element_type].append("")
                    matched_container['MatchedInfo'].append("Contains Relation Match")
                    df_y = df_y.drop(index_y)
                    break
        
    # Convert the dictionary into DataFrame
    df_contains_matched = pd.DataFrame(matched_container, columns = [element_type, 'MatchedInfo', 'SentenceId', 'Text'])
    df_contains_matched.replace(np.nan, '', regex=True)
    df_contains_matched.to_csv(output_file_name, index=False)
    return df_contains_matched

# Precision e Recall X Categoria, First Annotations will be considerate GOLD.
def precisionAndRecallXCategory(df_annotations1, df_annotations2, category) :

    a_1 = df_annotations1.dropna(subset=[category]).drop_duplicates(subset=['author','sent_id',category])
    a_2 = df_annotations2.dropna(subset=[category]).drop_duplicates(subset=['author','sent_id',category])

    merged = a_1[['author','sent_id',category]].merge(a_2[['author','sent_id',category]],on=['author','sent_id'])

    a_1 = merged[['author','sent_id','{}_x'.format(category)]].dropna(subset=['{}_x'.format(category)]).drop_duplicates(subset=['author','sent_id','{}_x'.format(category)]).reset_index().drop(columns='index')
    a_2 = merged[['author','sent_id','{}_y'.format(category)]].dropna(subset=['{}_y'.format(category)]).drop_duplicates(subset=['author','sent_id','{}_y'.format(category)]).reset_index().drop(columns='index')
    
    merged = a_1.merge(a_2)
    merged = merged.groupby(['author','sent_id','{}_x'.format(category)])['{}_y'.format(category)].apply(list)
    merged = merged.reset_index()

    true_pos = 0
    false_pos = 0
    
    for row in merged.iloc[:].values:
        agreement = False
        for el in row[3]:
            searched = re.search(row[2],el)
            if searched:
                agreement = True
            else: continue
        if agreement is True:
            true_pos +=1
        else: false_pos+=1
    return {'true_pos': true_pos, 'false_pos': false_pos, 'retrieved': len(a_1), 'relevant': len(a_2)}

## Trnasform Annotions file JSON to file CSV

In [4]:
#annotations_json_to_csv(annotations_enrico_100_json_path, "enrico", annotations_enrico_100_csv_path)

## Compute exact Agreement 
### Output file => "outputs\\annotations-matched-200.csv"

In [5]:
df_exact_matched = merged_two_csv(annotations_ousmane_200_csv_path, annotations_marco_1000_csv_path, "inner", exact_agreement_columns)
df_exact_matched.to_csv(annotations_matched_200_csv_path, index=False)

## Compute soft Agreement (columns values are contains relations)
### Output file => "outputs\\annotations_contains-matched-200.csv"

In [6]:
# Get Both df of annotatore and  make Pivot df
df_ousmane = pd.read_csv(annotations_ousmane_200_csv_path)
df_marco_200 = pd.read_csv(annotations_marco_200_csv_path)
df_contains_matched = computeContainsRelationMatchByEventOrState(df_ousmane, df_marco_200, annotations_contains_matched_200_csv_path)

In [7]:

t = PrettyTable(['GOLD ANNOTATIONS', 'Intersection Mode', 'Precision Evaluation', 'Recall Evaluation', 'Exact Match', 'Contains Relation Match', 'Type Element', 'F-score'])

exaxt_match = df_contains_matched[df_contains_matched.MatchedInfo == 'Exact Match'].shape[0]
contains_relation_match = df_contains_matched[df_contains_matched.MatchedInfo == 'Contains Relation Match'].shape[0]

precision = (df_exact_matched.shape[0])/(df_ousmane.shape[0])
recall = (df_exact_matched.shape[0])/(df_marco_200.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Fist Annotator', 'Exact Match', precision, recall, df_exact_matched.shape[0], 0, 'ALL ELEMENT', f_score])

precision = (df_contains_matched.shape[0])/(df_ousmane.shape[0])
recall = (df_contains_matched.shape[0])/(df_marco_200.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Fist Annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, 'EVENT or STATE', f_score])

precision = (df_exact_matched.shape[0])/(df_marco_200.shape[0])
recall = (df_exact_matched.shape[0])/(df_ousmane.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Second Annotator', 'Exact Match', precision, recall, df_exact_matched.shape[0], 0, 'ALL ELEMENT', f_score])

precision = (df_contains_matched.shape[0])/(df_marco_200.shape[0])
recall = (df_contains_matched.shape[0])/(df_ousmane.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Second Annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, 'EVENT or STATE', f_score])

print(t)

+--------------------------------+-------------------------+----------------------+---------------------+-------------+-------------------------+----------------+--------------------+
|        GOLD ANNOTATIONS        |    Intersection Mode    | Precision Evaluation |  Recall Evaluation  | Exact Match | Contains Relation Match |  Type Element  |      F-score       |
+--------------------------------+-------------------------+----------------------+---------------------+-------------+-------------------------+----------------+--------------------+
|  Annoations of Fist Annotator  |       Exact Match       |  0.3333333333333333  | 0.27823691460055094 |     101     |            0            |  ALL ELEMENT   | 0.3033033033033033 |
|  Annoations of Fist Annotator  | Contains Relation Match |  0.6996699669966997  |  0.5840220385674931 |     205     |            7            | EVENT or STATE | 0.6366366366366366 |
| Annoations of Second Annotator |       Exact Match       | 0.27823691460055094

In [8]:
t = PrettyTable(['GOLD ANNOTATIONS', 'Intersection Mode', 'Precision Evaluation', 'Recall Evaluation', 'Exact Match', 'Contains Relation Match', 'Type Element'])
for type_element in element_types_without_events_and_states :
    df_temp = computeContainsRelationMatchByElementType(df_ousmane, df_marco_200, type_element, temp_computing_path)

    exaxt_match = df_temp[df_temp.MatchedInfo == 'Exact Match'].shape[0]
    contains_relation_match = df_temp[df_temp.MatchedInfo == 'Contains Relation Match'].shape[0]

    precision = (df_temp.shape[0])/(df_marco_200.shape[0])
    recall = (df_temp.shape[0])/(df_ousmane.shape[0])
    t.add_row(['Annoations of Fist Annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, type_element])

    precision = (df_temp.shape[0])/(df_ousmane.shape[0])
    recall = (df_temp.shape[0])/(df_marco_200.shape[0])
    t.add_row(['Annoations of Second Annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, type_element])
    
    t.add_row(['', '', '', '', '', '', ''])

print(t)

+--------------------------------+-------------------------+----------------------+--------------------+-------------+-------------------------+--------------+
|        GOLD ANNOTATIONS        |    Intersection Mode    | Precision Evaluation | Recall Evaluation  | Exact Match | Contains Relation Match | Type Element |
+--------------------------------+-------------------------+----------------------+--------------------+-------------+-------------------------+--------------+
|  Annoations of Fist Annotator  | Contains Relation Match |  0.7162534435261708  | 0.858085808580858  |     257     |            3            |  WRITER-AG   |
| Annoations of Second Annotator | Contains Relation Match |  0.858085808580858   | 0.7162534435261708 |     257     |            3            |  WRITER-AG   |
|                                |                         |                      |                    |             |                         |              |
|  Annoations of Fist Annotator  | Conta

In [9]:
df_marco = pd.read_csv(annotations_marco_1000_csv_path).replace(np.nan, '', regex=True)
df_rossana = pd.read_csv(annotations_rossana_200_csv_path)
df_marco_200_second = pd.read_csv(annotations_marco_200_2_csv_path)

#df_marco_200_second = pd.DataFrame(columns=list(df_rossana.columns))
#df_set_pivot = df_rossana[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])
#for _, row in df_set_pivot.iterrows() :
#    author = row["author"]
#    sent_id = row["sent_id"]
#    text = row["text"]
#    df_ = df_marco.loc[(df_marco['author'] == author) & (df_marco['sent_id'] == sent_id) & (df_marco['text'] == text)]
#    for _, row_ in df_.iterrows() :
#        df_marco_200_second = df_marco_200_second.append(row_)
#df_marco_200_second.to_csv(annotations_marco_200_2_csv_path)

t = PrettyTable(['GOLD ANNOTATIONS', 'Intersection Mode', 'Precision Evaluation', 'Recall Evaluation', 'Exact Match', 'Contains Relation Match', 'Type Element', 'F-score'])

df_exact_matched = merged_two_csv(annotations_marco_200_2_csv_path, annotations_rossana_200_csv_path, "inner", exact_agreement_columns)
df_contains_matched = computeContainsRelationMatchByEventOrState(df_marco_200_second, df_rossana)

exaxt_match = df_contains_matched[df_contains_matched.MatchedInfo == 'Exact Match'].shape[0]
contains_relation_match = df_contains_matched[df_contains_matched.MatchedInfo == 'Contains Relation Match'].shape[0]

precision = (df_exact_matched.shape[0])/(df_rossana.shape[0])
recall = (df_exact_matched.shape[0])/(df_marco_200_second.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Fist Annotator', 'Exact Match', precision, recall, df_exact_matched.shape[0], 0, 'ALL ELEMENT', f_score])

precision = (df_contains_matched.shape[0])/(df_rossana.shape[0])
recall = (df_contains_matched.shape[0])/(df_marco_200_second.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Fist Annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, 'EVENT or STATE', f_score])

precision = (df_exact_matched.shape[0])/(df_marco_200_second.shape[0])
recall = (df_exact_matched.shape[0])/(df_rossana.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of third annotator', 'Exact Match', precision, recall, df_exact_matched.shape[0], 0, 'ALL ELEMENT', f_score])

precision = (df_contains_matched.shape[0])/(df_marco_200_second.shape[0])
recall = (df_contains_matched.shape[0])/(df_rossana.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of third annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, 'EVENT or STATE', f_score])

print(t)

+-------------------------------+-------------------------+----------------------+---------------------+-------------+-------------------------+----------------+--------------------+
|        GOLD ANNOTATIONS       |    Intersection Mode    | Precision Evaluation |  Recall Evaluation  | Exact Match | Contains Relation Match |  Type Element  |      F-score       |
+-------------------------------+-------------------------+----------------------+---------------------+-------------+-------------------------+----------------+--------------------+
|  Annoations of Fist Annotator |       Exact Match       |  0.2865853658536585  | 0.27485380116959063 |      94     |            0            |  ALL ELEMENT   | 0.2805970149253731 |
|  Annoations of Fist Annotator | Contains Relation Match |  0.6676829268292683  |  0.6403508771929824 |     213     |            6            | EVENT or STATE | 0.6537313432835822 |
| Annoations of third annotator |       Exact Match       | 0.27485380116959063  |  0

In [11]:
df_marco = pd.read_csv(annotations_marco_1000_csv_path).replace(np.nan, '', regex=True)
df_enrico = pd.read_csv(annotations_enrico_100_csv_path).replace(np.nan, '', regex=True)
df_marco_100 = pd.read_csv(annotations_marco_100_csv_path).replace(np.nan, '', regex=True)

#df_marco_100 = pd.DataFrame(columns=list(df_enrico.columns))
#df_set_pivot = df_enrico[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])
#for _, row in df_set_pivot.iterrows() :
#    author = row["author"]
#    sent_id = row["sent_id"]
#    text = row["text"]
#    df_ = df_marco.loc[(df_marco['author'] == author) & (df_marco['sent_id'] == sent_id)]
#    for _, row_ in df_.iterrows() :
#        df_marco_100 = df_marco_100.append(row_)

t = PrettyTable(['GOLD ANNOTATIONS', 'Intersection Mode', 'Precision Evaluation', 'Recall Evaluation', 'Exact Match', 'Contains Relation Match', 'Type Element', 'F-score'])

merge_on_columns = exact_agreement_columns.copy()
merge_on_columns.remove("ASP-EVENT")
df_exact_matched = df_marco_100.merge(df_enrico, how="inner", on=merge_on_columns)
df_contains_matched = computeContainsRelationMatchByEventOrState(df_marco_100, df_enrico)

exaxt_match = df_contains_matched[df_contains_matched.MatchedInfo == 'Exact Match'].shape[0]
contains_relation_match = df_contains_matched[df_contains_matched.MatchedInfo == 'Contains Relation Match'].shape[0]

#precision = (df_exact_matched.shape[0])/(df_enrico.shape[0])
#recall = (df_exact_matched.shape[0])/(df_marco_100.shape[0])
#f_score = (2*precision*recall)/(precision+recall)
#t.add_row(['Annoations of Fist Annotator', 'Exact Match', precision, recall, df_exact_matched.shape[0], 0, 'ALL ELEMENT', f_score])

precision = (df_contains_matched.shape[0])/(df_enrico.shape[0])
recall = (df_contains_matched.shape[0])/(df_marco_100.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of Fist Annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, 'EVENT or STATE', f_score])

#precision = (df_exact_matched.shape[0])/(df_marco_100.shape[0])
#recall = (df_exact_matched.shape[0])/(df_enrico.shape[0])
#f_score = (2*precision*recall)/(precision+recall)
#t.add_row(['Annoations of fourth annotator', 'Exact Match', precision, recall, df_exact_matched.shape[0], 0, 'ALL ELEMENT', f_score])

precision = (df_contains_matched.shape[0])/(df_marco_100.shape[0])
recall = (df_contains_matched.shape[0])/(df_enrico.shape[0])
f_score = (2*precision*recall)/(precision+recall)
t.add_row(['Annoations of fourth annotator', 'Contains Relation Match', precision, recall, exaxt_match, contains_relation_match, 'EVENT or STATE', f_score])

print(t)

+--------------------------------+-------------------------+----------------------+--------------------+-------------+-------------------------+----------------+--------------------+
|        GOLD ANNOTATIONS        |    Intersection Mode    | Precision Evaluation | Recall Evaluation  | Exact Match | Contains Relation Match |  Type Element  |      F-score       |
+--------------------------------+-------------------------+----------------------+--------------------+-------------+-------------------------+----------------+--------------------+
|  Annoations of Fist Annotator  | Contains Relation Match |  0.9440559440559441  | 0.7258064516129032 |      96     |            39           | EVENT or STATE | 0.8206686930091186 |
| Annoations of fourth annotator | Contains Relation Match |  0.7258064516129032  | 0.9440559440559441 |      96     |            39           | EVENT or STATE | 0.8206686930091186 |
+--------------------------------+-------------------------+----------------------+--

## Precision and Recall X Category Element, with Precision = TP/(TP + FP) e Recall = Precision = TP/(TP + FN).
## TP: True Positive, FP: False Positive e FN: False Negative.

In [12]:
# Between First annotator and Second annotator 
table = PrettyTable(['ANNOTATIONS GOLD', 'Precision Evaluation', 'Recall Evaluation','F-score', 'Type Element',])
for element_type in element_types :
    df_annot1 = pd.read_csv(annotations_marco_200_csv_path)
    df_annot2 = pd.read_csv(annotations_ousmane_200_csv_path)
    computed = precisionAndRecallXCategory(df_annot1, df_annot2, element_type)
    precision = computed['true_pos']/computed['retrieved'] if computed['retrieved'] > 0 else 'Division By Zero'
    recall = computed['true_pos']/computed['relevant'] if computed['relevant'] > 0 else  'Division By Zero'
    f_score = (2*precision*recall)/(precision+recall) if type(precision) != str and type(recall) != str else -1
    if f_score > 0 :
        table.add_row(['Annoations of First Annotator', precision, recall, f_score, element_type])
    computed = precisionAndRecallXCategory(df_annot2, df_annot1, element_type)
    precision = computed['true_pos']/computed['retrieved'] if computed['retrieved'] > 0 else 'Division By Zero'
    recall = computed['true_pos']/computed['relevant'] if computed['relevant'] > 0 else  'Division By Zero'
    f_score = (2*precision*recall)/(precision+recall) if type(precision) != str and type(recall) != str else -1
    if f_score > 0 :
        table.add_row(['Annoations of Second Annotator', precision, recall, f_score, element_type])
print(table)

+--------------------------------+----------------------+--------------------+--------------------+--------------+
|        ANNOTATIONS GOLD        | Precision Evaluation | Recall Evaluation  |      F-score       | Type Element |
+--------------------------------+----------------------+--------------------+--------------------+--------------+
| Annoations of First Annotator  |  0.896551724137931   | 0.9285714285714286 | 0.912280701754386  |     TIME     |
| Annoations of Second Annotator |  0.9642857142857143  | 0.9310344827586207 | 0.9473684210526316 |     TIME     |
| Annoations of First Annotator  |  0.8675496688741722  | 0.9357142857142857 | 0.9003436426116839 |  WRITER-AG   |
| Annoations of Second Annotator |         0.95         | 0.8807947019867549 | 0.9140893470790377 |  WRITER-AG   |
| Annoations of First Annotator  |  0.7786259541984732  | 0.9026548672566371 | 0.8360655737704918 |    EVENT     |
| Annoations of Second Annotator |  0.9026548672566371  | 0.7786259541984732 | 0

In [16]:
# Between First annotator and Third annotator 
table = PrettyTable(['ANNOTATIONS GOLD', 'Precision Evaluation', 'Recall Evaluation', 'F-score', 'Type Element',])
for element_type in element_types :
    df_annot1 = pd.read_csv(annotations_marco_200_2_csv_path)
    df_annot2 = pd.read_csv(annotations_rossana_200_csv_path)
    computed = precisionAndRecallXCategory(df_annot1, df_annot2, element_type)
    precision = computed['true_pos']/computed['retrieved'] if computed['retrieved'] > 0 else 'Division By Zero'
    recall = computed['true_pos']/computed['relevant'] if computed['relevant'] > 0 else  'Division By Zero'
    f_score = (2*precision*recall)/(precision+recall) if type(precision) != str and type(recall) != str else -1
    if f_score > 0 :
        table.add_row(['Annoations of First Annotator', precision, recall, f_score, element_type])
    computed = precisionAndRecallXCategory(df_annot2, df_annot1, element_type)
    precision = computed['true_pos']/computed['retrieved'] if computed['retrieved'] > 0 else 'Division By Zero'
    recall = computed['true_pos']/computed['relevant'] if computed['relevant'] > 0 else  'Division By Zero'
    f_score = (2*precision*recall)/(precision+recall) if type(precision) != str and type(recall) != str else -1
    if f_score > 0 :
        table.add_row(['Annoations of Third Annotator', precision, recall, f_score, element_type])
print(table)

+-------------------------------+----------------------+--------------------+---------------------+--------------+
|        ANNOTATIONS GOLD       | Precision Evaluation | Recall Evaluation  |       F-score       | Type Element |
+-------------------------------+----------------------+--------------------+---------------------+--------------+
| Annoations of First Annotator |  0.8602150537634409  | 0.851063829787234  |  0.8556149732620322 |     TIME     |
| Annoations of Third Annotator |  0.8723404255319149  | 0.8817204301075269 |  0.877005347593583  |     TIME     |
| Annoations of First Annotator |  0.8896551724137931  | 0.9347826086956522 |  0.911660777385159  |  WRITER-AG   |
| Annoations of Third Annotator |  0.9420289855072463  | 0.896551724137931  |  0.9187279151943464 |  WRITER-AG   |
| Annoations of First Annotator |  0.8165680473372781  | 0.8846153846153846 |  0.8492307692307692 |    EVENT     |
| Annoations of Third Annotator |  0.8782051282051282  | 0.8106508875739645 |  0

In [17]:
# Between First annotator and Fourth annotator 
table = PrettyTable(['ANNOTATIONS GOLD', 'Precision Evaluation', 'Recall Evaluation', 'F-score', 'Type Element'])
for element_type in element_types :
    df_annot1 = pd.read_csv(annotations_marco_100_csv_path)
    df_annot2 = pd.read_csv(annotations_enrico_100_csv_path)
    df_annot2['ASP-EVENT'] = np.nan
    computed = precisionAndRecallXCategory(df_annot1, df_annot2, element_type)
    precision = computed['true_pos']/computed['retrieved'] if computed['retrieved'] > 0 else 'Division By Zero'
    recall = computed['true_pos']/computed['relevant'] if computed['relevant'] > 0 else  'Division By Zero'
    f_score = (2*precision*recall)/(precision+recall) if type(precision) != str and type(recall) != str else -1
    if f_score > 0 :
        table.add_row(['Annoations of First Annotator', precision, recall, f_score, element_type])
    computed = precisionAndRecallXCategory(df_annot2, df_annot1, element_type)
    precision = computed['true_pos']/computed['retrieved'] if computed['retrieved'] > 0 else 'Division By Zero'
    recall = computed['true_pos']/computed['relevant'] if computed['relevant'] > 0 else  'Division By Zero'
    f_score = (2*precision*recall)/(precision+recall) if type(precision) != str and type(recall) != str else -1
    if f_score > 0 :
        table.add_row(['Annoations of Fourth Annotator', precision, recall, f_score, element_type])
print(table)

+--------------------------------+----------------------+---------------------+--------------------+--------------+
|        ANNOTATIONS GOLD        | Precision Evaluation |  Recall Evaluation  |      F-score       | Type Element |
+--------------------------------+----------------------+---------------------+--------------------+--------------+
| Annoations of First Annotator  |  0.9444444444444444  |  0.9444444444444444 | 0.9444444444444444 |     TIME     |
| Annoations of Fourth Annotator |  0.9444444444444444  |  0.9444444444444444 | 0.9444444444444444 |     TIME     |
| Annoations of First Annotator  |  0.8970588235294118  |  0.9384615384615385 | 0.9172932330827067 |  WRITER-AG   |
| Annoations of Fourth Annotator |  0.9538461538461539  |  0.9117647058823529 | 0.9323308270676691 |  WRITER-AG   |
| Annoations of First Annotator  |  0.7808219178082192  |  0.8507462686567164 | 0.8142857142857143 |    EVENT     |
| Annoations of Fourth Annotator |  0.835820895522388   |  0.76712328767

## Quantitative Analysis

In [None]:
nlp = spacy.load("en_core_web_sm")

verbs = {
    'states': {},
    'events': {}
}
df_marco = pd.read_csv(annotations_marco_1000_csv_path).replace(np.nan, '', regex=True)
print("Number of annotations over 1000 sentences: ", df_marco.shape[0])

for index, row in df_marco.iterrows() :
    state = str(row["STATE"]).strip().lower()
    event = str(row["EVENT"]).strip().lower()
    rep_event = str(row["REP-EVENT"]).strip().lower()
    asp_event = str(row["ASP-EVENT"]).strip().lower()
    location = str(row["LOC"]).strip()
    orgaization = str(row["ORG"]).strip()
    text = str(row['text'])
    is_verb = True
    key_verbs = verbs['states'] if len(event.strip()) == 0 else verbs['events']
    verb_state_ = state if len(event.strip()) == 0 else event
    doc1 = nlp(location)
    doc2 = nlp(orgaization)
    entities1 = [ent.label_ for ent in doc1.ents]
    entities2 = [ent.label_ for ent in doc2.ents]
    verb_states = [w for w in word_tokenize(verb_state_) if not w in stop_words]
    for verb_state in verb_states :
        best_sense = lesk(word_tokenize(text), verb_state, pos=wn.VERB)
        if best_sense is None :
            best_sense = lesk(word_tokenize(text), verb_state)
            is_verb = False
        definition = best_sense.definition() if best_sense is not None else verb_state
        best_sense = best_sense.name() if best_sense is not None else verb_state
        if best_sense in key_verbs :
            key_verbs[best_sense]['occorrences'] += 1
            key_verbs[best_sense]['rep_event'] += 1 if len(rep_event) > 0 else 0
            key_verbs[best_sense]['asp_event'] += 1 if len(asp_event) > 0 else 0 
            key_verbs[best_sense]['location'] += 1 if "GPE" in entities1 else 0
            key_verbs[best_sense]['organization'] += 1 if "ORG" in entities2 else 0
            if is_verb :
                key_verbs[best_sense]['targets'].add(wn_lemmatizer.lemmatize(verb_state, 'v'))
            else : 
                key_verbs[best_sense]['targets'].add(wn_lemmatizer.lemmatize(verb_state))
        else :
            key_verbs[best_sense] = {}
            key_verbs[best_sense]['occorrences'] = 1
            key_verbs[best_sense]['definition'] = definition
            key_verbs[best_sense]['rep_event'] = 1 if len(rep_event) > 0 else 0
            key_verbs[best_sense]['asp_event'] = 1 if len(asp_event) > 0 else 0
            key_verbs[best_sense]['location'] = 1 if "GPE" in entities1 else 0
            key_verbs[best_sense]['organization'] = 1 if "ORG" in entities2 else 0
            if is_verb :
                key_verbs[best_sense]['targets'] = set([wn_lemmatizer.lemmatize(verb_state, 'v')])
            else :
                key_verbs[best_sense]['targets'] = set([wn_lemmatizer.lemmatize(verb_state)])

Number of annotations over 1000 sentences:  1795


In [None]:
df_states_quant_analysis = pd.DataFrame(columns=[
    'state (WORDNET SENSE IF EXIST)', 
    'occorrences', 
    'definition', 
    'rep_event', 
    'asp_event', 
    'location', 
    'organization', 
    'targets'
])
for key, value in verbs['states'].items() :
    value_copy = value.copy()
    value_copy['state (WORDNET SENSE IF EXIST)'] = key
    value_copy['targets'] = ", ".join(value['targets'])
    df_states_quant_analysis = df_states_quant_analysis.append(value_copy, ignore_index=True)
df_states_quant_analysis = df_states_quant_analysis.sort_values(by='occorrences', ascending=False)
print(
    "Number of annotations (STATE) over 1000 sentences:", 
    df_states_quant_analysis['occorrences'].sum(),
    ", sia",
    (df_states_quant_analysis['occorrences'].sum() * 100) / df_marco.shape[0],
    "%."
)
df_states_quant_analysis.to_csv(states_quantitative_analysis_csv_path, index=False)

df_events_quant_analysis = pd.DataFrame(columns=[
    'event (WORDNET SENSE IF EXIST)', 
    'occorrences', 
    'definition', 
    'rep_event', 
    'asp_event', 
    'location', 
    'organization', 
    'targets'
])
for key, value in verbs['events'].items() :
    value_copy = value.copy()
    value_copy['event (WORDNET SENSE IF EXIST)'] = key
    value_copy['targets'] = ", ".join(value['targets'])
    df_events_quant_analysis = df_events_quant_analysis.append(value_copy, ignore_index=True)
df_events_quant_analysis = df_events_quant_analysis.sort_values(by='occorrences', ascending=False)
print(
    "Number of annotations (EVENT) over 1000 sentences:", 
    df_events_quant_analysis['occorrences'].sum(),
    ", sia",
    (df_events_quant_analysis['occorrences'].sum() * 100) / df_marco.shape[0],
    "%."
)
df_events_quant_analysis.to_csv(events_quantitative_analysis_csv_path, index=False)

Number of annotations (STATE) over 1000 sentences: 677 , sia 37.71587743732591 %.
Number of annotations (EVENT) over 1000 sentences: 853 , sia 47.5208913649025 %.


In [None]:
df_agg1 = df_states_quant_analysis.groupby('targets').agg({
    'state (WORDNET SENSE IF EXIST)': ' ## '.join, 
    'occorrences': sum, 
    'definition': ' ## '.join, 
    'rep_event': sum, 
    'asp_event': sum, 
    'location': sum, 
    'organization': sum,
    'targets': ' ## '.join
})
df_agg1 = df_agg1.sort_values(by='occorrences', ascending=False)
df_agg1.to_csv(states_quantitative_analysis_groupby_lemmas)

In [None]:
df_agg2 = df_events_quant_analysis.groupby('targets').agg({
    'event (WORDNET SENSE IF EXIST)': ' ## '.join, 
    'occorrences': sum, 
    'definition': ' ## '.join, 
    'rep_event': sum, 
    'asp_event': sum, 
    'location': sum, 
    'organization': sum,
    'targets': ' ## '.join
})
df_agg2 = df_agg2.sort_values(by='occorrences', ascending=False)
df_agg2.to_csv(events_quantitative_analysis_groupby_lemmas)

In [None]:
sentence = "Ramachandran, whose father wanted him to become a physician rather than a researcher, obtained an M.B.B.S. from Stanley Medical College in Chennai, India."
sense = lesk(word_tokenize(sentence), 'obtained', pos=wn.VERB)
print(sense, sense.definition())

sentence = "He received the Kerala Sahitya Academi award in the `poetry' section for his collection, Nellickal Muraleedharante Kavithakal in 2004."
sense = lesk(word_tokenize(sentence), 'obtained', pos=wn.VERB)
print(sense, sense.definition())

Synset('receive.v.02') receive a specified treatment (abstract)
Synset('receive.v.02') receive a specified treatment (abstract)
