## Imports

In [1]:
import json
import pandas as pd
import numpy as np
from prettytable import PrettyTable
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from stop_words import get_stop_words

## Globals Variables

In [35]:
## Customer Variables
annotations_ousmane_200_json_path = "inputs\\annotations-ousmane-200.json" 
annotations_ousmane_200_csv_path = "inputs\\annotations-ousmane-200.csv"
annotations_marco_1000_csv_path = "inputs\\1000annotazioni.csv"
annotations_marco_200_csv_path = "inputs\\annotations-marco-200.csv"
annotations_matched_200_csv_path = "outputs\\annotations-matched-200.csv"
annotations_contains_matched_200_csv_path = "outputs\\annotations_contains-matched-200.csv"
states_quantitative_analysis_csv_path = "outputs\\states_quantitative_analysis.csv"
events_quantitative_analysis_csv_path = "outputs\\events_quantitative_analysis.csv"
states_quantitative_analysis_groupby_lemmas = "outputs\\states_quantitative_analysis_groupby_lemmas.csv"
events_quantitative_analysis_groupby_lemmas = "outputs\\events_quantitative_analysis_groupby_lemmas.csv"
test_csv_path = "outputs\\test.csv"
exact_agreement_columns = ["author", "sent_id", "text", "TIME", "WRITER-AG", "EVENT", "ORG", "LOC", "ASP-EVENT", "STATE", "WRITER-PA", "REP-EVENT"]
soft_agreement_columns = ["author", "sent_id", "text", "EVENT", "STATE"]
merge_by_columns = ["author", "sent_id", "text"]
element_types_without_events_and_states = ["WRITER-AG", "ORG", "LOC", "ASP-EVENT", "WRITER-PA", "REP-EVENT"]

## Libraries Variables
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)
wn_lemmatizer = WordNetLemmatizer()


## Common Functions

In [37]:
## This function convert the annotation file json in a annotation file csv.
def annotations_json_to_csv(path_file_json, annotator_name, path_file_csv) :
    df = pd.DataFrame()
    with open(path_file_json, "r", encoding='utf-8') as f:
        file_contents = json.load(f)
        for item in file_contents:
            for annotation in item['annotations']:
                container = dict()
                container['annotator'] = annotator_name
                container['author'] = item['data']['author']
                container['sent_id'] = item['data']['sent_id']
                container['text'] = item['data']['text']
                for v in annotation['result']:
                    container[v['value']['labels'][0]] = v['value']['text']
                df = df.append(container, ignore_index=True)
    df.to_csv(path_file_csv, index=False)

# This function verify if string elements of two list are "contains relations".
# "contains relations": for example we have two string str1 and str2, if str1 is sub strinf of str2
# or str2 is sub string of str1
def checkContainsRelations(items_1, items_2) :
    computed = True
    if len(items_1) != len(items_2) : return False
    for index in range(len(items_1)) : 
        #computed = computed and (str(items_1[index]) in str(items_2[index]) or str(items_2[index]) in str(items_1[index]))
        sub_compute = True
        lx = str(items_1[index]).split()
        ly = str(items_2[index]).split()
        if len(lx) > len(ly) :
            lx, ly = swapedItems(lx, ly)
        for i in range(len(lx)) :
            sub_compute = sub_compute and (lx[i] in ly)
            if not sub_compute :
                break
        computed = computed and sub_compute
        if not computed : return False
    return computed

# Contain relation between two string
def checkContainRelation(item_1, item_2) :
    computed = True
    lx = str(item_1).split()
    ly = str(item_2).split()
    if len(lx) > len(ly) :
        lx, ly = swapedItems(lx, ly)
    for i in range(len(lx)) :
        computed = computed and (lx[i] in ly)
        if not computed :
            return False
    return computed


## This function merge two csv with Pandas
def merged_two_csv(path_csv1, path_csv2, how_mode, onColumns) :
    df1 = pd.read_csv(path_csv1)
    df2 = pd.read_csv(path_csv2)
    df = pd.merge(df1, df2, how=how_mode, on=onColumns) 
    return df

def concateItems(item_1, item_2) :
    item = "Value1: "
    item += str(item_1) if str(item_1) != "nan" else "MISSING_VALUE"
    item += " & Value2: "
    item += str(item_2) if str(item_2) != "nan" else "MISSING_VALUE"
    return item

def swapedItems(item_1, item_2) :
    temp = item_1
    item_1 = item_2
    item_2 = temp
    return item_1,item_2

## This function take two annotations Dataframe and compute the Agreement by Category
def computeAgreementByElementType(df_annotations1, df_annotations2, element_type, output_file_name) : 
    
    # Create a matched Dictionnary container
    matched_container = {
        'Author': [],
        'SentenceId': [],
        element_type: [],
        'MatchedInfo': [],
        'Text': []
    }

    if element_type not in element_types_without_events_and_states :
        raise Exception("Element Type not exist or has type EVENT or STATE!")

    # Compute Pivot Set
    df_set_pivot = df_annotations1[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])

    for _, row in df_set_pivot.iterrows() :
        author = row["author"]
        sent_id = row["sent_id"]
        text = row["text"]
        df_x = df_annotations1.loc[(df_annotations1['author'] == author) & (df_annotations1['sent_id'] == sent_id) & (df_annotations1['text'] == text)]
        df_y = df_annotations2.loc[(df_annotations2['author'] == author) & (df_annotations2['sent_id'] == sent_id) & (df_annotations2['text'] == text)]
        if (df_x.shape[0] < df_y.shape[0]) :
            df_x, df_y = swapedItems(df_x, df_y)
        for _, row_x in df_x.iterrows() :
            items_1 = [row_x[element_type]]
            for _, row_y in df_y.iterrows() :
                items_2 = [row_y[element_type]]
                if items_1 == items_2 : 
                    matched_container['Author'].append(author)
                    matched_container['SentenceId'].append(sent_id)
                    matched_container['Text'].append(text)
                    matched_container[element_type].append(row_x[element_type])
                    matched_container['MatchedInfo'].append("Exact Match")
                    break
                elif checkContainsRelations(items_1, items_2) :
                    matched_container['Author'].append(author)
                    matched_container['SentenceId'].append(sent_id)
                    matched_container['Text'].append(text)
                    if str(row_x[element_type]) != "nan" or str(row_y[element_type]) != "nan" : 
                        matched_container[element_type].append(concateItems(row_x[element_type],row_y[element_type]))
                    else : matched_container[element_type].append("")
                    matched_container['MatchedInfo'].append("Contains Relation Match")
                    break
        
    # Convert the dictionary into DataFrame
    df_contains_matched = pd.DataFrame(matched_container, columns = [element_type, 'MatchedInfo', 'SentenceId', 'Text'])
    df_contains_matched.replace(np.nan, '', regex=True)
    df_contains_matched.to_csv(output_file_name, index=False)

## Trnasform Annotions file JSON to file CSV

In [4]:
annotations_json_to_csv(annotations_ousmane_200_json_path, "ousmane", annotations_ousmane_200_csv_path)

## Compute exact Agreement 
### Output file => "outputs\\annotations-matched-200.csv"

In [5]:
df_exact_matched = merged_two_csv(annotations_ousmane_200_csv_path, annotations_marco_1000_csv_path, "inner", exact_agreement_columns)
df_exact_matched.to_csv(annotations_matched_200_csv_path, index=False)

## Compute soft Agreement (columns values are contains relations)
### Output file => "outputs\\annotations_contains-matched-200.csv"

In [6]:
# Get Both df of annotatore and  make Pivot df
df_ousmane = pd.read_csv(annotations_ousmane_200_csv_path)
df_marco = pd.read_csv(annotations_marco_1000_csv_path)
df_set_pivot = df_ousmane[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])

# Create a matched Dictionnary container
matched_container = {
    'Author': [],
    'SentenceId': [],
    'EVENT': [],
    'STATE': [],
    'MatchedInfo': [],
    'Text': []
}

# Compute match on remain elements types
other_matched_container = {
    'WRITER-AG': {'exact_match': 0, 'contain_match': 0},
    'WRITER-PA': {'exact_match': 0, 'contain_match': 0},  
    'ORG': {'exact_match': 0, 'contain_match': 0}, 
    'LOC': {'exact_match': 0, 'contain_match': 0}, 
    'ASP-EVENT': {'exact_match': 0, 'contain_match': 0},
    'REP-EVENT': {'exact_match': 0, 'contain_match': 0}
}

df_marco_200 = pd.DataFrame(columns=list(df_marco.columns))

# Iterate Pivot df and Processing soft matching on Some columns 
for index, row in df_set_pivot.iterrows():
    author = row["author"]
    sent_id = row["sent_id"]
    text = row["text"]
    df_x = df_ousmane.loc[(df_ousmane['author'] == author) & (df_ousmane['sent_id'] == sent_id) & (df_ousmane['text'] == text)]
    df_y = df_marco.loc[(df_marco['author'] == author) & (df_marco['sent_id'] == sent_id) & (df_marco['text'] == text)]
    df_marco_200 = df_y if len(df_marco_200) == 0 else pd.concat([df_marco_200, df_y], ignore_index=True)
    if (df_x.shape[0] < df_y.shape[0]) :
        df_x, df_y = swapedItems(df_x, df_y)
    for index_x, row_x in df_x.iterrows() :
        items_1 = [row_x["EVENT"], row_x["STATE"]]
        for index_y, row_y in df_y.iterrows() :
            items_2 = [row_y["EVENT"], row_y["STATE"]]
            if items_1 == items_2 : 
                matched_container['Author'].append(author)
                matched_container['SentenceId'].append(sent_id)
                matched_container['Text'].append(text)
                matched_container['EVENT'].append(row_x["EVENT"])
                matched_container['STATE'].append(row_x["STATE"])
                matched_container['MatchedInfo'].append("Exact Match")
                break
            elif checkContainsRelations(items_1, items_2) :
                matched_container['Author'].append(author)
                matched_container['SentenceId'].append(sent_id)
                matched_container['Text'].append(text)
                if str(row_x["EVENT"]) != "nan" or str(row_y["EVENT"]) != "nan" : 
                    matched_container['EVENT'].append(concateItems(row_x["EVENT"],row_y["EVENT"]))
                else : matched_container['EVENT'].append("")
                if str(row_x["STATE"]) != "nan" or str(row_y["STATE"]) != "nan" : 
                    matched_container['STATE'].append(concateItems(row_x["STATE"], row_y["STATE"]))
                else : matched_container['STATE'].append("")
                matched_container['MatchedInfo'].append("Contains Relation Match")
                break

# Convert the dictionary into DataFrame
df_contains_matched = pd.DataFrame(matched_container, columns = ['EVENT', 'STATE', 'MatchedInfo', 'SentenceId', 'Text'])
df_contains_matched.replace(np.nan, '', regex=True)
df_contains_matched.to_csv(annotations_contains_matched_200_csv_path, index=False)
df_marco_200.to_csv(annotations_marco_200_csv_path)

In [7]:

t = PrettyTable(['GOLD ANNOTATIONS', 'Intersection Mode', 'Precision Evaluation', 'Recall Evaluation'])

precision = (df_exact_matched.shape[0])/(df_ousmane.shape[0])
recall = (df_exact_matched.shape[0])/(df_marco_200.shape[0])
t.add_row(['Annoations for Fist Annotator', 'Exact Match', precision, recall])

precision = (df_contains_matched.shape[0])/(df_ousmane.shape[0])
recall = (df_contains_matched.shape[0])/(df_marco_200.shape[0])
t.add_row(['Annoations for Fist Annotator', 'Contains Relation Match', precision, recall])

precision = (df_exact_matched.shape[0])/(df_marco_200.shape[0])
recall = (df_exact_matched.shape[0])/(df_ousmane.shape[0])
t.add_row(['Annoations for Second Annotator', 'Exact Match', precision, recall])

precision = (df_contains_matched.shape[0])/(df_marco_200.shape[0])
recall = (df_contains_matched.shape[0])/(df_ousmane.shape[0])
t.add_row(['Annoations for Second Annotator', 'Contains Relation Match', precision, recall])

print(t)

+---------------------------------+-------------------------+----------------------+---------------------+
|         GOLD ANNOTATIONS        |    Intersection Mode    | Precision Evaluation |  Recall Evaluation  |
+---------------------------------+-------------------------+----------------------+---------------------+
|  Annoations for Fist Annotator  |       Exact Match       |  0.3333333333333333  | 0.27823691460055094 |
|  Annoations for Fist Annotator  | Contains Relation Match |  0.7095709570957096  |  0.5922865013774105 |
| Annoations for Second Annotator |       Exact Match       | 0.27823691460055094  |  0.3333333333333333 |
| Annoations for Second Annotator | Contains Relation Match |  0.5922865013774105  |  0.7095709570957096 |
+---------------------------------+-------------------------+----------------------+---------------------+


## Quantitative Analysis

In [8]:
nlp = spacy.load("en_core_web_sm")

verbs = {
    'states': {},
    'events': {}
}

df_marco = df_marco.replace(np.nan, '', regex=True)
print("Number of annotations over 1000 sentences: ", df_marco.shape[0])

for index, row in df_marco.iterrows() :
    state = str(row["STATE"]).strip().lower()
    event = str(row["EVENT"]).strip().lower()
    rep_event = str(row["REP-EVENT"]).strip().lower()
    asp_event = str(row["ASP-EVENT"]).strip().lower()
    location = str(row["LOC"]).strip()
    orgaization = str(row["ORG"]).strip()
    text = str(row['text'])
    is_verb = True
    key_verbs = verbs['states'] if len(event.strip()) == 0 else verbs['events']
    verb_state_ = state if len(event.strip()) == 0 else event
    doc1 = nlp(location)
    doc2 = nlp(orgaization)
    entities1 = [ent.label_ for ent in doc1.ents]
    entities2 = [ent.label_ for ent in doc2.ents]
    verb_states = [w for w in word_tokenize(verb_state_) if not w in stop_words]
    for verb_state in verb_states :
        best_sense = lesk(word_tokenize(text), verb_state, pos=wn.VERB)
        if best_sense is None :
            best_sense = lesk(word_tokenize(text), verb_state)
            is_verb = False
        definition = best_sense.definition() if best_sense is not None else verb_state
        best_sense = best_sense.name() if best_sense is not None else verb_state
        if best_sense in key_verbs :
            key_verbs[best_sense]['occorrences'] += 1
            key_verbs[best_sense]['rep_event'] += 1 if len(rep_event) > 0 else 0
            key_verbs[best_sense]['asp_event'] += 1 if len(asp_event) > 0 else 0 
            key_verbs[best_sense]['location'] += 1 if "GPE" in entities1 else 0
            key_verbs[best_sense]['organization'] += 1 if "ORG" in entities2 else 0
            if is_verb :
                key_verbs[best_sense]['targets'].add(wn_lemmatizer.lemmatize(verb_state, 'v'))
            else : 
                key_verbs[best_sense]['targets'].add(wn_lemmatizer.lemmatize(verb_state))
        else :
            key_verbs[best_sense] = {}
            key_verbs[best_sense]['occorrences'] = 1
            key_verbs[best_sense]['definition'] = definition
            key_verbs[best_sense]['rep_event'] = 1 if len(rep_event) > 0 else 0
            key_verbs[best_sense]['asp_event'] = 1 if len(asp_event) > 0 else 0
            key_verbs[best_sense]['location'] = 1 if "GPE" in entities1 else 0
            key_verbs[best_sense]['organization'] = 1 if "ORG" in entities2 else 0
            if is_verb :
                key_verbs[best_sense]['targets'] = set([wn_lemmatizer.lemmatize(verb_state, 'v')])
            else :
                key_verbs[best_sense]['targets'] = set([wn_lemmatizer.lemmatize(verb_state)])

Number of annotations over 1000 sentences:  1795


In [9]:
df_states_quant_analysis = pd.DataFrame(columns=[
    'state (WORDNET SENSE IF EXIST)', 
    'occorrences', 
    'definition', 
    'rep_event', 
    'asp_event', 
    'location', 
    'organization', 
    'targets'
])
for key, value in verbs['states'].items() :
    value_copy = value.copy()
    value_copy['state (WORDNET SENSE IF EXIST)'] = key
    value_copy['targets'] = ", ".join(value['targets'])
    df_states_quant_analysis = df_states_quant_analysis.append(value_copy, ignore_index=True)
df_states_quant_analysis = df_states_quant_analysis.sort_values(by='occorrences', ascending=False)
print(
    "Number of annotations (STATE) over 1000 sentences:", 
    df_states_quant_analysis['occorrences'].sum(),
    ", sia",
    (df_states_quant_analysis['occorrences'].sum() * 100) / df_marco.shape[0],
    "%."
)
df_states_quant_analysis.to_csv(states_quantitative_analysis_csv_path, index=False)

df_events_quant_analysis = pd.DataFrame(columns=[
    'event (WORDNET SENSE IF EXIST)', 
    'occorrences', 
    'definition', 
    'rep_event', 
    'asp_event', 
    'location', 
    'organization', 
    'targets'
])
for key, value in verbs['events'].items() :
    value_copy = value.copy()
    value_copy['event (WORDNET SENSE IF EXIST)'] = key
    value_copy['targets'] = ", ".join(value['targets'])
    df_events_quant_analysis = df_events_quant_analysis.append(value_copy, ignore_index=True)
df_events_quant_analysis = df_events_quant_analysis.sort_values(by='occorrences', ascending=False)
print(
    "Number of annotations (EVENT) over 1000 sentences:", 
    df_events_quant_analysis['occorrences'].sum(),
    ", sia",
    (df_events_quant_analysis['occorrences'].sum() * 100) / df_marco.shape[0],
    "%."
)
df_events_quant_analysis.to_csv(events_quantitative_analysis_csv_path, index=False)

Number of annotations (STATE) over 1000 sentences: 677 , sia 37.71587743732591 %.
Number of annotations (EVENT) over 1000 sentences: 853 , sia 47.5208913649025 %.


In [26]:
df_agg1 = df_states_quant_analysis.groupby('targets').agg({
    'state (WORDNET SENSE IF EXIST)': ' ## '.join, 
    'occorrences': sum, 
    'definition': ' ## '.join, 
    'rep_event': sum, 
    'asp_event': sum, 
    'location': sum, 
    'organization': sum,
    'targets': ' ## '.join
})
df_agg1 = df_agg1.sort_values(by='occorrences', ascending=False)
df_agg1.to_csv(states_quantitative_analysis_groupby_lemmas)

In [27]:
df_agg2 = df_events_quant_analysis.groupby('targets').agg({
    'event (WORDNET SENSE IF EXIST)': ' ## '.join, 
    'occorrences': sum, 
    'definition': ' ## '.join, 
    'rep_event': sum, 
    'asp_event': sum, 
    'location': sum, 
    'organization': sum,
    'targets': ' ## '.join
})
df_agg2 = df_agg2.sort_values(by='occorrences', ascending=False)
df_agg2.to_csv(events_quantitative_analysis_groupby_lemmas)

In [32]:
print((18*100)/1795)
print(not "ous" in ["ous", "mane"])

1.0027855153203342
False


In [38]:
# element_types_without_events_and_states = ["WRITER-AG", "ORG", "LOC", "ASP-EVENT", "WRITER-PA", "REP-EVENT"]
computeAgreementByElementType(df_ousmane, df_marco_200, "WRITER-AG", test_csv_path)