## Imports

In [124]:
import json
import pandas as pd
import numpy as np

## Globals Variables

In [125]:
## Customer Variables
annotations_ousmane_200_json_path = "inputs\\annotations-ousmane-200.json" 
annotations_ousmane_200_csv_path = "inputs\\annotations-ousmane-200.csv"
annotations_marco_1000_csv_path = "inputs\\1000annotazioni.csv"
annotations_matched_200_csv_path = "outputs\\annotations-matched-200.csv"
annotations_contains_matched_200_csv_path = "outputs\\annotations_contains-matched-200.csv"
exact_agreement_columns = ["author", "sent_id", "text", "TIME", "WRITER-AG", "EVENT", "ORG", "LOC", "ASP-EVENT", "STATE", "WRITER-PA", "REP-EVENT"]
soft_agreement_columns = ["author", "sent_id", "text", "EVENT", "STATE"]
merge_by_columns = ["author", "sent_id", "text"]

## Libraries Variables


## Transform annotations file json to csv file.

In [126]:
## This function convert the annotation file json in a annotation file csv.
def annotations_json_to_csv(path_file_json, annotator_name, path_file_csv) :
    df = pd.DataFrame()
    with open(path_file_json, "r", encoding='utf-8') as f:
        file_contents = json.load(f)
        for item in file_contents:
            for annotation in item['annotations']:
                container = dict()
                container['annotator'] = annotator_name
                container['author'] = item['data']['author']
                container['sent_id'] = item['data']['sent_id']
                container['text'] = item['data']['text']
                for v in annotation['result']:
                    container[v['value']['labels'][0]] = v['value']['text']
                df = df.append(container, ignore_index=True)
    df.to_csv(path_file_csv, index=False)

# This function verify if string elements of two list are "contains relations".
# "contains relations": for example we have two string str1 and str2, if str1 is sub strinf of str2
# or str2 is sub string of str1
def checkContainsRelations(items_1, items_2) :
    computed = True
    if len(items_1) != len(items_2) : return False
    for index in range(len(items_1)) : 
        computed = computed and (str(items_1[index]) in str(items_2[index]) or str(items_2[index]) in str(items_1[index]))
        if not computed : return False
    return computed

## This function merge two csv with Pandas
def merged_two_csv(path_csv1, path_csv2, how_mode, onColumns) :
    df1 = pd.read_csv(path_csv1)
    df2 = pd.read_csv(path_csv2)
    df = pd.merge(df1, df2, how=how_mode, on=onColumns) 
    return df

def concateItems(item_1, item_2) :
    item = "Value1: "
    item += str(item_1) if str(item_1) != "nan" else "MISSING_VALUE"
    item += " & Value2: "
    item += str(item_2) if str(item_2) != "nan" else "MISSING_VALUE"
    return item

def swapedItems(item_1, item_2) :
    temp = item_1
    item_1 = item_2
    item_2 = temp
    return item_1,item_2

## Trnasform Annotions file JSON to file CSV

In [127]:
annotations_json_to_csv(annotations_ousmane_200_json_path, "ousmane", annotations_ousmane_200_csv_path)

## Compute exact Agreement, outpu file => "outputs\\annotations-matched-200.csv"

In [128]:
df_exact_matched = merged_two_csv(annotations_ousmane_200_csv_path, annotations_marco_1000_csv_path, "inner", exact_agreement_columns)
df_exact_matched.to_csv(annotations_matched_200_csv_path, index=False)

In [129]:
# Get Both df of annotatore and  make Pivot df
df_ousmane = pd.read_csv(annotations_ousmane_200_csv_path)
df_marco = pd.read_csv(annotations_marco_1000_csv_path)
df_set_pivot = df_ousmane[["author", "sent_id", "text"]].drop_duplicates(subset=["author", "sent_id", "text"])

# Create a matched Dictionnary container
matched_container = {
    'Author': [],
    'SentenceId': [],
    'EVENT': [],
    'STATE': [],
    'ASP-EVENT': [],
    'REP-EVENT': [],
    'MatchedInfo': [],
    'Text': []
}

# Iterate Pivot df and Processing soft matching on Some columns 
for index, row in df_set_pivot.iterrows():
    author = row["author"]
    sent_id = row["sent_id"]
    text = row["text"]
    df_x = df_ousmane.loc[(df_ousmane['author'] == author) & (df_ousmane['sent_id'] == sent_id) & (df_ousmane['text'] == text)]
    df_y = df_marco.loc[(df_marco['author'] == author) & (df_marco['sent_id'] == sent_id) & (df_marco['text'] == text)]
    if (df_x.shape[0] < df_y.shape[0]) :
        df_x, df_y = swapedItems(df_x, df_y)
    for index_x, row_x in df_x.iterrows() :
        items_1 = [row_x["EVENT"], row_x["STATE"], row_x["ASP-EVENT"], row_x["REP-EVENT"]]
        for index_y, row_y in df_y.iterrows() :
            items_2 = [row_y["EVENT"], row_y["STATE"], row_y["ASP-EVENT"], row_y["REP-EVENT"]]
            if items_1 == items_2 : 
                matched_container['Author'].append(author)
                matched_container['SentenceId'].append(sent_id)
                matched_container['Text'].append(text)
                matched_container['EVENT'].append(row_x["EVENT"])
                matched_container['STATE'].append(row_x["STATE"])
                matched_container['ASP-EVENT'].append(row_x["ASP-EVENT"])
                matched_container['REP-EVENT'].append(row_x["REP-EVENT"])
                matched_container['MatchedInfo'].append("Exact Match")
                break
            elif checkContainsRelations(items_1, items_2) :
                matched_container['Author'].append(author)
                matched_container['SentenceId'].append(sent_id)
                matched_container['Text'].append(text)
                if str(row_x["EVENT"]) != "nan" or str(row_y["EVENT"]) != "nan" : 
                    matched_container['EVENT'].append(concateItems(row_x["EVENT"],row_y["EVENT"]))
                else : matched_container['EVENT'].append("")
                if str(row_x["STATE"]) != "nan" or str(row_y["STATE"]) != "nan" : 
                    matched_container['STATE'].append(concateItems(row_x["STATE"], row_y["STATE"]))
                else : matched_container['STATE'].append("")
                if str(row_x["ASP-EVENT"]) != "nan" or str(row_y["ASP-EVENT"]) != "nan" : 
                    matched_container['ASP-EVENT'].append(concateItems(row_x["ASP-EVENT"], row_y["ASP-EVENT"]))
                else : matched_container['ASP-EVENT'].append("")
                if str(row_x["REP-EVENT"]) != "nan" or str(row_y["REP-EVENT"]) != "nan" : 
                    matched_container['REP-EVENT'].append(concateItems(row_x["REP-EVENT"], row_y["REP-EVENT"]))
                else : matched_container['REP-EVENT'].append("")
                matched_container['MatchedInfo'].append("Contains Relation Match")
                break

# Convert the dictionary into DataFrame
df_contains_matched = pd.DataFrame(matched_container, columns = ['SentenceId', 'EVENT', 'STATE', 'ASP-EVENT', 'REP-EVENT', 'MatchedInfo', 'Text'])
df_contains_matched.replace(np.nan, '', regex=True)
df_contains_matched.to_csv(annotations_contains_matched_200_csv_path, index=False)