In [78]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')

In [79]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

### Load data

In [80]:
df1 = pd.read_csv("sem_covid/entrypoints/notebooks/semantic_similarity_evaluation/annotator1_EG.csv")
df2 = pd.read_csv("sem_covid/entrypoints/notebooks/semantic_similarity_evaluation/annotator2_CP.csv")

In [81]:
assert all(df1.columns == df2.columns)

In [82]:
df1.Q1 = df1.Q1.apply(lambda x: x.replace(" ", ""))
df2.Q3 = df2.Q3.apply(lambda x: int(x) if not np.isnan(x) else 1)

In [83]:
def shift_range(x):
    if x > 3:
        return 3
    elif x < 3:
        return 1
    else:
        return 2


df2.Q2 = df2.Q2.apply(shift_range)
df2.Q3 = df2.Q3.apply(shift_range)

### Define hypothesis

In [84]:
def test_hypothesis_1(data: pd.DataFrame):
    counter = 0
    for iter, row in data.iterrows():
        if row.Q1 == 'ref1':
            if (row.Q2 >= 2) and (row.Q3 <= 2):
                counter += 1
        elif row.Q1 == 'ref2':
            if (row.Q2 <= 2) and (row.Q3 >= 2):
                counter += 1
        elif row.Q1 == 'none':
            if (row.Q2 <= 2) and (row.Q3 <= 2):
                counter += 1
    return counter / len(data)


In [85]:
def test_hypothesis_2(data: pd.DataFrame):
    counter_ac = 0
    counter_ab = 0
    # A B C
    # B C D
    # C D A

    for i in range(0, len(data), 3):
        ab = data.iloc[i]['Q2']
        ac = data.iloc[i]['Q3']
        bc = data.iloc[i + 1]['Q2']
        bd = data.iloc[i + 1]['Q3']
        cd = data.iloc[i + 2]['Q2']
        ca = data.iloc[i + 2]['Q3']
        if ac == ca:
            counter_ac += 1
        if bc == ab:
            counter_ab += 1
    return counter_ac / (len(data) // 3), counter_ab / (len(data) // 3)

In [86]:
def test_hypothesis_3(data1: pd.DataFrame, data2: pd.DataFrame):
    return pd.DataFrame({"Agreement rate": {"Q1": cohen_kappa_score(data1.Q1, data2.Q1),
                                            "Q2": cohen_kappa_score(data1.Q2, data2.Q2),
                                            "Q3": cohen_kappa_score(data1.Q3, data2.Q3)}})


### Automatic semantic similarity evaluation

In [87]:
lookup_similarity_map = pd.read_pickle(
    "sem_covid/entrypoints/notebooks/semantic_similarity_evaluation/unified_dataset_similarity_matrix.pkl")
lookup_similarity_map.columns = list(map(lambda x: int(x), lookup_similarity_map.columns.values))
lookup_similarity_map.index = list(map(lambda x: int(x), lookup_similarity_map.index.values))
333 in lookup_similarity_map.columns.values

True

In [88]:
machine_df = df1.copy()

In [89]:
for index, row in machine_df.iterrows():
    target_id = row['target_id']
    ref1_id = row['ref1_id']
    ref2_id = row['ref2_id']
    sim_target_ref_1 = lookup_similarity_map.loc[target_id, ref1_id]
    sim_target_ref_2 = lookup_similarity_map.loc[target_id, ref2_id]
    # machine_df.at[index, 'Q2'] = sim_target_ref_1
    # machine_df.at[index, 'Q3'] = sim_target_ref_2
    # if sim_target_ref_1 > sim_target_ref_2:
    #     machine_df.at[index, 'Q1'] = 'ref1'
    # else:
    #     machine_df.at[index, 'Q1'] = 'ref2'
    if np.abs(sim_target_ref_1 - sim_target_ref_2) <= 0.0001:
        machine_df.at[index, 'Q1'] = 'none'
        machine_df.at[index, 'Q2'] = 2
        machine_df.at[index, 'Q3'] = 2
    elif sim_target_ref_1 > sim_target_ref_2:
        machine_df.at[index, 'Q1'] = 'ref1'
        machine_df.at[index, 'Q2'] = 3
        machine_df.at[index, 'Q3'] = 1
    else:
        machine_df.at[index, 'Q1'] = 'ref2'
        machine_df.at[index, 'Q2'] = 1
        machine_df.at[index, 'Q3'] = 3


# Inter annotator evaluation
* First annotator : EG
* Second annotator : CP
* Third annotator: USE

### T1: Is Q1, Q2, Q3 coherent to one another? Question cross-checking.

In [90]:
t1_df1 = test_hypothesis_1(df1)
t1_df2 = test_hypothesis_1(df2)
t1_df3 = test_hypothesis_1(machine_df)
print(f"Result for the first annotator: {t1_df1}", )
print(f"Result for the second annotator: {t1_df2}", )
print(f"Result for the third annotator: {t1_df3}", )

Result for the first annotator: 0.9833333333333333
Result for the second annotator: 0.6333333333333333
Result for the third annotator: 1.0


### T2: Are responses of the 4 groups consistent with one another? Row cross-checking.

In [91]:
t2_df1 = test_hypothesis_2(df1)
t2_df2 = test_hypothesis_2(df2)
t2_df3 = test_hypothesis_2(machine_df)
print("Result for the first annotator: ", t2_df1)
print("Result for the second annotator: ", t2_df2)
print("Result for the third annotator: ", t2_df3)

Result for the first annotator:  (0.8, 0.5)
Result for the second annotator:  (0.65, 0.55)
Result for the third annotator:  (0.6, 0.5)


### T3: What is the inter-annotation agreement between the two evaluation sets (per question Q1,Q2,Q3).
[Interpretation Cohen Kappa Score](https://i.stack.imgur.com/kYNd6.png)

#### First and second annotators

In [92]:
test_hypothesis_3(df1, df2)

Unnamed: 0,Agreement rate
Q1,0.081747
Q2,0.039301
Q3,0.123851


#### First and third annotators

In [93]:
test_hypothesis_3(df1, machine_df)

Unnamed: 0,Agreement rate
Q1,0.123596
Q2,0.113924
Q3,0.0


#### Second and third annotators

In [94]:
test_hypothesis_3(df2, machine_df)

Unnamed: 0,Agreement rate
Q1,0.166667
Q2,0.098592
Q3,-0.038961
