In [86]:
import json
from pathlib import Path

In [87]:
annotations = [json.loads(l) for l in Path("human_eval_data.jsonl").read_text().split("\n") if l]
print(len(annotations))
annotations[0]

522


 'meta': {'model': 'hr', 'sentence_index': 0, 'doc_index': 2966},
 '_input_hash': -1302499015,
 '_task_hash': 2026043455,
 'options': [{'id': 'content', 'text': 'content'},
  {'id': 'no_content', 'text': 'no_content'},
  {'id': 'important', 'text': 'important'},
  {'id': 'not_important', 'text': 'not_important'}],
 '_session_id': 'hiporank3-a2',
 '_view_id': 'choice',
 'accept': ['content', 'important'],
 'answer': 'accept'}

In [89]:
samples = [json.loads(l) for l in Path("human_eval_samples.jsonl").read_text().split("\n") if l]
print(len(samples))
samples[0]

281


 'meta': {'model': 'hr', 'sentence_index': 0, 'doc_index': 2966}}

In [90]:
annotator_data = dict(
    hr=dict(
        a1=dict(important=[],content=[]),
        a2=dict(important=[],content=[]),
        a3=dict(important=[],content=[]),
        a4=dict(important=[],content=[]),
    ),
    ps=dict(
        a1=dict(important=[],content=[]),
        a2=dict(important=[],content=[]),
        a3=dict(important=[],content=[]),
        a4=dict(important=[],content=[]),
    )
)
for s in samples:
    text = s['text'].replace("\n", " ").replace("========================", "\n=======================\n")
    model = s['meta']['model']
    answers = [a for a in annotations if a['text'] == text]
    for answer in answers:
        annotator = answer['_session_id'].split("-")[1]
        for a in answer['accept']:
            if a == "important":
                annotator_data[model][annotator]["important"] += [1]
            elif a == "not_important":
                annotator_data[model][annotator]["important"] += [0]
            elif a == "content":
                annotator_data[model][annotator]["content"] += [1]
            elif a == "no_content":
                annotator_data[model][annotator]["content"] += [0]
            

In [91]:
# human eval results (% yes)
model_results = {}
for model, annotators in annotator_data.items():
    model_results[model] = dict(important=0,not_important=0,total_important=0,
                                content=0,not_content=0,total_content=0)
    for annotator, measures in annotators.items():
        for measure, counts in measures.items():
            pos = sum(counts)
            total = len(counts)
            neg = total - pos
            model_results[model][measure] += pos
            model_results[model][f"not_{measure}"] += neg
            model_results[model][f"total_{measure}"] += total
            
print("hr importance: ", model_results['hr']['important'] / model_results['hr']['total_important'])
print("hr content: ", model_results['hr']['content'] / model_results['hr']['total_content'])
print("ps importance: ", model_results['ps']['important'] / model_results['ps']['total_important'])
print("ps content: ", model_results['ps']['content'] / model_results['ps']['total_content'])

hr importance:  0.5905511811023622
hr content:  0.421259842519685
ps importance:  0.487012987012987
ps content:  0.3051948051948052


In [92]:
# Cohen's kappa
# using a for yy, b for yn, c for ny, and d for nn
k = dict(
    important_a1a2=dict(a=0,b=0,c=0,d=0),
    content_a1a2=dict(a=0,b=0,c=0,d=0),
    important_a3a4=dict(a=0,b=0,c=0,d=0),
    content_a3a4=dict(a=0,b=0,c=0,d=0),
)

# annotators a1/a2
# important
for model in ["hr", "ps"]:
    for annotator_pair in [("a1","a2"),("a3","a4")]:
        for measure in ["important", "content"]:
            l1 = annotator_data[model][annotator_pair[0]][measure]
            l2 = annotator_data[model][annotator_pair[1]][measure]
            assert len(l1) == len(l2)
            k_key = f"{measure}_{''.join(annotator_pair)}"
            for i1,i2 in zip(l1,l2):
                if i1 == i2 and i1 == 1:
                    k[k_key]["a"] += 1
                elif i1 == i2 and i1 == 0:
                    k[k_key]["d"] += 1
                elif i1 == 1:
                    k[k_key]["b"] += 1
                else:
                    k[k_key]["c"] += 1

for measure in ["important","content"]:
    for annotator_pair in [("a1a2"),("a3a4")]:
        k_key = f"{measure}_{annotator_pair}"
        total = k[k_key]["a"] + k[k_key]["b"] + k[k_key]["c"] + k[k_key]["d"]
        k[k_key]["p_o"] = (k[k_key]["a"] + k[k_key]["d"]) / total
        k[k_key]["p_yes"] = (k[k_key]["a"] + k[k_key]["b"]) / total
        k[k_key]["p_yes"] *= (k[k_key]["a"] + k[k_key]["c"]) / total
        k[k_key]["p_no"] = (k[k_key]["c"] + k[k_key]["d"]) / total
        k[k_key]["p_no"] *= (k[k_key]["b"] + k[k_key]["d"]) / total
        k[k_key]["p_e"] = k[k_key]["p_yes"] + k[k_key]["p_no"]
        k[k_key]["k"] = (k[k_key]["p_o"] - k[k_key]["p_e"]) / (1 - k[k_key]["p_e"])
        print(f"k ({measure},{annotator_pair}) = {k[k_key]['k']}")

k (important,a1a2) = 0.5004133370074402
k (important,a3a4) = 0.3351708930540242
k (content,a1a2) = 0.505050505050505
k (content,a3a4) = 0.4118254583010586


In [93]:
# Fleiss's kappa
# build table
import pandas as pd
fleiss = {
    "i": pd.DataFrame(),
    "c": pd.DataFrame(),
}

fleiss["i"]["y"] = annotator_data["hr"]["a1"]["important"] + annotator_data["ps"]["a1"]["important"] + \
                    annotator_data["hr"]["a3"]["important"] + annotator_data["ps"]["a3"]["important"]
fleiss["i"]["y"] += annotator_data["hr"]["a2"]["important"] + annotator_data["ps"]["a2"]["important"] + \
                    annotator_data["hr"]["a4"]["important"] + annotator_data["ps"]["a4"]["important"]
fleiss["i"]["n"] = 2 - fleiss["i"]["y"]


fleiss["c"]["y"] = annotator_data["hr"]["a1"]["content"] + annotator_data["ps"]["a1"]["content"] + \
                    annotator_data["hr"]["a3"]["content"] + annotator_data["ps"]["a3"]["content"]
fleiss["c"]["y"] += annotator_data["hr"]["a2"]["content"] + annotator_data["ps"]["a2"]["content"] + \
                    annotator_data["hr"]["a4"]["content"] + annotator_data["ps"]["a4"]["content"]
fleiss["c"]["n"] = 2 - fleiss["c"]["y"]

n = 2 # number of raters
k = 2 # number of classes
assert len(fleiss["c"]) == len(fleiss["i"])
N = len(fleiss["c"]) # number of subjects

fleiss["i_pi"] = [x/fleiss["i"].sum().sum() for x in fleiss["i"].sum()]
fleiss["c_pi"] = [x/fleiss["c"].sum().sum() for x in fleiss["c"].sum()]

fleiss["i"]["P_i"] = (1/(n*(n-1))) * (fleiss["i"]["y"] ** 2 + fleiss["i"]["n"] ** 2 - n)
fleiss["c"]["P_i"] = (1/(n*(n-1))) * (fleiss["c"]["y"] ** 2 + fleiss["c"]["n"] ** 2 - n)

fleiss["i_P"] = fleiss["i"]["P_i"].mean()
fleiss["c_P"] = fleiss["c"]["P_i"].mean()

fleiss["i_Pe"] = sum([x**2 for x in fleiss["i_pi"]])
fleiss["c_Pe"] = sum([x**2 for x in fleiss["c_pi"]])

fleiss["k"] = {}
fleiss["k"]["i"] = (fleiss["i_P"] - fleiss["i_Pe"]) / (1 - fleiss["i_Pe"])
fleiss["k"]["c"] = (fleiss["c_P"] - fleiss["c_Pe"]) / (1 - fleiss["c_Pe"])
print(fleiss["k"])

{'i': 0.41368956743002566, 'c': 0.46558068383842566}


In [97]:
# statistical significance

In [94]:
# Example of the Student's t-test
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = mannwhitneyu(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')


stat=40.000, p=0.236
Probably the same distribution


In [95]:
stat, p = mannwhitneyu(df_hr['content'].tolist(), df_ps['content'].tolist())
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=34576.000, p=0.002
Probably different distributions


In [96]:
stat, p = mannwhitneyu(df_hr['important'].tolist(), df_ps['important'].tolist())
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=35066.000, p=0.007
Probably different distributions
