In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

In [3]:
plt.style.use('ggplot')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
plt.style.use('default')


In [4]:
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['axes.prop_cycle'] = mpl.cycler(color=colors)
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

plt.rcParams['savefig.bbox'] = 'tight'

## Inter-Annotator Agreement

In [46]:
naveen_file = "naveen_annotations.json"
jingwu_file = "jingwu_annotations.json"
naveen_data = json.load(open("../../results/annotations/"+naveen_file))
jingwu_data = json.load(open("../../results/annotations/"+jingwu_file))

In [47]:
naveen_data['annotations'] = sorted(naveen_data['annotations'],key=lambda k: k['id'])
jingwu_data['annotations'] = sorted(jingwu_data['annotations'],key=lambda k: k['id'])
assert len(jingwu_data) == len(naveen_data)

In [48]:
keys = ['inadequate_food', 'donor_problem', 'recipient_problem', 'info_update']
for k in keys:
    all_naveen_values = [i[k] for i in naveen_data['annotations']]
    all_jingwu_values = [i[k] for i in jingwu_data['annotations']]
    print(k,cohen_kappa_score(all_naveen_values, all_jingwu_values))


inadequate_food 0.9320156755835747
donor_problem 0.6956956956956957
recipient_problem 0.7254335260115607
info_update 0.5730337078651686


## Model Accuracies

In [22]:
annotation_files = ["gpt_35_annotations.json","gpt_35_annotations_updated.json"]#,"gpt_4_annotations.json"]
consensus_file = "consensus_annotations.json"

In [23]:
consensus_data = json.load(open("../../results/annotations/"+consensus_file))
consensus_data['annotations'] = sorted(consensus_data['annotations'],key=lambda k: k['id'])

In [24]:
all_id = [i['id'] for i in consensus_data['annotations']]

In [27]:
for annotation_file in annotation_files:
    annotations = json.load(open("../../results/annotations/"+annotation_file))
    annotations['annotations'] = sorted(annotations['annotations'],key=lambda k: k['id'])
    assert [i['id'] for i in annotations['annotations']] == all_id
    print("Model {}".format(annotations['parameters']['model']))

    keys = ['inadequate_food', 'donor_problem', 'recipient_problem', 'info_update']
    for k in keys:
        all_annotation_values = [i[k] for i in annotations['annotations']]
        all_consensus_values = [i[k] for i in consensus_data['annotations']]

        f1 = f1_score(all_consensus_values,all_annotation_values)
        print(f1,k)
    print()

Model gpt-3.5 turbo
0.9044585987261146 inadequate_food
0.29032258064516125 donor_problem
0.64 recipient_problem
0.391304347826087 info_update

Model gpt-3.5 turbo
0.8875 inadequate_food
0.2641509433962264 donor_problem
0.368421052631579 recipient_problem
0.41025641025641024 info_update

