In [1]:
%load_ext autoreload
%autoreload 2

In [19]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, cohen_kappa_score
from collections import Counter
import pandas as pd

In [3]:
plt.style.use('ggplot')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
plt.style.use('default')


In [4]:
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['axes.prop_cycle'] = mpl.cycler(color=colors)
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

plt.rcParams['savefig.bbox'] = 'tight'

## Inter-Annotator Agreement

In [5]:
naveen_file = "naveen_annotations.json"
jingwu_file = "jingwu_annotations.json"
naveen_data = json.load(open("../../results/annotations/"+naveen_file))
jingwu_data = json.load(open("../../results/annotations/"+jingwu_file))

In [6]:
naveen_data['annotations'] = sorted(naveen_data['annotations'],key=lambda k: k['id'])
jingwu_data['annotations'] = sorted(jingwu_data['annotations'],key=lambda k: k['id'])
assert len(jingwu_data) == len(naveen_data)

In [7]:
keys = ['inadequate_food', 'donor_problem', 'recipient_problem', 'info_update']
for k in keys:
    all_naveen_values = [i[k] for i in naveen_data['annotations']]
    all_jingwu_values = [i[k] for i in jingwu_data['annotations']]
    print(k,cohen_kappa_score(all_naveen_values, all_jingwu_values))


inadequate_food 0.9320156755835747
donor_problem 0.6956956956956957
recipient_problem 0.7254335260115607
info_update 0.5730337078651686


## Evaluating LLMs

In [14]:
tasks = ['recipient_problem', 'inadequate_food', 'donor_problem', 
            'direction_problem','earlier_pickup','system_problem',
            'update_contact']


In [38]:
start_date = "2024-05-20"
end_date = "2024-05-26"
train_labels = pd.read_csv('../../results/annotations/naveen_hierarchical_annotations_{}_{}.csv'.format(start_date,end_date))
train_predictions = pd.read_csv('../../results/reports/labeled_feedbacks_{}_{}.csv'.format(start_date,end_date))
train_predictions[tasks] = train_predictions[tasks].astype(int)
train_predictions['any_donor_problem'] = train_predictions[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
train_labels['any_donor_problem'] = train_labels[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
train_labels['any_problem'] = train_labels[tasks].max(axis=1)
train_predictions['any_problem'] = train_predictions[tasks].max(axis=1)

In [39]:
for task in tasks + ['any_donor_problem','any_problem']:
    preds = train_predictions[task]
    labels = train_labels[task]
    
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    accuracy = accuracy_score(labels, preds)
    print("For task {}, precision {}, recall {}, accuracy {}".format(task,precision,recall,accuracy))

For task recipient_problem, precision 0.1111111111111111, recall 0.8, accuracy 0.7380952380952381
For task inadequate_food, precision 0.85, recall 0.9444444444444444, accuracy 0.9682539682539683
For task donor_problem, precision 0.25, recall 0.7777777777777778, accuracy 0.8174603174603174
For task direction_problem, precision 0.2857142857142857, recall 1.0, accuracy 0.9206349206349206
For task earlier_pickup, precision 0.75, recall 1.0, accuracy 0.9841269841269841
For task system_problem, precision 1.0, recall 1.0, accuracy 1.0
For task update_contact, precision 0.2222222222222222, recall 1.0, accuracy 0.9444444444444444
For task any_donor_problem, precision 0.6829268292682927, recall 0.9032258064516129, accuracy 0.873015873015873
For task any_problem, precision 0.7307692307692307, recall 0.926829268292683, accuracy 0.8650793650793651


In [32]:
start_date = "2024-05-13"
end_date = "2024-05-19"
validation_labels = pd.read_csv('../../results/annotations/naveen_hierarchical_annotations_{}_{}.csv'.format(start_date,end_date))
validation_predictions = pd.read_csv('../../results/reports/labeled_feedbacks_{}_{}.csv'.format(start_date,end_date))
validation_predictions[tasks] = validation_predictions[tasks].astype(int)
validation_predictions['any_donor_problem'] = validation_predictions[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
validation_labels['any_donor_problem'] = validation_labels[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)

In [33]:
for task in tasks + ['any_donor_problem']:
    preds = validation_predictions[task]
    labels = validation_labels[task]
    
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    accuracy = accuracy_score(labels, preds)
    print("For task {}, precision {}, recall {}, accuracy {}".format(task,precision,recall,accuracy))

For task recipient_problem, precision 0.30434782608695654, recall 1.0, accuracy 0.873015873015873
For task inadequate_food, precision 0.5925925925925926, recall 1.0, accuracy 0.9126984126984127
For task donor_problem, precision 0.17647058823529413, recall 1.0, accuracy 0.8888888888888888
For task direction_problem, precision 0.4, recall 1.0, accuracy 0.9523809523809523
For task earlier_pickup, precision 0.2857142857142857, recall 1.0, accuracy 0.9603174603174603
For task system_problem, precision 0.6, recall 1.0, accuracy 0.9841269841269841
For task update_contact, precision 0.16666666666666666, recall 1.0, accuracy 0.9603174603174603
For task any_donor_problem, precision 0.5135135135135135, recall 1.0, accuracy 0.8571428571428571


## Model Accuracies

In [22]:
annotation_files = ["gpt_35_annotations.json","gpt_35_annotations_updated.json"]#,"gpt_4_annotations.json"]
consensus_file = "consensus_annotations.json"

In [23]:
consensus_data = json.load(open("../../results/annotations/"+consensus_file))
consensus_data['annotations'] = sorted(consensus_data['annotations'],key=lambda k: k['id'])

In [24]:
all_id = [i['id'] for i in consensus_data['annotations']]

In [27]:
for annotation_file in annotation_files:
    annotations = json.load(open("../../results/annotations/"+annotation_file))
    annotations['annotations'] = sorted(annotations['annotations'],key=lambda k: k['id'])
    assert [i['id'] for i in annotations['annotations']] == all_id
    print("Model {}".format(annotations['parameters']['model']))

    keys = ['inadequate_food', 'donor_problem', 'recipient_problem', 'info_update']
    for k in keys:
        all_annotation_values = [i[k] for i in annotations['annotations']]
        all_consensus_values = [i[k] for i in consensus_data['annotations']]

        f1 = f1_score(all_consensus_values,all_annotation_values)
        print(f1,k)
    print()

Model gpt-3.5 turbo
0.9044585987261146 inadequate_food
0.29032258064516125 donor_problem
0.64 recipient_problem
0.391304347826087 info_update

Model gpt-3.5 turbo
0.8875 inadequate_food
0.2641509433962264 donor_problem
0.368421052631579 recipient_problem
0.41025641025641024 info_update



## Trip Instructions

In [6]:
annotated_json = json.load(open("../../results/annotations/scored_instructions.json"))

In [11]:
category_count = Counter([i['category'] for i in annotated_json])
category_count

Counter({'logistics': 3, 'contact': 7, 'unhelpful': 1, 'directions': 4})

In [12]:
annotated_json[0]

{'side': 'donor',
 'location_id': 123,
 'comment': 'Giant eagle says only staurday Sunday and monday for 412 food rescue.   They said remove them from other days.  \r\n',
 'old_instruction': "Please enter the 412 Food Rescue PIN into scanner: 65428. Please do not share the PIN # with staff--it is our electronic signature! . Please call store when you're on your way to confirm donation. Ask for Bakery to confirm a donation. ",
 'new_instruction': "Please enter the 412 Food Rescue PIN into scanner: 65428. Please do not share the PIN # with staff--it is our electronic signature! . Please call store when you're on your way to confirm donation. Ask for Bakery to confirm a donation. Only available for pickup on Saturday, Sunday, and Monday.",
 'informativeness': 1,
 'clarity': 1,
 'helpfulness': 1,
 'truthfulness': 1,
 'category': 'logistics'}

In [14]:
metrics = ['informativeness','clarity','helpfulness','truthfulness']
for m in metrics:
    print(m,Counter([i[m] for i in annotated_json])[1]/len(annotated_json))

informativeness 1.0
clarity 0.9333333333333333
helpfulness 0.9333333333333333
truthfulness 1.0
