In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, cohen_kappa_score
from collections import Counter
import pandas as pd

In [3]:
plt.style.use('ggplot')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
plt.style.use('default')


In [4]:
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['axes.prop_cycle'] = mpl.cycler(color=colors)
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

plt.rcParams['savefig.bbox'] = 'tight'

## Inter-Annotator Agreement

In [5]:
tasks = ['recipient_problem', 'inadequate_food', 'donor_problem', 
            'direction_problem','earlier_pickup','system_problem',
            'update_contact']


In [6]:
naveen_file = "naveen_hierarchical_annotations_2024-05-13_2024-05-19.csv"
jingwu_file = "jingwu_hierarchical_annotations_2024-05-13_2024-05-19.csv"
naveen_data = pd.read_csv("../../results/annotations/"+naveen_file)
jingwu_data = pd.read_csv("../../results/annotations/"+jingwu_file)
jingwu_data = jingwu_data.fillna(0)

In [58]:
naveen_data['any_donor_problem'] = naveen_data[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
jingwu_data['any_donor_problem'] = jingwu_data[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)

naveen_data['any_system_problem'] = naveen_data[['direction_problem', 'system_problem','update_contact']].max(axis=1)
jingwu_data['any_system_problem'] = jingwu_data[['direction_problem', 'system_problem','update_contact']].max(axis=1)


In [59]:
jingwu_data = jingwu_data.set_index('id').reindex(naveen_data['id']).reset_index()
assert len(jingwu_data) == len(naveen_data)

In [63]:
kappa_scores = {}
for task in tasks+['any_donor_problem','any_system_problem']:
    preds = jingwu_data[task]
    labels = naveen_data[task]

    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)

    kappa = cohen_kappa_score(labels,preds)
    kappa_scores[task] = kappa 

    print("For task {}, precision {}, recall {}, F1 {}, kappa {}".format(task,precision,recall,(precision+recall)/2,kappa))


For task recipient_problem, precision 1.0, recall 0.42857142857142855, F1 0.7142857142857143, kappa 0.5862068965517241
For task inadequate_food, precision 0.6956521739130435, recall 1.0, F1 0.8478260869565217, kappa 0.7888942077549066
For task donor_problem, precision 0.375, recall 1.0, F1 0.6875, kappa 0.5291479820627802
For task direction_problem, precision 0.75, recall 0.75, F1 0.75, kappa 0.7418032786885246
For task earlier_pickup, precision 0.6666666666666666, recall 1.0, F1 0.8333333333333333, kappa 0.7961165048543689
For task system_problem, precision 0.4, recall 0.6666666666666666, F1 0.5333333333333333, kappa 0.48466257668711665
For task update_contact, precision 1.0, recall 1.0, F1 1.0, kappa 1.0
For task any_donor_problem, precision 0.6129032258064516, recall 1.0, F1 0.8064516129032258, kappa 0.7048028114017961
For task any_system_problem, precision 0.7, recall 0.875, F1 0.7875, kappa 0.760910815939279


In [70]:
table = table = """\\begin{{table*}}[]
\\scalebox{{0.85}}{{
\\begin{{tabular}}{{@{{}}lllll@{{}}}}
\\toprule
 & \\multicolumn{{4}}{{c}}{{Donor Issues (F1)}}      \\\\ \\cmidrule(lr){{2-5}}
                       & Inadequate Food & Earlier Pickup & Other Donor Problem & Any Donor Issue \\\\ \\midrule
Kappa Score                & {}            & {}         & {}              & {}             \\\\ \\bottomrule
\\end{{tabular}}
}}
\\end{{table*}}

\\begin{{table*}}[]
\\scalebox{{0.85}}{{
\\begin{{tabular}}{{@{{}}lccccc@{{}}}}
\\toprule
 & \\multicolumn{{1}}{{c}}{{Recipient Issues (F1)}} & \\multicolumn{{4}}{{c}}{{Logistical Issues (F1)}}                \\\\ \\cmidrule(lr){{2-2}} \\cmidrule(lr){{3-6}}
                       & Any Recipient Issue   & Direction Problem & System Problem & Update Contact & Any Logistical Issue    \\\\ \\midrule
Kappa Score               & {}                & {}            & {}          & {}        & {}           \\\\ \\bottomrule
\\end{{tabular}}
}}
\\end{{table*}}
"""

In [71]:
all_formats = []
task_list = ['inadequate_food','earlier_pickup','donor_problem','any_donor_problem']#
for task in task_list:
    all_formats.append("%.2f"% kappa_scores[task])
task_list = ['recipient_problem','direction_problem','system_problem','update_contact','any_system_problem']
for task in task_list:
    all_formats.append("%.2f"% kappa_scores[task])
print(table.format(*all_formats))

\begin{table*}[]
\scalebox{0.85}{
\begin{tabular}{@{}lllll@{}}
\toprule
 & \multicolumn{4}{c}{Donor Issues (F1)}      \\ \cmidrule(lr){2-5}
                       & Inadequate Food & Earlier Pickup & Other Donor Problem & Any Donor Issue \\ \midrule
Kappa Score                & 0.79            & 0.80         & 0.53              & 0.70             \\ \bottomrule
\end{tabular}
}
\end{table*}

\begin{table*}[]
\scalebox{0.85}{
\begin{tabular}{@{}lccccc@{}}
\toprule
 & \multicolumn{1}{c}{Recipient Issues (F1)} & \multicolumn{4}{c}{Logistical Issues (F1)}                \\ \cmidrule(lr){2-2} \cmidrule(lr){3-6}
                       & Any Recipient Issue   & Direction Problem & System Problem & Update Contact & Any Logistical Issue    \\ \midrule
Kappa Score               & 0.59                & 0.74            & 0.48          & 1.00        & 0.76           \\ \bottomrule
\end{tabular}
}
\end{table*}



## Evaluating LLMs

In [10]:
start_date = "2024-05-20"
end_date = "2024-05-26"

data = {}

all_predictions = {}

for model in ['gpt-4o-mini']:
    print()
    print("Model {}".format(model))
    data[model] = {}
    validation_labels = pd.read_csv('../../results/annotations/naveen_hierarchical_annotations_{}_{}.csv'.format(start_date,end_date))
    validation_predictions = pd.read_csv('../../results/reports/labeled_feedbacks_{}_{}_{}.csv'.format(start_date,end_date,model))
    validation_predictions = validation_predictions.fillna(0)
    validation_predictions[tasks] = validation_predictions[tasks].astype(int)
    validation_predictions['any_donor_problem'] = validation_predictions[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
    validation_labels['any_donor_problem'] = validation_labels[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
    validation_labels['any_problem'] = validation_labels[tasks].max(axis=1)
    validation_predictions['any_problem'] = validation_predictions[tasks].max(axis=1)
    validation_predictions = validation_predictions.set_index('rescue_id').reindex(validation_labels['id']).reset_index()

    all_predictions[model] = {}
    if model == 'gpt-4o':
        data['ensemble'] = {}

    for task in tasks+['any_donor_problem']:
        preds = validation_predictions[task]
        labels = validation_labels[task]

        if model == 'gpt-4o':
            combined_preds = preds * all_predictions['gpt-4o-mini'][task] * all_predictions['gpt-3.5-turbo'][task]
            precision = precision_score(labels, combined_preds)
            recall = recall_score(labels, combined_preds)
            data['ensemble'][task] = (precision+recall)/2



        all_predictions[model][task] = preds
        
        precision = precision_score(labels, preds)
        recall = recall_score(labels, preds)
        data[model][task] = (precision+recall)/2
        accuracy = accuracy_score(labels, preds)
        print("For task {}, precision {}, recall {}, F1 {}, accuracy {}".format(task,precision,recall,(precision+recall)/2,accuracy))


Model gpt-4o-mini
For task recipient_problem, precision 0.3, recall 0.6, F1 0.44999999999999996, accuracy 0.9285714285714286
For task inadequate_food, precision 0.72, recall 1.0, F1 0.86, accuracy 0.9444444444444444
For task donor_problem, precision 0.5, recall 0.7777777777777778, F1 0.6388888888888888, accuracy 0.9285714285714286
For task direction_problem, precision 0.5714285714285714, recall 1.0, F1 0.7857142857142857, accuracy 0.9761904761904762
For task earlier_pickup, precision 0.6666666666666666, recall 1.0, F1 0.8333333333333333, accuracy 0.9761904761904762
For task system_problem, precision 0.5, recall 1.0, F1 0.75, accuracy 0.9920634920634921
For task update_contact, precision 1.0, recall 0.6666666666666666, F1 0.8333333333333333, accuracy 0.9920634920634921
For task any_donor_problem, precision 0.6818181818181818, recall 0.967741935483871, F1 0.8247800586510263, accuracy 0.8809523809523809


In [20]:
start_date = "2024-05-13"
end_date = "2024-05-19"

data = {}

all_predictions = {}

for model in ['gpt-3.5-turbo','gpt-4o-mini','gpt-4o-mini_self_reflection','gpt-4o','llama']:
    print()
    print("Model {}".format(model))
    data[model] = {}
    validation_labels = pd.read_csv('../../results/annotations/naveen_hierarchical_annotations_{}_{}.csv'.format(start_date,end_date))
    validation_predictions = pd.read_csv('../../results/reports/labeled_feedbacks_{}_{}_{}.csv'.format(start_date,end_date,model))
    validation_predictions = validation_predictions.fillna(0)

    validation_predictions[tasks] = validation_predictions[tasks].astype(int)

    validation_predictions['any_donor_problem'] = validation_predictions[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)
    validation_labels['any_donor_problem'] = validation_labels[['inadequate_food', 'donor_problem','earlier_pickup']].max(axis=1)

    validation_predictions['any_system_problem'] = validation_predictions[['direction_problem', 'system_problem','update_contact']].max(axis=1)
    validation_labels['any_system_problem'] = validation_labels[['direction_problem', 'system_problem','update_contact']].max(axis=1)

    validation_labels['any_problem'] = validation_labels[tasks].max(axis=1)
    validation_predictions['any_problem'] = validation_predictions[tasks].max(axis=1)

    validation_labels['donor_other'] = validation_labels['any_donor_problem'] & ~(validation_labels['inadequate_food'] | validation_labels['earlier_pickup'])
    validation_predictions['donor_other'] = validation_predictions['any_donor_problem'] & ~(validation_predictions['inadequate_food'] | validation_predictions['earlier_pickup'])

    validation_predictions = validation_predictions.set_index('rescue_id').reindex(validation_labels['id']).reset_index()

    all_predictions[model] = {}
    if model == 'gpt-4o':
        data['ensemble'] = {}

    for task in tasks+['any_donor_problem','donor_other','any_system_problem']:
        preds = validation_predictions[task]
        labels = validation_labels[task]

        if model == 'gpt-4o':
            combined_preds = preds * all_predictions['gpt-4o-mini'][task] * all_predictions['gpt-3.5-turbo'][task]
            precision = precision_score(labels, combined_preds)
            recall = recall_score(labels, combined_preds)
            data['ensemble'][task] = (precision+recall)/2

        all_predictions[model][task] = preds
        
        precision = precision_score(labels, preds)
        recall = recall_score(labels, preds)
        data[model][task] = (precision+recall)/2
        accuracy = accuracy_score(labels, preds)
        print("For task {}, precision {}, recall {}, F1 {}, accuracy {}".format(task,precision,recall,(precision+recall)/2,accuracy))


Model gpt-3.5-turbo
For task recipient_problem, precision 0.3, recall 0.8571428571428571, F1 0.5785714285714285, accuracy 0.8809523809523809
For task inadequate_food, precision 0.5769230769230769, recall 0.9375, F1 0.7572115384615384, accuracy 0.9047619047619048
For task donor_problem, precision 0.14285714285714285, recall 1.0, F1 0.5714285714285714, accuracy 0.8571428571428571
For task direction_problem, precision 0.4444444444444444, recall 1.0, F1 0.7222222222222222, accuracy 0.9603174603174603
For task earlier_pickup, precision 0.3333333333333333, recall 1.0, F1 0.6666666666666666, accuracy 0.9682539682539683
For task system_problem, precision 0.6, recall 1.0, F1 0.8, accuracy 0.9841269841269841
For task update_contact, precision 0.125, recall 1.0, F1 0.5625, accuracy 0.9444444444444444
For task any_donor_problem, precision 0.4523809523809524, recall 1.0, F1 0.7261904761904762, accuracy 0.8174603174603174
For task donor_other, precision 0.09090909090909091, recall 1.0, F1 0.5454545

In [52]:
table = """\\begin{{table*}}[]
\\scalebox{{0.85}}{{
\\begin{{tabular}}{{@{{}}lllll@{{}}}}
\\toprule
\\multirow{{2}}{{*}}{{Model}} & \\multicolumn{{4}}{{c}}{{Donor Issues (F1)}}      \\\\ \\cmidrule(lr){{2-5}}
                       & Inadequate Food & Earlier Pickup & Other Donor Problem & Any Donor Issue \\\\ \\midrule
Llama 3                & {}            & {}         & {}              & {}         \\\\
GPT 3.5 Turbo          & {}            & {}         & {}              & {}            \\\\
GPT 4o Mini            & {}            & {}         & {}              & {}           \\\\
GPT 4o                 & {}            & {}         & {}              & {}             \\\\ \\bottomrule
\\end{{tabular}}
}}
\\end{{table*}}

\\begin{{table*}}[]
\\scalebox{{0.85}}{{
\\begin{{tabular}}{{@{{}}lccccccc@{{}}}}
\\toprule
\\multirow{{2}}{{*}}{{Model}} & \\multicolumn{{1}}{{c}}{{Recipient Issues (F1)}} & \\multicolumn{{4}}{{c}}{{Logistical Issues (F1)}} & \\multicolumn{{2}}{{c}}{{Other Info}}                   \\\\ \\cmidrule(lr){{2-2}} \\cmidrule(lr){{3-6}} \\cmidrule(lr){{7-8}}
                       & Any Recipient Issue   & Direction Problem & System Problem & Update Contact & Any Logistical Issue & Cost & Time Taken (minutes)   \\\\ \\midrule
Llama 3                & {}                & {}            & {}          & {}        & {}                 & \\$0.00 & 360 \\\\
GPT 3.5 Turbo          & {}                & {}            & {}          & {}        & {}         & \\$0.34 & 10  \\\\
GPT 4o Mini            & {}                & {}            & {}          & {}        & {}                 & \\$0.11 & 10  \\\\
GPT 4o                 & {}                & {}            & {}          & {}        & {}                 & \\$3.74 & 15  \\\\ \\bottomrule
\\end{{tabular}}
}}
\\end{{table*}}
"""

In [53]:
all_formats = []
for model in ['llama','gpt-3.5-turbo','gpt-4o-mini','gpt-4o']:
    task_list = ['inadequate_food','earlier_pickup','donor_problem','any_donor_problem']#
    for task in task_list:
        all_formats.append("%.2f"% data[model][task])
for model in ['llama','gpt-3.5-turbo','gpt-4o-mini','gpt-4o']:
    task_list = ['recipient_problem','direction_problem','system_problem','update_contact','any_system_problem']
    for task in task_list:
        all_formats.append("%.2f"% data[model][task])
print(table.format(*all_formats))

\begin{table*}[]
\scalebox{0.85}{
\begin{tabular}{@{}lllll@{}}
\toprule
\multirow{2}{*}{Model} & \multicolumn{4}{c}{Donor Issues (F1)}      \\ \cmidrule(lr){2-5}
                       & Inadequate Food & Earlier Pickup & Other Donor Problem & Any Donor Issue \\ \midrule
Llama 3                & 0.51            & 0.52         & 0.52              & 0.60         \\
GPT 3.5 Turbo          & 0.76            & 0.67         & 0.57              & 0.73            \\
GPT 4o Mini            & 0.81            & 0.83         & 0.58              & 0.73           \\
GPT 4o                 & 0.82            & 0.83         & 0.40              & 0.71             \\ \bottomrule
\end{tabular}
}
\end{table*}

\begin{table*}[]
\scalebox{0.85}{
\begin{tabular}{@{}lccccccc@{}}
\toprule
\multirow{2}{*}{Model} & \multicolumn{1}{c}{Recipient Issues (F1)} & \multicolumn{4}{c}{Logistical Issues (F1)} & \multicolumn{2}{c}{Other Info}                   \\ \cmidrule(lr){2-2} \cmidrule(lr){3-6} \cmidrule(lr){7-8}
   

## Model Accuracies

In [22]:
annotation_files = ["gpt_35_annotations.json","gpt_35_annotations_updated.json"]#,"gpt_4_annotations.json"]
consensus_file = "consensus_annotations.json"

In [23]:
consensus_data = json.load(open("../../results/annotations/"+consensus_file))
consensus_data['annotations'] = sorted(consensus_data['annotations'],key=lambda k: k['id'])

In [24]:
all_id = [i['id'] for i in consensus_data['annotations']]

In [27]:
for annotation_file in annotation_files:
    annotations = json.load(open("../../results/annotations/"+annotation_file))
    annotations['annotations'] = sorted(annotations['annotations'],key=lambda k: k['id'])
    assert [i['id'] for i in annotations['annotations']] == all_id
    print("Model {}".format(annotations['parameters']['model']))

    keys = ['inadequate_food', 'donor_problem', 'recipient_problem', 'info_update']
    for k in keys:
        all_annotation_values = [i[k] for i in annotations['annotations']]
        all_consensus_values = [i[k] for i in consensus_data['annotations']]

        f1 = f1_score(all_consensus_values,all_annotation_values)
        print(f1,k)
    print()

Model gpt-3.5 turbo
0.9044585987261146 inadequate_food
0.29032258064516125 donor_problem
0.64 recipient_problem
0.391304347826087 info_update

Model gpt-3.5 turbo
0.8875 inadequate_food
0.2641509433962264 donor_problem
0.368421052631579 recipient_problem
0.41025641025641024 info_update



## Trip Instructions

In [6]:
annotated_json = json.load(open("../../results/annotations/scored_instructions.json"))

In [11]:
category_count = Counter([i['category'] for i in annotated_json])
category_count

Counter({'logistics': 3, 'contact': 7, 'unhelpful': 1, 'directions': 4})

In [12]:
annotated_json[0]

{'side': 'donor',
 'location_id': 123,
 'comment': 'Giant eagle says only staurday Sunday and monday for 412 food rescue.   They said remove them from other days.  \r\n',
 'old_instruction': "Please enter the 412 Food Rescue PIN into scanner: 65428. Please do not share the PIN # with staff--it is our electronic signature! . Please call store when you're on your way to confirm donation. Ask for Bakery to confirm a donation. ",
 'new_instruction': "Please enter the 412 Food Rescue PIN into scanner: 65428. Please do not share the PIN # with staff--it is our electronic signature! . Please call store when you're on your way to confirm donation. Ask for Bakery to confirm a donation. Only available for pickup on Saturday, Sunday, and Monday.",
 'informativeness': 1,
 'clarity': 1,
 'helpfulness': 1,
 'truthfulness': 1,
 'category': 'logistics'}

In [14]:
metrics = ['informativeness','clarity','helpfulness','truthfulness']
for m in metrics:
    print(m,Counter([i[m] for i in annotated_json])[1]/len(annotated_json))

informativeness 1.0
clarity 0.9333333333333333
helpfulness 0.9333333333333333
truthfulness 1.0
