In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import numpy as np
from collections import Counter
import pandas as pd
from feedback.plotting import *
import os
import re
import openai
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from transformers import pipeline
from together import Together
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch


In [4]:
eval_columns = ["update_contact","inadequate_food", "earlier_pickup","system_problem","direction_problem","recipient_problem","donor_problem","positive_comment"]
nice_names = ["Update Contact","Inadequate Food","Earlier Pickup","System Problem","Direction Problem","Recipient Problem","Donor Problem","Positive Comment"]

In [5]:
eval_dataset = pd.read_csv("../../data/annotations/pre_deploy_eval.csv")
eval_dataset = eval_dataset[eval_dataset["annotator"] == "naveen"][["volunteer_comment","id"]]

In [6]:
groundtruth = pd.read_csv("../../data/annotations/pre_deploy_eval.csv")
groundtruth = groundtruth[groundtruth["annotator"] == "naveen"]

In [7]:
training_dataset = pd.read_csv("../../data/annotations/training.csv")
training_by_column = {}
for col in eval_columns:
    training_by_column[col] = training_dataset.copy()[training_dataset[col] != -1][["volunteer_comment","id",col]]

## LLM-Based Evaluation

In [7]:
donor_recipient_names = pd.read_csv("../../data/annotations/donor_recipient_annotated_names.csv")
dataset_merged = eval_dataset.merge(
    donor_recipient_names,
    left_on='id',
    right_on='delivery_id',
    how='left'  # or 'inner' if you only want matches
)
tasks = ['recipient_problem', 'inadequate_food', 'donor_problem', 
            'direction_problem','earlier_pickup','system_problem',
            'update_contact','positive_comment']
together_models = ["deepseek-ai/DeepSeek-R1-Distill-Qwen-14B","meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]

prompts = {}
for t in tasks:
        prompts[t] = open("../../data/prompts/{}.txt".format(t)).read()


In [24]:
client = Together(api_key=together_api_key)

In [25]:
predictions_by_model = [[{} for i in range(len(dataset_merged))] for i in range(len(together_models))]

for idx,model in enumerate(together_models):
    print("On Model {}".format(model))
    for i in range(len(dataset_merged)):
        comment = (
            f'For this rescue, the donor is {dataset_merged.loc[i, "donor_name"]};'
            f' the recipient is {dataset_merged.loc[i, "recipient_name"]}.'
            f' Comment: {dataset_merged.loc[i, "volunteer_comment"]}'
        )

        print("On Rescue {} out of {}".format(i+1,len(dataset_merged)))
        predictions_by_model[idx][i]['volunteer_comment'] = dataset_merged.loc[i, "volunteer_comment"]

        for task in tasks:
            # if task == 'donor_problem':
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompts[task] + comment}],
                    response_format={"type": "json_object"},
                )
                output = response.choices[0].message.content
                output = re.search('\{.*\}',output,re.DOTALL).group(0)
                feedback_info = json.loads(output)
                predictions_by_model[idx][i][task] = feedback_info[task]
            except Exception as e:
                print(f"Error processing feedback {i} for task {task}: {e}")


On Model deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
On Rescue 1 out of 126
On Rescue 2 out of 126
On Rescue 3 out of 126
On Rescue 4 out of 126
On Rescue 5 out of 126
On Rescue 6 out of 126
On Rescue 7 out of 126
On Rescue 8 out of 126
On Rescue 9 out of 126
On Rescue 10 out of 126
On Rescue 11 out of 126
On Rescue 12 out of 126
On Rescue 13 out of 126
On Rescue 14 out of 126
On Rescue 15 out of 126
On Rescue 16 out of 126
On Rescue 17 out of 126
On Rescue 18 out of 126
On Rescue 19 out of 126
On Rescue 20 out of 126
On Rescue 21 out of 126
On Rescue 22 out of 126
On Rescue 23 out of 126
On Rescue 24 out of 126
On Rescue 25 out of 126
On Rescue 26 out of 126
On Rescue 27 out of 126
On Rescue 28 out of 126
On Rescue 29 out of 126
On Rescue 30 out of 126
On Rescue 31 out of 126
On Rescue 32 out of 126
On Rescue 33 out of 126
On Rescue 34 out of 126
On Rescue 35 out of 126
On Rescue 36 out of 126
On Rescue 37 out of 126
On Rescue 38 out of 126
On Rescue 39 out of 126
On Rescue 40 out of 126

In [26]:
keys = predictions_by_model[0][0].keys()

for i,model in enumerate(together_models):
    file_name = "{}.csv".format(model.replace("/","_"))
    dataframe = pd.DataFrame(predictions_by_model[i])
    dataframe.to_csv('../../results/evaluation/{}'.format(file_name), index=False)

In [10]:
gpt_models = ["gpt-4o-mini","gpt-4o"]
client = openai.OpenAI(api_key=openai_api_key)

In [16]:
predictions_by_model = [[{} for i in range(len(dataset_merged))] for i in range(len(gpt_models))]

for idx,model in enumerate(gpt_models):
    print("On Model {}".format(model))
    for i in range(len(dataset_merged)):
        comment = (
            f'For this rescue, the donor is {dataset_merged.loc[i, "donor_name"]};'
            f' the recipient is {dataset_merged.loc[i, "recipient_name"]}.'
            f' Comment: {dataset_merged.loc[i, "volunteer_comment"]}'
        )

        print("On Rescue {} out of {}".format(i+1,len(dataset_merged)))
        predictions_by_model[idx][i]['volunteer_comment'] = dataset_merged.loc[i, "volunteer_comment"]

        for task in tasks:
            # if task == 'donor_problem':
            try:
                response = client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompts[task] + comment}],
                    response_format={"type": "json_object"},
                )
                output = response.choices[0].message.content
                output = re.search('\{.*\}',output,re.DOTALL).group(0)
                feedback_info = json.loads(output)
                predictions_by_model[idx][i][task] = feedback_info[task]
            except Exception as e:
                print(f"Error processing feedback {i} for task {task}: {e}")


On Model gpt-4o-mini
On Rescue 103 out of 126
Feedback info {'positive_comment': True, 'explanation': 'The thumbs up emoji indicates a positive sentiment towards the food rescue organization.'}
On Rescue 104 out of 126
Feedback info {'positive_comment': True, 'explanation': 'The thumbs up emoji indicates a positive response, suggesting satisfaction with the food rescue organization.'}
On Rescue 105 out of 126
Feedback info {'positive_comment': False, 'explanation': 'The comment is an emoji and does not provide any context or praise related to the food rescue organization.'}
On Rescue 106 out of 126
Feedback info {'positive_comment': True, 'explanation': 'The thumbs up emoji indicates a positive sentiment toward the food rescue organization.'}
On Model gpt-4o
On Rescue 103 out of 126
Feedback info {'positive_comment': True, 'explanation': 'The comment is positive as it uses the thumbs up emoji, which generally signifies approval or satisfaction, thus implying praise for the food rescue 

In [13]:
keys = predictions_by_model[0][0].keys()

for i,model in enumerate(together_models):
    file_name = "{}.csv".format(model.replace("/","_"))
    dataframe = pd.DataFrame(predictions_by_model[i])
    dataframe.to_csv('../../results/evaluation/{}'.format(file_name), index=False)

In [14]:
ablations = ["no_few_shot","no_guidelines","no_explanations"]

In [15]:
predictions_by_model = [[{} for i in range(len(dataset_merged))] for i in range(len(ablations))]
for idx,a in enumerate(ablations):
    for i in range(len(dataset_merged)):
        comment = (
            f'For this rescue, the donor is {dataset_merged.loc[i, "donor_name"]};'
            f' the recipient is {dataset_merged.loc[i, "recipient_name"]}.'
            f' Comment: {dataset_merged.loc[i, "volunteer_comment"]}'
        )

        print("On Rescue {} out of {}".format(i+1,len(dataset_merged)))
        predictions_by_model[idx][i]['volunteer_comment'] = dataset_merged.loc[i, "volunteer_comment"]

        for task in tasks:
            basic_ending = "Responses should be formatted in JSON to maintain uniformity and clarity across reports. The response should have two keys: {} and explanation. \n Now, it’s your turn. \n Analyze the following rescue: \n".format(task)

            if a == "no_few_shot":
                full_prompt = prompts[task].split("Example Comment Analysis:")[0] + "Now, it’s your turn. \n Analyze the following rescue; note that the JSON should have two keys: {} and explanation: \n".format(task)
            elif a == "no_guidelines":
                full_prompt = prompts[task].split("Notes:")[0] + "\n Example Comment Analysis\n"+prompts[task].split("Example Comment Analysis:")[1]
                full_prompt = full_prompt.split("Now, it’s your turn")[0]+basic_ending
            elif a == "no_explanations":
                full_prompt = prompts[task].split("\n")
                full_prompt = "\n".join([i for i in full_prompt if '"explanation"' not in i])


            try:
                response = client.chat.completions.create(
                    model='gpt-4o-mini',
                    messages=[{"role": "user", "content": full_prompt + "\n"+comment}],
                    response_format={"type": "json_object"},
                )
                output = response.choices[0].message.content
                output = re.search('\{.*\}',output,re.DOTALL).group(0)
                feedback_info = json.loads(output)
                predictions_by_model[idx][i][task] = feedback_info[task]
            except Exception as e:
                print(f"Error processing feedback {i} for task {task}: {e}")        

On Rescue 1 out of 126
On Rescue 2 out of 126
On Rescue 3 out of 126
On Rescue 4 out of 126
On Rescue 5 out of 126
On Rescue 6 out of 126
On Rescue 7 out of 126
On Rescue 8 out of 126
On Rescue 9 out of 126
On Rescue 10 out of 126
On Rescue 11 out of 126
On Rescue 12 out of 126
On Rescue 13 out of 126
On Rescue 14 out of 126
On Rescue 15 out of 126
On Rescue 16 out of 126
On Rescue 17 out of 126
On Rescue 18 out of 126
On Rescue 19 out of 126
On Rescue 20 out of 126
On Rescue 21 out of 126
On Rescue 22 out of 126
On Rescue 23 out of 126
On Rescue 24 out of 126
On Rescue 25 out of 126
On Rescue 26 out of 126
On Rescue 27 out of 126
On Rescue 28 out of 126
On Rescue 29 out of 126
On Rescue 30 out of 126
On Rescue 31 out of 126
On Rescue 32 out of 126
On Rescue 33 out of 126
On Rescue 34 out of 126
On Rescue 35 out of 126
On Rescue 36 out of 126
On Rescue 37 out of 126
On Rescue 38 out of 126
On Rescue 39 out of 126
On Rescue 40 out of 126
On Rescue 41 out of 126
On Rescue 42 out of 126
O

In [16]:
keys = predictions_by_model[0][0].keys()

for i,a in enumerate(ablations):
    file_name = "ablation_{}.csv".format(a.replace("/","_"))
    dataframe = pd.DataFrame(predictions_by_model[i])
    dataframe.to_csv('../../results/evaluation/{}'.format(file_name), index=False)

## Non-LLM Evaluation

In [8]:
random_prediction = np.random.random((len(eval_dataset),len(eval_columns)))
random_prediction = random_prediction.round()

result_df = eval_dataset.copy()

# Add the prediction columns
for i, col_name in enumerate(eval_columns):
    result_df[col_name] = random_prediction[:, i].astype(int)

# Save to CSV
result_df.to_csv('../../results/evaluation/random.csv', index=False)


In [9]:
random_prediction = np.random.random((len(eval_dataset),len(eval_columns)))
base_rates = list(groundtruth[eval_columns].mean())
for i in range(len(eval_columns)):
    random_prediction[:,i] = random_prediction[:,i] < base_rates[i]

random_prediction = random_prediction.round()

result_df = eval_dataset.copy()

# Add the prediction columns
for i, col_name in enumerate(eval_columns):
    result_df[col_name] = random_prediction[:, i].astype(int)

# Save to CSV
result_df.to_csv('../../results/evaluation/random_marginal.csv', index=False)


In [19]:
predictions_by_model = [{} for i in range(len(dataset_merged))]
for d in eval_columns:
    pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))
    pipeline.fit(training_by_column[d]['volunteer_comment'], training_by_column[d][d])

    for i in range(len(dataset_merged)):
        volunteer_comment = dataset_merged.loc[i,'volunteer_comment']
        predictions_by_model[i]['volunteer_comment'] = volunteer_comment
        y_pred = pipeline.predict([volunteer_comment])
        predictions_by_model[i][d] = y_pred[0]
predictions_by_model = pd.DataFrame(predictions_by_model)

In [20]:
predictions_by_model.to_csv('../../results/evaluation/tf_idf.csv', index=False)

In [10]:
predictions_by_model = [{} for i in range(len(eval_dataset))]
eval_dataset = pd.read_csv("../../data/annotations/pre_deploy_eval.csv")
eval_dataset = eval_dataset[eval_dataset["annotator"] == "naveen"][["volunteer_comment","id"]]
for task in eval_columns:
    task_name = task
    df = training_by_column[task_name][["volunteer_comment", task_name]].rename(columns={task_name: "label"})
    train_ds = Dataset.from_pandas(df.reset_index(drop=True))
    checkpoint = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

    # Tokenize
    def tokenize_function(example):
        return tokenizer(example["volunteer_comment"], padding="max_length", truncation=True)

    train_ds = train_ds.map(tokenize_function, batched=True)
    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    training_args = TrainingArguments(
    output_dir="./results",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    save_strategy="no",
    load_best_model_at_end=False,
    disable_tqdm=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
    )

    trainer.train()

    # 🔍 Evaluate on new sentence
    for i in range(len(eval_dataset)):
        input_text = eval_dataset.loc[i,"volunteer_comment"]
        predictions_by_model[i]["volunteer_comment"] = input_text
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to('cuda:0')
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred = torch.argmax(probs, dim=-1).item()
            predictions_by_model[i][task_name] = pred

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.3966
20,0.0658
30,0.0122
40,0.0053
50,0.0033
60,0.0026
70,0.0021
80,0.0021
90,0.0019
100,0.0019


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.3735
20,0.1025
30,0.0239
40,0.0066
50,0.0035
60,0.0025
70,0.0021
80,0.0019
90,0.0017
100,0.0017


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.3842
20,0.0544
30,0.0145
40,0.0066
50,0.0045
60,0.0037
70,0.0028
80,0.0026
90,0.0028
100,0.0025


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.3911
20,0.0572
30,0.0125
40,0.0061
50,0.0039
60,0.003
70,0.0025
80,0.0026
90,0.0022
100,0.0023


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.4062
20,0.0555
30,0.0121
40,0.006
50,0.0039
60,0.0033
70,0.0027
80,0.0026
90,0.0022
100,0.0023


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.4368
20,0.0649
30,0.0129
40,0.0061
50,0.0041
60,0.0034
70,0.0027
80,0.0027
90,0.0023
100,0.0024


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.3968
20,0.0607
30,0.012
40,0.0056
50,0.0034
60,0.0028
70,0.0024
80,0.002
90,0.0021
100,0.0021


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.4257
20,0.0703
30,0.0167
40,0.0069
50,0.0043
60,0.0035
70,0.0028
80,0.0025
90,0.0026
100,0.0025


In [12]:
predictions_by_model

[{'volunteer_comment': 'Everyone at the pickup and drop off were amazing and so friendly!',
  'update_contact': 0,
  'inadequate_food': 1,
  'earlier_pickup': 1,
  'system_problem': 1,
  'direction_problem': 0,
  'recipient_problem': 0,
  'donor_problem': 0,
  'positive_comment': 1},
 {'volunteer_comment': 'We were informed that they are making cookings a different way and will have less donations available from now on.',
  'update_contact': 0,
  'inadequate_food': 1,
  'earlier_pickup': 1,
  'system_problem': 1,
  'direction_problem': 0,
  'recipient_problem': 0,
  'donor_problem': 0,
  'positive_comment': 1},
 {'volunteer_comment': 'only one box of cheese',
  'update_contact': 0,
  'inadequate_food': 1,
  'earlier_pickup': 1,
  'system_problem': 1,
  'direction_problem': 0,
  'recipient_problem': 0,
  'donor_problem': 0,
  'positive_comment': 0},
 {'volunteer_comment': 'gate at CSC was closed at 4:03. unable to pick up',
  'update_contact': 0,
  'inadequate_food': 1,
  'earlier_picku

In [11]:
pd.DataFrame(predictions_by_model).to_csv('../../results/evaluation/distilbert.csv', index=False)