# Template for Accuracy Calculation for ChatGPT zero-shot and finetuned models

This script provides some functionalities to calculate accuracy scores for the results of fine-tuned and zero-shot models. For zero-shot predictions, you need to re-map the output of the model to match with the labels of the original data to calculate accurate metrics, as the output might return full labels as opposed to abbreviations, or add filler words. If your task is not one of the tasks that we provide, you might need to add the mapping functions yourself by inspecting the output of your predictions (e.g. you can print all the unique combinations of output and original labels to see which categories have been created.) 

### Import Modules

In [None]:
import os
import re
import glob

import pandas as pd 
from IPython.core.display import Markdown

from dataload_utils import  load_dataset_task_prompt_mappings
from label_utils import plot_count_and_normalized_confusion_matrix, task_to_display_labels, map_label_to_completion

### Define Utility Functions

Unlike with other models, ChatGPT returns the text clearly in all finetuned models, but in zero-shot models it has a tendency to return full labels, accompanied with an explanation. So the function belows converts zero-shot output into short labels by either returning the output or instances of mentions of full labels. If you find such instances in your finetuned data, you might want to run this on your finetuned models as well.

In [31]:
def process_zero_shotoutput(completion: str, task: int) -> str:
    # Load mappings defined in Utils between short and full versions
    mapping_dict = task_to_display_labels[task]
    full_values = mapping_dict.get('full_name')
    short_values = mapping_dict.get('short_name')

    #assign null value if a text longer than 3 characters is returned that does not match one of the labels provided
    val_out = "NAN"
    #parse through completions to either return the short version of the full output (A, B, C etc.) or the original output
    if "Answer:" in completion:
        completion = completion.split("Answer:")[1].strip()
    #max length of short labels
    if len(completion) > 3:
        for full_val in full_values:
            if full_val.lower() in completion.lower():
                index_full = full_values.index(full_val)
                val_out = short_values[index_full]
                next
    else:
        val_out = completion
    return val_out

### Set Up Arguments and Data

Here, you define the values for the dataset that you are loading:

In [None]:
#Configuration variables

#Name of the file to compute accuracy for
prediction_file_name = "data/predictions/chatgpt/ds_1_t_1_file.csv"

#Path to the dataset-task mappings file
dataset_task_mappings_fp = os.path.normpath(os.path.join(module_dir, '..', 'dataset_task_mappings.csv'))

# Type of task to run inference on
task = 1  # Choices: [1,2,3,4,5,6]

# Dataset to run inference on
dataset = 1  # 

# Size of the sample to generate
sample_size = '250'  # Enter 0 for zero-shot predictions

# Zero-shot
zero_shot = False

## Main Implementation

In [32]:
#load the file 
df = pd.read_csv(prediction_file_name)

In [None]:
#load the mapping file
dataset_task_mappings_fp = pd.read_csv(dataset_task_mappings_fp)

# load dataset mappings
dataset_idx, dataset_task_mappings = load_dataset_task_prompt_mappings(
    dataset_num=ds, task_num=task, dataset_task_mappings_fp=dataset_task_mappings_fp)
label_column = dataset_task_mappings.loc[dataset_idx, "label_column"]
labelset = dataset_task_mappings.loc[dataset_idx, "labelset"].split(",")
labelset = [label.strip() for label in labelset]
labelset_full_description = dataset_task_mappings.loc[dataset_idx, "labelset_fullword"].split("; ")

In [None]:
#Run accuracy scores
if zero_shot:
    y_pred = df['prediction'].apply(lambda x : process_zero_shotoutput (x, task))
    # Get ground truth in same format
    y_true = df[label_column].map(lambda label: map_label_to_completion(
        label=label, task_num=task, full_label=False))
    #assert y_true.map(lambda pred: pred not in labelset).sum() == 0, 'Ground truth not in expected labelset'
    display_labels = labelset
else:
    y_pred = df['prediction'].map(lambda label: map_label_to_completion(
    label=label, task_num=task, full_label=True))
    # Get ground truth in same format
    y_true = df[label_column].map(lambda label: map_label_to_completion(
        label=label, task_num=task, full_label=True))
    #assert y_true.map(lambda pred: pred not in labelset_full_description).sum() == 0, 'Ground truth not in expected labelset'
    display_labels = [label.upper() for label in labelset_full_description]
    
# Get accuracy
#labels = labelset
cm_plot, classification_report, metrics = plot_count_and_normalized_confusion_matrix(
    y_true, y_pred, display_labels, display_labels, xticks_rotation='horizontal')

# Get accuracy
print({
    'sample_size': sample_size,
    'accuracy': metrics['accuracy'],
    'f1-macro': metrics['f1'],
    'precision': metrics['precision'],
    'recall': metrics['recall']
})
