In [1]:
import numpy as np
import pandas as pd
import pickle
import random
import json
from collections import Counter
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef


In this notebook, we estimate:

- Upper bound of performance on Inconsistency detection (referred to as Bootstrap)
- Individual human performance at predicting the majority label (referred to as Cross-validation)

Additionally:

- "Majority LLM" means we are evaluating LLMs based on their majority class out of the 5 runs.
- "5 runs" means we are treating each of the 5 LLM runs as a separate prediction.

We run the analysis for both 5 and 3 classes settings.

#### Load the final dataset

In [6]:
df_gold = pd.read_csv(r"../data/qualtrics_survey/results/df_gold_all.csv")

In [7]:
classes_5 = ['Unrelated', 'Consistent', 'Indirect inconsistency', "Factual inconsistency", 'Surface contradiction']                             
classes_3 = ['Unrelated', 'Consistent', 'Inconsistent']
classes_5_to_3 = {'Unrelated': 'Unrelated', 'Consistent': 'Consistent', 'Indirect inconsistency': 'Inconsistent', "Factual inconsistency": 'Inconsistent', 'Surface contradiction': 'Inconsistent'}

### Synthetic upper-bound

#### Bootstrap 5 classes

In [None]:
bootstrap_predictions_tuples = []

for i in range(len(df_gold)):
    # we have 5 tuples per sample 
    strings = eval(df_gold['All answers'][i])
    # create N-1 tuples
    versions = [(strings[j], strings[:j] + strings[j+1:]) for j in range(len(strings))]
    # print(len(versions))
    # go through each element, value is held out "ground truth"
    for value, version in versions:
        list_annotator_dropped = version
        counter = Counter(list_annotator_dropped)
        # identify the majority vote and sample randomly if more than 1
        max_count = max(counter.values())
        top_labels = [label for label, count in counter.items() if count == max_count]
        # this is today's ground truth
        majority = random.choice(top_labels)
        # 10 bootstraps of the same pool where we took the majority from 
        bootstraps = [random.choices(version, k=len(version)) for _ in range(10)]
        bootstrap_majorities = []
        for bootstrap in bootstraps:
            counter = Counter(bootstrap)
            max_count = max(counter.values())
            top_labels = [label for label, count in counter.items() if count == max_count]
            # this is tomorrow's ground truth
            tomorrow_majority = random.choice(top_labels)
            bootstrap_majorities.append(tomorrow_majority)
        # print(len(bootstrap_majorities))
        to_append = []
        for tomorrow_majority in bootstrap_majorities:
            to_append.append((majority, tomorrow_majority))
        bootstrap_predictions_tuples.extend(to_append)        
            

In [None]:
with open(r'../data/model_evaluations/bootstrap_predictions_tuples.pkl', 'wb') as f:
    pickle.dump(bootstrap_predictions_tuples, f)

#### Bootstrap 3 classes

In [17]:
bootstrap_predictions_tuples_3_classes = []

for i in range(len(df_gold)):
    # we have 5 tuples per sample 
    strings = eval(df_gold['All answers'][i])
    # create N-1 tuples
    versions = [(strings[j], strings[:j] + strings[j+1:]) for j in range(len(strings))]
    # print(len(versions))
    # go through each element, value is held out "ground truth"
    for value, version in versions:
        list_annotator_dropped = [classes_5_to_3[el] for el in version]
        version = [classes_5_to_3[el] for el in version]
        value = classes_5_to_3[value]
        counter = Counter(list_annotator_dropped)
        # identify the majority vote and sample randomly if more than 1
        max_count = max(counter.values())
        top_labels = [label for label, count in counter.items() if count == max_count]
        # this is today's ground truth
        majority = random.choice(top_labels)
        # 10 bootstraps of the same pool where we took the majority from 
        bootstraps = [random.choices(version, k=len(version)) for _ in range(10)]
        bootstrap_majorities = []
        for bootstrap in bootstraps:
            counter = Counter(bootstrap)
            max_count = max(counter.values())
            top_labels = [label for label, count in counter.items() if count == max_count]
            # this is tomorrow's ground truth
            tomorrow_majority = random.choice(top_labels)
            bootstrap_majorities.append(tomorrow_majority)
        # print(len(bootstrap_majorities))
        to_append = []
        for tomorrow_majority in bootstrap_majorities:
            to_append.append((majority, tomorrow_majority))
        bootstrap_predictions_tuples_3_classes.extend(to_append)        
            
      

In [18]:
with open(r'../data/model_evaluations/bootstrap_predictions_tuples_3_classes.pkl', 'wb') as f:
    pickle.dump(bootstrap_predictions_tuples_3_classes, f)

### Cross-validation: majority LLM

#### 5 classes

In [10]:
prediction_ground_truth_tuples_majority_LLM = dict()
prediction_ground_truth_tuples_majority_LLM['Humans'] = []
prediction_ground_truth_tuples_majority_LLM['ChatGPT-4'] = []
prediction_ground_truth_tuples_majority_LLM['ChatGPT-3.5'] = []
prediction_ground_truth_tuples_majority_LLM['LLaMA 8B'] = []
prediction_ground_truth_tuples_majority_LLM['LLaMA 70B'] = []

In [11]:
for i in range(len(df_gold)):
    # we have 5 tuples per sample 
    # ground truth - from majority, the 5th one is prediction 
    strings = eval(df_gold['All answers'][i])
    # create N tuples of size (N-1), 1 held-out
    versions = [(strings[j], strings[:j] + strings[j+1:]) for j in range(len(strings))]
    # go through each element
    # value is held out "prediction"
    for value, version in versions:
        list_annotator_dropped = version
        counter = Counter(list_annotator_dropped)
        # identify the majority vote and sample randomly if more than 1
        max_count = max(counter.values())
        top_labels = [label for label, count in counter.items() if count == max_count]
        majority = random.choice(top_labels)
        prediction_ground_truth_tuples_majority_LLM['Humans'].append((majority, value)) #ground truth, prediction
        prediction_ground_truth_tuples_majority_LLM['ChatGPT-4'].append((majority, df_gold['ChatGPT-4 majority'][i]))
        prediction_ground_truth_tuples_majority_LLM['ChatGPT-3.5'].append((majority, df_gold['ChatGPT-3.5 majority'][i]))
        prediction_ground_truth_tuples_majority_LLM['LLaMA 8B'].append((majority, df_gold['Llama 8B majority'][i]))
        prediction_ground_truth_tuples_majority_LLM['LLaMA 70B'].append((majority, df_gold['Llama 70B majority'][i]))
    
        
    

In [12]:
with open(r'../data/model_evaluations/prediction_ground_truth_tuples_majority_LLM.json', 'w') as f:
    json.dump(prediction_ground_truth_tuples_majority_LLM, f)

#### 3 classes

In [7]:
prediction_ground_truth_tuples_majority_LLM_3_classes = dict()
prediction_ground_truth_tuples_majority_LLM_3_classes['Humans'] = []
prediction_ground_truth_tuples_majority_LLM_3_classes['ChatGPT-4'] = []
prediction_ground_truth_tuples_majority_LLM_3_classes['ChatGPT-3.5'] = []
prediction_ground_truth_tuples_majority_LLM_3_classes['LLaMA 8B'] = []
prediction_ground_truth_tuples_majority_LLM_3_classes['LLaMA 70B'] = []

In [8]:
for i in range(len(df_gold)):
    # we have 5 tuples per sample 
    # ground truth - from majority, the 5th one is prediction 
    strings = eval(df_gold['All answers'][i])
    # create N tuples of size (N-1), 1 held-out
    versions = [(strings[j], strings[:j] + strings[j+1:]) for j in range(len(strings))]
    # go through each element
    # value is held out "prediction"
    for value, version in versions:
        list_annotator_dropped = [classes_5_to_3[el] for el in version]
        value = classes_5_to_3[value]
        counter = Counter(list_annotator_dropped)
        # identify the majority vote and sample randomly if more than 1
        max_count = max(counter.values())
        top_labels = [label for label, count in counter.items() if count == max_count]
        majority = random.choice(top_labels)
        prediction_ground_truth_tuples_majority_LLM_3_classes['Humans'].append((majority, value)) #ground truth, prediction
        prediction_ground_truth_tuples_majority_LLM_3_classes['ChatGPT-4'].append((majority, classes_5_to_3[df_gold['ChatGPT-4 majority'][i]]))
        prediction_ground_truth_tuples_majority_LLM_3_classes['ChatGPT-3.5'].append((majority, classes_5_to_3[df_gold['ChatGPT-3.5 majority'][i]]))
        prediction_ground_truth_tuples_majority_LLM_3_classes['LLaMA 8B'].append((majority, classes_5_to_3[df_gold['Llama 8B majority'][i]]))
        prediction_ground_truth_tuples_majority_LLM_3_classes['LLaMA 70B'].append((majority, classes_5_to_3[df_gold['Llama 70B majority'][i]]))
    
        
    

In [9]:
with open(r'../data/model_evaluations/prediction_ground_truth_tuples_majority_LLM_3_classes.json', 'w') as f:
    json.dump(prediction_ground_truth_tuples_majority_LLM_3_classes, f)

### Cross-validation for all 5 runs - 3 classes

In [5]:
prediction_ground_truth_tuples_5_runs_3_classes = dict()
prediction_ground_truth_tuples_5_runs_3_classes['Humans'] = []
prediction_ground_truth_tuples_5_runs_3_classes['ChatGPT-4'] = []
prediction_ground_truth_tuples_5_runs_3_classes['ChatGPT-3.5'] = []
prediction_ground_truth_tuples_5_runs_3_classes['LLaMA 8B'] = []
prediction_ground_truth_tuples_5_runs_3_classes['LLaMA 70B'] = []

In [6]:
for i in range(len(df_gold)):
    # we have 5 tuples per sample 
    # ground truth - from majority, the 5th one is prediction 
    strings = eval(df_gold['All answers'][i])
    # create N tuples of size (N-1), 1 held-out
    versions = [(strings[j], strings[:j] + strings[j+1:]) for j in range(len(strings))]
    # go through each element
    # value is held out "prediction"
    for value, version in versions:
        list_annotator_dropped = [classes_5_to_3[el] for el in version]
        counter = Counter(list_annotator_dropped)
        # identify the majority vote and sample randomly if more than 1
        max_count = max(counter.values())
        top_labels = [label for label, count in counter.items() if count == max_count]
        majority = random.choice(top_labels)
        value = classes_5_to_3[value]
        prediction_ground_truth_tuples_5_runs_3_classes['Humans'].append((majority, value)) #ground truth, prediction

        # compare each majority with each of the 5 runs results of the model 
        for run in range(5):
            prediction_ground_truth_tuples_5_runs_3_classes['ChatGPT-4'].append((majority, classes_5_to_3[eval(df_gold['ChatGPT-4 5 runs'][i])[run]]))
            prediction_ground_truth_tuples_5_runs_3_classes['ChatGPT-3.5'].append((majority, classes_5_to_3[eval(df_gold['ChatGPT-3.5 5 runs'][i])[run]]))
            prediction_ground_truth_tuples_5_runs_3_classes['LLaMA 8B'].append((majority, classes_5_to_3[eval(df_gold['LLaMA 8B 5 runs'][i])[run]]))
            prediction_ground_truth_tuples_5_runs_3_classes['LLaMA 70B'].append((majority, classes_5_to_3[eval(df_gold['LLaMA 70B 5 runs'][i])[run]]))
        
        
    

In [10]:
with open(r'../data/model_evaluations/prediction_ground_truth_tuples_5_runs_3_classes.json', 'w') as f:
    json.dump(prediction_ground_truth_tuples_5_runs_3_classes, f)

### Cross-validation for all 5 runs - 5 classes

In [4]:
prediction_ground_truth_tuples_5_runs = dict()
prediction_ground_truth_tuples_5_runs['Humans'] = []
prediction_ground_truth_tuples_5_runs['ChatGPT-4'] = []
prediction_ground_truth_tuples_5_runs['ChatGPT-3.5'] = []
prediction_ground_truth_tuples_5_runs['LLaMA 8B'] = []
prediction_ground_truth_tuples_5_runs['LLaMA 70B'] = []

In [6]:
for i in range(len(df_gold)):
    # we have 5 tuples per sample 
    # ground truth - from majority, the 5th one is prediction 
    strings = eval(df_gold['All answers'][i])
    # create N tuples of size (N-1), 1 held-out
    versions = [(strings[j], strings[:j] + strings[j+1:]) for j in range(len(strings))]
    # go through each element
    # value is held out "prediction"
    for value, version in versions:
        list_annotator_dropped = version
        counter = Counter(list_annotator_dropped)
        # identify the majority vote and sample randomly if more than 1
        max_count = max(counter.values())
        top_labels = [label for label, count in counter.items() if count == max_count]
        majority = random.choice(top_labels)
        prediction_ground_truth_tuples_5_runs['Humans'].append((majority, value)) #ground truth, prediction

        # compare each majority with each of the 5 runs results of the model 
        for run in range(5):
            prediction_ground_truth_tuples_5_runs['ChatGPT-4'].append((majority, eval(df_gold['ChatGPT-4 5 runs'][i])[run]))
            prediction_ground_truth_tuples_5_runs['ChatGPT-3.5'].append((majority, eval(df_gold['ChatGPT-3.5 5 runs'][i])[run]))
            prediction_ground_truth_tuples_5_runs['LLaMA 8B'].append((majority, eval(df_gold['LLaMA 8B 5 runs'][i])[run]))
            prediction_ground_truth_tuples_5_runs['LLaMA 70B'].append((majority, eval(df_gold['LLaMA 70B 5 runs'][i])[run]))
        
        
    

In [8]:
with open(r'../data/model_evaluations/prediction_ground_truth_tuples_5_runs.json', 'w') as f:
    json.dump(prediction_ground_truth_tuples_5_runs, f)