In [None]:
import numpy as np
import pandas as pd
import re
import pickle
import random
import json
from collections import Counter
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score, matthews_corrcoef

In this notebook, we evaluate models and humans on the following metrics:

- Precision
- Recall
- F1-Score (macro)
- Matthews Correlation Coefficient (MCC)

Additionally:

- "Majority LLM" means we are evaluating LLMs based on their majority class out of the 5 runs.
- "5 runs" means we are treating each of the 5 LLM runs as a separate prediction.

We run the analysis for both 5 and 3 classes settings.

#### Load our final dataset

In [None]:
df_gold = pd.read_csv(r"../data/df_gold.csv")

In [7]:
classes_5 = ['Unrelated', 'Consistent', 'Indirect inconsistency', "Factual inconsistency", 'Surface contradiction']                             
classes_3 = ['Unrelated', 'Consistent', 'Inconsistent']
classes_5_to_3 = {'Unrelated': 'Unrelated', 'Consistent': 'Consistent', 'Indirect inconsistency': 'Inconsistent', "Factual inconsistency": 'Inconsistent', 'Surface contradiction': 'Inconsistent'}

In [6]:
with open(r"../data/model_evaluations/bootstrap_predictions_tuples.pkl", 'rb') as f:
    bootstrap_predictions_tuples = pickle.load(f)
    
with open(r"../data/model_evaluations/bootstrap_predictions_tuples_3_classes.pkl", 'rb') as f:
    bootstrap_predictions_tuples_3_classes = pickle.load(f)
    
with open(r"../data/model_evaluations/prediction_ground_truth_tuples_majority_LLM.json", 'r') as f:
    prediction_ground_truth_tuples_majority_LLM = json.load(f)
    
with open(r"../data/model_evaluations/prediction_ground_truth_tuples_majority_LLM_3_classes.json", 'r') as f:
    prediction_ground_truth_tuples_majority_LLM_3_classes = json.load(f)
    
with open(r"../data/model_evaluations/prediction_ground_truth_tuples_5_runs_3_classes.json", 'r') as f:
    prediction_ground_truth_tuples_5_runs_3_classes = json.load(f)
    
with open(r"../data/model_evaluations/prediction_ground_truth_tuples_5_runs.json", 'r') as f:
    prediction_ground_truth_tuples_5_runs = json.load(f)

#### Calculate Matthews Correlation Coefficient (MCC)

### Majority LLM

#### 5 classes

In [8]:
matthews_corrcoef_majority_LLM_per_class = dict()

for model_type in prediction_ground_truth_tuples_majority_LLM:
    
    print(model_type)
    y_true = []
    y_pred = []
    for ground_truth, prediction in prediction_ground_truth_tuples_majority_LLM[model_type]:
        y_true.append(ground_truth)
        y_pred.append(prediction)
        
    
    # Compute MCC for each class
    mcc_scores = []
    for class_name in classes_5:
        # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
        y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
        y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
        
        # Calculate MCC for this class
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        mcc_scores.append(mcc)

    # Output MCC for each class
    for class_name, mcc in zip(classes_5, mcc_scores):
        print(f"MCC for class '{class_name}': {mcc}")
        
    matthews_corrcoef_majority_LLM_per_class[model_type] = dict(zip(classes_5, mcc_scores))



Humans
MCC for class 'Unrelated': 0.5118594119149651
MCC for class 'Consistent': 0.6268537446684619
MCC for class 'Indirect inconsistency': 0.24836592507889102
MCC for class 'Factual inconsistency': 0.25585335049644997
MCC for class 'Surface contradiction': 0.4175184199168447
ChatGPT-4
MCC for class 'Unrelated': 0.5478670649304451
MCC for class 'Consistent': 0.6694946961029499
MCC for class 'Indirect inconsistency': 0.1743969492825913
MCC for class 'Factual inconsistency': 0.1834291017531791
MCC for class 'Surface contradiction': 0.3280374205878848
ChatGPT-3.5
MCC for class 'Unrelated': 0.2826301125441576
MCC for class 'Consistent': 0.4164564190994086
MCC for class 'Indirect inconsistency': 0.08877300291709384
MCC for class 'Factual inconsistency': 0.16703238114784744
MCC for class 'Surface contradiction': 0.19384294648407363
LLaMA 8B
MCC for class 'Unrelated': 0.051258928298996816
MCC for class 'Consistent': 0.24976100547798227
MCC for class 'Indirect inconsistency': -0.02142191964215

In [20]:
with open(r"../data/model_evaluations/f1_score_per_model_type_majority_LLM.pkl", 'rb') as f:
    f1_score_per_model_type_majority_LLM = pickle.load(f)

In [9]:
model_type = 'bootstrap'

y_true = []
y_pred = []
for ground_truth, prediction in bootstrap_predictions_tuples:
    y_true.append(ground_truth)
    y_pred.append(prediction)
    

# Compute MCC for each class
mcc_scores = []
for class_name in classes_5:
    # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
    y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
    y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
    
    # Calculate MCC for this class
    mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
    mcc_scores.append(mcc)

# Output MCC for each class
for class_name, mcc in zip(classes_5, mcc_scores):
    print(f"MCC for class '{class_name}': {mcc}")
    
matthews_corrcoef_majority_LLM_per_class[model_type] = dict(zip(classes_5, mcc_scores))

MCC for class 'Unrelated': 0.7157325874971767
MCC for class 'Consistent': 0.7857342723881804
MCC for class 'Indirect inconsistency': 0.5906623958002127
MCC for class 'Factual inconsistency': 0.57283963807414
MCC for class 'Surface contradiction': 0.6749434614250672


In [10]:
with open(r'../data/model_evaluations/matthews_corrcoef_majority_LLM_per_class.pkl', 'wb') as f:
    pickle.dump(matthews_corrcoef_majority_LLM_per_class, f)

##### 3 classes

In [11]:
matthews_corrcoef_majority_LLM_per_class_3_classes = dict()

for model_type in prediction_ground_truth_tuples_majority_LLM_3_classes:
    
    print(model_type)
    y_true = []
    y_pred = []
    for ground_truth, prediction in prediction_ground_truth_tuples_majority_LLM_3_classes[model_type]:
        y_true.append(ground_truth)
        y_pred.append(prediction)
            
    # Compute MCC for each class
    mcc_scores = []
    for class_name in classes_3:
        # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
        y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
        y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
        
        # Calculate MCC for this class
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        mcc_scores.append(mcc)

    # Output MCC for each class
    for class_name, mcc in zip(classes_3, mcc_scores):
        print(f"MCC for class '{class_name}': {mcc}")
        
    matthews_corrcoef_majority_LLM_per_class_3_classes[model_type] = dict(zip(classes_3, mcc_scores))


model_type = 'bootstrap'

y_true = []
y_pred = []
for ground_truth, prediction in bootstrap_predictions_tuples_3_classes:
    y_true.append(ground_truth)
    y_pred.append(prediction)
    

# Compute MCC for each class
mcc_scores = []
for class_name in classes_3:
    # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
    y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
    y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
    
    # Calculate MCC for this class
    mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
    mcc_scores.append(mcc)

# Output MCC for each class
for class_name, mcc in zip(classes_3, mcc_scores):
    print(f"MCC for class '{class_name}': {mcc}")
    
matthews_corrcoef_majority_LLM_per_class_3_classes[model_type] = dict(zip(classes_3, mcc_scores))



Humans
MCC for class 'Unrelated': 0.5026503128019292
MCC for class 'Consistent': 0.6367265146427177
MCC for class 'Inconsistent': 0.6168235295892426
ChatGPT-4
MCC for class 'Unrelated': 0.5478002081755924
MCC for class 'Consistent': 0.6624972248689089
MCC for class 'Inconsistent': 0.6186233202710536
ChatGPT-3.5
MCC for class 'Unrelated': 0.3080388858278656
MCC for class 'Consistent': 0.43242698976411825
MCC for class 'Inconsistent': 0.3783360802867473
LLaMA 8B
MCC for class 'Unrelated': 0.06190079741535469
MCC for class 'Consistent': 0.2451971829261946
MCC for class 'Inconsistent': 0.10673700529267396
LLaMA 70B
MCC for class 'Unrelated': 0.5245552760933625
MCC for class 'Consistent': 0.7073268914805231
MCC for class 'Inconsistent': 0.6326792968688756
MCC for class 'Unrelated': 0.7271250758242624
MCC for class 'Consistent': 0.7982638493044966
MCC for class 'Inconsistent': 0.786103408719816


In [12]:
with open(r'../data/model_evaluations/matthews_corrcoef_majority_LLM_per_class_3_classes.pkl', 'wb') as f:
    pickle.dump(matthews_corrcoef_majority_LLM_per_class_3_classes, f)

### MCC: 5 runs

#### 3 classes

In [15]:
matthews_corrcoef_5_runs_per_class_3_classes = dict()

for model_type in prediction_ground_truth_tuples_5_runs_3_classes:
    
    print(model_type)
    y_true = []
    y_pred = []
    for ground_truth, prediction in prediction_ground_truth_tuples_5_runs_3_classes[model_type]:
        y_true.append(ground_truth)
        y_pred.append(prediction)
            
    # Compute MCC for each class
    mcc_scores = []
    for class_name in classes_3:
        # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
        y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
        y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
        
        # Calculate MCC for this class
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        mcc_scores.append(mcc)

    # Output MCC for each class
    for class_name, mcc in zip(classes_3, mcc_scores):
        print(f"MCC for class '{class_name}': {mcc}")
        
    matthews_corrcoef_5_runs_per_class_3_classes[model_type] = dict(zip(classes_3, mcc_scores))


model_type = 'bootstrap'

y_true = []
y_pred = []
for ground_truth, prediction in bootstrap_predictions_tuples_3_classes:
    y_true.append(ground_truth)
    y_pred.append(prediction)
    

# Compute MCC for each class
mcc_scores = []
for class_name in classes_3:
    # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
    y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
    y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
    
    # Calculate MCC for this class
    mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
    mcc_scores.append(mcc)

# Output MCC for each class
for class_name, mcc in zip(classes_3, mcc_scores):
    print(f"MCC for class '{class_name}': {mcc}")
    
matthews_corrcoef_5_runs_per_class_3_classes[model_type] = dict(zip(classes_3, mcc_scores))



Humans
MCC for class 'Unrelated': 0.5200451910514584
MCC for class 'Consistent': 0.6342229284856554
MCC for class 'Inconsistent': 0.6201655777477595
ChatGPT-4
MCC for class 'Unrelated': 0.5479387493515656
MCC for class 'Consistent': 0.6289176599252758
MCC for class 'Inconsistent': 0.6099645822612041
ChatGPT-3.5
MCC for class 'Unrelated': 0.292490180176298
MCC for class 'Consistent': 0.4173021985534696
MCC for class 'Inconsistent': 0.36723945318802514
LLaMA 8B
MCC for class 'Unrelated': 0.127842448832716
MCC for class 'Consistent': 0.3157350332063401
MCC for class 'Inconsistent': 0.19465537990682572
LLaMA 70B
MCC for class 'Unrelated': 0.5170274762429716
MCC for class 'Consistent': 0.705692274669214
MCC for class 'Inconsistent': 0.6241615705543863
MCC for class 'Unrelated': 0.7271250758242624
MCC for class 'Consistent': 0.7982638493044966
MCC for class 'Inconsistent': 0.786103408719816


In [16]:
with open(r'../data/model_evaluations/matthews_corrcoef_5_runs_per_class_3_classes.pkl', 'wb') as f:
    pickle.dump(matthews_corrcoef_5_runs_per_class_3_classes, f)

#### 5 classes

In [17]:
matthews_corrcoef_5_runs_per_class = dict()

for model_type in prediction_ground_truth_tuples_5_runs:
    
    print(model_type)
    y_true = []
    y_pred = []
    for ground_truth, prediction in prediction_ground_truth_tuples_5_runs[model_type]:
        y_true.append(ground_truth)
        y_pred.append(prediction)
            
    # Compute MCC for each class
    mcc_scores = []
    for class_name in classes_5:
        # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
        y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
        y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
        
        # Calculate MCC for this class
        mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
        mcc_scores.append(mcc)

    # Output MCC for each class
    for class_name, mcc in zip(classes_5, mcc_scores):
        print(f"MCC for class '{class_name}': {mcc}")
        
    matthews_corrcoef_5_runs_per_class[model_type] = dict(zip(classes_5, mcc_scores))


model_type = 'bootstrap'

y_true = []
y_pred = []
for ground_truth, prediction in bootstrap_predictions_tuples:
    y_true.append(ground_truth)
    y_pred.append(prediction)
    

# Compute MCC for each class
mcc_scores = []
for class_name in classes_5:
    # Convert labels to binary for one-vs-all (True if the sample is the current class, False otherwise)
    y_true_binary = np.array([1 if label == class_name else 0 for label in y_true])
    y_pred_binary = np.array([1 if label == class_name else 0 for label in y_pred])
    
    # Calculate MCC for this class
    mcc = matthews_corrcoef(y_true_binary, y_pred_binary)
    mcc_scores.append(mcc)

# Output MCC for each class
for class_name, mcc in zip(classes_5, mcc_scores):
    print(f"MCC for class '{class_name}': {mcc}")
    
matthews_corrcoef_5_runs_per_class[model_type] = dict(zip(classes_5, mcc_scores))



Humans
MCC for class 'Unrelated': 0.4898683723899008
MCC for class 'Consistent': 0.6240924859438436
MCC for class 'Indirect inconsistency': 0.2470729047013832
MCC for class 'Factual inconsistency': 0.26115717607945876
MCC for class 'Surface contradiction': 0.4125043198569154
ChatGPT-4
MCC for class 'Unrelated': 0.5336216117558299
MCC for class 'Consistent': 0.6269120062815203
MCC for class 'Indirect inconsistency': 0.17552635065737107
MCC for class 'Factual inconsistency': 0.22042918435148218
MCC for class 'Surface contradiction': 0.3272980890203094
ChatGPT-3.5
MCC for class 'Unrelated': 0.2741280801376437
MCC for class 'Consistent': 0.403543239303218
MCC for class 'Indirect inconsistency': 0.0931470037114452
MCC for class 'Factual inconsistency': 0.16758393613193637
MCC for class 'Surface contradiction': 0.17050173166598745
LLaMA 8B
MCC for class 'Unrelated': 0.13503134353375384
MCC for class 'Consistent': 0.3104549563718792
MCC for class 'Indirect inconsistency': 0.002787613104032726

In [18]:
with open(r'../data/model_evaluations/matthews_corrcoef_5_runs_per_class.pkl', 'wb') as f:
    pickle.dump(matthews_corrcoef_5_runs_per_class, f)

### Precision

In [40]:
final_precision_majority_LLM_3_classses = dict()

for model in prediction_ground_truth_tuples_majority_LLM_3_classes:
    print(model)
    answers = prediction_ground_truth_tuples_majority_LLM_3_classes[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    precision = precision_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
    macro = precision_score(true_labels, predictions, average='macro')
    final_precision_majority_LLM_3_classses[model] = dict(zip(classes_3, precision))
    final_precision_majority_LLM_3_classses[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples_3_classes
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
precision = precision_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
macro = precision_score(true_labels, predictions, average='macro')
final_precision_majority_LLM_3_classses[model] = dict(zip(classes_3, precision))
final_precision_majority_LLM_3_classses[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [43]:
with open(r'../data/model_evaluations/final_precision_majority_LLM_3_classses.pkl', 'wb') as f:
    pickle.dump(final_precision_majority_LLM_3_classses, f)

In [44]:
final_precision_majority_LLM = dict()

for model in prediction_ground_truth_tuples_majority_LLM:
    print(model)
    answers = prediction_ground_truth_tuples_majority_LLM[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    precision = precision_score(true_labels, predictions,labels=classes_5, average=None)
    macro = precision_score(true_labels, predictions, average='macro')
    final_precision_majority_LLM[model] = dict(zip(classes_5, precision))
    final_precision_majority_LLM[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
precision = precision_score(true_labels, predictions,labels=classes_5, average=None)
macro = precision_score(true_labels, predictions, average='macro')
final_precision_majority_LLM[model] = dict(zip(classes_5, precision))
final_precision_majority_LLM[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [46]:
with open(r'../data/model_evaluations/final_precision_majority_LLM.pkl', 'wb') as f:
    pickle.dump(final_precision_majority_LLM, f)

#### 5 runs

In [48]:
final_precision_5_runs_3_classses = dict()

for model in prediction_ground_truth_tuples_5_runs_3_classes:
    print(model)
    answers = prediction_ground_truth_tuples_5_runs_3_classes[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    precision = precision_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
    macro = precision_score(true_labels, predictions, average='macro')
    final_precision_5_runs_3_classses[model] = dict(zip(classes_3, precision))
    final_precision_5_runs_3_classses[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples_3_classes
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
precision = precision_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
macro = precision_score(true_labels, predictions, average='macro')
final_precision_5_runs_3_classses[model] = dict(zip(classes_3, precision))
final_precision_5_runs_3_classses[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [49]:
with open(r'../data/model_evaluations/final_precision_5_runs_3_classses.pkl', 'wb') as f:
    pickle.dump(final_precision_5_runs_3_classses, f)

In [50]:
final_precision_5_runs = dict()

for model in prediction_ground_truth_tuples_5_runs:
    print(model)
    answers = prediction_ground_truth_tuples_5_runs[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    precision = precision_score(true_labels, predictions,labels=classes_5, average=None)
    macro = precision_score(true_labels, predictions, average='macro')
    final_precision_5_runs[model] = dict(zip(classes_5, precision))
    final_precision_5_runs[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
precision = precision_score(true_labels, predictions,labels=classes_5, average=None)
macro = precision_score(true_labels, predictions, average='macro')
final_precision_5_runs[model] = dict(zip(classes_5, precision))
final_precision_5_runs[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [51]:
with open(r'../data/model_evaluations/final_precision_5_runs.pkl', 'wb') as f:
    pickle.dump(final_precision_5_runs, f)

### Recall

In [52]:
final_recall_majority_LLM_3_classses = dict()

for model in prediction_ground_truth_tuples_majority_LLM_3_classes:
    print(model)
    answers = prediction_ground_truth_tuples_majority_LLM_3_classes[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    recall = recall_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
    macro = recall_score(true_labels, predictions, average='macro')
    final_recall_majority_LLM_3_classses[model] = dict(zip(classes_3, recall))
    final_recall_majority_LLM_3_classses[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples_3_classes
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
recall = recall_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
macro = recall_score(true_labels, predictions, average='macro')
final_recall_majority_LLM_3_classses[model] = dict(zip(classes_3, recall))
final_recall_majority_LLM_3_classses[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [54]:
with open(r'../data/model_evaluations/final_recall_majority_LLM_3_classses.pkl', 'wb') as f:
    pickle.dump(final_recall_majority_LLM_3_classses, f)

In [55]:
final_recall_majority_LLM = dict()

for model in prediction_ground_truth_tuples_majority_LLM:
    print(model)
    answers = prediction_ground_truth_tuples_majority_LLM[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    recall = recall_score(true_labels, predictions,labels=classes_5, average=None)
    macro = recall_score(true_labels, predictions, average='macro')
    final_recall_majority_LLM[model] = dict(zip(classes_5, recall))
    final_recall_majority_LLM[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
recall = recall_score(true_labels, predictions,labels=classes_5, average=None)
macro = recall_score(true_labels, predictions, average='macro')
final_recall_majority_LLM[model] = dict(zip(classes_5, recall))
final_recall_majority_LLM[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [58]:
with open(r'../data/model_evaluations/final_recall_majority_LLM.pkl', 'wb') as f:
    pickle.dump(final_recall_majority_LLM, f)

#### 5 runs

In [59]:
final_recall_5_runs_3_classses = dict()

for model in prediction_ground_truth_tuples_5_runs_3_classes:
    print(model)
    answers = prediction_ground_truth_tuples_5_runs_3_classes[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    recall = recall_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
    macro = recall_score(true_labels, predictions, average='macro')
    final_recall_5_runs_3_classses[model] = dict(zip(classes_3, recall))
    final_recall_5_runs_3_classses[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples_3_classes
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
recall = recall_score(true_labels, predictions,labels=['Unrelated', 'Consistent', 'Inconsistent'], average=None)
macro = recall_score(true_labels, predictions, average='macro')
final_recall_5_runs_3_classses[model] = dict(zip(classes_3, recall))
final_recall_5_runs_3_classses[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [60]:
with open(r'../data/model_evaluations/final_recall_5_runs_3_classses.pkl', 'wb') as f:
    pickle.dump(final_recall_5_runs_3_classses, f)

In [61]:
final_recall_5_runs = dict()

for model in prediction_ground_truth_tuples_5_runs:
    print(model)
    answers = prediction_ground_truth_tuples_5_runs[model]
    true_labels = [el[0] for el in answers] 
    predictions = [el[1] for el in answers]
    
    recall = recall_score(true_labels, predictions,labels=classes_5, average=None)
    macro = recall_score(true_labels, predictions, average='macro')
    final_recall_5_runs[model] = dict(zip(classes_5, recall))
    final_recall_5_runs[model]['macro'] = macro

model = 'bootstrap'
print(model)
answers = bootstrap_predictions_tuples
true_labels = [el[0] for el in answers] 
predictions = [el[1] for el in answers]
recall = recall_score(true_labels, predictions,labels=classes_5, average=None)
macro = recall_score(true_labels, predictions, average='macro')
final_recall_5_runs[model] = dict(zip(classes_5, recall))
final_recall_5_runs[model]['macro'] = macro


Humans
ChatGPT-4
ChatGPT-3.5
LLaMA 8B
LLaMA 70B
bootstrap


In [62]:
with open(r'../data/model_evaluations/final_recall_5_runs.pkl', 'wb') as f:
    pickle.dump(final_recall_5_runs, f)

### F1-score for majority LLM - 3 classes

In [23]:
f1_score_per_model_type_majority_LLM_3_classes = dict()


for model_type in prediction_ground_truth_tuples_majority_LLM_3_classes:
    f1_score_per_model_type_majority_LLM_3_classes[model_type] = dict()
    print(model_type)
    true_labels = []
    predictions = []
    for ground_truth, prediction in prediction_ground_truth_tuples_majority_LLM_3_classes[model_type]:
        true_labels.append(ground_truth)
        predictions.append(prediction)
    res_f1_score = f1_score(true_labels, predictions, labels=classes_3, average=None)
    print("Per class", res_f1_score)
    my_dict = dict(zip(classes_3, res_f1_score))
    f1_score_per_model_type_majority_LLM_3_classes[model_type] = my_dict
    print('Just mean', np.mean(res_f1_score))
    res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_3, average='macro')
    print("Macro", res_f1_score_macro)
    f1_score_per_model_type_majority_LLM_3_classes[model_type]['macro'] = res_f1_score_macro

    res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_3, average='micro')
    print("Micro", res_f1_score_micro)
    f1_score_per_model_type_majority_LLM_3_classes[model_type]['micro'] = res_f1_score_micro


Humans
Per class [0.58319039 0.68117519 0.88539778]
Just mean 0.716587789808988
Macro 0.716587789808988
Micro 0.8087943262411349
ChatGPT-4
Per class [0.62       0.7026455  0.87325702]
Just mean 0.731967508859635
Macro 0.731967508859635
Micro 0.8036879432624113
ChatGPT-3.5
Per class [0.1986755  0.41463415 0.86750681]
Just mean 0.49360548500643536
Macro 0.49360548500643536
Micro 0.7733333333333333
LLaMA 8B
Per class [0.21551724 0.34756554 0.6726674 ]
Just mean 0.41191672763789944
Macro 0.41191672763789944
Micro 0.535886524822695
LLaMA 70B
Per class [0.55253837 0.74193548 0.90612555]
Just mean 0.7335331332738586
Macro 0.7335331332738586
Micro 0.8419858156028368


In [25]:
model_type = 'bootstrap'
f1_score_per_model_type_majority_LLM_3_classes['bootstrap'] = dict()
true_labels = []
predictions = []
for ground_truth, prediction in bootstrap_predictions_tuples_3_classes:
    true_labels.append(ground_truth)
    predictions.append(prediction)
res_f1_score = f1_score(true_labels, predictions, labels=classes_3, average=None)
print("Per class", res_f1_score)
my_dict = dict(zip(classes_3, res_f1_score))
f1_score_per_model_type_majority_LLM_3_classes[model_type] = my_dict
print('Just mean', np.mean(res_f1_score))
res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_3, average='macro')
print("Macro", res_f1_score_macro)
f1_score_per_model_type_majority_LLM_3_classes[model_type]['macro'] = res_f1_score_macro

res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_3, average='micro')
print("Micro", res_f1_score_micro)
f1_score_per_model_type_majority_LLM_3_classes[model_type]['micro'] = res_f1_score_micro
   

Per class [0.76945039 0.82290398 0.93802598]
Just mean 0.8434601173271709
Macro 0.8434601173271709
Micro 0.8964822695035461


In [26]:
with open(r'../data/model_evaluations/f1_score_per_model_type_majority_LLM_3_classes.pkl', 'wb') as f:
    pickle.dump(f1_score_per_model_type_majority_LLM_3_classes, f)

### F1-score for 5 runs - 3 classes

In [19]:
f1_score_per_model_type_5_runs_3_classes = dict()

for model_type in prediction_ground_truth_tuples_5_runs_3_classes:
    f1_score_per_model_type_5_runs_3_classes[model_type] = dict()
    print(model_type)
    true_labels = []
    predictions = []
    for ground_truth, prediction in prediction_ground_truth_tuples_5_runs_3_classes[model_type]:
        true_labels.append(ground_truth)
        predictions.append(prediction)
    res_f1_score = f1_score(true_labels, predictions, labels=classes_3, average=None)
    print("Per class", res_f1_score)
    my_dict = dict(zip(classes_3, res_f1_score))
    f1_score_per_model_type_5_runs_3_classes[model_type] = my_dict
    print('Just mean', np.mean(res_f1_score))
    res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_3, average='macro')
    print("Macro", res_f1_score_macro)
    f1_score_per_model_type_5_runs_3_classes[model_type]['macro'] = res_f1_score_macro

    res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_3, average='micro')
    print("Micro", res_f1_score_micro)
    f1_score_per_model_type_5_runs_3_classes[model_type]['micro'] = res_f1_score_micro



Humans
Per class [0.59846547 0.67965368 0.88592772]
Just mean 0.7213489577909499
Macro 0.7213489577909499
Micro 0.8110638297872339
ChatGPT-4
Per class [0.62141872 0.67434073 0.86991021]
Just mean 0.7218898871563099
Macro 0.7218898871563099
Micro 0.7973900709219858
ChatGPT-3.5
Per class [0.18151815 0.40425532 0.86419163]
Just mean 0.4833217013589303
Macro 0.4833217013589303
Micro 0.7674893617021277
LLaMA 8B
Per class [0.27590759 0.40278408 0.69757761]
Just mean 0.4587564275219275
Macro 0.4587564275219275
Micro 0.5698156028368795
LLaMA 70B
Per class [0.54583723 0.74070946 0.90266026]
Just mean 0.7297356514187122
Macro 0.7297356514187122
Micro 0.8376170212765958


In [20]:
model_type = 'bootstrap'
f1_score_per_model_type_5_runs_3_classes['bootstrap'] = dict()
true_labels = []
predictions = []
for ground_truth, prediction in bootstrap_predictions_tuples_3_classes:
    true_labels.append(ground_truth)
    predictions.append(prediction)
res_f1_score = f1_score(true_labels, predictions, labels=classes_3, average=None)
print("Per class", res_f1_score)
my_dict = dict(zip(classes_3, res_f1_score))
f1_score_per_model_type_5_runs_3_classes[model_type] = my_dict
print('Just mean', np.mean(res_f1_score))
res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_3, average='macro')
print("Macro", res_f1_score_macro)
f1_score_per_model_type_5_runs_3_classes[model_type]['macro'] = res_f1_score_macro

res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_3, average='micro')
print("Micro", res_f1_score_micro)
f1_score_per_model_type_5_runs_3_classes[model_type]['micro'] = res_f1_score_micro
   

Per class [0.76945039 0.82290398 0.93802598]
Just mean 0.8434601173271709
Macro 0.8434601173271709
Micro 0.8964822695035461


In [22]:
with open(r'../data/model_evaluations/f1_score_per_model_type_5_runs_3_classes.pkl', 'wb') as f:
    pickle.dump(f1_score_per_model_type_5_runs_3_classes, f)

### F1-score for 5 runs - 5 classes

In [42]:
f1_score_per_model_type_5_runs = dict()

for model_type in prediction_ground_truth_tuples_5_runs:
    f1_score_per_model_type_5_runs[model_type] = dict()
    print(model_type)
    true_labels = []
    predictions = []
    for ground_truth, prediction in prediction_ground_truth_tuples_5_runs[model_type]:
        true_labels.append(ground_truth)
        predictions.append(prediction)
    res_f1_score = f1_score(true_labels, predictions, labels=classes_5, average=None)
    print("Per class", res_f1_score)
    my_dict = dict(zip(classes_5, res_f1_score))
    f1_score_per_model_type_5_runs[model_type] = my_dict
    print('Just mean', np.mean(res_f1_score))
    res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_5, average='macro')
    print("Macro", res_f1_score_macro)
    f1_score_per_model_type_5_runs[model_type]['macro'] = res_f1_score_macro

    res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_5, average='micro')
    print("Micro", res_f1_score_micro)
    f1_score_per_model_type_5_runs[model_type]['micro'] = res_f1_score_micro



Humans
Per class [0.57928803 0.67559217 0.44311377 0.39754413 0.55431591]
Just mean 0.5299708026780671
Macro 0.5299708026780671
Micro 0.5174468085106383
ChatGPT-4
Per class [0.61970184 0.67863555 0.45127693 0.23447591 0.31134367]
Just mean 0.4590867798983247
Macro 0.4590867798983247
Micro 0.4702978723404255
ChatGPT-3.5
Per class [0.16442451 0.38424897 0.24084527 0.35903064 0.41092375]
Just mean 0.31189462912921684
Macro 0.31189462912921684
Micro 0.3357163120567376
LLaMA 8B
Per class [0.29113725 0.41291642 0.16266945 0.24903503 0.27922624]
Just mean 0.2789968781886877
Macro 0.2789968781886877
Micro 0.27914893617021275
LLaMA 70B
Per class [0.53016772 0.7294307  0.509321   0.24245305 0.48415156]
Just mean 0.4991048046691481
Macro 0.4991048046691481
Micro 0.5063829787234042


In [43]:
model_type = 'bootstrap'
f1_score_per_model_type_5_runs['bootstrap'] = dict()
true_labels = []
predictions = []
for ground_truth, prediction in bootstrap_predictions_tuples:
    true_labels.append(ground_truth)
    predictions.append(prediction)
res_f1_score = f1_score(true_labels, predictions, labels=classes_5, average=None)
print("Per class", res_f1_score)
my_dict = dict(zip(classes_5, res_f1_score))
f1_score_per_model_type_5_runs[model_type] = my_dict
print('Just mean', np.mean(res_f1_score))
res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_5, average='macro')
print("Macro", res_f1_score_macro)
f1_score_per_model_type_5_runs[model_type]['macro'] = res_f1_score_macro

res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_5, average='micro')
print("Micro", res_f1_score_micro)
f1_score_per_model_type_5_runs[model_type]['micro'] = res_f1_score_micro
   

Per class [0.76555798 0.81462905 0.69751779 0.6507764  0.7545423 ]
Just mean 0.7366047033160935
Macro 0.7366047033160935
Micro 0.7307801418439717


In [44]:
with open(r'../data/model_evaluations/f1_score_per_model_type_5_runs.pkl', 'wb') as f:
    pickle.dump(f1_score_per_model_type_5_runs, f)

### F1-score for majority LLM

In [39]:
f1_score_per_model_type_majority_LLM = dict()


for model_type in prediction_ground_truth_tuples_majority_LLM:
    f1_score_per_model_type_majority_LLM[model_type] = dict()
    print(model_type)
    true_labels = []
    predictions = []
    for ground_truth, prediction in prediction_ground_truth_tuples_majority_LLM[model_type]:
        true_labels.append(ground_truth)
        predictions.append(prediction)
    res_f1_score = f1_score(true_labels, predictions, labels=classes_5, average=None)
    print("Per class", res_f1_score)
    my_dict = dict(zip(classes_5, res_f1_score))
    f1_score_per_model_type_majority_LLM[model_type] = my_dict
    print('Just mean', np.mean(res_f1_score))
    res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_5, average='macro')
    print("Macro", res_f1_score_macro)
    f1_score_per_model_type_majority_LLM[model_type]['macro'] = res_f1_score_macro

    res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_5, average='micro')
    print("Micro", res_f1_score_micro)
    f1_score_per_model_type_majority_LLM[model_type]['micro'] = res_f1_score_micro



Humans
Per class [0.5984     0.67708333 0.44347351 0.39227799 0.55892649]
Just mean 0.5340322650203002
Macro 0.5340322650203002
Micro 0.5214184397163121
ChatGPT-4
Per class [0.63294798 0.71399594 0.44950213 0.19768935 0.31588613]
Just mean 0.4620043066357198
Macro 0.4620043066357198
Micro 0.47404255319148936
ChatGPT-3.5
Per class [0.1744186  0.39349593 0.22979985 0.35601118 0.43161634]
Just mean 0.31706838319602587
Macro 0.31706838319602587
Micro 0.3415602836879433
LLaMA 8B
Per class [0.21864952 0.36337209 0.13168087 0.16833333 0.27127385]
Just mean 0.230661932817461
Macro 0.230661932817461
Micro 0.23687943262411348
LLaMA 70B
Per class [0.53061224 0.73738414 0.50658561 0.25298329 0.47294292]
Just mean 0.500101642433283
Macro 0.500101642433283
Micro 0.5049645390070922


In [40]:
model_type = 'bootstrap'
f1_score_per_model_type_majority_LLM['bootstrap'] = dict()
true_labels = []
predictions = []
for ground_truth, prediction in bootstrap_predictions_tuples:
    true_labels.append(ground_truth)
    predictions.append(prediction)
res_f1_score = f1_score(true_labels, predictions, labels=classes_5, average=None)
print("Per class", res_f1_score)
my_dict = dict(zip(classes_5, res_f1_score))
f1_score_per_model_type_majority_LLM[model_type] = my_dict
print('Just mean', np.mean(res_f1_score))
res_f1_score_macro = f1_score(true_labels, predictions, labels=classes_5, average='macro')
print("Macro", res_f1_score_macro)
f1_score_per_model_type_majority_LLM[model_type]['macro'] = res_f1_score_macro

res_f1_score_micro = f1_score(true_labels, predictions, labels=classes_5, average='micro')
print("Micro", res_f1_score_micro)
f1_score_per_model_type_majority_LLM[model_type]['micro'] = res_f1_score_micro
   

Per class [0.76555798 0.81462905 0.69751779 0.6507764  0.7545423 ]
Just mean 0.7366047033160935
Macro 0.7366047033160935
Micro 0.7307801418439717


In [41]:
with open(r'../data/model_evaluations/f1_score_per_model_type_majority_LLM.pkl', 'wb') as f:
    pickle.dump(f1_score_per_model_type_majority_LLM, f)