Calculations of mean, sd, and kappa for the results files (sentence-level)

## BERT - Sherlock


In [66]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import numpy as np 

f1_scores = [0.4927, 0.5354, 0.4127] 

mean_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_score:.4f}")

std_dev = np.std(f1_scores)
print(f"Standard deviation of F1-scores: {std_dev:.4f}")

Mean F1-score: 0.4803
Standard deviation of F1-scores: 0.0509


In [59]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

A = pd.read_csv('all_folds_sentence_predictions.tsv', sep='\t')
B = pd.read_csv('2all_folds_sentence_predictions.tsv', sep='\t')
C = pd.read_csv('3all_folds_sentence_predictions.tsv', sep='\t')

for df in (A, B, C):
    df['key'] = df['sentence'] + '||' + df['gold_label']
    df['occ'] = df.groupby('key').cumcount()

A = A.rename(columns={'prediction': 'pred_A'})
B = B.rename(columns={'prediction': 'pred_B'})
C = C.rename(columns={'prediction': 'pred_C'})

merged = (
    A[['key', 'occ', 'pred_A']]
    .merge(B[['key', 'occ', 'pred_B']], on=['key', 'occ'])
    .merge(C[['key', 'occ', 'pred_C']], on=['key', 'occ'])
)

kappa_AB = cohen_kappa_score(merged['pred_A'], merged['pred_B'])
kappa_AC = cohen_kappa_score(merged['pred_A'], merged['pred_C'])
kappa_BC = cohen_kappa_score(merged['pred_B'], merged['pred_C'])

print(f"Kappa A vs B: {kappa_AB:.4f}")
print(f"Kappa A vs C: {kappa_AC:.4f}")
print(f"Kappa B vs C: {kappa_BC:.4f}")


Kappa A vs B: 0.3819
Kappa A vs C: 0.3415
Kappa B vs C: 0.2631


## TuringBERT - Sherlock

In [61]:
f1_scores = [0.4258, 0.3849, 0.4428] 

mean_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_score:.4f}")

std_dev = np.std(f1_scores)
print(f"Standard deviation of F1-scores: {std_dev:.4f}")

Mean F1-score: 0.4178
Standard deviation of F1-scores: 0.0243


In [56]:
A = pd.read_csv('Turing_all_folds_sentence_predictions.tsv', sep='\t')
B = pd.read_csv('2Turing_all_folds_sentence_predictions.tsv', sep='\t')
C = pd.read_csv('3Turing_all_folds_sentence_predictions.tsv', sep='\t')

for df in (A, B, C):
    df['key'] = df['sentence'] + '||' + df['gold_label']
    df['occ'] = df.groupby('key').cumcount()

A = A.rename(columns={'prediction': 'pred_A'})
B = B.rename(columns={'prediction': 'pred_B'})
C = C.rename(columns={'prediction': 'pred_C'})

merged = (
    A[['key', 'occ', 'pred_A']]
    .merge(B[['key', 'occ', 'pred_B']], on=['key', 'occ'])
    .merge(C[['key', 'occ', 'pred_C']], on=['key', 'occ'])
)

kappa_AB = cohen_kappa_score(merged['pred_A'], merged['pred_B'])
kappa_AC = cohen_kappa_score(merged['pred_A'], merged['pred_C'])
kappa_BC = cohen_kappa_score(merged['pred_B'], merged['pred_C'])

print(f"Kappa A vs B: {kappa_AB:.4f}")
print(f"Kappa A vs C: {kappa_AC:.4f}")
print(f"Kappa B vs C: {kappa_BC:.4f}")

Kappa A vs B: 0.3203
Kappa A vs C: 0.3788
Kappa B vs C: 0.3084


## BERT - Friends

In [81]:
f1_scores = [0.6763, 0.6866, 0.6764] 

mean_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_score:.4f}")

std_dev = np.std(f1_scores)
print(f"Standard deviation of F1-scores: {std_dev:.4f}")

Mean F1-score: 0.6798
Standard deviation of F1-scores: 0.0048


In [94]:
modelA = pd.read_csv('BERT_benchmark_predictions.tsv', sep='\t')
modelB = pd.read_csv('2BERT_benchmark_predictions.tsv', sep='\t')
modelC = pd.read_csv('3BERT_benchmark_predictions.tsv', sep='\t')

kappa = cohen_kappa_score(modelA['prediction'], modelB['prediction'])
kappa_AC = cohen_kappa_score(modelA['prediction'], modelC['prediction'])
kappa_BC = cohen_kappa_score(modelB['prediction'], modelC['prediction'])
print(f"Cohen's kappa between Model A and Model B: {kappa:.4f}")
print(f"Kappa A vs C: {kappa_AC:.4f}")
print(f"Kappa B vs C: {kappa_BC:.4f}")


Cohen's kappa between Model A and Model B: 0.8740
Kappa A vs C: 0.8743
Kappa B vs C: 0.8711


## TuringBERT - Friends

In [89]:
f1_scores = [0.6710, 0.6665, 0.6618] 

mean_score = np.mean(f1_scores)
print(f"Mean F1-score: {mean_score:.4f}")

std_dev = np.std(f1_scores)
print(f"Standard deviation of F1-scores: {std_dev:.4f}")

Mean F1-score: 0.6664
Standard deviation of F1-scores: 0.0038


In [96]:
modelA = pd.read_csv('TuringBERT_benchmark_predictions.tsv', sep='\t')
modelB = pd.read_csv('2TuringBERT_benchmark_predictions.tsv', sep='\t')
modelC = pd.read_csv('3TuringBERT_benchmark_predictions.tsv', sep='\t')

kappa = cohen_kappa_score(modelA['prediction'], modelB['prediction'])
kappa_AC = cohen_kappa_score(modelA['prediction'], modelC['prediction'])
kappa_BC = cohen_kappa_score(modelB['prediction'], modelC['prediction'])
print(f"Cohen's kappa between Model A and Model B: {kappa:.4f}")
print(f"Kappa A vs C: {kappa_AC:.4f}")
print(f"Kappa B vs C: {kappa_BC:.4f}")


Cohen's kappa between Model A and Model B: 0.8583
Kappa A vs C: 0.8443
Kappa B vs C: 0.8340
