# Evaluate model's predictions against gold labels

In [125]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score, accuracy_score

# read inference results
df_predictions = pd.read_csv('../results/pappa_dim1_flan-t5-small_0106_181636.pre.csv', sep=';')

# retrieve columns starting with "gold" and their "names"
gold_labels = df_predictions.filter(regex='^gold', axis=1)
gold_names = [col.split('gold_')[-1] for col in gold_labels.columns]
human_names = [name for name in gold_names if 'agg' not in name]

# define tables where to store results
df_kappa = pd.DataFrame(columns=gold_names+['model'], index=gold_names+['model']).fillna(1.0)
df_accuracy = pd.DataFrame(columns=gold_names+['model'], index=gold_names+['model']).fillna(1.0)


for i, col in enumerate(gold_labels.columns):
    # compare agreement with gold labels
    kappa = cohen_kappa_score(df_predictions['prediction'].astype(str), gold_labels[col].astype(str))
    accuracy = accuracy_score(df_predictions['prediction'].astype(str), gold_labels[col].astype(str))
    # store results
    df_kappa.loc['model', gold_names[i]] = df_kappa.loc[gold_names[i], 'model'] = kappa
    df_accuracy.loc['model', gold_names[i]] = df_accuracy.loc[gold_names[i], 'model'] = accuracy

    for j, col2 in enumerate(gold_labels.columns):
        if i < j:
            # compare agreement of gold labels with each other
            kappa = cohen_kappa_score(gold_labels[col].astype(str), gold_labels[col2].astype(str))
            accuracy = accuracy_score(gold_labels[col].astype(str), gold_labels[col2].astype(str))
            # store results
            df_kappa.loc[gold_names[i], gold_names[j]] = df_kappa.loc[gold_names[j], gold_names[i]] = kappa
            df_accuracy.loc[gold_names[i], gold_names[j]] = df_accuracy.loc[gold_names[j], gold_names[i]] = accuracy

# compute average agreement between humans
df_kappa['mean_human'] = df_kappa[human_names].mean(axis=1)
df_accuracy['mean_human'] = df_accuracy[human_names].mean(axis=1)
for name in human_names:
    # correct for humans fully agreeing with themselves
    df_kappa.mean_human[name] = (df_kappa[human_names].loc[name].sum() - 1.0) / (len(human_names) - 1.0)
    df_accuracy.mean_human[name] = (df_accuracy[human_names].loc[name].sum() - 1.0) / (len(human_names) - 1.0)

print('ACCURACY:')
print(df_accuracy.round(4)*100)
print()
print(f"Humans' mean accuracy: {100*df_accuracy.mean_human[:-1].mean():.2f}")
print(f"Model's mean accuracy: {100*df_accuracy.model[:-1].mean():.2f}")
print(f'Diff in mean accuracy: {100*(df_accuracy.mean_human[:-1].mean() - df_accuracy.model[:-1].mean()):.2f}')
print()
print('KAPPA:')
print(df_kappa.round(4)*100)
print()
print(f"Humans' mean kappa: {100*df_kappa.mean_human[:-1].mean():.2f}")
print(f"Model's mean kappa: {100*df_kappa.model[:-1].mean():.2f}")
print(f'Diff in mean kappa: {100*(df_kappa.mean_human[:-1].mean() - df_kappa.model[:-1].mean()):.2f}')

ACCURACY:
         elin    lena   oscar     agg   model  mean_human
elin   100.00   60.86   61.71   74.86   13.43       61.29
lena    60.86  100.00   66.86   81.71   11.43       63.86
oscar   61.71   66.86  100.00   84.00    9.43       64.29
agg     74.86   81.71   84.00  100.00   12.57       80.19
model   13.43   11.43    9.43   12.57  100.00       11.43

Humans' mean accuracy: 67.40
Model's mean accuracy: 11.71
Diff in mean accuracy: 55.69

KAPPA:
         elin    lena   oscar     agg   model  mean_human
elin   100.00   48.68   49.93   67.28   -0.08       49.30
lena    48.68  100.00   53.37   75.09    0.10       51.02
oscar   49.93   53.37  100.00   78.19    0.08       51.65
agg     67.28   75.09   78.19  100.00    0.16       73.52
model   -0.08    0.10    0.08    0.16  100.00        0.03

Humans' mean kappa: 56.37
Model's mean kappa: 0.06
Diff in mean kappa: 56.31


Humans' mean accuracy: 67.40
Model's mean accuracy: 0.00
Diff in mean accuracy: 67.40
Model's mean accuracy: 0.00


Results from Dirk's evaluation of GPT:

````python
dim1
350
RAW:
               elin      lena     oscar  aggregate       GPT  mean_human
elin       1.000000  0.608571  0.617143   0.748571  0.494286    0.612857
lena       0.608571  1.000000  0.668571   0.817143  0.545714    0.638571
oscar      0.617143  0.668571  1.000000   0.840000  0.528571    0.642857
aggregate  0.748571  0.817143  0.840000   1.000000  0.548571    0.801905
GPT        0.494286  0.545714  0.528571   0.548571  1.000000    0.522857
KAPPA:
               elin      lena     oscar  aggregate       GPT  mean_human
elin       1.000000  0.486754  0.499290   0.672779  0.354546    0.493022
lena       0.486754  1.000000  0.533719   0.750879  0.388609    0.510237
oscar      0.499290  0.533719  1.000000   0.781927  0.364316    0.516505
aggregate  0.672779  0.750879  0.781927   1.000000  0.406302    0.735195
GPT        0.354546  0.388609  0.364316   0.406302  1.000000    0.369157

dim2
225
RAW:
               elin      lena     oscar  aggregate       GPT  mean_human
elin       1.000000  0.613333  0.631111   0.835556  0.591111    0.622222
lena       0.613333  1.000000  0.706667   0.511111  0.760000    0.660000
oscar      0.631111  0.706667  1.000000   0.795556  0.577778    0.668889
aggregate  0.835556  0.511111  0.795556   1.000000  0.462222    0.714074
GPT        0.591111  0.760000  0.577778   0.462222  1.000000    0.642963
KAPPA:
               elin      lena     oscar  aggregate       GPT  mean_human
elin       1.000000  0.014797  0.194175   0.677288  0.110251    0.104486
lena       0.014797  1.000000  0.181818   0.081155  0.145570    0.098308
oscar      0.194175  0.181818  1.000000   0.601156  0.030612    0.187996
aggregate  0.677288  0.081155  0.601156   1.000000  0.036624    0.453200
GPT        0.110251  0.145570  0.030612   0.036624  1.000000    0.095478

dim3
225
RAW:
               elin      lena     oscar  aggregate       GPT  mean_human
elin       1.000000  0.746667  0.857778   1.000000  0.746667    0.802222
lena       0.746667  1.000000  0.808889   0.746667  0.702222    0.777778
oscar      0.857778  0.808889  1.000000   0.857778  0.737778    0.833333
aggregate  1.000000  0.746667  0.857778   1.000000  0.746667    0.868148
GPT        0.746667  0.702222  0.737778   0.746667  1.000000    0.728889
KAPPA:
               elin      lena     oscar  aggregate       GPT  mean_human
elin       1.000000  0.312590  0.629630   1.000000  0.468812    0.471110
lena       0.312590  1.000000  0.354354   0.312590  0.303245    0.333472
oscar      0.629630  0.354354  1.000000   0.629630  0.405242    0.491992
aggregate  1.000000  0.312590  0.629630   1.000000  0.468812    0.647407
GPT        0.468812  0.303245  0.405242   0.468812  1.000000    0.392433


````