In [7]:
import krippendorff
import numpy as np
import pandas as pd
import pickle
import random
import json
from collections import Counter

A notebook for analyzing inter-annotator agreement. We assume all annotator answers are in a file called all_samples_to_responses.pkl

Format dict():

Sample -> [list of labels provided by annotators]

In [21]:
with open(r'../evaluating_models/all_samples_to_responses.pkl', 'rb') as f:
    final_sample_to_responses = pickle.load(f)

### 5 classes evaluation

In [11]:
label_to_num_5_classes = {'Unrelated': 1, "Consistent": 2, "Indirect inconsistency": 3, "Factual inconsistency": 4, "Surface contradiction": 5}

In [None]:
matrix = []
for sample in final_sample_to_responses:
    res = []
    for answer in final_sample_to_responses[sample]:
        res.append(label_to_num_5_classes[answer])
        
    # Ensure at least 5 annotations per sample
    if len(res) == 5:
        matrix.append(res)
    else:
        print(sample)
        print(res)
        # If more than 5 annotations, randomly sample 5 out of them
        matrix.append(random.sample(res, 5))


In [13]:
matrix = np.array(matrix)

matrix_fleiss = []
for row in matrix:
    res = []
    counts = Counter(row)
    for i in range(1, 6): 
        res.append(counts[i])
    matrix_fleiss.append(res)
    
matrix_fleiss = np.array(matrix_fleiss)

In [None]:
print("Krippendorff's alpha for nominal metric: ", krippendorff.alpha(value_counts=matrix_fleiss,
                                                                          level_of_measurement='nominal'))
print("Krippendorff's alpha for ordinal metric: ", krippendorff.alpha(value_counts=matrix_fleiss,
                                                                        level_of_measurement='ordinal'))

### 3 classes evaluation

In [17]:
label_to_num_3_classes = {'Unrelated': 1, "Consistent": 2,  "Indirect inconsistency": 3, "Factual inconsistency": 3, "Surface contradiction": 3}

In [None]:
matrix = []
for sample in final_sample_to_responses:
    res = []
    for answer in final_sample_to_responses[sample]:
        res.append(label_to_num_3_classes[answer])
        
    # Ensure at least 5 annotations per sample
    if len(res) == 5:
        matrix.append(res)
    else:
        print(sample)
        print(res)
        # If more than 5 annotations, randomly sample 5 out of them
        matrix.append(random.sample(res, 5))


In [19]:
matrix = np.array(matrix)

matrix_fleiss = []
for row in matrix:
    res = []
    counts = Counter(row)
    for i in range(1, 4): 
        res.append(counts[i])
    matrix_fleiss.append(res)
    
matrix_fleiss = np.array(matrix_fleiss)

In [None]:
print("Krippendorff's alpha for nominal metric: ", krippendorff.alpha(value_counts=matrix_fleiss,
                                                                          level_of_measurement='nominal'))
print("Krippendorff's alpha for ordinal metric: ", krippendorff.alpha(value_counts=matrix_fleiss,
                                                                      level_of_measurement='ordinal'))