# Cohen's kappa score

Computes the kappa socre for two sets of annotated data stored in a .txt file. The program assumes that the annotations are categorical.

If the kappa score is greater that 0.8, then it is considered excellent agreement, 0.6 to 0.8 is substantial agreement, and 0.4 to 0.6 iss considered moderte agreement.

In [6]:
from sklearn.metrics import cohen_kappa_score

In [7]:
def read_annotations_from_file(file_path):
    """
    Read annotations from a text file and return as a list of labels.

    :param file_path: The path to the text file containing annotations.
    :return: List of labels.
    """
    with open(file_path, 'r') as file:
        lines = [line.strip() for line in file]

    # Extracting labels from the last space-separated element of each line
    annotations = [line.split()[-1] for line in lines if len(line.split()) > 1]

    return annotations

In [8]:
def compute_observed_agreement(annotations1, annotations2):
    """
    Compute observed agreement for two sets of annotations.

    :param annotations1: List of annotations from the first annotator (list of categorical labels).
    :param annotations2: List of annotations from the second annotator (list of categorical labels).
    :return: Observed agreement.
    """
    assert len(annotations1) == len(annotations2), "Annotations lists must have the same length."

    num_agreements = sum(1 for a1, a2 in zip(annotations1, annotations2) if a1 == a2)
    total_data_points = len(annotations1)
    return num_agreements / total_data_points

In [9]:
def compute_agreement_by_chance(annotations1, annotations2):
    """
    Compute agreement by chance (expected agreement) for two sets of annotations.

    :param annotations1: List of annotations from the first annotator (list of categorical labels).
    :param annotations2: List of annotations from the second annotator (list of categorical labels).
    :return: Agreement by chance.
    """
    assert len(annotations1) == len(annotations2), "Annotations lists must have the same length."

    labels = set(annotations1 + annotations2)
    label_counts_annotator1 = {label: annotations1.count(label) for label in labels}
    label_counts_annotator2 = {label: annotations2.count(label) for label in labels}

    agreement_by_chance = sum(
        (label_counts_annotator1[label] / len(annotations1)) * (label_counts_annotator2[label] / len(annotations2))
        for label in labels
    )
    return agreement_by_chance

In [13]:
if __name__ == "__main__":
    # Example usage:
    file_path_annotator1 = "/Users/ellyzamaripapas/Code/NER_Project/data/annotated_data/final-1.txt"
    file_path_annotator2 = "/Users/ellyzamaripapas/Code/NER_Project/data/annotated_data/final-2.txt"

    annotations_annotator1 = read_annotations_from_file(file_path_annotator1)
    annotations_annotator2 = read_annotations_from_file(file_path_annotator2)

    observed_agreement = compute_observed_agreement(annotations_annotator1, annotations_annotator2)
    agreement_by_chance = compute_agreement_by_chance(annotations_annotator1, annotations_annotator2)
    kappa_score = cohen_kappa_score(annotations_annotator1, annotations_annotator2)

    print(f"Observed Agreement: {observed_agreement:.4f}")
    print(f"Agreement by Chance: {agreement_by_chance:.4f}")
    print(f"Cohen's Kappa Score: {kappa_score:.4f}")

Observed Agreement: 0.9837
Agreement by Chance: 0.7617
Cohen's Kappa Score: 0.9315
