In [None]:
!pip install -q gwpy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ligo-segments (setup.py) ... [?25l[?25hdone


In [None]:
%%capture
!pip install numpy#==1.21.6
!pip install spacy scikit-learn krippendorff
!python -m spacy download en_core_web_sm
!python -m spacy download fi_core_news_sm
# Add other languages as needed

In [None]:
%%capture
!pip install nipype
!pip install statsmodels

In [None]:
from collections import defaultdict
from statsmodels.stats.inter_rater import fleiss_kappa

In [None]:
import spacy
import krippendorff
import json

# Load spaCy models for English and Finnish
nlp_en = spacy.load("en_core_web_sm")
nlp_fi = spacy.load("fi_core_news_sm")

# Function to load annotations from JSONL file
def load_annotations(file_path):
    annotations = []
    with open(file_path, 'r') as f:
        for line in f:
            annotations.append(json.loads(line))
    return annotations

file1 = 'GPT3_Emotion.jsonl'
file2 = 'Eero-Emotions.jsonl'
# Load annotations for English and Finnish
annotations_en = load_annotations(file1)
annotations_fi = load_annotations(file2)

# Function to tokenize and extract labels
def process_annotations(annotations, nlp_model):
    token_label_pairs = []
    for annotation in annotations:
        doc = nlp_model(annotation['text'])
        tokens = [token.text for token in doc]
        labels = annotation['label']
        token_label_pairs.append((tokens, labels))
    return token_label_pairs

# Process annotations for both languages
token_label_pairs_en = process_annotations(annotations_en, nlp_en)
token_label_pairs_fi = process_annotations(annotations_fi, nlp_fi)

# Function to align token-label pairs
def align_annotations(token_label_pairs_en, token_label_pairs_fi):
    aligned_labels_en = []
    aligned_labels_fi = []

    for (tokens_en, labels_en), (tokens_fi, labels_fi) in zip(token_label_pairs_en, token_label_pairs_fi):

        # Assume both texts are translations and can be aligned at sentence level
        if len(tokens_en) != len(tokens_fi):
            # If lengths differ, consider padding or merging as appropriate
            # Here, we'll assume we can pad the shorter sequence with "O" labels
            max_len = max(len(tokens_en), len(tokens_fi))
            if len(tokens_en) < max_len:
                labels_en.extend([[0,0,'O']] * (max_len - len(labels_en)))
            if len(labels_fi) < max_len:
                labels_fi.extend([[0,0,'O']] * (max_len - len(labels_fi)))

        aligned_labels_en.extend(labels_en)
        aligned_labels_fi.extend(labels_fi)

    return aligned_labels_en, aligned_labels_fi

# Align the annotations
aligned_labels_en, aligned_labels_fi = align_annotations(token_label_pairs_en, token_label_pairs_fi)

# Combine labels for agreement calculation
combined_labels = list(zip(aligned_labels_en, aligned_labels_fi))

# Prepare data in the format expected by Fleiss' Kappa
data = []
for en_labels, fi_labels in combined_labels:
    data.append([label for label in en_labels] + [label for label in fi_labels])

label_mapping = {'Joy':1, 'Sadness':2, 'Anger':3, 'Fear':4, 'Surprise':5,
                 'Disgust':6, 'Trust':7, 'Anticipation':8,
                 'joy':1, 'sadness':2, 'anger':3, 'fear':4, 'surprise':5,
                 'disgust':6, 'trust':7, 'anticipation':8}

#label_mapping = {'Speaker 1': 1, 'S1': 1, 'S2': 2, 'Speaker 2': 2,
#                 'Instructor': 3, 'Instrutor': 3}

#label_mapping = {'Speaker 1': 1,'Interviewee': 1,'interviewee': 1,
#                 'interviewer': 2, 'Interviewer': 2 }  # Add more labels as needed


data = []
for en_labels, fi_labels in combined_labels:
    numerical_en = [label_mapping.get(label, -1) for label in en_labels] # Map labels to numbers, use -1 for unknown labels
    numerical_fi = [label_mapping.get(label, -1) for label in fi_labels]
    data.append(numerical_en + numerical_fi)

# Prepare data in the format expected by NLTK's Agreement metrics
formatted_data = []
for i, (en_labels, fi_labels) in enumerate(zip(aligned_labels_en, aligned_labels_fi)):
    for j, (en_label, fi_label) in enumerate(zip(en_labels, fi_labels)):
        # Extract the label string directly if en_label and fi_label are lists or tuples
        en_label_str = en_label[2] if isinstance(en_label, (list, tuple)) else str(en_label)
        fi_label_str = fi_label[2] if isinstance(fi_label, (list, tuple)) else str(fi_label)

        formatted_data.append(('coder_en', f'item_{i}_{j}', en_label_str))
        formatted_data.append(('coder_fi', f'item_{i}_{j}', fi_label_str))

from nltk.metrics import agreement
task = agreement.AnnotationTask(formatted_data)
print(file1, ', ', file2, ' : ' "Kappa:", task.kappa())

# Calculate Fleiss' Kappahouse of dragons
#kappa = fleiss_kappa(data)

# print(f"Fleiss' Kappa: {kappa}")


GPT3_Emotion.jsonl ,  Eero-Emotions.jsonl  : Kappa: 0.8972155459286325
