# Inter-annotator agreement (IAA)

To access IAA, masakhaner2 authors calculated Fleiss' Kappa on entity level. We follow this approach and compare IAA for model-annotated vs human-annotated data. Instead of having multiple annotators, we ask the model to reannotate the same samples N=10 times. 

We follow the logic that a good model should assign the labels "without any doubts" and therefore it should have high IAA score.

In [95]:
models = ['gpt-4-turbo-0.1-temp', 
          'gemini-1.0_temp_0.1', 'gemini-1.0_batch_size_2',
          'mistral-0.1-temp', 
          'starling-lm-0.1-temp', 
          'llama_70b',
          'claude3-sonet',
          'gpt_4_turbo_batch_size_2', 'gpt-4-turbo', 'claude3-opus', 'claude3-sonet']
'''
['claude3-opus', 'gpt-4', 'gpt-4-turbo', 'gpt_4_turbo_batch_size_2',
 'gemini-1.0', 'gemini-1.0_temp_0.1', 'gemini-1.0_batch_size_2',
 'gemini-1.0_shorter_prompt', 'llama_70b', 'mistral', 'starling-lm',
 'starling-lm-0.1-temp', 'starling-lm-zero-temp']
'''
languages = ['zul', 'bam', 'tsn', 'fon', 'bbj', 'swa']


## Fleiss` Kappa

### Main steps & calculation example

In [96]:
import json
import numpy as np
from IPython.display import display
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa
import os
import yaml
from collections import Counter
from collections import defaultdict
from sklearn.metrics import f1_score
import sys
from contextlib import contextmanager

@contextmanager
def extend_sys_path(path):
    if path not in sys.path:
        # Append the path to sys.path
        sys.path.append(path)
    try:
        # Execute code inside the 'with' statement
        yield
    finally:
        # Remove the path from sys.path
        if path in sys.path:
            sys.path.remove(path)

In [97]:
SAMPLE_SIZE = 50
REPEAT_ANNOTATION = 10

In [98]:
# Specifying path to the necessary files and folders
PATH_TO_SRC = os.path.abspath('../../../')

CONFIG_PATH = os.path.join(PATH_TO_SRC, "settings/config.yml")
RESULTS_PATH = os.path.join(PATH_TO_SRC, 'data/foundation_model_selection')

In [99]:
with extend_sys_path(PATH_TO_SRC):
    from src.utils.utils import calculate_consistency_score

In [100]:
# Reading config file
config = yaml.safe_load(open(os.path.join(PATH_TO_SRC, "settings/config.yml")))

# Load unique label categories
unique_labels = list(config['label_mapping'].values())
unique_labels

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-DATE', 'I-DATE']

Creating matrix in the following format:



```
[
 [3, 0, 0],  # Sentence 1, Token 1
 [0, 2, 1],  # Sentence 1, Token 2
 [1, 1, 1],  # Sentence 1, Token 3
 [0, 0, 3],  # Sentence 1, Token 4
 [2, 1, 0],  # Sentence 2, Token 1
 [1, 2, 0],  # Sentence 2, Token 2
 [3, 0, 0],  # Sentence 2, Token 3
 [0, 1, 2],  # Sentence 2, Token 4
]
```
, where columns correspond to unique label categories, and values correspond to the number of annotators assigned the category.




In [101]:
example_bbj = json.load(open(os.path.join(RESULTS_PATH, 'gemini-1.0/bbj.json')))

Annotations for the first sentence:

In [102]:
sent_ann = []

# For each reannnotation for record_0
for prediction in example_bbj['record_0']['pred']:
    # Extract labels
    pred_label = [t[1] for t in prediction]

    sent_ann.append(pred_label)
    print(pred_label)

['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O']


Creating matrix:

In [103]:
def annotations_to_matrix(sentence_annotations, unique_labels):

    # Shape -> number of tokens in sentence * number of unique categories
    label_counts = np.zeros((len(sentence_annotations[0]), len(unique_labels)))

    # For each annotation attempt
    for annotation in sentence_annotations:
        # For each token in sentence
        for i, label in enumerate(annotation):
            # Label position (column)
            label_index = unique_labels.index(label)
            # Append annotator counts
            label_counts[i][label_index] += 1
    return label_counts


pd.DataFrame(annotations_to_matrix(sent_ann, unique_labels), columns=unique_labels)\
    .style.applymap(lambda x: 'background-color: lightblue' if x > 0 else 'background-color: white')

Unnamed: 0,O,B-PER,I-PER,B-ORG,I-ORG,B-LOC,I-LOC,B-DATE,I-DATE
0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0
7,0.0,7.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
8,7.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Creating matrix for all records:

In [104]:
def get_aggregate_matrix(data, sample_size, unique_labels):
    records = [f'record_{i}' for i in range(sample_size)]

    all_counts = []
    
    # For each record (record_0, record_1, record_2...)
    for record in records:
        # Get annotations for this record
        record_ann = []
        if record in data:
            # Extract predicted labels
            for prediction in data[record]['pred']:
                if len(prediction) > 0:
                    try:
                        pred_label = [t[1] for t in prediction]
                        record_ann.append(pred_label)
                    except Exception as e:
                        print(prediction)
        else:
            print(f'This record is omitted: {record}')
    
        if len(record_ann) > 0:
            try:
                sentence_matrix = annotations_to_matrix(record_ann, unique_labels)
                all_counts.append(sentence_matrix)
            except Exception as e:
                print(e)
                print(record)
                print(record_ann)
                continue
    
    # Concatenate all sentence matrices vertically
    aggregate_matrix = np.vstack(all_counts)
    
    return pd.DataFrame(aggregate_matrix, columns=unique_labels)    

In [105]:
df = get_aggregate_matrix(example_bbj, SAMPLE_SIZE, unique_labels)
df.head()

This record is omitted: record_36


Unnamed: 0,O,B-PER,I-PER,B-ORG,I-ORG,B-LOC,I-LOC,B-DATE,I-DATE
0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Per-row sum should be equal to the number of annotators (reannotation attempts):

In [106]:
df['skipped_annotations'] = REPEAT_ANNOTATION - df[unique_labels].sum(axis=1)

df[df['skipped_annotations'] != 0]

Unnamed: 0,O,B-PER,I-PER,B-ORG,I-ORG,B-LOC,I-LOC,B-DATE,I-DATE,skipped_annotations
15,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
16,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
17,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
18,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
19,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...
752,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
753,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
754,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
755,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [107]:
df_no_skipped_ann = df[df['skipped_annotations'] == 0].copy()
df_no_skipped_ann.shape

(546, 10)

In [108]:
df[df['skipped_annotations'] != 0].shape

(221, 10)

Replacing all skipped annotations with non-entity tokens to have consistent number of annotations per sentence:

In [109]:
df['O'] += df['skipped_annotations']

df['skipped_annotations'] = REPEAT_ANNOTATION - df[unique_labels].sum(axis=1)

df[df['skipped_annotations'] != 0]

Unnamed: 0,O,B-PER,I-PER,B-ORG,I-ORG,B-LOC,I-LOC,B-DATE,I-DATE,skipped_annotations


In [110]:
# Calculate Fleiss' Kappa for the aggregated annotation data
kappa = fleiss_kappa(df[unique_labels], method='fleiss')
print(f"Fleiss' Kappa: {round(kappa, 4)}")

kappa = fleiss_kappa(df_no_skipped_ann[unique_labels], method='fleiss')
print(f"Fleiss' Kappa for records without missing annotations: {round(kappa, 4)}")

Fleiss' Kappa: 0.8271
Fleiss' Kappa for records without missing annotations: 0.9003


### Kappa calculation for foundation models' annotations

In [111]:
class FleissKappaCalculator:
    def __init__(self, sample_size, repeat_annotation, unique_labels):
        self.sample_size = sample_size
        self.repeat_annotation = repeat_annotation
        self.unique_labels = unique_labels
        self.log = ''
        self.counter = {
            'num_skipped_records': 0,
            'skipped_records': [],
            'different_ann_length': 0,
            'contain_empty_predictions': 0, 
        }

    def annotations_to_matrix(self, sentence_annotations):
        # Shape -> number of tokens in sentence * number of unique categories
        label_counts = np.zeros((len(sentence_annotations[0]), len(self.unique_labels)))
    
        # For each annotation attempt
        for annotation in sentence_annotations:
            # For each token in sentence
            for i, label in enumerate(annotation):
                # Label position (column)
                label_index = unique_labels.index(label)
                # Append annotator counts
                label_counts[i][label_index] += 1
        return label_counts      
          
    def get_aggregate_matrix(self, data):
        records = [f'record_{i}' for i in range(self.sample_size)]
        all_counts = []
    
        # For each record (record_0, record_1, record_2...)
        for record in records:
            # Get annotations for this record
            record_ann = []
            skip_flag = False

            if record in data:
                
                # Counting number of skipped annotations per record
                if len(data[record]['pred']) < self.repeat_annotation:
                    self.log += (f'{record} --> less than 10 annotations for record '
                                 f'(num annotations: {len(data[record]["pred"])})\n')
                    
                # Counting length of each prediction (should be equal)
                lengths = [len(pred) for pred in data[record]['pred']]
                length_counts = dict(Counter(lengths))
                if 0 in length_counts:
                    self.log += f'{record} --> has {length_counts[0]} empty predictions ([]).\n'
                   
                # Combining two previous conditions
                if (len(data[record]['pred']) < self.repeat_annotation or
                        0 in length_counts):
                    self.counter['contain_empty_predictions'] += 1
                    skip_flag = True
                    
                # Check if there's more than one length
                if len(length_counts) > 1:
                    self.log += f'{record} --> different number of predicted labels ({dict(length_counts)}), true num labels = {len(data[record]["tokens"])}.\n'
                    self.counter['skipped_records'].append(record)
                    self.counter['different_ann_length'] += 1
                    continue
                
                elif skip_flag is True:
                    self.counter['skipped_records'].append(record)
                    continue
                    
                else:
                    # Extract predicted labels
                    for prediction in data[record]['pred']:
                        if len(prediction) > 0:
                            try:
                                pred_label = [t[1] if len(t) > 1 else "O" for t in prediction]
                                record_ann.append(pred_label)
                            except Exception as e:
                                self.log += f'{record} --> cannot extract tokens.\n'
                                self.counter['skipped_records'].append(record)
                            
                    
            else:
                self.log += f'{record} --> record is not in the data.\n'
                self.counter['skipped_records'].append(record)
    
            if len(record_ann) > 0:
                try:
                    sentence_matrix = self.annotations_to_matrix(record_ann)
                    all_counts.append(sentence_matrix) # Create sent matrix as shown above
                except ValueError as e:  # If additional labels are encountered (e.g., B-MISC)
                    self.log += f'{record} --> has additional tokens ({e}).\n'
                    self.counter['skipped_records'].append(record)
                    continue

        self.counter['num_skipped_records'] = len(self.counter['skipped_records'])
        self.log += f'Number of omitted records: {self.counter["num_skipped_records"]}\n'
        # Concatenate all sentence matrices vertically
        aggregate_matrix = np.vstack(all_counts)
        return pd.DataFrame(aggregate_matrix, columns=unique_labels)

    def get_kappa(self, data):
        """Calculate Fleiss' Kappa for foundation model annotations"""
        # 1. Get aggregated matrix
        df = self.get_aggregate_matrix(data)
        # 2. Calculate Fleiss' Kappa for the aggregated annotation data
        kappa = fleiss_kappa(df, method='fleiss')
        
        return kappa

In [None]:
metrics = {
    'kappa': defaultdict(dict),
    'num_skipped_records': defaultdict(dict),
    'different_ann_length': defaultdict(dict),
    'contain_empty_predictions': defaultdict(dict)
}

logs = {}

# Loop through each model and language to gather metrics
for model in models:
    for language in languages:
        try:
            data_path = os.path.join(RESULTS_PATH, model, language + '.json')
            with open(data_path) as f:
                data = json.load(f)

            # Calculate Fleiss' Kappa, extract logs
            calc = FleissKappaCalculator(SAMPLE_SIZE, REPEAT_ANNOTATION, unique_labels)
            kappa = calc.get_kappa(data)
            logs[f'{model} | {language}'] = calc.log

            # Assign values to each metric dictionary
            metrics['kappa'][model][language] = kappa
            metrics['num_skipped_records'][model][language] = calc.counter['num_skipped_records']
            metrics['different_ann_length'][model][language] = calc.counter['different_ann_length']
            metrics['contain_empty_predictions'][model][language] = calc.counter['contain_empty_predictions']

        except Exception as e:
            print(e)
            # Assign None for each metric in case of an exception
            for metric in metrics.keys():
                metrics[metric][model][language] = None

# Convert metrics dictionaries into pandas DataFrames
kappa_results = pd.DataFrame(metrics['kappa']).T
skipped_records = pd.DataFrame(metrics['num_skipped_records']).T
different_ann_length = pd.DataFrame(metrics['different_ann_length']).T
contain_empty_predictions = pd.DataFrame(metrics['contain_empty_predictions']).T

# Manually update entries for human annotations in the kappa_results DataFrame
kappa_results.loc['human annotation in masakhaner2', 'bbj'] = 1.000
kappa_results.loc['human annotation in masakhaner2', 'zul'] = 0.953 
kappa_results.loc['human annotation in masakhaner2', 'bam'] =  0.980
kappa_results.loc['human annotation in masakhaner2', 'fon'] = 0.941
kappa_results.loc['human annotation in masakhaner2', 'tsn'] =  0.962

print('\nFleiss` Kappa score')
kappa_results.round(3)

In [113]:
print(f'Number of records with different number of predicted tokens')
different_ann_length.round()

Number of records with different number of predicted tokens


Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,0.0,3.0,1.0,0.0,2.0,
gemini-1.0_temp_0.1,,,,4.0,1.0,0.0
gemini-1.0_batch_size_2,,,,,,1.0
mistral-0.1-temp,20.0,25.0,20.0,28.0,6.0,25.0
starling-lm-0.1-temp,10.0,24.0,5.0,15.0,7.0,11.0
llama_70b,12.0,15.0,18.0,38.0,13.0,29.0
claude3-sonet,,,,,,
gpt_4_turbo_batch_size_2,2.0,,,,,0.0
gpt-4-turbo,,,,0.0,2.0,1.0
claude3-opus,,,,,,


In [114]:
print(f'Number of records containing empty predictions or records where number of reannotations is less than 10')
contain_empty_predictions

Number of records containing empty predictions or records where number of reannotations is less than 10


Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,0.0,0.0,0.0,0.0,0.0,
gemini-1.0_temp_0.1,,,,17.0,15.0,0.0
gemini-1.0_batch_size_2,,,,,,2.0
mistral-0.1-temp,0.0,6.0,0.0,6.0,0.0,10.0
starling-lm-0.1-temp,0.0,2.0,0.0,1.0,0.0,0.0
llama_70b,0.0,2.0,2.0,1.0,0.0,4.0
claude3-sonet,,,,,,
gpt_4_turbo_batch_size_2,0.0,,,,,0.0
gpt-4-turbo,,,,0.0,0.0,0.0
claude3-opus,,,,,,


In [115]:
print(f'Total skipped records')
skipped_records

Total skipped records


Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,1.0,3.0,2.0,3.0,3.0,
gemini-1.0_temp_0.1,,,,18.0,15.0,0.0
gemini-1.0_batch_size_2,,,,,,3.0
mistral-0.1-temp,23.0,27.0,24.0,31.0,6.0,33.0
starling-lm-0.1-temp,12.0,27.0,8.0,16.0,7.0,11.0
llama_70b,13.0,16.0,19.0,38.0,13.0,32.0
claude3-sonet,,,,,,
gpt_4_turbo_batch_size_2,3.0,,,,,0.0
gpt-4-turbo,,,,3.0,2.0,1.0
claude3-opus,,,,,,


In [116]:
print(f'% of skipped records')
skipped_records / SAMPLE_SIZE

% of skipped records


Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,0.02,0.06,0.04,0.06,0.06,
gemini-1.0_temp_0.1,,,,0.36,0.3,0.0
gemini-1.0_batch_size_2,,,,,,0.06
mistral-0.1-temp,0.46,0.54,0.48,0.62,0.12,0.66
starling-lm-0.1-temp,0.24,0.54,0.16,0.32,0.14,0.22
llama_70b,0.26,0.32,0.38,0.76,0.26,0.64
claude3-sonet,,,,,,
gpt_4_turbo_batch_size_2,0.06,,,,,0.0
gpt-4-turbo,,,,0.06,0.04,0.02
claude3-opus,,,,,,


In [117]:
for k, v in logs.items():
    if len(v) > 0:
        print(k)
        print(v)

gpt-4-turbo-0.1-temp | zul
record_32 --> has additional tokens ('B-PROD' is not in list).
Number of omitted records: 1

gpt-4-turbo-0.1-temp | bam
record_5 --> different number of predicted labels ({47: 9, 46: 1}), true num labels = 47.
record_7 --> different number of predicted labels ({54: 9, 53: 1}), true num labels = 54.
record_13 --> different number of predicted labels ({48: 7, 47: 3}), true num labels = 48.
Number of omitted records: 3

gpt-4-turbo-0.1-temp | tsn
record_34 --> different number of predicted labels ({24: 9, 23: 1}), true num labels = 24.
record_42 --> has additional tokens ('B-EVENT' is not in list).
Number of omitted records: 2

gpt-4-turbo-0.1-temp | fon
record_27 --> has additional tokens ('B-EVENT' is not in list).
record_48 --> record is not in the data.
record_49 --> record is not in the data.
Number of omitted records: 3

gpt-4-turbo-0.1-temp | bbj
record_20 --> different number of predicted labels ({20: 7, 21: 3}), true num labels = 21.
record_40 --> diffe

# F1-Score

In [118]:
skipped_records_df = pd.DataFrame(index=models, columns=languages)
f1_df = pd.DataFrame(index=models, columns=languages)

records = [f'record_{i}' for i in range(SAMPLE_SIZE)]

for model in models:
    for language in languages:
        print(f'{model} | {language}')
        try:
            # Initialize variables for each language iteration
            pred, true = [], []
            skipped_records = 0

            # Load data
            data_path = os.path.join(RESULTS_PATH, model, language + '.json')
            with open(data_path, 'r') as file:
                data = json.load(file)

            # Process each record
            for record in records:
                if record in data:  # Check if the record exists in the data
                    non_empty_pred = [pred for pred in data[record]['pred'] if len(pred) > 0]

                    if non_empty_pred:
                        first_pred = non_empty_pred[0]  # Selecting first non-empty record
                        pred_labels = [t[1] if len(t) > 1 else "O" for t in first_pred]

                        if len(data[record]['true']) == len(pred_labels):
                            pred.extend(pred_labels)
                            true.extend(data[record]['true'])
                        else:
                            skipped_records += 1
                            print(f'{record} --> different number of labels (pred={len(pred_labels)}, true={len(data[record]["true"])}).')
                    else:
                        skipped_records += 1
                        print(f'{record} --> all predictions are empty.')
                else:
                    skipped_records += 1  # Increment skipped records if not found
                    print(f'{record} --> not in data.')

            # Calculate F1 score if applicable
            f1_score_value = f1_score(true, pred, average='micro') if true and pred else None

            # Assign calculated values to the respective DataFrame cells
            skipped_records_df.at[model, language] = skipped_records
            f1_df.at[model, language] = f1_score_value

        except Exception as e:
            print(f"Error processing {model} in {language}: {e}")
            skipped_records_df.at[model, language] = None
            f1_df.at[model, language] = None
        print()

gpt-4-turbo-0.1-temp | zul

gpt-4-turbo-0.1-temp | bam

gpt-4-turbo-0.1-temp | tsn

gpt-4-turbo-0.1-temp | fon
record_48 --> not in data.
record_49 --> not in data.

gpt-4-turbo-0.1-temp | bbj
record_20 --> different number of labels (pred=20, true=21).

gpt-4-turbo-0.1-temp | swa
Error processing gpt-4-turbo-0.1-temp in swa: [Errno 2] No such file or directory: '/root/ecml_paper/implementation/data/foundation_model_selection/gpt-4-turbo-0.1-temp/swa.json'

gemini-1.0_temp_0.1 | zul
record_0 --> all predictions are empty.
record_1 --> all predictions are empty.
record_2 --> all predictions are empty.
record_3 --> all predictions are empty.
record_4 --> all predictions are empty.
record_5 --> all predictions are empty.
record_6 --> all predictions are empty.
record_7 --> all predictions are empty.
record_8 --> all predictions are empty.
record_9 --> all predictions are empty.
record_10 --> all predictions are empty.
record_11 --> all predictions are empty.
record_12 --> all predictions 

In [119]:
f1_df

Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,0.938021,0.912509,0.900592,0.904137,0.878468,
gemini-1.0_temp_0.1,,,,0.885057,0.86964,0.941767
gemini-1.0_batch_size_2,,,,,,0.937751
mistral-0.1-temp,0.775974,0.789392,0.826207,0.807453,0.789041,0.787701
starling-lm-0.1-temp,0.867698,0.850174,0.901361,0.890769,0.855474,0.919606
llama_70b,0.703196,0.70229,0.766355,0.700306,0.668285,0.805556
claude3-sonet,0.91922,0.931225,0.918022,0.908509,0.891484,0.962412
gpt_4_turbo_batch_size_2,0.942789,,,,,0.956493
gpt-4-turbo,,,,0.902023,0.892031,0.96921
claude3-opus,0.972145,0.93899,0.923291,0.954488,0.896853,


In [120]:
skipped_records_df

Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,0.0,0.0,0.0,2.0,1.0,
gemini-1.0_temp_0.1,50.0,50.0,50.0,12.0,13.0,0.0
gemini-1.0_batch_size_2,,,,,,0.0
mistral-0.1-temp,13.0,24.0,24.0,17.0,2.0,24.0
starling-lm-0.1-temp,14.0,16.0,14.0,10.0,5.0,9.0
llama_70b,36.0,35.0,38.0,39.0,7.0,45.0
claude3-sonet,6.0,6.0,4.0,15.0,2.0,7.0
gpt_4_turbo_batch_size_2,0.0,,,,,0.0
gpt-4-turbo,,,,0.0,0.0,0.0
claude3-opus,35.0,13.0,6.0,0.0,14.0,


# Consistency 

In [121]:
@contextmanager
def extend_sys_path(path):
    if path not in sys.path:
        # Append the path to sys.path
        sys.path.append(path)
    try:
        # Execute code inside the 'with' statement
        yield
    finally:
        # Remove the path from sys.path
        if path in sys.path:
            sys.path.remove(path)
            


In [None]:
records = [f'record_{i}' for i in range(SAMPLE_SIZE)]
consistency_df = pd.DataFrame(index=models, columns=languages)

for model in models:
    for language in languages:
        consistency = []
        try:
            # Load data
            data_path = os.path.join(RESULTS_PATH, model, language + '.json')
            with open(data_path, 'r') as file:
                data = json.load(file)

            # Process each record
            for record in records:
                if record in data:  # Check if the record exists in the data
                    while len(data[record]['pred']) < REPEAT_ANNOTATION:
                        data[record]['pred'].append([])  # Append an empty array

                    consistency.append(calculate_consistency_score(data[record]['pred'], data[record]['true']))
                else:
                   consistency.append(0)

            consistency_df.at[model, language] = np.mean(consistency)
        except Exception as e:
            print(e)
            consistency_df.at[model, language] = None

In [123]:
consistency_df

Unnamed: 0,zul,bam,tsn,fon,bbj,swa
gpt-4-turbo-0.1-temp,93.5266,90.0244,89.6792,86.8992,86.6972,
gemini-1.0_temp_0.1,0.0,0.0,0.0,61.2396,62.6432,93.8558
gemini-1.0_batch_size_2,,,,,,91.7774
mistral-0.1-temp,54.3496,40.548,42.7286,44.82,73.8064,40.469
starling-lm-0.1-temp,66.6154,54.1092,63.6082,70.9122,76.5844,72.5888
llama_70b,17.8698,18.0696,14.7406,15.7684,55.4654,9.16
claude3-sonet,8.1166,8.2024,8.4478,6.3786,8.5108,8.2744
gpt_4_turbo_batch_size_2,93.097,,,,,95.7232
gpt-4-turbo,,,,90.5068,86.9434,96.3278
claude3-opus,2.9356,6.9106,8.1726,9.5102,6.4222,


Keep code below to easy paste the results from different dataframes to tables in latex

In [None]:
for model in consistency_df.T.columns:
    
    # Round each value in the column to 1 decimal place, convert to string, and then join with ' & '
    rounded_values = consistency_df.T[model].apply(lambda x: 'None' if pd.isnull(x) else str(round(x, 1))).values

    print("{:<30}".format(model), ' & '.join(filter(None, rounded_values)))


In [None]:
' & '.join(contain_empty_predictions.T.index)

### Claude VS GPT

In [138]:
for model in models:
    for language in languages:
        consistency = []
  
        data_path = os.path.join(RESULTS_PATH, model, language + '.json')
        try:
            with (open(data_path, 'r') as file):
                data = json.load(file)
                # For gpt use first record out of 10 reannotations
                if 'gpt' in model:
                    consistency_score = []
                    for record in records:
                        if record in data:
                            consistency_score.append(calculate_consistency_score(
                                [data[record]['pred'][0]], data[record]['true']))
                        else:
                            consistency_score.append(0)
                    print(model, language, round(np.mean(consistency_score), 2))
                elif 'claude' in model:
                    print(model, language, round(data['overall_consistency'], 2))
        except FileNotFoundError as e:
            continue
        except Exception as e:
            print(e)

gpt-4-turbo-0.1-temp zul 93.48
gpt-4-turbo-0.1-temp bam 91.35
gpt-4-turbo-0.1-temp tsn 89.7
gpt-4-turbo-0.1-temp fon 86.89
gpt-4-turbo-0.1-temp bbj 85.91
claude3-sonet zul 92.23
claude3-sonet bam 93.21
claude3-sonet tsn 86.2
claude3-sonet fon 66.44
claude3-sonet bbj 88.65
claude3-sonet swa 96.21
gpt_4_turbo_batch_size_2 zul 93.93
gpt_4_turbo_batch_size_2 swa 95.63
gpt-4-turbo fon 90.49
gpt-4-turbo bbj 88.58
gpt-4-turbo swa 96.81
claude3-opus zul 97.86
claude3-opus bam 94.64
claude3-opus tsn 92.87
claude3-opus fon 95.1
claude3-opus bbj 89.2
claude3-sonet zul 92.23
claude3-sonet bam 93.21
claude3-sonet tsn 86.2
claude3-sonet fon 66.44
claude3-sonet bbj 88.65
claude3-sonet swa 96.21
