In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import re
import random
from statistics import mode
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [3]:
def cut_labels(golden_cut: list):
    """Parameters:
        golden_cut (list): label list from golden annotation data
    Returns:
        short_labels (list): cut (short) labels from the original label list"""
    short_labels = []
    for i in range(len(golden_cut)):
        if 'Open' in golden_cut[i]:
            if 'Initiate' in golden_cut[i]:
                short_labels.append(re.sub('Initiate.','', golden_cut[i]))

            if len(golden_cut[i].split('.')) == 3:
                short_labels.append('.'.join(golden_cut[i].split('.')[:-1]))
            else:
                short_labels.append(golden_cut[i])
        elif "Prolong" in golden_cut[i] or "Develop" in golden_cut[i]:
            short_labels.append('.'.join(golden_cut[i].split('.')[:-1]))
        elif "Track" in golden_cut[i]:
            short_labels.append('.'.join(golden_cut[i].split('.')[:-1]))
        elif "Reply" in golden_cut[i]:
            if "Accept" not in golden_cut[i]:
                short_labels.append('.'.join(golden_cut[i].split('.')[:-1]))
            else:
                short_labels.append(golden_cut[i])
        elif "Challenge" in golden_cut[i]:
            short_labels.append('.'.join(golden_cut[i].split('.')[:-1]))
        elif "Confront.Response" in golden_cut[i]:
            short_labels.append('.'.join(golden_cut[i].split('.')[:-1]))
        else:
            short_labels.append(golden_cut[i])

    return short_labels

# Prepare clean crowdsource data

In [4]:
crowdsource = pd.read_csv('assignments.tsv', sep='\t')
# drop na values
crowdsource = crowdsource.dropna(subset=['OUTPUT:labels'])
# making dataset more useable
crowdsource = crowdsource[crowdsource.columns[[1, 4, 6]]]
crowdsource = crowdsource.rename(columns={'INPUT:dialog_id': 'dialog_id', 
                                          'OUTPUT:labels': 'label', 
                                          'INPUT:utterances': 'utterance'})

In [5]:
# split label string
crowdsource.label = crowdsource.label.apply(lambda x: x.replace('&nbsp;', '').split(','))
# split html utterances
crowdsource.utterance = crowdsource.utterance.apply(
    lambda x: re.split(r',?<span class="speaker_\d">speaker_\d</span>: ', x.replace('�', "'"))[1:]
)
crowdsource['short_label'] = crowdsource.label.apply(cut_labels)

# split utterances and labels into rows
crowdsource = crowdsource.explode(['utterance', 'label', 'short_label'])

In [6]:
crowdsource.head()

Unnamed: 0,dialog_id,utterance,label,short_label
0,1.0,Can you do push-ups?,Open.Demand.Fact,Open.Demand
0,1.0,Of course I can.,React.Respond.Support.Reply.Agree,React.Respond.Support.Reply
0,1.0,It's a piece of cake!,Sustain.Continue.Prolong.Extend,Sustain.Continue.Prolong
0,1.0,Believe it or not.,Sustain.Continue.Prolong.Enhance,Sustain.Continue.Prolong
0,1.0,I can do 30 push-ups a minute.,Sustain.Continue.Prolong.Enhance,Sustain.Continue.Prolong


In [7]:
crowdsource.to_csv('assignments_2col.csv', index=False)

# Load data

In [8]:
crowdsource = pd.read_csv('assignments_2col.csv')
gold_dialogs = pd.read_csv("gold_standard.csv")

# Change text labels to numeric

1. Prepare `gold_standard.csv`:
    - str labels -> list of labels
    - replace Initiate classes with the common ones
2. Get long and short labels from gold standard and create `label2id` dictionary

*There are 32 classes for lomg labels and 16 classes for short labels*

3. Replace class labels with numeric values

In [9]:
parse = lambda x: re.findall(r'\'(\S+)\'', x)
gold_dialogs.gold_answers = gold_dialogs.gold_answers.apply(parse)
gold_dialogs.possible_answers = gold_dialogs.possible_answers.apply(parse)

In [10]:
change = {
    'Open.Initiate.Demand.Opinion': 'Open.Demand.Opinion',
    'Open.Initiate.Give.Fact': 'Open.Give.Fact', 
    'Open.Initiate.Demand.Fact': 'Open.Demand.Fact'
}
make_change = lambda x: [change[el] if el in change else el for el in x]
gold_dialogs.gold_answers = gold_dialogs.gold_answers.apply(make_change)
gold_dialogs.possible_answers = gold_dialogs.possible_answers.apply(make_change)

In [11]:
# get labels from gold standard
labels = list(set([el for ell in gold_dialogs.gold_answers for el in ell]))
labels = sorted(labels)
short_labels = sorted(list(set(cut_labels(labels))))

# get label dictionaries for long and short labels
longlabel2id = {l: i for i, l in enumerate(labels)}
shortlabel2id = {l: i for i, l in enumerate(short_labels)}

In [12]:
# get numeric labels for good annotation
gold_dialogs['long_labels_id_gold'] = gold_dialogs.gold_answers.apply(
    lambda x: [longlabel2id[el.replace('\\n', '')] for el in x])
gold_dialogs['long_labels_id'] = gold_dialogs.possible_answers.apply(
    lambda x: [longlabel2id[el.replace('\\n', '')] for el in x])
gold_dialogs['short_labels_id_gold'] = gold_dialogs.gold_answers.apply(
    lambda x: [shortlabel2id[el] for el in set(cut_labels(x))])
gold_dialogs['short_labels_id'] = gold_dialogs.possible_answers.apply(
    lambda x: [shortlabel2id[el] for el in set(cut_labels(x))])

In [13]:
# get numeric labels for crowdsource annotation
crowdsource['long_labels_id'] = crowdsource.label.apply(longlabel2id.get)
crowdsource['short_labels_id'] = crowdsource.short_label.apply(shortlabel2id.get)

## Concatenate different annotators' labels

In [14]:
storage = {
    'dialog_id': [],
    'utterance': [],
    'long_label_1': [],
    'long_label_2': [],
    'long_label_3': [],
    'short_label_1': [],
    'short_label_2': [],
    'short_label_3': [],
}

In [15]:
for dia_id in sorted(crowdsource.dialog_id.unique()):
    dia_len = crowdsource[crowdsource.dialog_id == dia_id].shape[0] // 3
    storage['dialog_id'].extend(crowdsource[crowdsource.dialog_id == dia_id][:dia_len].dialog_id.tolist())
    storage['utterance'].extend(crowdsource[crowdsource.dialog_id == dia_id][:dia_len].utterance.tolist())
    
    storage['long_label_1'].extend(crowdsource[crowdsource.dialog_id == dia_id][:dia_len].long_labels_id.tolist())
    storage['short_label_1'].extend(crowdsource[crowdsource.dialog_id == dia_id][:dia_len].short_labels_id.tolist())
    
    storage['long_label_2'].extend(crowdsource[crowdsource.dialog_id == dia_id][dia_len:dia_len*2].long_labels_id.tolist())
    storage['short_label_2'].extend(crowdsource[crowdsource.dialog_id == dia_id][dia_len:dia_len*2].short_labels_id.tolist())
    
    storage['long_label_3'].extend(crowdsource[crowdsource.dialog_id == dia_id][dia_len*2:].long_labels_id.tolist())
    storage['short_label_3'].extend(crowdsource[crowdsource.dialog_id == dia_id][dia_len*2:].short_labels_id.tolist())

In [16]:
new_crowdsource = pd.DataFrame(storage)
new_crowdsource

Unnamed: 0,dialog_id,utterance,long_label_1,long_label_2,long_label_3,short_label_1,short_label_2,short_label_3
0,1.0,Can you do push-ups?,2,2,2,2,2,2
1,1.0,Of course I can.,27,26,26,12,12,12
2,1.0,It's a piece of cake!,31,30,30,15,15,15
3,1.0,Believe it or not.,30,31,30,15,15,15
4,1.0,I can do 30 push-ups a minute.,30,29,30,15,15,15
...,...,...,...,...,...,...,...,...
605,9554.0,Where should we go?,13,13,13,7,7,7
606,9554.0,Why don't we go down to the pool?,13,13,13,7,7,7
607,9554.0,It's not too far from here.,30,30,30,15,15,15
608,9554.0,Fine.,27,27,27,12,12,12


# Count metrics for crowdsource workers

## No voting

In [17]:
def get_gold_and_toloka_labels(gold_csv: pd.DataFrame, 
                               label_array: list,
                               if_long_label: bool=True,
                               full: bool=False):
    """Prepare arrays for crowdsource and golden annotation.
    Parameters:
        gold_csv (pd.DataFrame): DataFrame with golden annotation,
        label_array (list): list of unified annotations,
        if_long_label (bool): if long or short label observed,
        full (bool): if all dialogs are observed
    Returns:
        target (list): target annotations,
        annotated (list): actual annotations"""
    target, annotated = [], []
    column_to_iter = 'long_labels_id' if if_long_label else 'short_labels_id'
    gold_csv = gold_csv.sort_values(by='dialogue_id', kind='stable')
    
    if not full:
        gold_csv = gold_csv.query('`dialogue_id` not in [0, 5, 8]')
    
    for i, (_, (possible, gold)) in enumerate(
        gold_csv[[column_to_iter, f'{column_to_iter}_gold']].iterrows()):
        annotated_label = label_array[i]
        annotated.append(annotated_label)
        if annotated_label in possible or annotated_label in gold:
            target.append(annotated_label)
        else:
            target.append(gold[0])
            
    return target, annotated

In [18]:
def metrics(target: list, predicted: list):
    """Get accuracy, F1-scores (macro & micro), recall and precision metrics."""
    accuracy = round(accuracy_score(target, predicted), 2)
    f1_macro = round(f1_score(target, predicted, average='macro'), 2)
    f1_micro = round(f1_score(target, predicted, average='micro'), 2)
    recall = round(recall_score(target, predicted, average='weighted'), 2)
    precision = round(precision_score(target, predicted, average='weighted'), 2)
    return accuracy, f1_macro, f1_micro, recall, precision


def shuffle_dataframe(crowdsource_csv: pd.DataFrame,
                      if_long_label: bool):
    """Specific shuffle for crowdsource dataframes.
    Parameters:
        crowdsource_csv (pd.DataFrame): dataframe with crowdsource annotation,
        if_long_label (bool): if long or short label observed
    Returns:
        crowdsource_csv (pd.DataFrame): shuffled dataframe with crowdsource annotation"""
    if if_long_label:
        to_shuffle = crowdsource_csv[crowdsource_csv.columns[2:5]].to_numpy()
    else:
        to_shuffle = crowdsource_csv[crowdsource_csv.columns[5:]].to_numpy()
    for l in to_shuffle:
        np.random.shuffle(l)
    if if_long_label:
        crowdsource_csv[crowdsource_csv.columns[2:5]] = to_shuffle
    else:
        crowdsource_csv[crowdsource_csv.columns[5:]] = to_shuffle
    return crowdsource_csv


def annotator_assessment(gold_csv: pd.DataFrame, 
                          crowdsource_csv: pd.DataFrame):
    """Get dataframe with annotators' results. The assessment is caried out 10 times and the data is shuffled each time except for the first one.
    Parameters:
        gold_csv (pd.DataFrame): dataframe with golden annotation,
        crowdsource_csv (pd.DataFrame): dataframe with annotators'data which we assess
    Returns:
        results (list): collected metrics and parameters of metric count
    """
    results = []
    params = [
            {'annotator_label': ann_id, 
             'if_long_label': if_ll} for if_ll in [True, False] for ann_id in [1, 2, 3]
    ]
    for m in range(10):
        for param in params:
            label_type = 'long' if param['if_long_label'] else 'short'
            annotator_label = param['annotator_label']
            if m:
                crowdsource_csv = shuffle_dataframe(crowdsource_csv, param['if_long_label'])
                target, annotated = get_gold_and_toloka_labels(gold_csv, 
                                                               crowdsource_csv[f'{label_type}_label_{annotator_label}'], 
                                                               param['if_long_label'])
                accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
                results.append(
                    ('shuffle', param['annotator_label'], label_type, 
                     accuracy, f1_macro, f1_micro, recall, precision)
                )
            else:
                target, annotated = get_gold_and_toloka_labels(gold_csv, 
                                                               crowdsource_csv[f'{label_type}_label_{annotator_label}'], 
                                                               param['if_long_label'])
                accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
                results.append(
                    ('no_shuffle', param['annotator_label'], label_type, 
                     accuracy, f1_macro, f1_micro, recall, precision)
                )
    return results

In [19]:
results = annotator_assessment(gold_dialogs,
                                new_crowdsource)
results = pd.DataFrame(results, columns=[
    'shuffle', 'annotator', 'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('metrics_without_voting.csv', index=False)

In [20]:
pd.read_csv('metrics_without_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.63,0.02,0.47,0.03,0.63,0.02,0.63,0.02,0.73,0.01
2,short,0.82,0.01,0.58,0.03,0.82,0.01,0.82,0.01,0.84,0.01


## With voting

In [21]:
def vote(ls: list):
    """Voting fuction that returns mode of annotators' labels or random label if there is no mode"""
    for l in ls:
        el1, el2, el3 = l
        if el1 != el2 != el3:
            yield random.choice(l)
        else:
            yield mode(l)


def annotator_assessment_voting(gold_csv: pd.DataFrame,
                                 crowdsource_csv: pd.DataFrame):
    """Get dataframe with annotators' results. The assessment is caried out 10 times. Voting technique is used for label aggregation.
    Parameters:
        gold_csv (pd.DataFrame): dataframe with golden annotation,
        crowdsource_csv (pd.DataFrame): dataframe with annotators'data which we assess
    Returns:
        results (list): collected metrics and parameters of metric count
    """
    results = []
    
    for i in range(10):
        labels_long = [el for el in vote(crowdsource_csv[crowdsource_csv.columns[2:5]].values.tolist())]
        labels_short = [el for el in vote(crowdsource_csv[crowdsource_csv.columns[5:]].values.tolist())]
        target, annotated = get_gold_and_toloka_labels(
            gold_csv, labels_long, True
        )
        accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
        results.append(('long', accuracy, f1_macro, f1_micro, recall, precision))
        target, annotated = get_gold_and_toloka_labels(
            gold_csv, labels_short, False
        )
        accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
        results.append(('short', accuracy, f1_macro, f1_micro, recall, precision))
    
    return results

In [22]:
results = annotator_assessment_voting(gold_dialogs,
                                new_crowdsource)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('metrics_with_voting.csv', index=False)

In [23]:
pd.read_csv('metrics_with_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.65,0.01,0.49,0.01,0.65,0.01,0.65,0.01,0.75,0.01
2,short,0.86,0.01,0.61,0.02,0.86,0.01,0.86,0.01,0.86,0.01


## Fleiss' kappa

|measurement case        | fleiss cappa score |
|------------------------|--------------------|
|long label crowdsource  | 0.380              |
|short label crowdsource | 0.673              |

In [24]:
f_table = aggregate_raters(new_crowdsource[new_crowdsource.columns[2:5]])
round(fleiss_kappa(f_table[0], method='fleiss'), 3)

0.38

In [25]:
f_table = aggregate_raters(new_crowdsource[new_crowdsource.columns[5:]])
round(fleiss_kappa(f_table[0], method='fleiss'), 3)

0.673

# Load LLM annotations

### claude-3-haiku

In [26]:
claude = pd.read_csv('claude_t0_context1_masking.tsv', sep='\t')

# add missing numeric label 
longlabel2id['Sustain.Continue.Command'] = 32
shortlabel2id['Sustain.Continue.Command'] = 16

# add numeric label columns and short label column
claude['long_labels_id'] = claude.annotation.apply(longlabel2id.get)
claude['short_labels'] = claude.annotation.apply(lambda x: cut_labels([x])[0])
claude['short_labels_id'] = claude.short_labels.apply(shortlabel2id.get)

claude = claude.sort_values(by='dialog_id', kind='stable').query('`dialog_id` not in [0, 5, 8]')

### GPT-3.5 Turbo

In [27]:
chatgpt = pd.DataFrame() 

for filename in ['sf_annotation__chatgpt_t9_masking1.tsv', 'sf_annotation__chatgpt_t9_masking2.tsv',
                'sf_annotation__chatgpt_t9_masking3.tsv']:
    df = pd.read_csv(filename, sep='\t')
    chatgpt = pd.concat([chatgpt, df])

chatgpt = chatgpt.drop(columns=chatgpt.columns[0])

# add numeric label columns and short label column
chatgpt['long_labels_id'] = chatgpt.annotation.apply(longlabel2id.get)
chatgpt['short_labels'] = chatgpt.annotation.apply(lambda x: cut_labels([x])[0])
chatgpt['short_labels_id'] = chatgpt.short_labels.apply(shortlabel2id.get)

chatgpt = chatgpt.sort_values(by='dialog_id', kind='stable').query('`dialog_id` not in [0, 5, 8]')

### Mixtral 8x22B

In [28]:
mistral = pd.DataFrame() 

for filename in ['sf_annotation__mistral_t5_no_masking.tsv', 'full__mistral_t5_no_masking.tsv']:
    df = pd.read_csv(filename, sep='\t')
    mistral = pd.concat([mistral, df])

mistral = mistral.drop(columns=mistral.columns[0])
mistral = mistral.replace({'React.Rejoinder.Confront.Response.Re-challenge-React.Rejoinder.Confront.Response.Re-challenge':
                          'React.Rejoinder.Confront.Response.Re-challenge'})

# add missing numeric label 
longlabel2id['React.Respond.Command'] = 33
shortlabel2id['React.Respond.Command'] = 17

# add numeric label columns and short label column
mistral['long_labels_id'] = mistral.annotation.apply(longlabel2id.get)
mistral['short_labels'] = mistral.annotation.apply(lambda x: cut_labels([x])[0])
mistral['short_labels_id'] = mistral.short_labels.apply(shortlabel2id.get)

mistral = mistral.sort_values(by='dialog_id', kind='stable')

In [29]:
gold_dialogs = gold_dialogs.sort_values(by='dialogue_id', kind='stable')

# Hybrid annotation

In [30]:
def get_hybrid_labels(human: list, ai: list):
    """Get hybrid label arrays.
    Parameters:
        human (list): list of lists with human annotations,
        ai (list): list with LLM annotations
    Yields:
        list of 2 human annotations and 1 LLM annotation"""
    for i, l in enumerate(human):
        random.shuffle(l)
        yield l[:2] + [ai[i]]


def annotator_assessment_hybrid(gold_csv: pd.DataFrame, 
                                 crowdsource_csv: pd.DataFrame,
                                 ai_csv: pd.DataFrame):
    """Get dataframe with hybrid annotation results. 
    The assessment is caried out 30 times and the hybrid annotation data is obtained each 3 time.
    No voting is used.
    Parameters:
        gold_csv (pd.DataFrame): dataframe with golden annotation,
        crowdsource_csv (pd.DataFrame): dataframe with annotators'data which we assess,
        ai_csv (pd.DataFrame): dataframe with LLM annotation
    Returns:
        results (list): collected metrics and parameters of metric count"""
    results = []
    
    ai_csv = ai_csv.sort_values(by='dialog_id', kind='stable').query('`dialog_id` not in [0, 5, 8]')

    for m in range(10):
        
        hybrid_labels_long = list(get_hybrid_labels(
            crowdsource_csv[crowdsource_csv.columns[2:5]].values.tolist(),
            ai_csv['long_labels_id'].values.tolist()
        ))
        hybrid_labels_short = list(get_hybrid_labels(
            crowdsource_csv[crowdsource_csv.columns[5:]].values.tolist(),
            ai_csv['short_labels_id'].values.tolist()
        ))
        for i in range(3):
            target, annotated = get_gold_and_toloka_labels(gold_csv, 
                                                           hybrid_labels_long)
            annotated1, annotated2, annotated3 = [], [], []
            
            for el1, el2, el3 in annotated:
                annotated1.append(el1)
                annotated2.append(el2)
                annotated3.append(el3)
                
            for annotated in [annotated1, annotated2, annotated3]:
                accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
                results.append(('long', accuracy, f1_macro, f1_micro, recall, precision))
                
            target, annotated = get_gold_and_toloka_labels(gold_csv, 
                                                           hybrid_labels_short, 
                                                           False)
            annotated1, annotated2, annotated3 = [], [], []
            
            for el1, el2, el3 in annotated:
                annotated1.append(el1)
                annotated2.append(el2)
                annotated3.append(el3)
                
            for annotated in [annotated1, annotated2, annotated3]:    
                accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
                results.append(('short', accuracy, f1_macro, f1_micro, recall, precision))
        
    return results

In [31]:
def annotator_assessment_voting_hybrid(gold_csv: pd.DataFrame, 
                                 crowdsource_csv: pd.DataFrame,
                                 ai_csv: pd.DataFrame):
    """Get dataframe with hybrid annotation results. 
    The assessment is caried out 10 times. Voting is used.
    Parameters:
        gold_csv (pd.DataFrame): dataframe with golden annotation,
        crowdsource_csv (pd.DataFrame): dataframe with annotators'data which we assess,
        ai_csv (pd.DataFrame): dataframe with LLM annotation
    Returns:
        results (list): collected metrics and parameters of metric count"""
    results = []
    
    ai_csv = ai_csv.sort_values(by='dialog_id', kind='stable').query('`dialog_id` not in [0, 5, 8]')

    for m in range(10):
        
        hybrid_labels_long = list(get_hybrid_labels(
            crowdsource_csv[crowdsource_csv.columns[2:5]].values.tolist(),
            ai_csv['long_labels_id'].values.tolist()
        ))
        hybrid_labels_short = list(get_hybrid_labels(
            crowdsource_csv[crowdsource_csv.columns[5:]].values.tolist(),
            ai_csv['short_labels_id'].values.tolist()
        ))
    
        labels_long = [el for el in vote(hybrid_labels_long)]
        labels_short = [el for el in vote(hybrid_labels_short)]
        target, annotated = get_gold_and_toloka_labels(
            gold_csv, labels_long, True
        )
        accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
        results.append(('long', accuracy, f1_macro, f1_micro, recall, precision))
        target, annotated = get_gold_and_toloka_labels(
            gold_csv, labels_short, False
        )
        accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
        results.append(('short', accuracy, f1_macro, f1_micro, recall, precision))
    
    return results

In [32]:
def fleiss_kappa_results(crowdsource_csv: pd.DataFrame,
                         ai_csv: pd.DataFrame):
    """Get Fleiss' kappa results for hybrid annotation. The assessment is caried out 100 times.
    Parameters:
        crowdsource_csv (pd.DataFrame): dataframe with annotators'data which we assess,
        ai_csv (pd.DataFrame): dataframe with LLM annotation
    Returns:
        results (list): collected Fleiss' kappa"""
    results = []
    
    for _ in range(100):
        hybrid_labels_long = list(get_hybrid_labels(
            crowdsource_csv[crowdsource_csv.columns[2:5]].values.tolist(),
            ai_csv['long_labels_id'].values.tolist()
        ))
        hybrid_labels_short = list(get_hybrid_labels(
            crowdsource_csv[crowdsource_csv.columns[5:]].values.tolist(),
            ai_csv['short_labels_id'].values.tolist()
        ))
        f_table = aggregate_raters(hybrid_labels_long)
        fk = round(fleiss_kappa(f_table[0], method='fleiss'), 3)
        results.append(('long', fk))
        f_table = aggregate_raters(hybrid_labels_short)
        fk = round(fleiss_kappa(f_table[0], method='fleiss'), 3)
        results.append(('short', fk))

    return results

### claude-3-haiku metrics
**No voting**

In [33]:
results = annotator_assessment_hybrid(gold_dialogs,
                                       new_crowdsource,
                                       claude)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('hybrid_metrics_claude_without_voting.csv', index=False)

In [34]:
pd.read_csv('hybrid_metrics_claude_without_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.4,0.06,0.3,0.04,0.4,0.06,0.4,0.06,0.57,0.02
2,short,0.72,0.05,0.45,0.06,0.72,0.05,0.72,0.05,0.77,0.02


**Voting**

In [35]:
results = annotator_assessment_voting_hybrid(gold_dialogs,
                                       new_crowdsource,
                                       claude)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('hybrid_metrics_claude_with_voting.csv', index=False)

In [36]:
pd.read_csv('hybrid_metrics_claude_with_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.62,0.01,0.46,0.02,0.62,0.01,0.62,0.01,0.73,0.01
2,short,0.82,0.01,0.59,0.04,0.82,0.01,0.82,0.01,0.84,0.01


**Fleiss' kappa**

In [37]:
results = fleiss_kappa_results(new_crowdsource, claude)
results = pd.DataFrame(results, columns=[
    'label type', 'fleiss kappa'
])
results.groupby('label type')['fleiss kappa'].aggregate(
    ['mean', 'std']).round(3)

Unnamed: 0_level_0,mean,std
label type,Unnamed: 1_level_1,Unnamed: 2_level_1
long,0.332,0.007
short,0.58,0.006


### GPT-3.5 Turbo metrics
**No voting**

In [39]:
results = annotator_assessment_hybrid(gold_dialogs,
                                       new_crowdsource,
                                       chatgpt)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('hybrid_metrics_chatgpt_without_voting.csv', index=False)

In [40]:
pd.read_csv('hybrid_metrics_chatgpt_without_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.39,0.06,0.29,0.04,0.39,0.06,0.39,0.06,0.55,0.02
2,short,0.74,0.03,0.45,0.05,0.74,0.03,0.74,0.03,0.78,0.01


**Voting**

In [41]:
results = annotator_assessment_voting_hybrid(gold_dialogs,
                                       new_crowdsource,
                                       chatgpt)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('hybrid_metrics_chatgpt_with_voting.csv', index=False)

In [42]:
pd.read_csv('hybrid_metrics_chatgpt_with_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.61,0.01,0.46,0.02,0.61,0.01,0.61,0.01,0.73,0.01
2,short,0.83,0.01,0.63,0.03,0.83,0.01,0.83,0.01,0.86,0.01


**Fleiss' kappa**

In [43]:
results = fleiss_kappa_results(new_crowdsource, chatgpt)
results = pd.DataFrame(results, columns=[
    'label type', 'fleiss kappa'
])
results.groupby('label type')['fleiss kappa'].aggregate(
    ['mean', 'std']).round(3)

Unnamed: 0_level_0,mean,std
label type,Unnamed: 1_level_1,Unnamed: 2_level_1
long,0.314,0.007
short,0.609,0.008


### Mixtral 8x22B metrics
**No voting**

In [44]:
results = annotator_assessment_hybrid(gold_dialogs,
                                       new_crowdsource,
                                       mistral)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('hybrid_metrics_mistral_without_voting.csv', index=False)

In [45]:
pd.read_csv('hybrid_metrics_mistral_without_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.39,0.06,0.29,0.04,0.39,0.06,0.39,0.06,0.55,0.01
2,short,0.69,0.11,0.43,0.08,0.69,0.11,0.69,0.11,0.77,0.02


**Voting**

In [46]:
results = annotator_assessment_voting_hybrid(gold_dialogs,
                                       new_crowdsource,
                                       mistral)
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('hybrid_metrics_mistral_with_voting.csv', index=False)

In [47]:
pd.read_csv('hybrid_metrics_mistral_with_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.62,0.01,0.43,0.01,0.62,0.01,0.62,0.01,0.73,0.01
2,short,0.82,0.01,0.56,0.03,0.82,0.01,0.82,0.01,0.84,0.01


**Fleiss' kappa**

In [48]:
results = fleiss_kappa_results(new_crowdsource, mistral)
results = pd.DataFrame(results, columns=[
    'label type', 'fleiss kappa'
])
results.groupby('label type')['fleiss kappa'].aggregate(
    ['mean', 'std']).round(3)

Unnamed: 0_level_0,mean,std
label type,Unnamed: 1_level_1,Unnamed: 2_level_1
long,0.117,0.005
short,0.197,0.004


## Metrics for all LLMs

In [49]:
def get_target_annotated(gold:list, 
                         good: list, 
                         model_annotated: list):
    """Prepare arrays with golden and LLM annotations
    Parameters:
        gold (list): list of golden annotations,
        good (list): list of possible annotations,
        model_annotated (list): list of LLM annotations
    Returns:
        target (list): target annotations,
        annotated (list): actual annotations"""
    target, annotated = [], []
    
    for i, pred in enumerate(model_annotated):
        if pred in gold[i] or pred in good[i]:
            target.append(pred)
            annotated.append(pred)
        else:
            target.append(gold[i][0])
            annotated.append(pred)
            
    return target, annotated

**No voting**

In [50]:
results = []
models = {'claude': claude, 'chatgpt': chatgpt, 'mistral': mistral}

for model in models:
    for label_length in ['long', 'short']:
        target, annotated = get_target_annotated(gold_dialogs[f'{label_length}_labels_id_gold'].tolist(),
                                                 gold_dialogs[f'{label_length}_labels_id'].tolist(),
                                                 models[model][f'{label_length}_labels_id'].tolist())
        accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
        results.append((label_length, model, accuracy, f1_macro, f1_micro, recall, precision))

In [51]:
results = pd.DataFrame(results, columns=[
    'label type', 'model annotator', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('llm_metrics_without_voting.csv', index=False)

In [52]:
pd.read_csv('llm_metrics_without_voting.csv')

Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,mean,std,mean,std,mean,std,mean,std,mean,std
1,long,0.49,0.02,0.33,0.02,0.49,0.02,0.49,0.02,0.68,0.03
2,short,0.68,0.09,0.44,0.08,0.68,0.09,0.68,0.09,0.79,0.02


**Voting**

In [53]:
long = list(zip(claude['long_labels_id'].tolist(), 
         chatgpt['long_labels_id'].tolist(),
         mistral['long_labels_id'].tolist(),))
short = list(zip(claude['short_labels_id'].tolist(), 
         chatgpt['short_labels_id'].tolist(),
         mistral['short_labels_id'].tolist()))
long_voting = list(vote(long))
short_voting = list(vote(short))

In [63]:
results = []
for _ in range(10):
    target, annotated = get_target_annotated(gold_dialogs.long_labels_id_gold.tolist(),
                                             gold_dialogs.long_labels_id.tolist(),
                                             long_voting)
    accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
    results.append(('long', accuracy, f1_macro, f1_micro, recall, precision))
    target, annotated = get_target_annotated(gold_dialogs.short_labels_id_gold.tolist(),
                                             gold_dialogs.short_labels_id.tolist(),
                                             short_voting)
    accuracy, f1_macro, f1_micro, recall, precision = metrics(target, annotated)
    results.append(('short', accuracy, f1_macro, f1_micro, recall, precision))

In [54]:
results = pd.DataFrame(results, columns=[
    'label type', 'accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision'
])
results.groupby('label type')[['accuracy', 'f1_macro', 'f1_micro', 'recall', 'precision']].aggregate(
    ['mean', 'std']).round(2).reset_index().to_csv('llm_metrics_with_voting.csv')

In [55]:
pd.read_csv('llm_metrics_with_voting.csv')

Unnamed: 0.1,Unnamed: 0,label type,accuracy,accuracy.1,f1_macro,f1_macro.1,f1_micro,f1_micro.1,recall,recall.1,precision,precision.1
0,,,mean,std,mean,std,mean,std,mean,std,mean,std
1,0.0,long,0.54,0.0,0.38,0.0,0.54,0.0,0.54,0.0,0.67,0.0
2,1.0,short,0.74,0.0,0.5,0.0,0.74,0.0,0.74,0.0,0.79,0.0


**Fleiss' kappa**

In [59]:
f_table = aggregate_raters(long)
f_table_long = round(fleiss_kappa(f_table[0], method='fleiss'), 3)

In [58]:
f_table = aggregate_raters(short)
f_table_short = round(fleiss_kappa(f_table[0], method='fleiss'), 3)