In [2]:
import json
import os
import numpy as np
import re

import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from itertools import product
from os import listdir

# German

In [7]:
language = 'german'

def load_metrics(root_dir, language):
    metrics_data = {}

    for root, dirs, files in os.walk(root_dir + language + '/'):
        for file in files:
            
            # only process files of the form split_"<number>.json"
            if re.match('split_\d+\.json', file):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    data = json.load(f)

                # Organize data by model and shot number
                parts = root.split(os.sep)
                model_name = parts[-2]
                num_shots = parts[-1]
                if model_name not in metrics_data:
                    metrics_data[model_name] = {}
                if num_shots not in metrics_data[model_name]:
                    metrics_data[model_name][num_shots] = []
                metrics_data[model_name][num_shots].append(data)
    return metrics_data

metrics = load_metrics(f'../data/metrics/', language)

In [52]:
print(metrics.keys())

dict_keys(['mistral-instruct-v02-logits', 'mistral-instruct-v02-generation', 'gpt35', 'Llama3_8b', 'gpt4', 'leo-hessianai-7b-chat-logits'])


In [53]:
metrics

{'mistral-instruct-v02-logits': {'5shot': [{'accuracy': 0.6754385964912281,
    'precision_binary': 0.6139240506329114,
    'recall_binary': 0.5271739130434783,
    'f1_binary': 0.5672514619883041,
    'micro_precision': 0.6754385964912281,
    'micro_recall': 0.6754385964912281,
    'micro_f1': 0.6754385964912281,
    'macro_precision': 0.6609888709540397,
    'macro_recall': 0.6514546035805626,
    'macro_f1': 0.6538011695906432,
    'true_positives': 97,
    'false_positives': 61,
    'true_negatives': 211,
    'false_negatives': 87,
    'distinct_true_labels': [0, 1],
    'distinct_pred_labels': [0, 1]},
   {'accuracy': 0.6732456140350878,
    'precision_binary': 0.6036585365853658,
    'recall_binary': 0.5409836065573771,
    'f1_binary': 0.5706051873198847,
    'micro_precision': 0.6732456140350878,
    'micro_recall': 0.6732456140350878,
    'micro_f1': 0.6732456140350878,
    'macro_precision': 0.6579936518543268,
    'macro_recall': 0.6514441842310694,
    'macro_f1': 0.653444

In [3]:
def calc_mean_std_for_setting(data, model_name, shots, task, metric):
    try:
        results = [l[task][metric] for l in data[model_name][shots]]
    except KeyError:
        if task == "toxicity":
            results = [l[metric] for l in data[model_name][shots]]
    return float(np.mean(results)), float(np.std(results))


Compute average F1 score for binary offensiveness/toxicity classification:

In [55]:
metric = 'f1_binary'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '0shot', 'toxicity', metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '5shot', 'toxicity', metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '0shot', 'toxicity', metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '5shot', 'toxicity', metric)
mistral_0shot_mean, mistral_0shot_std = calc_mean_std_for_setting(metrics, 'mistral-instruct-v02-logits', '0shot', 'toxicity', metric)
mistral_5shot_mean, mistral_5shot_std = calc_mean_std_for_setting(metrics, 'mistral-instruct-v02-logits', '5shot', 'toxicity', metric)
leolm7b_0shot_mean, leolm_0shot_std = calc_mean_std_for_setting(metrics, 'leo-hessianai-7b-chat-logits', '0shot', 'toxicity', metric)
leolm7b_5shot_mean, leolm_5shot_std = calc_mean_std_for_setting(metrics, 'leo-hessianai-7b-chat-logits', '5shot', 'toxicity', metric)
llama38b_0shot_mean, llama38b_0shot_std = calc_mean_std_for_setting(metrics, 'Llama3_8b', '0shot', 'toxicity', metric)
llama38b_5shot_mean, llama38b_5shot_std = calc_mean_std_for_setting(metrics, 'Llama3_8b', '5shot', 'toxicity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')
print(f'Mistral 0-shot: mean {mistral_0shot_mean}, std {mistral_0shot_std}')
print(f'Mistral 5-shot: mean {mistral_5shot_mean}, std {mistral_5shot_std}')
print(f'LeoLM 0-shot: mean {leolm7b_0shot_mean}, std {leolm_0shot_std}')
print(f'LeoLM 5-shot: mean {leolm7b_5shot_mean}, std {leolm_5shot_std}')
print(f'Llama3 8b 0-shot: mean {llama38b_0shot_mean}, std {llama38b_0shot_std}')
print(f'Llama3 8b 5-shot: mean {llama38b_5shot_mean}, std {llama38b_5shot_std}')

GPT 3.5 0-shot: mean 0.6834570038837534, std 0.008981314205363642
GPT 3.5 5-shot: mean 0.7221945827127847, std 0.013042186380112904
GPT 4 0-shot: mean 0.6958680107020687, std 0.022708418549084394
GPT 4 5-shot: mean 0.7601107961415291, std 0.027334703518908855
Mistral 0-shot: mean 0.3036743770399912, std 0.016495710957608163
Mistral 5-shot: mean 0.5513068761969737, std 0.028207125181928337
LeoLM 0-shot: mean 0.6140259725489766, std 0.018473863127967484
LeoLM 5-shot: mean 0.5199957163660538, std 0.020064732819364262
Llama3 8b 0-shot: mean 0.6741470438359323, std 0.01732577403437997
Llama3 8b 5-shot: mean 0.5892419645272124, std 0.004828106863343557


In [56]:
metric = 'macro_f1'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '0shot', 'toxicity', metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '5shot', 'toxicity', metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '0shot', 'toxicity', metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '5shot', 'toxicity', metric)
mistral_0shot_mean, mistral_0shot_std = calc_mean_std_for_setting(metrics, 'mistral-instruct-v02-logits', '0shot', 'toxicity', metric)
mistral_5shot_mean, mistral_5shot_std = calc_mean_std_for_setting(metrics, 'mistral-instruct-v02-logits', '5shot', 'toxicity', metric)
leolm7b_0shot_mean, leolm_0shot_std = calc_mean_std_for_setting(metrics, 'leo-hessianai-7b-chat-logits', '0shot', 'toxicity', metric)
leolm7b_5shot_mean, leolm_5shot_std = calc_mean_std_for_setting(metrics, 'leo-hessianai-7b-chat-logits', '5shot', 'toxicity', metric)
llama38b_0shot_mean, llama38b_0shot_std = calc_mean_std_for_setting(metrics, 'Llama3_8b', '0shot', 'toxicity', metric)
llama38b_5shot_mean, llama38b_5shot_std = calc_mean_std_for_setting(metrics, 'Llama3_8b', '5shot', 'toxicity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')
print(f'Mistral 0-shot: mean {mistral_0shot_mean}, std {mistral_0shot_std}')
print(f'Mistral 5-shot: mean {mistral_5shot_mean}, std {mistral_5shot_std}')
print(f'LeoLM 0-shot: mean {leolm7b_0shot_mean}, std {leolm_0shot_std}')
print(f'LeoLM 5-shot: mean {leolm7b_5shot_mean}, std {leolm_5shot_std}')
print(f'Llama3 8b 0-shot: mean {llama38b_0shot_mean}, std {llama38b_0shot_std}')
print(f'Llama3 8b 5-shot: mean {llama38b_5shot_mean}, std {llama38b_5shot_std}')

GPT 3.5 0-shot: mean 0.6399004133630843, std 0.01642357502595972
GPT 3.5 5-shot: mean 0.7186632017030311, std 0.01550105137192629
GPT 4 0-shot: mean 0.768468761091805, std 0.01566308265144348
GPT 4 5-shot: mean 0.8054771233012434, std 0.02179098762332204
Mistral 0-shot: mean 0.5328623662358666, std 0.01008026700238353
Mistral 5-shot: mean 0.6393276489674001, std 0.019750795224120467
LeoLM 0-shot: mean 0.537649440607449, std 0.02565586926677769
LeoLM 5-shot: mean 0.5650894093465297, std 0.018685378390896155
Llama3 8b 0-shot: mean 0.6668561489918189, std 0.016631324003507007
Llama3 8b 5-shot: mean 0.37371789703349545, std 0.01631875695217568


Compute mean vulgarity scores:

In [57]:
metric = 'f1_binary'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '0shot', 'vulgarity', metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '5shot', 'vulgarity', metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '0shot', 'vulgarity', metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '5shot', 'vulgarity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')

GPT 3.5 0-shot: mean 0.4029864641214167, std 0.017476852968389054
GPT 3.5 5-shot: mean 0.43049428306123794, std 0.024552895733935497
GPT 4 0-shot: mean 0.3550059287874655, std 0.03669812711480945
GPT 4 5-shot: mean 0.41182287964939973, std 0.020534540938615325


In [58]:
metric = 'macro_f1'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '0shot', 'vulgarity', metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '5shot', 'vulgarity', metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '0shot', 'vulgarity', metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '5shot', 'vulgarity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')

GPT 3.5 0-shot: mean 0.6898461358731625, std 0.009281303564825635
GPT 3.5 5-shot: mean 0.7034093463525843, std 0.012739507298397934
GPT 4 0-shot: mean 0.6708468707956279, std 0.01863964730026959
GPT 4 5-shot: mean 0.699255617334775, std 0.010378803830970094


Compute mean target scores (macro F1):

In [59]:
metric = 'micro_f1'
task = 'target'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '0shot', task, metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '5shot', task, metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '0shot', task, metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '5shot', task, metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')

GPT 3.5 0-shot: mean 0.981535981905831, std 0.0009304075376001975
GPT 3.5 5-shot: mean 0.9858585057510221, std 0.0006991321658125397
GPT 4 0-shot: mean 0.9890950507755767, std 0.0009051743432576767
GPT 4 5-shot: mean 0.988443508364419, std 0.0011057852334838334


In [60]:
metric = 'macro_f1'
task = 'target'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '0shot', task, metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics, 'gpt35', '5shot', task, metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '0shot', task, metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics, 'gpt4', '5shot', task, metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')

GPT 3.5 0-shot: mean 0.5184421906736161, std 0.00923224539796234
GPT 3.5 5-shot: mean 0.5541202501616989, std 0.010584615305949197
GPT 4 0-shot: mean 0.5510303724032719, std 0.028411724514751734
GPT 4 5-shot: mean 0.5766787979287746, std 0.034208164608525625


## Compute Target Scores only over Spans (Micro F1)

In [4]:
def load_metrics_only_over_spans(root_dir, language):
    metrics_data = {}

    for root, dirs, files in os.walk(root_dir + language + '/'):
        for file in files:
            
            # only process files of the form "split_<number>only_spans_true.json"
            if file.endswith('only_spans_both.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    data = json.load(f)
                
                # Organize data by model and shot number
                parts = root.split(os.sep)
                model_name = parts[-2]
                num_shots = parts[-1]
                if model_name not in metrics_data:
                    metrics_data[model_name] = {}
                if num_shots not in metrics_data[model_name]:
                    metrics_data[model_name][num_shots] = []

                metrics_data[model_name][num_shots].append(data)
    
    return metrics_data

metrics_only_spans = load_metrics_only_over_spans('../data/metrics/', 'german')
metrics_only_spans

{'gpt35': {'5shot': [{'toxicity': {'accuracy': 0.743421052631579,
     'precision_binary': 0.6173285198555957,
     'recall_binary': 0.9395604395604396,
     'f1_binary': 0.7450980392156862,
     'micro_precision': 0.743421052631579,
     'micro_recall': 0.743421052631579,
     'micro_f1': 0.7434210526315789,
     'macro_precision': 0.7779380029445576,
     'macro_recall': 0.7763495628459132,
     'macro_f1': 0.7434099467601609,
     'true_positives': 171,
     'false_positives': 106,
     'true_negatives': 168,
     'false_negatives': 11,
     'distinct_true_labels': [0, 1],
     'distinct_pred_labels': [0, 1]},
    'vulgarity': {'accuracy': 0.3069936421435059,
     'precision_binary': 0.367791077257889,
     'recall_binary': 0.65,
     'f1_binary': 0.4697706740792217,
     'micro_precision': 0.3069936421435059,
     'micro_recall': 0.3069936421435059,
     'micro_f1': 0.3069936421435059,
     'macro_precision': 0.1838955386289445,
     'macro_recall': 0.325,
     'macro_f1': 0.234885

In [62]:
metric = 'micro_f1'
task = 'target'
gpt35_0shot_mean, gpt35_0shot_std = calc_mean_std_for_setting(metrics_only_spans, 'gpt35', '0shot', task, metric)
gpt35_5shot_mean, gpt35_5shot_std = calc_mean_std_for_setting(metrics_only_spans, 'gpt35', '5shot', task, metric)
gpt4_0shot_mean, gpt4_0shot_std = calc_mean_std_for_setting(metrics_only_spans, 'gpt4', '0shot', task, metric)
gpt4_5shot_mean, gpt4_5shot_std = calc_mean_std_for_setting(metrics_only_spans, 'gpt4', '5shot', task, metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean}, std {gpt35_0shot_std}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean}, std {gpt35_5shot_std}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean}, std {gpt4_0shot_std}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean}, std {gpt4_5shot_std}')

GPT 3.5 0-shot: mean 0.16635700747140722, std 0.009925236939753022
GPT 3.5 5-shot: mean 0.2003963017259335, std 0.01211082821373046
GPT 4 0-shot: mean 0.1976978963516321, std 0.02426456453932022
GPT 4 5-shot: mean 0.22320826125626478, std 0.03136496700505511


## Aggregate New Results

In [63]:
def calc_mean_std_for_setting_toxicity_only(data, model_name, shots, metric):
    results = [l[metric] for l in data[model_name][shots]]
    return float(np.mean(results)), float(np.std(results))

Compute average F1 score for binary offensiveness/toxicity classification:

In [64]:
metric = 'f1_binary'
mistral_0shot_mean, mistral_0shot_std = calc_mean_std_for_setting_toxicity_only(metrics, 'mistral-instruct-v02-logits', '0shot', metric)
mistral_5shot_mean, mistral_5shot_std = calc_mean_std_for_setting_toxicity_only(metrics, 'mistral-instruct-v02-logits', '5shot', metric)
llama38b_0shot_mean, llama38b_0shot_std = calc_mean_std_for_setting_toxicity_only(metrics, 'Llama3_8b', '0shot', metric)
llama38b_5shot_mean, llama38b_5shot_std = calc_mean_std_for_setting_toxicity_only(metrics, 'Llama3_8b', '5shot', metric)
print(f'Mistral 0-shot: mean {mistral_0shot_mean}, std {mistral_0shot_std}')
print(f'Mistral 5-shot: mean {mistral_5shot_mean}, std {mistral_5shot_std}')
print(f'Llama3 8b 0-shot: mean {llama38b_0shot_mean}, std {llama38b_0shot_std}')
print(f'Llama3 8b 5-shot: mean {llama38b_5shot_mean}, std {llama38b_5shot_std}')

Mistral 0-shot: mean 0.3036743770399912, std 0.016495710957608163
Mistral 5-shot: mean 0.5513068761969737, std 0.028207125181928337
Llama3 8b 0-shot: mean 0.6741470438359323, std 0.01732577403437997
Llama3 8b 5-shot: mean 0.5892419645272124, std 0.004828106863343557


# English

In [8]:
language = 'english'

metrics_en = load_metrics(f'../data/metrics/', language)
print(metrics_en.keys())

dict_keys(['mistral-instruct-v02-logits', 'gpt35', 'Llama3_8b', 'gpt4'])


In [66]:
metric = 'f1_binary'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '0shot', 'toxicity', metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '5shot', 'toxicity', metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '0shot', 'toxicity', metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '5shot', 'toxicity', metric)
mistral_0shot_mean, mistral_0shot_std = calc_mean_std_for_setting(metrics, 'mistral-instruct-v02-logits', '0shot', 'toxicity', metric)
mistral_5shot_mean, mistral_5shot_std = calc_mean_std_for_setting(metrics, 'mistral-instruct-v02-logits', '5shot', 'toxicity', metric)
llama38b_0shot_mean_en, llama38b_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'Llama3_8b', '0shot', 'toxicity', metric)
llama38b_5shot_mean_en, llama38b_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'Llama3_8b', '5shot', 'toxicity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')
print(f'Mistral 0-shot: mean {mistral_0shot_mean}, std {mistral_0shot_std}')
print(f'Mistral 5-shot: mean {mistral_5shot_mean}, std {mistral_5shot_std}')
print(f'Llama3 8b 0-shot: mean {llama38b_0shot_mean_en}, std {llama38b_0shot_std_en}')
print(f'Llama3 8b 5-shot: mean {llama38b_5shot_mean_en}, std {llama38b_5shot_std_en}')

GPT 3.5 0-shot: mean 0.8892291888243993, std 0.017243466395349658
GPT 3.5 5-shot: mean 0.8933200312793076, std 0.02051588156802596
GPT 4 0-shot: mean 0.8691730739399046, std 0.02530396502236657
GPT 4 5-shot: mean 0.8943939614044222, std 0.019073378155010836
Mistral 0-shot: mean 0.3036743770399912, std 0.016495710957608163
Mistral 5-shot: mean 0.5513068761969737, std 0.028207125181928337
Llama3 8b 0-shot: mean 0.7822596166445457, std 0.03240321260656494
Llama3 8b 5-shot: mean 0.8196300049665985, std 0.016540026450266324


In [67]:
metric = 'macro_f1'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '0shot', 'toxicity', metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '5shot', 'toxicity', metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '0shot', 'toxicity', metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '5shot', 'toxicity', metric)
mistral_0shot_mean_en, mistral_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'mistral-instruct-v02-logits', '0shot', 'toxicity', metric)
mistral_5shot_mean_en, mistral_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'mistral-instruct-v02-logits', '5shot', 'toxicity', metric)
llama38b_0shot_mean_en, llama38b_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'Llama3_8b', '0shot', 'toxicity', metric)
llama38b_5shot_mean_en, llama38b_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'Llama3_8b', '5shot', 'toxicity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')
print(f'Mistral 0-shot: mean {mistral_0shot_mean}, std {mistral_0shot_std}')
print(f'Mistral 5-shot: mean {mistral_5shot_mean}, std {mistral_5shot_std}')
print(f'Llama3 8b 0-shot: mean {llama38b_0shot_mean_en}, std {llama38b_0shot_std_en}')
print(f'Llama3 8b 5-shot: mean {llama38b_5shot_mean_en}, std {llama38b_5shot_std_en}')

GPT 3.5 0-shot: mean 0.8450519041769496, std 0.02118849687653775
GPT 3.5 5-shot: mean 0.8466284908386429, std 0.02779516424850154
GPT 4 0-shot: mean 0.8380955260159635, std 0.027860919991959353
GPT 4 5-shot: mean 0.860357526486297, std 0.021242466325986902
Mistral 0-shot: mean 0.3036743770399912, std 0.016495710957608163
Mistral 5-shot: mean 0.5513068761969737, std 0.028207125181928337
Llama3 8b 0-shot: mean 0.7455211991978186, std 0.035353161331946686
Llama3 8b 5-shot: mean 0.7495169824568032, std 0.028258218624681954


In [14]:
metric = 'f1_binary'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '0shot', 'vulgarity', metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '5shot', 'vulgarity', metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '0shot', 'vulgarity', metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '5shot', 'vulgarity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')

GPT 3.5 0-shot: mean 0.4570047862247285, std 0.04139009695790149
GPT 3.5 5-shot: mean 0.4730624532167842, std 0.021542608644551146
GPT 4 0-shot: mean 0.4062829350033761, std 0.0596245627860067
GPT 4 5-shot: mean 0.4263781875295306, std 0.035401012791129595


In [69]:
metric = 'macro_f1'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '0shot', 'vulgarity', metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '5shot', 'vulgarity', metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '0shot', 'vulgarity', metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '5shot', 'vulgarity', metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')

GPT 3.5 0-shot: mean 0.721456094489895, std 0.021076645003824522
GPT 3.5 5-shot: mean 0.7291571121444743, std 0.01117258008443193
GPT 4 0-shot: mean 0.6976301414498656, std 0.03045098682113758
GPT 4 5-shot: mean 0.707029048396531, std 0.01850883544833917


In [70]:
metric = 'micro_f1'
task = 'target'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '0shot', task, metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '5shot', task, metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '0shot', task, metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '5shot', task, metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')

GPT 3.5 0-shot: mean 0.9915271676931795, std 0.001589827855857926
GPT 3.5 5-shot: mean 0.9928454305254638, std 0.0009902372537771316
GPT 4 0-shot: mean 0.9910723531835167, std 0.001794450117032075
GPT 4 5-shot: mean 0.9922141691821953, std 0.0014571242771846233


In [71]:
metric = 'macro_f1'
task = 'target'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '0shot', task, metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt35', '5shot', task, metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '0shot', task, metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_en, 'gpt4', '5shot', task, metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')

GPT 3.5 0-shot: mean 0.4971165412720767, std 0.020274434220713546
GPT 3.5 5-shot: mean 0.5228714610968284, std 0.02643069612834431
GPT 4 0-shot: mean 0.49412278992752573, std 0.017703348023476076
GPT 4 5-shot: mean 0.521050556925988, std 0.022428608270117445


In [10]:
metrics_only_spans_en = load_metrics_only_over_spans('../data/metrics/', "english")

In [73]:
metric = 'micro_f1'
task = 'target'
gpt35_0shot_mean_en, gpt35_0shot_std_en = calc_mean_std_for_setting(metrics_only_spans_en, 'gpt35', '0shot', task, metric)
gpt35_5shot_mean_en, gpt35_5shot_std_en = calc_mean_std_for_setting(metrics_only_spans_en, 'gpt35', '5shot', task, metric)
gpt4_0shot_mean_en, gpt4_0shot_std_en = calc_mean_std_for_setting(metrics_only_spans_en, 'gpt4', '0shot', task, metric)
gpt4_5shot_mean_en, gpt4_5shot_std_en = calc_mean_std_for_setting(metrics_only_spans_en, 'gpt4', '5shot', task, metric)
print(f'GPT 3.5 0-shot: mean {gpt35_0shot_mean_en}, std {gpt35_0shot_std_en}')
print(f'GPT 3.5 5-shot: mean {gpt35_5shot_mean_en}, std {gpt35_5shot_std_en}')
print(f'GPT 4 0-shot: mean {gpt4_0shot_mean_en}, std {gpt4_0shot_std_en}')
print(f'GPT 4 5-shot: mean {gpt4_5shot_mean_en}, std {gpt4_5shot_std_en}')

GPT 3.5 0-shot: mean 0.16259751560995672, std 0.01785468655214053
GPT 3.5 5-shot: mean 0.1833124007217653, std 0.02881012035320713
GPT 4 0-shot: mean 0.14839213095542986, std 0.020303136017932467
GPT 4 5-shot: mean 0.18499252602407337, std 0.023978851864500012


# Create Table for Latex

In [12]:
language = 'english'

if language == 'german':
    metrics_tmp = metrics
    metrics_only_spans_tmp = metrics_only_spans

else:
    metrics_tmp = metrics_en
    metrics_only_spans_tmp = metrics_only_spans_en

model_names = {
            'gpt35': 'GPT 3.5',
            'gpt4': 'GPT 4',
            'mistral-instruct-v02-logits': 'Mistral',
            'leo-hessianai-7b-chat-logits': 'LeoLM',
            'Llama3_8b': 'Llama3'
        }

n_params = {
            'gpt35': '-',
            'gpt4': '-',
            'mistral-instruct-v02-logits': '7.24B',
            #'leo-hessianai-7b-chat-logits': '7B',
            'Llama3_8b': '8B'
        }

prompt_str = {
    '0shot': '0-Shot',
    '5shot': '5-Shot'
}

for model in [
    #'leo-hessianai-7b-chat-logits', 
    'mistral-instruct-v02-logits', 
    'Llama3_8b', 
    'gpt35', 
    'gpt4'
    ]:

    for prompt_style in ['0shot', '5shot']:
        if '0' in prompt_style:
            str_final = '\multirow{2}{*}{' + model_names[model] + '} & ' + prompt_str[prompt_style] + ' & \multirow{2}{*}{' + n_params[model] + '}'
        else:
            str_final = '& ' + prompt_str[prompt_style] + ' & ' 
        
        labels = ['toxicity']
        if 'gpt' in model: #or 'Llama' in model:
            labels += ['vulgarity', 'target']

        for l in labels: 
            if 'target' in l:
                metrics_list = ['micro_f1', 'macro_f1']
            else: 
                metrics_list = ['f1_binary', 'macro_f1']

            for m in metrics_list:
                if 'target' in l and 'micro' in m:
                    mean, std = calc_mean_std_for_setting(metrics_only_spans_tmp, model, prompt_style, l, m)
                else:  
                    mean, std = calc_mean_std_for_setting(metrics_tmp, model, prompt_style, l, m)
                mean_str = str(round(mean, 2))[1:]
                if len(mean_str) == 2:
                    mean_str += '0'
                str_final += ' & $' + mean_str + ' \pm ' + str(round(std, 2))[2:] + '$'
        
        if 'gpt' not in model: 
            str_final += ' & - & - & - & - '

        str_final += ' \\\\'
        print(str_final)

\multirow{2}{*}{Mistral} & 0-Shot & \multirow{2}{*}{7.24B} & $.48 \pm 05$ & $.55 \pm 04$ & - & - & - & -  \\
& 5-Shot &  & $.77 \pm 03$ & $.73 \pm 03$ & - & - & - & -  \\
\multirow{2}{*}{Llama3} & 0-Shot & \multirow{2}{*}{8B} & $.78 \pm 03$ & $.75 \pm 04$ & - & - & - & -  \\
& 5-Shot &  & $.82 \pm 02$ & $.75 \pm 03$ & - & - & - & -  \\
\multirow{2}{*}{GPT 3.5} & 0-Shot & \multirow{2}{*}{-} & $.89 \pm 02$ & $.85 \pm 02$ & $.46 \pm 04$ & $.72 \pm 02$ & $.16 \pm 02$ & $.50 \pm 02$ \\
& 5-Shot &  & $.89 \pm 02$ & $.85 \pm 03$ & $.47 \pm 02$ & $.73 \pm 01$ & $.18 \pm 03$ & $.52 \pm 03$ \\
\multirow{2}{*}{GPT 4} & 0-Shot & \multirow{2}{*}{-} & $.87 \pm 03$ & $.84 \pm 03$ & $.41 \pm 06$ & $.70 \pm 03$ & $.15 \pm 02$ & $.49 \pm 02$ \\
& 5-Shot &  & $.89 \pm 02$ & $.86 \pm 02$ & $.43 \pm 04$ & $.71 \pm 02$ & $.18 \pm 02$ & $.52 \pm 02$ \\


## Check whether differences are statstically different

In [26]:
def get_values_for_setting(data, model_name, shots, task, metric):
    try:
        results = [l[task][metric] for l in data[model_name][shots]]
    except KeyError:
        if task == "toxicity":
            results = [l[metric] for l in data[model_name][shots]]
    return results

In [27]:
language = 'german'
metric = 'micro_f1'
task = 'target'
alpha = 0.35

metrics = load_metrics(f'../data/metrics/', language)

if task == 'target':
    metrics = load_metrics_only_over_spans(f'../data/metrics/', language)

task_name_fine = {'target': 'target_extraction', 'toxicity': 'offensiveness', 'vulgarity': 'vulgarity'}
measure_name_fine = {'toxicity_f1_binary': 'binary_f1', 
                     'toxicity_macro_f1': 'macro_f1', 
                     'vulgarity_f1_binary': 'f1_binary', 
                     'vulgarity_macro_f1': 'f1_macro', 
                     'target_macro_f1': 'f1_macro',
                     'target_micro_f1': 'f1_micro'
                    }

if language == 'german': 
    model_names_fine = [
        #"bert-base-uncased",
        "bert-base-german-cased",
        "bert-base-german-dbmdz-cased",
        "deepset-gelectra-base",
        "deepset-gbert-base",
        #"deepset-gelectra-large", 
        #"deepset-gbert-large",
    ]

else: 
    model_names_fine = [
    "electra-large",
    "roberta-large"
    ]

model_names_llms = [
    #'gpt35 0shot', 
    #'gpt35 5shot', 
    #'gpt4 0shot', 
    'gpt4 5shot',
    ]

if task == 'toxicity' and language == 'german': 
    model_names_llms = [
        'mistral-instruct-v02-logits', 
        'leo-hessianai-7b-chat-logits'
        ] + model_names_llms

all_f1s = []
for m_name in model_names_fine:
    if m_name in listdir('../results/{}/'.format(language) + task_name_fine[task]):
        print(m_name)
        with open('../results/{}/{}/{}/overall_results.json'.format(language, task_name_fine[task], m_name), 'r') as file:
            data = json.load(file)
        f1_name = 'test_' + measure_name_fine[task + '_' + metric] 
        if task == 'target_extraction':
            for i in range(10):
                all_classes = set()
                for comment in data[str(i)][1]['test_predicted_labels']:
                    for pred in comment:
                        all_classes.add(pred)
                if len(all_classes) > 3:
                    f1s.append(data[str(i)][1][f1_name])
                else:
                    if 'micro' in f1_name:
                        f1s.append(data[str(i)][1][f1_name])
                    else:
                        f1s.append(0.25)
        else:
            f1s = [data[str(i)][1][f1_name] for i in range(10)]
        print(np.mean(f1s))
    all_f1s.append(sorted(f1s))

#name_n_shot_llms = list(product(model_names, shots))

print(model_names_fine)
print(model_names_llms)

all_f1s += [sorted(get_values_for_setting(metrics, s[:-6], s[-5:], task, metric)) for s in model_names_llms]

# Create variables x1, x2, x3 and assign values from my_list
for i, value in enumerate(all_f1s):
    print(f'model_{i} = {np.mean(value)}')
    exec(f'model_{i} = {value}')

# Combine all F1 scores
all_f1_scores = np.concatenate(all_f1s)

# Check normality using Shapiro-Wilk test
shapiro_test_statistic, shapiro_p_value = stats.shapiro(all_f1_scores)
print("Shapiro-Wilk Test (p-value):", shapiro_p_value)

# Check homogeneity of variances using Levene's test
str_to_execute = '('
for i in range(len(all_f1s)):
    str_to_execute += f'model_{i}, '
str_to_execute = str_to_execute[:-2]
str_to_execute += ')'
exec('stats.levene' + str_to_execute)
levene_test_statistic, levene_p_value = eval('stats.levene' + str_to_execute)
print("Levene's Test (p-value):", levene_p_value)

# Perform ANOVA
exec('stats.f_oneway' + str_to_execute)
anova_results = eval('stats.f_oneway' +str_to_execute)
print("ANOVA F-statistic:", anova_results.statistic)
print("ANOVA p-value:", anova_results.pvalue)

# If ANOVA p-value is significant, perform post-hoc Tukey HSD test
#if anova_results.pvalue < 0.35:

# Combine all F1 scores with corresponding group labels
group_labels_nested = [[m] * 10 for m in model_names_fine] + [[s] * 10 for s in model_names_llms]
group_labels = [s for l in group_labels_nested for s in l]
tukey_results = pairwise_tukeyhsd(all_f1_scores, group_labels, alpha=alpha)
print(tukey_results)

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
metrics

{'mistral-instruct-v02-logits': {'5shot': [{'accuracy': 0.6754385964912281,
    'precision_binary': 0.6139240506329114,
    'recall_binary': 0.5271739130434783,
    'f1_binary': 0.5672514619883041,
    'micro_precision': 0.6754385964912281,
    'micro_recall': 0.6754385964912281,
    'micro_f1': 0.6754385964912281,
    'macro_precision': 0.6609888709540397,
    'macro_recall': 0.6514546035805626,
    'macro_f1': 0.6538011695906432,
    'true_positives': 97,
    'false_positives': 61,
    'true_negatives': 211,
    'false_negatives': 87,
    'distinct_true_labels': [0, 1],
    'distinct_pred_labels': [0, 1]},
   {'accuracy': 0.6732456140350878,
    'precision_binary': 0.6036585365853658,
    'recall_binary': 0.5409836065573771,
    'f1_binary': 0.5706051873198847,
    'micro_precision': 0.6732456140350878,
    'micro_recall': 0.6732456140350878,
    'micro_f1': 0.6732456140350878,
    'macro_precision': 0.6579936518543268,
    'macro_recall': 0.6514441842310694,
    'macro_f1': 0.653444