# Computing Evaluation Results

In [None]:
%pip install krippendorff

Import all files

In [103]:
import pandas as pd

excl = ['id','intent','text','prompts','output']

crb_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-Cerbero-7B.tsv", sep="\t").drop(excl, axis=1)
l2_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-LLaMAntino2.tsv", sep="\t").drop(excl, axis=1)
l3_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-LLaMAntino3.tsv", sep="\t").drop(excl, axis=1)
zef_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-Zefiro.tsv", sep="\t").drop(excl, axis=1)
#l2_df.head(5)


## Inter-Annotator Agreement

In [144]:
import krippendorff

criteria = ['Usefulness', 'Necessity', 'Understandability', 'Fluency', 'Accuracy']
model_names=['Cerbero', 'LLaMAntino2', 'LLaMAntino3', 'Zefiro']
model_dfs = [crb_df, l2_df, l3_df, zef_df]

def compute_alpha(df, criteria):

    iaa = {}

    for criterion in criteria:
        columns = [col for col in df.columns if col.startswith(criterion)]
        data = df[columns].values.T
        #print(data)
        alpha = krippendorff.alpha(reliability_data=data)
        iaa[criterion] = alpha
        print(f"Krippendorff's alpha for {criterion}: {alpha:.2f}")

    return iaa


def compute_average_alpha(criteria, model_iaas):
    average_iaa = {}
    for criterion in criteria:
        alphas = [model_iaa[criterion] for model_iaa in model_iaas]
        average_alpha = sum(alphas) / len(alphas)
        average_iaa[criterion] = average_alpha
        print(f"Average alpha for {criterion}: {average_alpha:.2f}")
    return average_iaa

# collect IAAs for each model
model_iaas = []
for model_name, df in zip(model_names, model_dfs):
    print(model_name)
    iaa = compute_alpha(df, criteria)
    model_iaas.append(iaa)

average_iaa = compute_average_alpha(criteria, model_iaas)

Cerbero
Krippendorff's alpha for Usefulness: 0.40
Krippendorff's alpha for Necessity: 0.34
Krippendorff's alpha for Understandability: 0.24
Krippendorff's alpha for Fluency: 0.31
Krippendorff's alpha for Accuracy: 0.47
LLaMAntino2
Krippendorff's alpha for Usefulness: 0.78
Krippendorff's alpha for Necessity: 0.78
Krippendorff's alpha for Understandability: 0.34
Krippendorff's alpha for Fluency: -0.01
Krippendorff's alpha for Accuracy: 0.80
LLaMAntino3
Krippendorff's alpha for Usefulness: 0.30
Krippendorff's alpha for Necessity: 0.35
Krippendorff's alpha for Understandability: 0.13
Krippendorff's alpha for Fluency: 0.26
Krippendorff's alpha for Accuracy: 0.68
Zefiro
Krippendorff's alpha for Usefulness: 0.35
Krippendorff's alpha for Necessity: 0.37
Krippendorff's alpha for Understandability: 0.13
Krippendorff's alpha for Fluency: 0.23
Krippendorff's alpha for Accuracy: 0.60
Average alpha for Usefulness: 0.46
Average alpha for Necessity: 0.46
Average alpha for Understandability: 0.21
Avera

Compute pairwise percentage agreement

In [143]:
import numpy as np


def compute_percent_agreement(df, criteria):
    iaa = {}
    for criterion in criteria:
        columns = [col for col in df.columns if col.startswith(criterion)]
        data = df[columns].values.T
        #print(data)
        num_items = data.shape[1]     
        num_annotators = data.shape[0]
        agreements = 0
        tot_comparisons=0

        # compare each pair of annotations
        for i in range(num_annotators):
            for j in range(i+1, num_annotators):
                #print([i, j])
                agreements += np.sum(data[i] == data[j])
                tot_comparisons += 1
                #print(tot_comparisons)
                
        iaa[criterion] = agreements / (num_items*tot_comparisons)      

    return iaa


def compute_average_percent(criteria, model_iaas):
    average_iaa = {}
    for criterion in criteria:
        percent = [model_iaa[criterion] for model_iaa in model_iaas]
        average_percent = sum(percent) / len(percent)
        average_iaa[criterion] = average_percent
        #print(f"Average percentage agreement for {criterion}: {average_percent:.2f}")
    return average_iaa

# Percentage agreement for the given annotations
model_iaas = []
for model_name, df in zip(model_names, model_dfs):
    print(model_name)
    iaa = compute_percent_agreement(df, criteria)
    print(iaa)
    model_iaas.append(iaa)

compute_average_percent(criteria, model_iaas)



Cerbero
{'Usefulness': 0.4166666666666667, 'Necessity': 0.5233333333333333, 'Understandability': 0.5166666666666667, 'Fluency': 0.4533333333333333, 'Accuracy': 0.66}
LLaMAntino2
{'Usefulness': 0.39666666666666667, 'Necessity': 0.44666666666666666, 'Understandability': 0.37333333333333335, 'Fluency': 0.31666666666666665, 'Accuracy': 0.5566666666666666}
LLaMAntino3
{'Usefulness': 0.3933333333333333, 'Necessity': 0.31333333333333335, 'Understandability': 0.5033333333333333, 'Fluency': 0.4533333333333333, 'Accuracy': 0.6466666666666666}
Zefiro
{'Usefulness': 0.37, 'Necessity': 0.41, 'Understandability': 0.5533333333333333, 'Fluency': 0.43666666666666665, 'Accuracy': 0.58}


{'Usefulness': 0.39416666666666667,
 'Necessity': 0.4233333333333333,
 'Understandability': 0.4866666666666667,
 'Fluency': 0.41500000000000004,
 'Accuracy': 0.6108333333333333}

## Statistical tests

In [None]:
%pip install scikit-posthocs

In [102]:
import pandas as pd
import numpy as np
from scipy.stats import kruskal
import scikit_posthocs as sp

# conflate annotations keeping track of annotators
def conflate_annotations(df, criteria):
    conflated_df = pd.DataFrame()
    for criterion in criteria:
        criterion_scores = []
        annotators = []
        for i in range(4):
            column = f"{criterion}_A{i+1}"
            scores = df[column].values
            criterion_scores.extend(scores)
            annotator_info = [f"A{i+1}"] * len(scores)
            annotators.extend(annotator_info)
        conflated_df[criterion] = criterion_scores
        conflated_df[f"{criterion}_Annotator"] = annotators
    return conflated_df

conflated_dfs = [conflate_annotations(df, criteria) for df in model_dfs]

# combine all models into a single dataframe for each criterion
data = []
for model_name, conflated_df in zip(model_names, conflated_dfs):
    for criterion in criteria:
        for idx, score in enumerate(conflated_df[criterion].values):
            annotator = conflated_df[f"{criterion}_Annotator"].values[idx]
            data.append({'Model': model_name, 'Criterion': criterion, 'Score': score, 'Annotator': annotator})

# df with ratings for each criterion and model
df = pd.DataFrame(data)
#print(df.head(10))

def eta_squared(chi2, N):
    return chi2 / (N - 1)

# perform Kruskal-Wallis test + η² calculation + Dunn's test for each criterion 
eta_squared_results = {}
for criterion in df['Criterion'].unique():
    
    print(criterion)
    df_criterion = df[df['Criterion'] == criterion]
    
    # remove NaN values, if present
    groups = [df_criterion[df_criterion['Model'] == model]['Score'].dropna().values for model in df_criterion['Model'].unique()]
    
    #print(f"Groups for {criterion}:")
    #for i, group in enumerate(groups):
        #print(f"Group {i+1}: {group}")

    # Check for identical values or insufficient variability
    #if any(len(set(group)) == 1 for group in groups) or len(groups) <= 1:
    #    print(f"Skipping Kruskal-Wallis test for {criterion} due to insufficient variability.")
    #    eta_squared_results[criterion] = float('nan')
    #    continue

    kruskal_test = kruskal(*groups)
    print(kruskal_test)
    
    chi_square = kruskal_test.statistic
    N = len(df_criterion.dropna())
    eta_sq = eta_squared(chi_square, N)   
    eta_squared_results[criterion] = eta_sq

    # Dunn's test with Bonferroni correction
    dunn_results = sp.posthoc_dunn(df_criterion, val_col='Score', group_col='Model', p_adjust='bonferroni')
    print(dunn_results)


for criterion, eta_sq in eta_squared_results.items():
    print(f'Eta-squared for {criterion}: η² = {eta_sq:.4f}')


KruskalResult(statistic=84.26518476097468, pvalue=3.7314438905209685e-18)
                  Cerbero   LLaMAntino2   LLaMAntino3        Zefiro
Cerbero      1.000000e+00  5.266857e-10  8.601793e-01  1.000000e+00
LLaMAntino2  5.266857e-10  1.000000e+00  1.119024e-14  7.178825e-14
LLaMAntino3  8.601793e-01  1.119024e-14  1.000000e+00  1.000000e+00
Zefiro       1.000000e+00  7.178825e-14  1.000000e+00  1.000000e+00
KruskalResult(statistic=84.98586353457488, pvalue=2.6133076249360153e-18)
                  Cerbero   LLaMAntino2  LLaMAntino3        Zefiro
Cerbero      1.000000e+00  3.175125e-18     0.000613  9.978984e-02
LLaMAntino2  3.175125e-18  1.000000e+00     0.000003  4.472457e-10
LLaMAntino3  6.132525e-04  3.089213e-06     1.000000  8.164995e-01
Zefiro       9.978984e-02  4.472457e-10     0.816500  1.000000e+00
KruskalResult(statistic=68.62403126742905, pvalue=8.411140077181468e-15)
                  Cerbero   LLaMAntino2   LLaMAntino3        Zefiro
Cerbero      1.000000e+00  2.544223e