# Computing Evaluation Results

In [None]:
%pip install krippendorff

Import all files

In [None]:
import pandas as pd

excl = ['id','intent','text','prompts','output']

crb_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-Cerbero-7B.tsv", sep="\t").drop(excl, axis=1)
l2_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-LLaMAntino2.tsv", sep="\t").drop(excl, axis=1)
l3_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-LLaMAntino3.tsv", sep="\t").drop(excl, axis=1)
zef_df = pd.read_csv("EnergyFeedbackGeneration-EvaluationResults-Zefiro.tsv", sep="\t").drop(excl, axis=1)
#l2_df.head(5)


## Inter-Annotator Agreement

In [None]:
import krippendorff

criteria = ['Usefulness', 'Necessity', 'Understandability', 'Fluency', 'Accuracy']
model_names=['Cerbero', 'LLaMAntino2', 'LLaMAntino3', 'Zefiro']
model_dfs = [crb_df, l2_df, l3_df, zef_df]

def compute_alpha(df, criteria):

    iaa = {}

    for criterion in criteria:
        columns = [col for col in df.columns if col.startswith(criterion)]
        data = df[columns].values.T
        #print(data)
        alpha = krippendorff.alpha(reliability_data=data)
        iaa[criterion] = alpha
        print(f"Krippendorff's alpha for {criterion}: {alpha:.2f}")

    return iaa


def compute_average_alpha(criteria, model_iaas):
    average_iaa = {}
    for criterion in criteria:
        alphas = [model_iaa[criterion] for model_iaa in model_iaas]
        average_alpha = sum(alphas) / len(alphas)
        average_iaa[criterion] = average_alpha
        print(f"Average alpha for {criterion}: {average_alpha:.2f}")
    return average_iaa

# collect IAAs for each model
model_iaas = []
for model_name, df in zip(model_names, model_dfs):
    print(model_name)
    iaa = compute_alpha(df, criteria)
    model_iaas.append(iaa)

average_iaa = compute_average_alpha(criteria, model_iaas)

## Statistical tests

In [None]:
%pip install scikit-posthocs

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import kruskal
import scikit_posthocs as sp

# conflate annotations keeping track of annotators
def conflate_annotations(df, criteria):
    conflated_df = pd.DataFrame()
    for criterion in criteria:
        criterion_scores = []
        annotators = []
        for i in range(4):
            column = f"{criterion}_A{i+1}"
            scores = df[column].values
            criterion_scores.extend(scores)
            annotator_info = [f"A{i+1}"] * len(scores)
            annotators.extend(annotator_info)
        conflated_df[criterion] = criterion_scores
        conflated_df[f"{criterion}_Annotator"] = annotators
    return conflated_df

conflated_dfs = [conflate_annotations(df, criteria) for df in model_dfs]

# combine all models into a single dataframe for each criterion
data = []
for model_name, conflated_df in zip(model_names, conflated_dfs):
    for criterion in criteria:
        for idx, score in enumerate(conflated_df[criterion].values):
            annotator = conflated_df[f"{criterion}_Annotator"].values[idx]
            data.append({'Model': model_name, 'Criterion': criterion, 'Score': score, 'Annotator': annotator})

# df with ratings for each criterion and model
df = pd.DataFrame(data)
#print(df.head(10))


# perform Kruskal-Wallis test + Dunn's test for each criterion 
eta_squared_results = {}
for criterion in df['Criterion'].unique():
    
    print(criterion)
    df_criterion = df[df['Criterion'] == criterion]
    
    # remove NaN values, if present
    groups = [df_criterion[df_criterion['Model'] == model]['Score'].dropna().values for model in df_criterion['Model'].unique()]
    
    #print(f"Groups for {criterion}:")
    #for i, group in enumerate(groups):
        #print(f"Group {i+1}: {group}")

    kruskal_test = kruskal(*groups)
    print(kruskal_test)
    
    # Dunn's test with Bonferroni correction
    dunn_results = sp.posthoc_dunn(df_criterion, val_col='Score', group_col='Model', p_adjust='bonferroni')
    print(dunn_results)


