# Inter-Annotator Agreement

In [21]:
from collections import defaultdict
import numpy as np
import pandas as pd
import os
import plotly.express as px
from tqdm import tqdm
import json
import glob
pd.options.display.float_format = '{:.2f}'.format

In [22]:
cache_dir = "results"
metrics_names = ['Completeness', 'Factualness', 'Granularity', 'Topical', 'Uniqueness']

results_files = glob.glob(os.path.join(cache_dir, "*.csv"))
results_files = [os.path.basename(f) for f in results_files]

# group them by metrics_name according to the start of the file name
results_files_grouped = defaultdict(list)
for f in results_files:
    for m in metrics_names:
        if f.startswith(m):
            results_files_grouped[m].append(f)


In [23]:
human_group_1_dict = {}
human_group_2_dict = {}

for metric_name, files in results_files_grouped.items():
    jc_data = []
    zf_data = []
    pt_data = []
    for file in files:
        df = pd.read_csv(os.path.join(cache_dir, file))
        if 'Jiacheng' in file:
            jc_data.append(df)
        elif 'Zifeng' in file:
            # extract sample_id comlumn (the first colmn) equals between 20 and 69
            zf_data.append(df[df['sample_id'].between(20, 69)])
        else:
            pt_data.append(df[df['sample_id'].between(20, 69)])

    human_group_1_dict[metric_name] = pd.concat(jc_data)
    human_group_2_dict[metric_name] = pd.concat(pt_data + zf_data)



In [24]:
def cal_agreement(human_1, human_2, metric_name):
    agree = 0.
    total = 0.
    for i in range(len(human_1)):
        if human_1.iloc[i].win == human_2.iloc[i].win:
            agree += 1
        elif human_1.iloc[i].win == "tie" or human_2.iloc[i].win == "tie":
            agree += 0.5
        total += 1

    print(f'{metric_name}: human aggreement score is: {agree/total}')

for metric_name in metrics_names:
    cal_agreement(human_group_1_dict[metric_name], human_group_2_dict[metric_name], metric_name)

Completeness: human aggreement score is: 0.8816666666666667
Factualness: human aggreement score is: 0.8266666666666667
Granularity: human aggreement score is: 0.9266666666666666
Topical: human aggreement score is: 0.81
Uniqueness: human aggreement score is: 0.93


# Elo Rating

In [52]:
data_merged_single = {}
for metric_name, files in results_files_grouped.items():
    data_curr = []
    for file in files:
        df = pd.read_csv(os.path.join(cache_dir, file))
        if 'Zifeng' in file:
            data_curr.append(df[df['sample_id'].between(70, 99)])
        elif 'Patrick' in file:
            data_curr.append(df[df['sample_id'].between(0, 19)])
    
    data_merged_single[metric_name] = pd.concat(data_curr)

In [26]:
# merge 20-69 data

def merge_humans(human_1, human_2):
    raw_data_new = pd.DataFrame({'sample_id': [], 'model_A_name': [], 'model_B_name': [], "win":[]}) 
    
    for i in tqdm(range(len(human_1))):
        if human_1.iloc[i].win == human_2.iloc[i].win:
            raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].sample_id, human_1.iloc[i].model_A_name, human_1.iloc[i].model_B_name, human_1.iloc[i].win]
        elif human_1.iloc[i].win == "tie":
            raw_data_new.loc[len(raw_data_new)] = [human_2.iloc[i].sample_id, human_2.iloc[i].model_A_name, human_2.iloc[i].model_B_name, human_2.iloc[i].win]
        elif human_2.iloc[i].win == "tie":
            raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].sample_id, human_1.iloc[i].model_A_name, human_1.iloc[i].model_B_name, human_1.iloc[i].win]
        else:
            raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].sample_id, human_1.iloc[i].model_A_name, human_1.iloc[i].model_B_name, "tie"]

    return raw_data_new

data_merged_twice = {}


for metrics_name in metrics_names:
    data_merged_twice[metrics_name] = merge_humans(human_group_1_dict[metrics_name], human_group_2_dict[metrics_name])

100%|██████████| 300/300 [00:00<00:00, 1173.74it/s]
100%|██████████| 300/300 [00:00<00:00, 1176.23it/s]
100%|██████████| 300/300 [00:00<00:00, 1185.24it/s]
100%|██████████| 300/300 [00:00<00:00, 1171.00it/s]
100%|██████████| 300/300 [00:00<00:00, 1189.06it/s]


In [28]:
# merge single and twice
data_merged = {}
for metrics_name in metrics_names:
    data_merged[metrics_name] = pd.concat([data_merged_single[metrics_name], data_merged_twice[metrics_name]])

# 

In [30]:
def compute_elo(battles, K, SCALE, BASE, INIT_RATING):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, win in battles[['model_A_name', 'model_B_name', 'win']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if win == "model_A_win":
            sa = 1
        elif win == "model_B_win":
            sa = 0
        elif win == "tie" or win == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {win}")
            
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

In [85]:
def preety_print_elo_ratings(elo_ratings):
    df = pd.DataFrame([
        [n, elo_ratings[n]] for n in elo_ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

param_K=16
param_SCALE=600
param_BASE=10
param_INIT_RATING=1000

elo_ratings_dict = {}

for metric_name in metrics_names:
    elo_ratings = compute_elo(data_merged[metric_name], K=param_K, SCALE=param_SCALE, BASE=param_BASE, INIT_RATING=param_INIT_RATING)
    print(metric_name)
    print(preety_print_elo_ratings(elo_ratings))
    elo_ratings_dict[metric_name] = elo_ratings
    # save to txt
    with open(f'output/{metric_name}_elo_ratings.txt', 'w') as f:
        f.write(preety_print_elo_ratings(elo_ratings).to_string())

Completeness
         Model  Elo rating
1  Groundtruth        1370
2  GPT-4-Turbo         914
3     Openchat         861
4   LLaMA2-70b         855
Factualness
         Model  Elo rating
1  Groundtruth        1094
2  GPT-4-Turbo        1042
3     Openchat         953
4   LLaMA2-70b         910
Granularity
         Model  Elo rating
1  GPT-4-Turbo        1021
2     Openchat        1006
3   LLaMA2-70b        1001
4  Groundtruth         971
Topical
         Model  Elo rating
1  GPT-4-Turbo        1165
2   LLaMA2-70b        1067
3     Openchat        1059
4  Groundtruth         708
Uniqueness
         Model  Elo rating
1  GPT-4-Turbo        1059
2     Openchat        1013
3   LLaMA2-70b         978
4  Groundtruth         949


In [111]:
completeness = {
            "LLaMA2-70b": 0.47,
            "GPT-4-Turbo": 0.545,
            "Openchat": 0.495,
            "Groundtruth": 0.9947916666666666
      }
uniqueness = {
            "LLaMA2-70b": 0.6197593073593073,
            "GPT-4-Turbo": 0.628600592390066,
            "Openchat": 0.5866212121212121,
            "Groundtruth": 0.29
      }

topical = {
            "LLaMA2-70b": 0.49791686969414096,
            "GPT-4-Turbo": 0.5914792349910096,
            "Openchat": 0.5220243533241367,
            "Groundtruth": 0.04284611353612888
      }

granularity = {
            "LLaMA2-70b": 0.7947036840268796,
            "GPT-4-Turbo": 0.7596207862197314,
            "Openchat": 0.7788184104971542,
            "Groundtruth": 0.6870097376678056
      }

factualness = {
            "LLaMA2-70b": 0.9101528138528133,
            "GPT-4-Turbo": 0.9261104465709728,
            "Openchat": 0.9169444444444443,
            "Groundtruth": 0.926
      }

In [112]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

scales = [[500, 1400], [800, 1200], [950, 1050], [600, 1350], [900, 1100]]

# Create a subplot with 1 row and 5 columns (for 5 plots)
fig = make_subplots(rows=1, cols=5, subplot_titles=metrics_names)

# Iterate through each metric and add a bar plot to the subplot
for i, metric_name in enumerate(metrics_names, start=1):
    fig.update_yaxes(range=scales[i-1], row=1, col=i)
    data = elo_ratings_dict[metric_name]
    model_names = list(data.keys())
    elo_ratings = list(data.values())

    plot_data = pd.DataFrame({"Model": model_names, "Elo Rating": elo_ratings})
    fig.add_trace(
        go.Bar(x=plot_data['Model'], y=plot_data['Elo Rating'], name=metric_name),
        row=1, col=i
    )

# Update layout if necessary
fig.update_layout(height=350, width=200 * 5, showlegend=False)

# Save and show the figure
fig.write_image("output/elo_ranking_horizontal.pdf")
fig.show()
