# Inter-Annotator Agreement

In [74]:
from collections import defaultdict
import numpy as np
import pandas as pd
import os
import plotly.express as px
from tqdm import tqdm
import json
import glob
pd.options.display.float_format = '{:.2f}'.format

In [75]:
cache_dir = "results"
metrics_names = ['Completeness', 'Factualness', 'Granularity', 'Topical', 'Uniqueness']

results_files = glob.glob(os.path.join(cache_dir, "*.csv"))
results_files = [os.path.basename(f) for f in results_files]

# group them by metrics_name according to the start of the file name
results_files_grouped = defaultdict(list)
for f in results_files:
    for m in metrics_names:
        if f.startswith(m):
            results_files_grouped[m].append(f)


In [76]:
human_group_1_dict = {}
human_group_2_dict = {}

for metric_name, files in results_files_grouped.items():
    jc_data = []
    zf_data = []
    pt_data = []
    for file in files:
        df = pd.read_csv(os.path.join(cache_dir, file))
        if 'Jiacheng' in file:
            jc_data.append(df)
        elif 'Zifeng' in file:
            # extract sample_id comlumn (the first colmn) equals between 20 and 69
            zf_data.append(df[df['sample_id'].between(20, 69)])
        else:
            pt_data.append(df[df['sample_id'].between(20, 69)])

    human_group_1_dict[metric_name] = pd.concat(jc_data)
    human_group_2_dict[metric_name] = pd.concat(pt_data + zf_data)



In [77]:
human_group_1_dict['Topical']

Unnamed: 0,sample_id,model_A_name,model_B_name,win
0,20,Groundtruth,LLaMA2-70b,model_A_win
1,20,Groundtruth,Openchat,model_B_win
2,20,Groundtruth,GPT-4-Turbo,model_B_win
3,20,LLaMA2-70b,Openchat,model_B_win
4,20,LLaMA2-70b,GPT-4-Turbo,model_B_win
...,...,...,...,...
295,69,Groundtruth,Openchat,model_A_win
296,69,Groundtruth,GPT-4-Turbo,model_A_win
297,69,LLaMA2-70b,Openchat,tie
298,69,LLaMA2-70b,GPT-4-Turbo,tie


In [78]:
human_group_2_dict['Topical']

Unnamed: 0,sample_id,model_A_name,model_B_name,win
120,20,Groundtruth,LLaMA2-70b,model_B_win
121,20,Groundtruth,Openchat,model_B_win
122,20,Groundtruth,GPT-4-Turbo,model_B_win
123,20,LLaMA2-70b,Openchat,model_B_win
124,20,LLaMA2-70b,GPT-4-Turbo,model_B_win
...,...,...,...,...
115,69,Groundtruth,Openchat,tie
116,69,Groundtruth,GPT-4-Turbo,tie
117,69,LLaMA2-70b,Openchat,tie
118,69,LLaMA2-70b,GPT-4-Turbo,tie


In [79]:
def cal_agreement(human_1, human_2, metric_name):
    agree = 0.
    total = 0.
    for i in range(len(human_1)):
        if human_1.iloc[i].win == human_2.iloc[i].win:
            agree += 1
        elif human_1.iloc[i].win == "tie" or human_2.iloc[i].win == "tie":
            agree += 0.5
        total += 1

    print(f'{metric_name}: human aggreement score is: {agree/total}')

for metric_name in metrics_names:
    cal_agreement(human_group_1_dict[metric_name], human_group_2_dict[metric_name], metric_name)

Completeness: human aggreement score is: 0.8816666666666667
Factualness: human aggreement score is: 0.8266666666666667
Granularity: human aggreement score is: 0.9266666666666666
Topical: human aggreement score is: 0.81
Uniqueness: human aggreement score is: 0.93


# Elo Rating

In [80]:
data_merged_single = {}
for metric_name, files in results_files_grouped.items():
    data_curr = []
    for file in files:
        df = pd.read_csv(os.path.join(cache_dir, file))
        if 'Zifeng' in file:
            data_curr.append(df[df['sample_id'].between(70, 99)])
        elif 'Patrick' in file:
            data_curr.append(df[df['sample_id'].between(0, 19)])
    
    data_merged_single[metric_name] = pd.concat(data_curr)

data_merged_single['Topical']

Unnamed: 0,sample_id,model_A_name,model_B_name,win
0,0,Groundtruth,LLaMA2-70b,model_B_win
1,0,Groundtruth,Openchat,model_B_win
2,0,Groundtruth,GPT-4-Turbo,model_B_win
3,0,LLaMA2-70b,Openchat,model_A_win
4,0,LLaMA2-70b,GPT-4-Turbo,tie
...,...,...,...,...
295,99,Groundtruth,Openchat,model_B_win
296,99,Groundtruth,GPT-4-Turbo,model_B_win
297,99,LLaMA2-70b,Openchat,model_A_win
298,99,LLaMA2-70b,GPT-4-Turbo,model_B_win


In [81]:
# merge 20-69 data

def merge_humans(human_1, human_2):
    raw_data_new = pd.DataFrame({'sample_id': [], 'model_A_name': [], 'model_B_name': [], "win":[]}) 
    
    for i in tqdm(range(len(human_1))):
        if human_1.iloc[i].win == human_2.iloc[i].win:
            raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].sample_id, human_1.iloc[i].model_A_name, human_1.iloc[i].model_B_name, human_1.iloc[i].win]
        elif human_1.iloc[i].win == "tie":
            raw_data_new.loc[len(raw_data_new)] = [human_2.iloc[i].sample_id, human_2.iloc[i].model_A_name, human_2.iloc[i].model_B_name, human_2.iloc[i].win]
        elif human_2.iloc[i].win == "tie":
            raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].sample_id, human_1.iloc[i].model_A_name, human_1.iloc[i].model_B_name, human_1.iloc[i].win]
        else:
            raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].sample_id, human_1.iloc[i].model_A_name, human_1.iloc[i].model_B_name, "tie"]

    return raw_data_new

data_merged_twice = {}


for metrics_name in metrics_names:
    data_merged_twice[metrics_name] = merge_humans(human_group_1_dict[metrics_name], human_group_2_dict[metrics_name])

100%|██████████| 300/300 [00:00<00:00, 1203.95it/s]
100%|██████████| 300/300 [00:00<00:00, 1210.02it/s]
100%|██████████| 300/300 [00:00<00:00, 1221.28it/s]
100%|██████████| 300/300 [00:00<00:00, 1202.41it/s]
100%|██████████| 300/300 [00:00<00:00, 1194.86it/s]


In [82]:
data_merged_twice['Topical']

Unnamed: 0,sample_id,model_A_name,model_B_name,win
0,20,Groundtruth,LLaMA2-70b,tie
1,20,Groundtruth,Openchat,model_B_win
2,20,Groundtruth,GPT-4-Turbo,model_B_win
3,20,LLaMA2-70b,Openchat,model_B_win
4,20,LLaMA2-70b,GPT-4-Turbo,model_B_win
...,...,...,...,...
295,69,Groundtruth,Openchat,model_A_win
296,69,Groundtruth,GPT-4-Turbo,model_A_win
297,69,LLaMA2-70b,Openchat,tie
298,69,LLaMA2-70b,GPT-4-Turbo,tie


In [83]:
# merge single and twice
data_merged = {}
for metrics_name in metrics_names:
    data_merged[metrics_name] = pd.concat([data_merged_single[metrics_name], data_merged_twice[metrics_name]])

In [84]:
data_merged['Completeness']

Unnamed: 0,sample_id,model_A_name,model_B_name,win
0,0,Groundtruth,LLaMA2-70b,model_A_win
1,0,Groundtruth,Openchat,model_A_win
2,0,Groundtruth,GPT-4-Turbo,model_A_win
3,0,LLaMA2-70b,Openchat,model_A_win
4,0,LLaMA2-70b,GPT-4-Turbo,tie
...,...,...,...,...
295,69,Groundtruth,Openchat,model_A_win
296,69,Groundtruth,GPT-4-Turbo,model_A_win
297,69,LLaMA2-70b,Openchat,tie
298,69,LLaMA2-70b,GPT-4-Turbo,model_B_win


# 

In [85]:
def compute_elo(battles, K, SCALE, BASE, INIT_RATING):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, win in battles[['model_A_name', 'model_B_name', 'win']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if win == "model_A_win":
            sa = 1
        elif win == "model_B_win":
            sa = 0
        elif win == "tie" or win == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {win}")
            
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

In [86]:
def preety_print_elo_ratings(elo_ratings):
    df = pd.DataFrame([
        [n, elo_ratings[n]] for n in elo_ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

param_K=16
param_SCALE=400
param_BASE=10
param_INIT_RATING=1000

elo_ratings_dict = {}

for metric_name in metrics_names:
    elo_ratings = compute_elo(data_merged[metric_name], K=param_K, SCALE=param_SCALE, BASE=param_BASE, INIT_RATING=param_INIT_RATING)
    print(metric_name)
    print(preety_print_elo_ratings(elo_ratings))
    elo_ratings_dict[metric_name] = elo_ratings
    # save to txt
    with open(f'output/{metric_name}_elo_ratings.txt', 'w') as f:
        f.write(preety_print_elo_ratings(elo_ratings).to_string())

Completeness
         Model  Elo rating
1  Groundtruth        1260
2  GPT-4-Turbo         948
3     Openchat         900
4   LLaMA2-70b         891
Factualness
         Model  Elo rating
1  Groundtruth        1080
2  GPT-4-Turbo        1029
3     Openchat         970
4   LLaMA2-70b         921
Granularity
         Model  Elo rating
1  GPT-4-Turbo        1014
2     Openchat        1005
3   LLaMA2-70b        1001
4  Groundtruth         979
Topical
         Model  Elo rating
1  GPT-4-Turbo        1109
2   LLaMA2-70b        1041
3     Openchat        1037
4  Groundtruth         813
Uniqueness
         Model  Elo rating
1  GPT-4-Turbo        1035
2     Openchat        1007
3   LLaMA2-70b         986
4  Groundtruth         972


In [87]:
import pandas as pd
import plotly.io as pio
import plotly.express as px
pd.options.display.float_format = '{:.2f}'.format



# data = {
#     "Vicuna-7B": 1222,
#     "ChatGLM": 1172,
#     "Moss": 1162,
#     "StableLM-Tuned-Alpha": 1124,
#     "Alpaca-7B": 1094,
#     "Open-Assistant": 1087,
#     "Alpaca-lora": 998,
#     "Dolly-v2-7B": 972,
#     "MPT": 960,
#     "Galatica": 916,
#     "RWKV (Pile)-7B": 903,
#     "BELLE": 900,
#     "PandaLM": 898,
#     "RedPajama-7B_base": 889,
#     "h2oGPT-6.9B": 873,
#     "RedPajama-7B_instruct": 831
# }

def plot_elo_bar_plot(data, metric_name):

    model_names = list(data.keys())
    elo_ratings = list(data.values())

    plot_data = pd.DataFrame({"Model": model_names, "Elo Rating": elo_ratings})

    # colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52', '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3']

    # # 创建图形
    # fig = go.Figure()

    # # 添加水平柱状图，并为每个柱子指定颜色
    # for name, score, color in zip(model_names,elo_ratings, colors):
    #     fig.add_trace(go.Bar(
    #         x=[name],
    #         y=[score],
    #         marker=dict(color=color)
    #     ))
    font_size = 12

    fig = px.bar(plot_data, x='Model', y='Elo Rating', color='Elo Rating', color_continuous_scale='Tealgrn')
    fig.update_traces(texttemplate='%{y}', textposition='outside', textfont=dict(size=font_size))
    fig.update_layout(xaxis_title=None, height=500, width=400, showlegend=False,
                    yaxis=dict(
                        categoryorder='total ascending', # 这会使项目按得分升序排列
                        range=[min(elo_ratings) - 20, max(elo_ratings) + 80] # 可以通过调整这些值来限制y轴的显示范围
                    ))
    

    # 获取当前的X轴刻度位置
    tickvals = [i for i in range(len(plot_data['Model'].unique()))]

    # 创建平移后的刻度位置（例如，向左平移0.2个单位）
    new_tickvals = [tick - 0.2 for tick in tickvals]


    fig.update_layout(
        xaxis=dict(
            tickmode='array',
            tickvals=new_tickvals,
            ticktext=plot_data['Model'].unique(), # 使用原始的刻度标签
            title_font=dict(size=16), # 调整X轴标题字体大小
            tickfont=dict(size=font_size)    # 调整X轴刻度字体大小
        ),
        yaxis=dict(
            title_font=dict(size=16), # 调整Y轴标题字体大小
            showticklabels=False,   # 调整Y轴刻度字体大小
        ),
        coloraxis_showscale=False # 隐藏侧边的颜色刻度表
    )
    fig.add_shape(
            type="line",
            x0=-1,
            y0=1000,
            x1=4,
            y1=1000,
            line=dict(
                color="darkred",
            )
    )

    fig.write_image(f"output/elo_ranking_{metric_name}.pdf")

    fig.show()

# for each elements in elo_rating, int it   
for metric_name in metrics_names:
    elo_ratings_dict[metric_name] = {k: int(v) for k, v in elo_ratings_dict[metric_name].items()}

for metric_name in metrics_names:
    print(metric_name)
    plot_elo_bar_plot(elo_ratings_dict[metric_name], metric_name)

Completeness


Factualness


Granularity


Topical


Uniqueness
