In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
import os
import plotly.express as px
from tqdm import tqdm
import json
pd.options.display.float_format = '{:.2f}'.format

In [2]:
human_1 = pd.DataFrame({'model_a': [], 'model_b': [], "win":[]}) 
human_2 = pd.DataFrame({'model_a': [], 'model_b': [], "win":[]}) 

cache_dir = "record"
for file in ["v-gaokaiyuan.json", "v-shaojie.json"]:
    meta_data = json.load(open(os.path.join(cache_dir, file)))
    for idx, rec in zip(meta_data["que_idx"], meta_data["que_rec"]):
        if rec == 0:
            continue
        elif rec == 1:
            human_1.loc[len(human_1)] = [idx[1], idx[2], "model_a"] if idx[1] < idx[2] else [idx[2], idx[1], "model_b"]
        elif rec == -1:
            human_1.loc[len(human_1)] = [idx[1], idx[2], "model_b"] if idx[1] < idx[2] else [idx[2], idx[1], "model_a"]
        elif rec == -2:
            human_1.loc[len(human_1)] = [idx[1], idx[2], "tie"] if idx[1] < idx[2] else [idx[2], idx[1], "tie"]

for file in ['v-jiachlin.json', 'v-sunanhe.json']:
    meta_data = json.load(open(os.path.join(cache_dir, file)))
    for idx, rec in zip(meta_data["que_idx"], meta_data["que_rec"]):
        if rec == 0:
            continue
        elif rec == 1:
            human_2.loc[len(human_2)] = [idx[1], idx[2], "model_a"] if idx[1] < idx[2] else [idx[2], idx[1], "model_b"]
        elif rec == -1:
            human_2.loc[len(human_2)] = [idx[1], idx[2], "model_b"] if idx[1] < idx[2] else [idx[2], idx[1], "model_a"]
        elif rec == -2:
            human_2.loc[len(human_2)] = [idx[1], idx[2], "tie"] if idx[1] < idx[2] else [idx[2], idx[1], "tie"]


In [3]:
print(human_1.tail())
print(human_2.tail())

      model_a  model_b      win
2395        5       14  model_a
2396        3        5  model_b
2397       12       14      tie
2398        6       11  model_a
2399        7        8  model_b
      model_a  model_b      win
2395        5       14  model_a
2396        3        5  model_b
2397       12       14      tie
2398        6       11  model_a
2399        7        8      tie


## Human consistency

In this measurement, a full point is awarded when both annotators concur, half a point is given if only one annotator denotes a tie, and no points are awarded in other situations.

In [4]:
agree = 0.
total = 0.
for i in tqdm(range(len(human_1))):
    if human_1.iloc[i].win == human_2.iloc[i].win:
        agree += 1
    elif human_1.iloc[i].win == "tie" or human_2.iloc[i].win == "tie":
        agree += 0.5
    total += 1

print(f'human aggreement score is: {agree/total}')

  0%|          | 0/2400 [00:00<?, ?it/s]

100%|██████████| 2400/2400 [00:00<00:00, 2888.93it/s]

human aggreement score is: 0.8002083333333333





## Elo ranking

1. Merge 2400 + 3600 pairs
2. Elo ranking

In [5]:
raw_data_new = pd.DataFrame({'model_a': [], 'model_b': [], "win":[]}) 

# 2400
for i in tqdm(range(len(human_1))):
    if human_1.iloc[i].win == human_2.iloc[i].win:
        raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].model_a, human_1.iloc[i].model_b, human_1.iloc[i].win]
    elif human_1.iloc[i].win == "tie":
        raw_data_new.loc[len(raw_data_new)] = [human_2.iloc[i].model_a, human_2.iloc[i].model_b, human_2.iloc[i].win]
    elif human_2.iloc[i].win == "tie":
        raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].model_a, human_1.iloc[i].model_b, human_1.iloc[i].win]
    else:
        raw_data_new.loc[len(raw_data_new)] = [human_1.iloc[i].model_a, human_1.iloc[i].model_b, "tie"]

# 3600
cache_dir = "record"
for file in tqdm(os.listdir(cache_dir)):
    if "json" not in file:
        continue
    meta_data = json.load(open(os.path.join(cache_dir, file)))
    if len(meta_data["que_idx"]) == 400:
        for idx, rec in zip(meta_data["que_idx"], meta_data["que_rec"]):
            if rec == 1:
                raw_data_new.loc[len(raw_data_new)] = [idx[1], idx[2], "model_a"] if idx[1] < idx[2] else [idx[2], idx[1], "model_b"]
            elif rec == -1:
                raw_data_new.loc[len(raw_data_new)] = [idx[1], idx[2], "model_b"] if idx[1] < idx[2] else [idx[2], idx[1], "model_a"]
            elif rec == -2:
                raw_data_new.loc[len(raw_data_new)] = [idx[1], idx[2], "tie"] if idx[1] < idx[2] else [idx[2], idx[1], "tie"]

print("len of data: ", len(raw_data_new))
print(raw_data_new.head())

  1%|▏         | 33/2400 [00:00<00:07, 320.56it/s]

100%|██████████| 2400/2400 [00:05<00:00, 447.66it/s]
100%|██████████| 13/13 [00:04<00:00,  2.76it/s]

len of data:  6000
   model_a  model_b      win
0        6       10  model_b
1        7       12  model_a
2        3        7  model_a
3        1       10  model_b
4        1        8  model_a





In [6]:

def compute_elo(battles, K, SCALE, BASE, INIT_RATING):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, win in battles[['model_a', 'model_b', 'win']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if win == "model_a":
            sa = 1
        elif win == "model_b":
            sa = 0
        elif win == "tie" or win == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {win}")
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

In [7]:
model_answer_df = pd.read_csv('answer16models.csv')
# model_answer_df.head()
id_to_model_name = model_answer_df.columns.tolist()[3:]

id_to_model_name_dict = {}
for i in range(len(id_to_model_name)):
    id_to_model_name_dict[i] = id_to_model_name[i]

In [29]:
raw_data_new

Unnamed: 0,model_a,model_b,win
0,6,10,model_b
1,7,12,model_a
2,3,7,model_a
3,1,10,model_b
4,1,8,model_a
...,...,...,...
5995,6,13,tie
5996,1,2,tie
5997,5,15,model_a
5998,7,8,model_a


In [31]:
def preety_print_elo_ratings(elo_ratings):
    df = pd.DataFrame([
        [id_to_model_name[n], elo_ratings[n]] for n in elo_ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

param_K=16
param_SCALE=400
param_BASE=10
param_INIT_RATING=1000
elo_ratings = compute_elo(raw_data_new, K=param_K, SCALE=param_SCALE, BASE=param_BASE, INIT_RATING=param_INIT_RATING)
preety_print_elo_ratings(elo_ratings)

Unnamed: 0,Model,Elo rating
1,vicuna-7b,1247
2,moss,1224
3,chatGLM,1184
4,stablelm-tuned-alpha,1165
5,openassistant,1103
6,alpaca-7b,1079
7,alpaca-lora,1025
8,mpt,1005
9,belle,928
10,dolly-v2-7b,924


## Pairwise win-rate

In [38]:
import plotly.io as pio
names = sorted(list(elo_ratings.keys()))
wins = defaultdict(lambda: defaultdict(lambda: 0))
for a in names:
    for b in names:
        ea = 1 / (1 + param_BASE ** ((elo_ratings[b] - elo_ratings[a]) / param_SCALE))
        wins[a][b] = ea
        wins[b][a] = 1 - ea

data = {
    a: [wins[a][b] if a != b else np.NAN for b in names]
    for a in names
}

df = pd.DataFrame(data, index=names)
df.index.name = "model_a"
df.columns.name = "model_b"

prop_wins = df.mean(axis=1).sort_values(ascending=True)
model_names = list(prop_wins.keys())
win_rate_pivot_table = df.loc[model_names, model_names]

win_rate_pivot_table = win_rate_pivot_table.rename(columns=id_to_model_name_dict, index=id_to_model_name_dict)# .fillna(0)

In [39]:
real_order_model_names = model_names

In [40]:
# import plotly.io as pio

fig = px.imshow(win_rate_pivot_table.T, color_continuous_scale='spectral',
                text_auto=".2f", title=None)
fig.update_layout(xaxis_title="Model B", yaxis_title="Model A", xaxis_side="top", height=600, width=600,
                title_y=0.07, title_x=0.5)
# fig.update_traces(hovertemplate=
#                 "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
fig.update_xaxes(tickangle=45)

pio.write_image(fig, 'win-rate-pred.pdf')
fig

In [35]:
battles_no_ties = raw_data_new[~raw_data_new["win"].str.contains("tie")]
total_count_pivot_table = battles_no_ties.pivot_table(values='win', index='model_a', columns='model_b', aggfunc='count', fill_value=0)

# 在最左侧添加一列 0 值， 在最下侧添加一行 0 值
total_count_pivot_table.insert(0, 0, 0)
total_count_pivot_table.loc[15] = 0

win_battles = battles_no_ties[battles_no_ties['win'] == "model_a"]
win_count_pivot_table = win_battles.pivot_table(values='win', index='model_a', columns='model_b', aggfunc='count', fill_value=0)
win_count_pivot_table.insert(0, 0, 0)
win_count_pivot_table.loc[15] = 0

lose_battles = battles_no_ties[battles_no_ties['win'] == "model_b"]
lose_count_pivot_table = lose_battles.pivot_table(values='win', index='model_a', columns='model_b', aggfunc='count', fill_value=0)
lose_count_pivot_table.insert(0, 0, 0)
lose_count_pivot_table.loc[15] = 0

# calculate win-rate
row_beats_col_freq = (
    (win_count_pivot_table + lose_count_pivot_table.T) /
    (total_count_pivot_table + total_count_pivot_table.T)
)
# Arrange ordering according to proprition of wins
prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False)
model_names = list(prop_wins.keys())
# 用pred win rate 的model names list
win_rate_pivot_table = row_beats_col_freq.loc[real_order_model_names, real_order_model_names]

win_rate_pivot_table = win_rate_pivot_table.rename(columns=id_to_model_name_dict, index=id_to_model_name_dict)# .fillna(0)

In [36]:


fig = px.imshow(win_rate_pivot_table, color_continuous_scale='spectral',
                text_auto=".2f", title=None)# "Fraction of Model A wins for all non-tied A vs. B battles.")
fig.update_layout(xaxis_title="Model B",
                yaxis_title="Model A",
                xaxis_side="top", height=600, width=600,
                title_y=0.07, title_x=0.5)
# fig.update_traces(hovertemplate=
#                 "Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>")
fig.update_xaxes(tickangle=45)

pio.write_image(fig, 'win-rate-real.pdf')
fig

## elo ranking

In [None]:
import pandas as pd
import plotly.io as pio
import plotly.express as px
pd.options.display.float_format = '{:.2f}'.format



data = {
    "Vicuna-7B": 1222,
    "ChatGLM": 1172,
    "Moss": 1162,
    "StableLM-Tuned-Alpha": 1124,
    "Alpaca-7B": 1094,
    "Open-Assistant": 1087,
    "Alpaca-lora": 998,
    "Dolly-v2-7B": 972,
    "MPT": 960,
    "Galatica": 916,
    "RWKV (Pile)-7B": 903,
    "BELLE": 900,
    "PandaLM": 898,
    "RedPajama-7B_base": 889,
    "h2oGPT-6.9B": 873,
    "RedPajama-7B_instruct": 831
}

model_names = list(data.keys())
elo_ratings = list(data.values())

plot_data = pd.DataFrame({"Model": model_names, "Elo Rating": elo_ratings})

# colors = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52', '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3']

# # 创建图形
# fig = go.Figure()

# # 添加水平柱状图，并为每个柱子指定颜色
# for name, score, color in zip(model_names,elo_ratings, colors):
#     fig.add_trace(go.Bar(
#         x=[name],
#         y=[score],
#         marker=dict(color=color)
#     ))


fig = px.bar(plot_data, x='Model', y='Elo Rating', color='Elo Rating', color_continuous_scale='Tealgrn')
fig.update_traces(texttemplate='%{y}', textposition='outside', textfont=dict(size=24))
fig.update_layout(xaxis_title=None, height=600, width=1800, showlegend=False,
                yaxis=dict(
                    categoryorder='total ascending', # 这会使项目按得分升序排列
                    range=[730, 1270] # 可以通过调整这些值来限制y轴的显示范围
                ))


# 获取当前的X轴刻度位置
tickvals = [i for i in range(len(plot_data['Model'].unique()))]

# 创建平移后的刻度位置（例如，向左平移0.2个单位）
new_tickvals = [tick - 0.2 for tick in tickvals]


fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=new_tickvals,
        ticktext=plot_data['Model'].unique(), # 使用原始的刻度标签
        title_font=dict(size=16), # 调整X轴标题字体大小
        tickfont=dict(size=24)    # 调整X轴刻度字体大小
    ),
    yaxis=dict(
        title_font=dict(size=32), # 调整Y轴标题字体大小
        showticklabels=False,   # 调整Y轴刻度字体大小
    ),
    coloraxis_showscale=False # 隐藏侧边的颜色刻度表
)
fig.add_shape(
        type="line",
        x0=-1,
        y0=1000,
        x1=6.7,
        y1=1000,
        line=dict(
            color="darkred",
        )
)
fig.add_shape(
        type="line",
        x0=7.3,
        y0=1000,
        x1=16,
        y1=1000,
        line=dict(
            color="darkred",
        )
)

fig.write_image("elo_ranking.pdf")

fig.show()
