In [1]:
import json
import pandas as pd
from glicko2 import glicko2

In [2]:
with open("review_gpt-4.jsonl", "r", encoding="utf-8") as f:
    matches = [json.loads(line) for line in f.readlines()]
len(matches)

216

In [3]:
model_id_list = list(
    set(
        [match["model_id_A"] for match in matches]
        + [match["model_id_B"] for match in matches]
    )
)
player_dict = {model_id: glicko2.Player() for model_id in model_id_list}
len(player_dict), player_dict.keys()


(7,
 dict_keys(['AIBunCho/japanese-novel-gpt-j-6b', 'GPT-3.5/ChatGPT-August-3', 'rinna/bilingual-gpt-neox-4b-instruction-ppo', 'supertrin-beta', 'GPT-4/ChatGPT-August-3', 'line-corporation/japanese-large-lm-3.6b-instruction-sft', 'elyza/ELYZA-japanese-Llama-2-7b-fast-instruct']))

In [4]:
for match in matches:
    p1 = player_dict[match["model_id_A"]]
    p2 = player_dict[match["model_id_B"]]

    if match["model_id_A"] == match["winner"]:
        p1.update_player([p2.rating], [p2.rd], [1])
        p2.update_player([p1.rating], [p1.rd], [0])
    else:
        p1.update_player([p2.rating], [p2.rd], [0])
        p2.update_player([p1.rating], [p1.rd], [1])

In [5]:
results = []
for model_id, player in player_dict.items():
    results.append([model_id, player.rating, player.rd])
df = pd.DataFrame(results, columns=["model_id", "rating", "rd"]).sort_values(
    by="rating", ascending=False
)
df


Unnamed: 0,model_id,rating,rd
4,GPT-4/ChatGPT-August-3,1630.72533,93.051524
3,supertrin-beta,1481.931086,80.695233
1,GPT-3.5/ChatGPT-August-3,1418.475101,76.583399
6,elyza/ELYZA-japanese-Llama-2-7b-fast-instruct,1184.214459,72.123804
5,line-corporation/japanese-large-lm-3.6b-instru...,1131.257224,68.418139
0,AIBunCho/japanese-novel-gpt-j-6b,1039.26443,75.823452
2,rinna/bilingual-gpt-neox-4b-instruction-ppo,919.882502,79.702151


In [6]:
df["Rating ± RD"] = df.apply(
    lambda row: f'{row["rating"]:.0f} ± {row["rd"]:.0f}', axis=1
)
df["ERP Score"] = ""


In [7]:
print(df[["model_id", "Rating ± RD", "ERP Score"]].to_markdown(index=False))


| model_id                                                | Rating ± RD   | ERP Score   |
|:--------------------------------------------------------|:--------------|:------------|
| GPT-4/ChatGPT-August-3                                  | 1631 ± 93     |             |
| supertrin-beta                                          | 1482 ± 81     |             |
| GPT-3.5/ChatGPT-August-3                                | 1418 ± 77     |             |
| elyza/ELYZA-japanese-Llama-2-7b-fast-instruct           | 1184 ± 72     |             |
| line-corporation/japanese-large-lm-3.6b-instruction-sft | 1131 ± 68     |             |
| AIBunCho/japanese-novel-gpt-j-6b                        | 1039 ± 76     |             |
| rinna/bilingual-gpt-neox-4b-instruction-ppo             | 920 ± 80      |             |
