In [1]:
import json
import pandas as pd
from glicko2 import glicko2

In [2]:
with open("review_gpt-4.jsonl", "r", encoding="utf-8") as f:
    matches = [json.loads(line) for line in f.readlines()]

In [3]:
model_id_list = list(
    set(
        [match["model_id_A"] for match in matches]
        + [match["model_id_B"] for match in matches]
    )
)
player_dict = {model_id: glicko2.Player() for model_id in model_id_list}

In [4]:
for match in matches:
    p1 = player_dict[match["model_id_A"]]
    p2 = player_dict[match["model_id_B"]]

    if match["model_id_A"] == match["winner"]:
        p1.update_player([p2.rating], [p2.rd], [1])
        p2.update_player([p1.rating], [p1.rd], [0])
    else:
        p1.update_player([p2.rating], [p2.rd], [0])
        p2.update_player([p1.rating], [p1.rd], [1])

In [5]:
results = []
for model_id, player in player_dict.items():
    results.append([model_id, player.rating, player.rd])
df = pd.DataFrame(results, columns=["model_id", "rating", "rd"]).sort_values(
    by="rating", ascending=False
)

In [6]:
df["rating ± rd"] = df.apply(
    lambda row: f'{row["rating"]:.0f} ± {row["rd"]:.0f}', axis=1
)

In [7]:
print(df[["model_id", "rating ± rd"]].to_markdown(index=False))


| model_id                                                | rating ± rd   |
|:--------------------------------------------------------|:--------------|
| GPT-4/ChatGPT-August-3                                  | 1660 ± 92     |
| supertrin-beta                                          | 1448 ± 81     |
| GPT-3.5/ChatGPT-August-3                                | 1378 ± 78     |
| elyza/ELYZA-japanese-Llama-2-7b-fast-instruct           | 1210 ± 79     |
| line-corporation/japanese-large-lm-3.6b-instruction-sft | 1150 ± 72     |
| AIBunCho/japanese-novel-gpt-j-6b                        | 1015 ± 85     |
| rinna/bilingual-gpt-neox-4b-instruction-ppo             | 942 ± 84      |
