In [1]:
import json
import pandas as pd
from glicko2 import glicko2

In [2]:
with open("review_gpt-4.jsonl", "r", encoding="utf-8") as f:
    matches = [json.loads(line) for line in f.readlines()]
len(matches)

456

In [3]:
model_id_list = list(
    set(
        [match["model_id_A"] for match in matches]
        + [match["model_id_B"] for match in matches]
    )
)
player_dict = {model_id: glicko2.Player() for model_id in model_id_list}
len(player_dict), player_dict.keys()

(10,
 dict_keys(['AIBunCho/japanese-novel-gpt-j-6b', 'GPT-3.5/ChatGPT-August-3', 'llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0', 'rinna/bilingual-gpt-neox-4b-instruction-ppo', 'supertrin-beta', 'GPT-4/ChatGPT-August-3', 'stabilityai/japanese-stablelm-instruct-alpha-7b-v2', 'stabilityai/japanese-stablelm-instruct-gamma-7b', 'line-corporation/japanese-large-lm-3.6b-instruction-sft', 'elyza/ELYZA-japanese-Llama-2-7b-fast-instruct']))

In [4]:
for match in matches:
    p1 = player_dict[match["model_id_A"]]
    p2 = player_dict[match["model_id_B"]]

    if match["model_id_A"] == match["winner"]:
        p1.update_player([p2.rating], [p2.rd], [1])
        p2.update_player([p1.rating], [p1.rd], [0])
    else:
        p1.update_player([p2.rating], [p2.rd], [0])
        p2.update_player([p1.rating], [p1.rd], [1])

In [5]:
results = []
for model_id, player in player_dict.items():
    results.append([model_id, player.rating, player.rd])
df = pd.DataFrame(results, columns=["model_id", "rating", "rd"]).sort_values(
    by="rating", ascending=False
)
df

Unnamed: 0,model_id,rating,rd
5,GPT-4/ChatGPT-August-3,1586.128974,83.69199
4,supertrin-beta,1467.502015,71.563131
1,GPT-3.5/ChatGPT-August-3,1401.92535,68.433791
7,stabilityai/japanese-stablelm-instruct-gamma-7b,1305.160712,66.035737
6,stabilityai/japanese-stablelm-instruct-alpha-7...,1247.593297,66.495074
9,elyza/ELYZA-japanese-Llama-2-7b-fast-instruct,1184.527815,64.492837
8,line-corporation/japanese-large-lm-3.6b-instru...,1126.414659,66.718206
0,AIBunCho/japanese-novel-gpt-j-6b,1121.875465,72.127283
2,llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0,1012.830561,76.283325
3,rinna/bilingual-gpt-neox-4b-instruction-ppo,966.883365,71.294955


In [6]:
df["Rating ± RD"] = df.apply(
    lambda row: f'{row["rating"]:.0f} ± {row["rd"]:.0f}', axis=1
)
df["ERP Score"] = ""

In [7]:
print(df[["model_id", "Rating ± RD", "ERP Score"]].to_markdown(index=False))

| model_id                                                | Rating ± RD   | ERP Score   |
|:--------------------------------------------------------|:--------------|:------------|
| GPT-4/ChatGPT-August-3                                  | 1586 ± 84     |             |
| supertrin-beta                                          | 1468 ± 72     |             |
| GPT-3.5/ChatGPT-August-3                                | 1402 ± 68     |             |
| stabilityai/japanese-stablelm-instruct-gamma-7b         | 1305 ± 66     |             |
| stabilityai/japanese-stablelm-instruct-alpha-7b-v2      | 1248 ± 66     |             |
| elyza/ELYZA-japanese-Llama-2-7b-fast-instruct           | 1185 ± 64     |             |
| line-corporation/japanese-large-lm-3.6b-instruction-sft | 1126 ± 67     |             |
| AIBunCho/japanese-novel-gpt-j-6b                        | 1122 ± 72     |             |
| llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0        | 1013 ± 76     |             |
| rinna/bi