In [1]:
import json
import pandas as pd
from glicko2 import glicko2

In [2]:
with open("review_gpt-4.jsonl", "r", encoding="utf-8") as f:
    matches = [json.loads(line) for line in f.readlines()]
len(matches)

366

In [3]:
model_id_list = list(
    set(
        [match["model_id_A"] for match in matches]
        + [match["model_id_B"] for match in matches]
    )
)
player_dict = {model_id: glicko2.Player() for model_id in model_id_list}
len(player_dict), player_dict.keys()

(9,
 dict_keys(['AIBunCho/japanese-novel-gpt-j-6b', 'GPT-3.5/ChatGPT-August-3', 'llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0', 'rinna/bilingual-gpt-neox-4b-instruction-ppo', 'supertrin-beta', 'GPT-4/ChatGPT-August-3', 'stabilityai/japanese-stablelm-instruct-alpha-7b-v2', 'line-corporation/japanese-large-lm-3.6b-instruction-sft', 'elyza/ELYZA-japanese-Llama-2-7b-fast-instruct']))

In [4]:
for match in matches:
    p1 = player_dict[match["model_id_A"]]
    p2 = player_dict[match["model_id_B"]]

    if match["model_id_A"] == match["winner"]:
        p1.update_player([p2.rating], [p2.rd], [1])
        p2.update_player([p1.rating], [p1.rd], [0])
    else:
        p1.update_player([p2.rating], [p2.rd], [0])
        p2.update_player([p1.rating], [p1.rd], [1])

In [5]:
results = []
for model_id, player in player_dict.items():
    results.append([model_id, player.rating, player.rd])
df = pd.DataFrame(results, columns=["model_id", "rating", "rd"]).sort_values(
    by="rating", ascending=False
)
df

Unnamed: 0,model_id,rating,rd
5,GPT-4/ChatGPT-August-3,1643.337577,91.214946
1,GPT-3.5/ChatGPT-August-3,1423.711171,74.864654
4,supertrin-beta,1422.294404,79.701514
6,stabilityai/japanese-stablelm-instruct-alpha-7...,1242.251847,70.621353
8,elyza/ELYZA-japanese-Llama-2-7b-fast-instruct,1229.353359,65.907299
7,line-corporation/japanese-large-lm-3.6b-instru...,1143.086702,64.958286
0,AIBunCho/japanese-novel-gpt-j-6b,1080.532006,72.911866
2,llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0,976.537416,74.660696
3,rinna/bilingual-gpt-neox-4b-instruction-ppo,942.667478,69.014529


In [6]:
df["Rating ± RD"] = df.apply(
    lambda row: f'{row["rating"]:.0f} ± {row["rd"]:.0f}', axis=1
)
df["ERP Score"] = ""

In [7]:
print(df[["model_id", "Rating ± RD", "ERP Score"]].to_markdown(index=False))

| model_id                                                | Rating ± RD   | ERP Score   |
|:--------------------------------------------------------|:--------------|:------------|
| GPT-4/ChatGPT-August-3                                  | 1643 ± 91     |             |
| GPT-3.5/ChatGPT-August-3                                | 1424 ± 75     |             |
| supertrin-beta                                          | 1422 ± 80     |             |
| stabilityai/japanese-stablelm-instruct-alpha-7b-v2      | 1242 ± 71     |             |
| elyza/ELYZA-japanese-Llama-2-7b-fast-instruct           | 1229 ± 66     |             |
| line-corporation/japanese-large-lm-3.6b-instruction-sft | 1143 ± 65     |             |
| AIBunCho/japanese-novel-gpt-j-6b                        | 1081 ± 73     |             |
| llm-jp/llm-jp-13b-instruct-full-dolly-oasst-v1.0        | 977 ± 75      |             |
| rinna/bilingual-gpt-neox-4b-instruction-ppo             | 943 ± 69      |             |
