In [27]:
import pandas as pd
import plotly.express as px
import glob
import os
import re
from tqdm import tqdm

In [51]:
pattern = re.compile(r"\[\[(.*?)\]\]")

def get_score(text):
    matches = pattern.findall(text)

    if len(matches) == 1:
        return float(matches[0])

In [57]:
df = pd.read_json("rated_question.jsonl", lines=True)

harder = df[df["mean_score"] < 7]
harder.to_json("harder_question.jsonl", orient="records", lines=True)
len(harder)

242

In [53]:
path = os.path.join("model_answer", "*.jsonl")
files = glob.glob(path)
    
models = []
for file in files:
    models.append(file.replace(".jsonl", "").replace("model_answer/", ""))

In [54]:
leader_board = {}

ids = harder["question_id"].to_list()

judgment = pd.read_json(os.path.join("model_judgment", "gpt-4_single.jsonl"), lines=True)

for model in tqdm(models):
    model_df = judgment[judgment["model"] == model]
    model_df = model_df[model_df["question_id"].isin(ids)]

    total_score = 0
    count = 0
    for i, row in model_df.iterrows():
        score = get_score(row["judgment"])
        
        if not score:
            continue

        total_score += score
        count += 1

    leader_board[model] = total_score / count

100%|██████████| 11/11 [00:00<00:00, 182.26it/s]


In [55]:
leader_board

{'claude-v1': 5.697478991596639,
 'gpt-3.5-turbo': 5.975103734439834,
 'claude-instant-v1': 5.341666666666667,
 'vicuna-13b-v1.5': 2.957805907172996,
 'Llama-2-70b-chat': 3.0720338983050848,
 'wizardlm-70b-v1.0': 4.175,
 'vicuna-7b-v1.5': 2.3221757322175733,
 'vicuna-33b-v1.3': 3.3177966101694913,
 'claude-2': 5.929166666666666,
 'gpt-4': 8.215767634854771,
 'palm-2-chat-bison-001': 2.5625}

In [56]:
dict(sorted(leader_board.items(), key=lambda item: item[1], reverse=True))

{'gpt-4': 8.215767634854771,
 'gpt-3.5-turbo': 5.975103734439834,
 'claude-2': 5.929166666666666,
 'claude-v1': 5.697478991596639,
 'claude-instant-v1': 5.341666666666667,
 'wizardlm-70b-v1.0': 4.175,
 'vicuna-33b-v1.3': 3.3177966101694913,
 'Llama-2-70b-chat': 3.0720338983050848,
 'vicuna-13b-v1.5': 2.957805907172996,
 'palm-2-chat-bison-001': 2.5625,
 'vicuna-7b-v1.5': 2.3221757322175733}