In [2]:
import pandas as pd
import seaborn as sns
from transformers import AutoModel

from router_poc import settings as S

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
arena_df = pd.read_parquet(S.DATA_DIR / "raw" / "chatbot_arena_conversations.parquet")

In [4]:
arena_df.head()

Unnamed: 0,question_id,model_a,model_b,winner,judge,conversation_a,conversation_b,turn,anony,language,tstamp,openai_moderation,toxic_chat_tag
0,58210e39b3fd4441a2bd4a518bb44c2d,chatglm-6b,koala-13b,model_b,arena_user_973,[{'content': 'What is the difference between O...,[{'content': 'What is the difference between O...,1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
1,2564acd09e3942fd97657d05282d4389,oasst-pythia-12b,alpaca-13b,tie,arena_user_973,[{'content': 'Why did my parent not invite me ...,[{'content': 'Why did my parent not invite me ...,1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
2,90bfd142157948aba01931726c888e7f,koala-13b,oasst-pythia-12b,model_b,arena_user_973,"[{'content': 'Fuji vs. Nikon, which is better?...","[{'content': 'Fuji vs. Nikon, which is better?...",1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
3,a7c5accc53e649a3bc6b2e41d962ebc4,vicuna-13b,oasst-pythia-12b,model_b,arena_user_973,[{'content': 'How to build an arena for chatbo...,[{'content': 'How to build an arena for chatbo...,1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."
4,adf27e819a3c494cb6e993f0c660e097,vicuna-13b,koala-13b,model_a,arena_user_973,"[{'content': 'When is it today?', 'role': 'use...","[{'content': 'When is it today?', 'role': 'use...",1,True,English,1682352000.0,"{'categories': {'harassment': False, 'harassme...","{'roberta-large': {'flagged': False, 'probabil..."


In [5]:
models = set(arena_df.model_a.unique()).union(set(arena_df.model_b.unique()))
len(models)

20

In [6]:
models

{'RWKV-4-Raven-14B',
 'alpaca-13b',
 'chatglm-6b',
 'claude-instant-v1',
 'claude-v1',
 'dolly-v2-12b',
 'fastchat-t5-3b',
 'gpt-3.5-turbo',
 'gpt-4',
 'gpt4all-13b-snoozy',
 'guanaco-33b',
 'koala-13b',
 'llama-13b',
 'mpt-7b-chat',
 'oasst-pythia-12b',
 'palm-2',
 'stablelm-tuned-alpha-7b',
 'vicuna-13b',
 'vicuna-7b',
 'wizardlm-13b'}

Not ideal, data is from 2023 and models are ancient, but I guess, for poc it works.

In [8]:
arena = pd.read_parquet(S.DATA_DIR / "raw" / "arena_human_preferences.parquet")
arena.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [11]:
arena.model_a.value_counts(normalize=True)

model_a
gpt-4-1106-preview          0.063991
gpt-3.5-turbo-0613          0.061816
gpt-4-0613                  0.053917
claude-2.1                  0.049742
gpt-4-0314                  0.036310
                              ...   
falcon-180b-chat            0.002523
openchat-3.5-0106           0.001879
qwen1.5-7b-chat             0.001844
qwen1.5-4b-chat             0.001740
mistral-7b-instruct-v0.2    0.000940
Name: proportion, Length: 64, dtype: float64

I think I'll go ahead with arena human preferences. It's a bit larger and more modern than chatbot arena conversations and I was not able to find a comparative mmlu dataset in a reasonable amount of time.

In [14]:
num_comparisons = arena.model_a.value_counts() + arena.model_b.value_counts()
num_comparisons.sort_values(ascending=False)

gpt-4-1106-preview          7387
gpt-3.5-turbo-0613          7083
gpt-4-0613                  6165
claude-2.1                  5583
claude-instant-1            4136
                            ... 
falcon-180b-chat             286
openchat-3.5-0106            244
qwen1.5-7b-chat              208
qwen1.5-4b-chat              200
mistral-7b-instruct-v0.2     100
Name: count, Length: 64, dtype: int64

In [20]:
selected_models = num_comparisons[num_comparisons > 1000].index
df = arena[arena.model_a.isin(selected_models) & arena.model_b.isin(selected_models)].copy()
df.shape

(44514, 9)

In [22]:
selected_models

Index(['RWKV-4-Raven-14B', 'alpaca-13b', 'chatglm-6b', 'claude-1',
       'claude-2.0', 'claude-2.1', 'claude-instant-1',
       'codellama-34b-instruct', 'fastchat-t5-3b', 'gemini-pro',
       'gemini-pro-dev-api', 'gpt-3.5-turbo-0314', 'gpt-3.5-turbo-0613',
       'gpt-3.5-turbo-1106', 'gpt-4-0125-preview', 'gpt-4-0314', 'gpt-4-0613',
       'gpt-4-1106-preview', 'koala-13b', 'llama-2-13b-chat',
       'llama-2-70b-chat', 'llama-2-7b-chat', 'mistral-7b-instruct',
       'mistral-medium', 'mixtral-8x7b-instruct-v0.1', 'oasst-pythia-12b',
       'openchat-3.5', 'palm-2', 'pplx-70b-online', 'pplx-7b-online',
       'qwen-14b-chat', 'starling-lm-7b-alpha', 'tulu-2-dpo-70b', 'vicuna-13b',
       'vicuna-33b', 'vicuna-7b', 'wizardlm-13b', 'wizardlm-70b',
       'yi-34b-chat', 'zephyr-7b-beta'],
      dtype='object')

In [23]:
selected_models = [
    'claude-2.1', 'gemini-pro', 'gpt-4-0613', 'mistral-medium'
]

In [24]:
providers = []
for provider_name, model_name in zip(
    ["anthropic", "google", "openai", "mistral"],
    selected_models,
):
    providers.append(
        {
            "provider_name": provider_name,
            "model_name": model_name,
        }
    )
pd.DataFrame(providers).to_csv(S.DATA_DIR / "intermediate" / "providers.csv", index=False)

In [26]:
df = df.query("model_a.isin(@selected_models) & model_b.isin(@selected_models)")
df.shape

(1353, 6)

Nope, this amount of data would cost a few cents to generate and I'll get more modern models.

In [25]:
df["prompt"] =df.prompt.str[2:-2]
df = df[["model_a", "model_b", "prompt", "winner_model_a", "winner_model_b", "winner_tie"]]
df.to_parquet(S.DATA_DIR / "intermediate" / "arena_human_preferences_selected.parquet")