In [7]:
from importlib import reload

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from router_poc import settings as S
from router_poc import router

In [2]:
embeddings = pd.read_parquet(S.DATA_DIR / "intermediate" / "stanford_mmlu_results.embeddings.parquet")
labels = pd.read_parquet(S.DATA_DIR / "intermediate" / "stanford_mmlu_results.parquet")

data = pd.merge(embeddings, labels, on="prompt")

In [19]:
# This should have been done in data gen, now we hope we are getting the same splits
prompts = data["prompt"].unique()

train, val = train_test_split(prompts, test_size=0.2, random_state=42)
train = data.query("prompt in @train")
val = data.query("prompt in @val")

# Seems like I didn't exclude all of prompts using mistral small.
# It seems to have used slightly different prompt, so it's hard
# to compare the results. Let's just ignore it
train = train.query("model != 'mistralai/mistral-small-2402'").copy()
val = val.query("model != 'mistralai/mistral-small-2402'").copy()

In [20]:
best_llm = train.groupby("model")["exact_match"].mean()
best_llm_accuracy = best_llm.max()
best_llm_name = best_llm.idxmax()

worst_llm_accuracy = best_llm.min()
worst_llm_name = best_llm.idxmin()

display(
    f"Best LLM: {best_llm_name} ({best_llm_accuracy:.2%})",
    f"Worst LLM: {worst_llm_name} ({worst_llm_accuracy:.2%})",
)

'Best LLM: anthropic/claude-3-5-sonnet-20241022 (87.61%)'

'Worst LLM: anthropic/claude-3-5-haiku-20241022 (73.80%)'

In [21]:
reload(router)

random_router = router.Router(train["model"].unique(), "random")
random_router("What is the capital of France?")

RoutedPrompt(prompt='What is the capital of France?', llm_name='google/gemini-2.0-flash-exp', response='The capital of France is **Paris**.\n')

In [22]:
reload(router)
strength_router = router.Router(train["model"].unique(), "absolute")
strength_router("What is the capital of France?")

RoutedPrompt(prompt='What is the capital of France?', llm_name='openai/gpt-4-turbo-2024-04-09', response='The capital of France is Paris.')

We can cheat and to evaluate routing models on val set we don't actually have to call the lms.

In [25]:
def evaluate_router(router: router.Router, val_set: pd.DataFrame):
    results = []
    prompts = val_set["prompt"].unique()
    for prompt in prompts:
        chosen_llm = router.router(prompt)
        exact_match = val_set.query("model == @chosen_llm & prompt == @prompt")["exact_match"].values[0]
        results.append({
            "prompt": prompt,
            "chosen_llm": chosen_llm,
            "exact_match": exact_match,
        })
    return pd.DataFrame(results)

In [27]:
strength_router_results = evaluate_router(strength_router, val)
strength_router_results

Unnamed: 0,prompt,chosen_llm,exact_match
0,Answer with only a single letter.\n\nThe follo...,anthropic/claude-3-5-sonnet-20241022,1.0
1,Answer with only a single letter.\n\nThe follo...,mistralai/mistral-large-2407,1.0
2,Answer with only a single letter.\n\nThe follo...,mistralai/mistral-large-2407,1.0
3,Answer with only a single letter.\n\nThe follo...,openai/gpt-4-turbo-2024-04-09,1.0
4,Answer with only a single letter.\n\nThe follo...,openai/gpt-4-turbo-2024-04-09,1.0
...,...,...,...
2848,Answer with only a single letter.\n\nThe follo...,anthropic/claude-3-5-sonnet-20241022,1.0
2849,Answer with only a single letter.\n\nThe follo...,google/gemini-2.0-flash-exp,1.0
2850,Answer with only a single letter.\n\nThe follo...,anthropic/claude-3-5-sonnet-20241022,1.0
2851,Answer with only a single letter.\n\nThe follo...,anthropic/claude-3-5-sonnet-20241022,1.0


In [28]:
strength_router_results["exact_match"].mean()

np.float64(0.9263932702418507)