## Setup the environment

In [1]:
from llmselector.data_utils import DataLoader_SimpleQA
from llmselector.compoundai.module.debate import MultiAgentDebate
from llmselector.compoundai.metric import Metric, compute_score
from llmselector.compoundai.optimizer import OptimizerFullSearch, OptimizerLLMDiagnoser
import llmselector, os
import os
if not os.path.exists('../cache/db_simpleqa.sqlite'): 
    !wget -P ../cache https://github.com/LLMSELECTOR/LLMSELECTOR/releases/download/0.0.1/db_simpleqa.sqlite

In [2]:
llmselector.config.config(
    db_path=f"../cache/db_simpleqa.sqlite" ,
    openai_api_key="YOUR_OPENAI_KEY",
	anthropic_api_key="YOUR_ANTHROPIC_KEY",
	together_ai_api_key="YOUR_TOGETHERAI_KEY",
	gemini_api_key="YOUR_GEMINI_KEY")

## 1. Load dataset

In [3]:
from sklearn.model_selection import train_test_split
Mydataloader = DataLoader_SimpleQA()
q_data = Mydataloader.get_query_df()
train_df, test_df = train_test_split(q_data,test_size=0.5, random_state=2024)

## 2. Specify model and eval metric

In [4]:
model_list = ['gpt-4o-2024-05-13','gpt-4-turbo-2024-04-09','gpt-4o-mini-2024-07-18',
              'claude-3-5-sonnet-20240620','claude-3-haiku-20240307',
              'gemini-1.5-pro','gemini-1.5-flash',
              'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo','meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo','Qwen/Qwen2.5-72B-Instruct-Turbo']
M1 = Metric('em_direct')

## 3. Standard systems using one fixed model

In [5]:
Agents_SameModel ={}
for name in model_list:
    Agents_SameModel[name] = MultiAgentDebate()
    Opt0 = OptimizerFullSearch(model_list = [name])
    Opt0.optimize( train_df, M1, Agents_SameModel[name])

  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   3%|█                                 | 71/2163 [00:00<00:03, 668.24it/s][A
Processing:   7%|██▍                              | 160/2163 [00:00<00:02, 785.07it/s][A
Processing:  11%|███▋                             | 239/2163 [00:00<00:02, 743.69it/s][A
Processing:  15%|████▊                            | 314/2163 [00:00<00:02, 626.67it/s][A
Processing:  18%|█████▉                           | 391/2163 [00:00<00:02, 670.29it/s][A
Processing:  21%|███████                          | 461/2163 [00:00<00:02, 664.05it/s][A
Processing:  25%|████████▏                        | 539/2163 [00:00<00:02, 695.62it/s][A
Processing:  29%|█████████▍                       | 619/2163 [00:00<00:02, 720.43it/s][A
Processing:  33%|██████████▊                      | 710/2163 [00:00<00:01, 774.79it/s][A
Processing:  

('gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13', 'gpt-4o-2024-05-13')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   4%|█▍                                | 91/2163 [00:00<00:02, 889.81it/s][A
Processing:  10%|███▏                            | 214/2163 [00:00<00:01, 1087.85it/s][A
Processing:  15%|████▊                           | 326/2163 [00:00<00:01, 1099.83it/s][A
Processing:  20%|██████▍                         | 437/2163 [00:00<00:01, 1004.31it/s][A
Processing:  25%|████████▏                        | 539/2163 [00:00<00:01, 918.36it/s][A
Processing:  29%|█████████▋                       | 633/2163 [00:00<00:01, 844.72it/s][A
Processing:  33%|██████████▉                      | 720/2163 [00:00<00:01, 845.44it/s][A
Processing:  37%|████████████▎                    | 806/2163 [00:00<00:01, 849.40it/s][A
Processing:  41%|█████████████▌                   | 892/2163 [00:01<00:01, 831.97it/s][A
Processing:  

('gpt-4-turbo-2024-04-09', 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo-2024-04-09')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   4%|█▏                                | 76/2163 [00:00<00:02, 757.15it/s][A
Processing:   7%|██▎                              | 152/2163 [00:00<00:02, 758.00it/s][A
Processing:  11%|███▋                             | 240/2163 [00:00<00:02, 798.96it/s][A
Processing:  15%|████▉                            | 320/2163 [00:00<00:02, 721.64it/s][A
Processing:  18%|██████                           | 394/2163 [00:00<00:02, 603.95it/s][A
Processing:  21%|██████▉                          | 458/2163 [00:00<00:04, 341.17it/s][A
Processing:  23%|███████▋                         | 506/2163 [00:01<00:04, 335.96it/s][A
Processing:  25%|████████▍                        | 549/2163 [00:01<00:04, 339.93it/s][A
Processing:  28%|█████████▏                       | 600/2163 [00:01<00:04, 374.18it/s][A
Processing:  

('gpt-4o-mini-2024-07-18', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini-2024-07-18', 'gpt-4o-mini-2024-07-18')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   6%|█▉                              | 131/2163 [00:00<00:01, 1278.42it/s][A
Processing:  12%|███▉                            | 270/2163 [00:00<00:01, 1339.72it/s][A
Processing:  19%|█████▉                          | 405/2163 [00:00<00:01, 1340.42it/s][A
Processing:  25%|███████▉                        | 540/2163 [00:00<00:01, 1343.68it/s][A
Processing:  31%|█████████▉                      | 675/2163 [00:00<00:01, 1327.43it/s][A
Processing:  37%|███████████▉                    | 811/2163 [00:00<00:01, 1328.81it/s][A
Processing:  44%|██████████████                  | 950/2163 [00:00<00:00, 1346.11it/s][A
Processing:  50%|███████████████▌               | 1085/2163 [00:00<00:00, 1315.57it/s][A
Processing:  56%|█████████████████▍             | 1217/2163 [00:00<00:00, 1213.07it/s][A
Processing:  

('claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620', 'claude-3-5-sonnet-20240620')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   6%|█▊                              | 125/2163 [00:00<00:01, 1247.97it/s][A
Processing:  12%|███▊                            | 259/2163 [00:00<00:01, 1300.96it/s][A
Processing:  18%|█████▊                          | 391/2163 [00:00<00:01, 1292.42it/s][A
Processing:  24%|███████▋                        | 521/2163 [00:00<00:01, 1293.15it/s][A
Processing:  30%|█████████▋                      | 651/2163 [00:00<00:01, 1172.65it/s][A
Processing:  36%|███████████▍                    | 770/2163 [00:00<00:01, 1108.56it/s][A
Processing:  41%|█████████████                   | 883/2163 [00:00<00:01, 1110.04it/s][A
Processing:  46%|██████████████▊                 | 998/2163 [00:00<00:01, 1119.20it/s][A
Processing:  51%|███████████████▉               | 1111/2163 [00:00<00:00, 1100.85it/s][A
Processing:  

('claude-3-haiku-20240307', 'claude-3-haiku-20240307', 'claude-3-haiku-20240307', 'claude-3-haiku-20240307', 'claude-3-haiku-20240307', 'claude-3-haiku-20240307', 'claude-3-haiku-20240307')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   5%|█▌                               | 101/2163 [00:00<00:02, 994.88it/s][A
Processing:  10%|███                             | 210/2163 [00:00<00:01, 1049.26it/s][A
Processing:  15%|████▋                           | 321/2163 [00:00<00:01, 1058.54it/s][A
Processing:  20%|██████▍                         | 431/2163 [00:00<00:01, 1069.50it/s][A
Processing:  25%|████████▏                       | 551/2163 [00:00<00:01, 1107.94it/s][A
Processing:  31%|█████████▉                      | 671/2163 [00:00<00:01, 1132.94it/s][A
Processing:  37%|███████████▊                    | 796/2163 [00:00<00:01, 1169.64it/s][A
Processing:  43%|█████████████▊                  | 931/2163 [00:00<00:01, 1213.49it/s][A
Processing:  49%|███████████████▎               | 1069/2163 [00:00<00:00, 1264.39it/s][A
Processing:  

('gemini-1.5-pro', 'gemini-1.5-pro', 'gemini-1.5-pro', 'gemini-1.5-pro', 'gemini-1.5-pro', 'gemini-1.5-pro', 'gemini-1.5-pro')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   5%|█▋                              | 111/2163 [00:00<00:01, 1074.10it/s][A
Processing:  11%|███▍                            | 231/2163 [00:00<00:01, 1133.11it/s][A
Processing:  17%|█████▍                          | 366/2163 [00:00<00:01, 1229.57it/s][A
Processing:  23%|███████▎                        | 493/2163 [00:00<00:01, 1245.11it/s][A
Processing:  29%|█████████▍                       | 618/2163 [00:00<00:01, 923.15it/s][A
Processing:  34%|███████████▏                     | 732/2163 [00:00<00:01, 980.12it/s][A
Processing:  40%|████████████▊                   | 867/2163 [00:00<00:01, 1084.10it/s][A
Processing:  46%|██████████████▋                 | 996/2163 [00:00<00:01, 1142.73it/s][A
Processing:  52%|████████████████               | 1123/2163 [00:01<00:00, 1175.76it/s][A
Processing:  

('gemini-1.5-flash', 'gemini-1.5-flash', 'gemini-1.5-flash', 'gemini-1.5-flash', 'gemini-1.5-flash', 'gemini-1.5-flash', 'gemini-1.5-flash')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   5%|█▋                              | 111/2163 [00:00<00:01, 1078.69it/s][A
Processing:  10%|███▎                            | 221/2163 [00:00<00:01, 1083.23it/s][A
Processing:  15%|████▉                           | 331/2163 [00:00<00:01, 1078.20it/s][A
Processing:  20%|██████▌                         | 441/2163 [00:00<00:01, 1079.66it/s][A
Processing:  26%|████████▏                       | 552/2163 [00:00<00:01, 1089.14it/s][A
Processing:  31%|█████████▉                      | 672/2163 [00:00<00:01, 1122.75it/s][A
Processing:  37%|███████████▋                    | 792/2163 [00:00<00:01, 1142.46it/s][A
Processing:  42%|█████████████▌                  | 918/2163 [00:00<00:01, 1178.29it/s][A
Processing:  48%|██████████████▊                | 1036/2163 [00:00<00:00, 1166.35it/s][A
Processing:  

('meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   5%|█▋                              | 111/2163 [00:00<00:01, 1087.36it/s][A
Processing:  11%|███▍                            | 231/2163 [00:00<00:01, 1125.50it/s][A
Processing:  16%|█████▏                          | 351/2163 [00:00<00:01, 1151.47it/s][A
Processing:  22%|███████                         | 481/2163 [00:00<00:01, 1199.04it/s][A
Processing:  28%|█████████                       | 611/2163 [00:00<00:01, 1232.22it/s][A
Processing:  34%|███████████                     | 744/2163 [00:00<00:01, 1264.39it/s][A
Processing:  40%|████████████▉                   | 871/2163 [00:00<00:01, 1247.67it/s][A
Processing:  46%|██████████████▋                 | 996/2163 [00:00<00:00, 1210.95it/s][A
Processing:  52%|████████████████               | 1118/2163 [00:00<00:00, 1168.73it/s][A
Processing:  

('meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo', 'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo')


  0%|                                                           | 0/1 [00:00<?, ?it/s]
Processing:   0%|                                            | 0/2163 [00:00<?, ?it/s][A
Processing:   6%|█▉                              | 132/2163 [00:00<00:01, 1316.21it/s][A
Processing:  13%|████                            | 271/2163 [00:00<00:01, 1343.69it/s][A
Processing:  19%|██████                          | 409/2163 [00:00<00:01, 1358.30it/s][A
Processing:  25%|████████                        | 545/2163 [00:00<00:01, 1354.68it/s][A
Processing:  31%|██████████                      | 681/2163 [00:00<00:01, 1279.13it/s][A
Processing:  37%|███████████▉                    | 810/2163 [00:00<00:01, 1272.37it/s][A
Processing:  44%|██████████████                  | 948/2163 [00:00<00:00, 1303.70it/s][A
Processing:  50%|███████████████▌               | 1083/2163 [00:00<00:00, 1315.12it/s][A
Processing:  56%|█████████████████▍             | 1215/2163 [00:00<00:00, 1303.14it/s][A
Processing:  

('Qwen/Qwen2.5-72B-Instruct-Turbo', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'Qwen/Qwen2.5-72B-Instruct-Turbo', 'Qwen/Qwen2.5-72B-Instruct-Turbo')





## 4. LLMSELECTOR

In [None]:
LLMSELECTOR = MultiAgentDebate()
Optimizer = OptimizerLLMDiagnoser(model_list = model_list)
Optimizer.optimize( train_df.head(500), M1, LLMSELECTOR) # use only 500 samples for fast processing

100%|███████████████████████████████████████████████| 500/500 [00:10<00:00, 48.15it/s]
100%|███████████████████████████████████████████████| 500/500 [00:12<00:00, 40.23it/s]
100%|███████████████████████████████████████████████| 500/500 [00:09<00:00, 51.65it/s]
100%|███████████████████████████████████████████████| 500/500 [00:09<00:00, 52.28it/s]
100%|███████████████████████████████████████████████| 500/500 [00:09<00:00, 51.99it/s]
100%|███████████████████████████████████████████████| 500/500 [00:11<00:00, 44.64it/s]
100%|███████████████████████████████████████████████| 500/500 [00:08<00:00, 60.09it/s]
100%|███████████████████████████████████████████████| 500/500 [00:09<00:00, 52.07it/s]
 48%|██████████████████████▍                        | 239/500 [00:05<00:07, 36.23it/s]

## 5. Performance evaluation

In [None]:
All_systems = {"LLMSELECTOR": LLMSELECTOR, **Agents_SameModel}
results = compute_score(All_systems, test_df, M1)
display("test accuracy",results)

## 6. Visualization

In [None]:
import plotly.express as px

def visualize_scores(dataframe):
    """
    Visualizes the Mean Scores of models in a given dataframe using Plotly.

    Args:
    - dataframe (pd.DataFrame): DataFrame containing 'Name' and 'Mean_Score' columns.

    Returns:
    - A Plotly bar chart figure.
    """
    fig = px.bar(
        dataframe,
        x="Name",
        y="Mean_Score",
        title="Mean Scores of Models",
        labels={"Name": "Model Name", "Mean_Score": "Mean Score"},
        text="Mean_Score",
    )
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(
        xaxis=dict(title="Model Name", tickangle=45),
        yaxis=dict(title="Mean Score"),
        margin=dict(l=40, r=40, t=40, b=100),
        height=800,
    )
    return fig

In [None]:
name_map = {'gpt-4o-2024-05-13': 'GPT-4o', 
            'gpt-4-turbo-2024-04-09': 'GPT-4 Turbo', 
           'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo': 'Llama 3.1 405B',
           'meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo': 'Llama 3.1 70B',            
           'Qwen/Qwen2.5-72B-Instruct-Turbo': 'Qwen 2.5 72B',
          }
results['Name'] = results['Name'].replace(name_map)

In [None]:
visualize_scores(results)

## 7. Example Deepdive

In [None]:
# Select questions on which LLMSELECTOR is correct but other models are wrong
model1 = 'meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'
model2 = 'claude-3-5-sonnet-20240620'
example_df = test_df[
(test_df['score_LLMSELECTOR']==1) &
(test_df[f'score_{model1}']==0)
& (test_df[f'score_{model2}']==0)]

In [None]:
index = 1
query = example_df.iloc[index]['query']
answer = example_df.iloc[index]['true_answer']
print(f"question: {query}\nanswer: {answer}")

In [None]:
Agents_SameModel[model1].generate(query)
Agents_SameModel[model1].load_history()['trace']

In [None]:
Agents_SameModel[model2].generate(query)
Agents_SameModel[model2].load_history()['trace']

In [None]:
LLMSELECTOR.generate(query)
LLMSELECTOR.load_history()['trace']