## Setup

In [76]:
import ast
import re
import json
import numpy as np
import pandas as pd
from typing import List, Dict
from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv('keys.env')

from openai import OpenAI
OpenAI_CLIENT = OpenAI()

from gradio_client import Client

import prompts
SEED = 42

np.random.seed(SEED)

In [26]:
import os
import sys
from contextlib import contextmanager

@contextmanager
def suppress_stdout():
    """
    A context manager to suppress stdout output temporarily.
    """
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

## Test set curation

In [29]:
with open('samples/search_helpful_prompts.json', 'r') as f:
    search_prompts = f.read()
    search_prompts = ast.literal_eval(search_prompts)

with open('samples/search_not_helpful_prompts.json', 'r') as f:
    no_search_prompts = f.read()
    no_search_prompts = ast.literal_eval(no_search_prompts)

print('Number of search prompts: {}'.format(len(search_prompts)))
print('Number of no search prompts: {}'.format(len(no_search_prompts)))

# processing initial samples
samples = []
for i, prompt in enumerate(search_prompts[:30]): # take only the samples from chatbot arena
    samples.append({'prompt': prompt, 'search_helpful': 1, 'source': 'arena', 'label': 'none', 'metadata': 'none'})
for prompt in no_search_prompts[:30]: # match the number of search samples from arena
    samples.append({'prompt': prompt, 'search_helpful': 0, 'source': 'arena', 'label': 'none', 'metadata': 'none'})

Number of search prompts: 50
Number of no search prompts: 51


In [30]:
simple_qa = pd.read_csv('samples/simple_qa_test_set.csv')
print('Number of simple_qa samples: {}'.format(len(simple_qa)))
display(simple_qa.head())

simple_qa_sample = simple_qa.sample(30, random_state=SEED) # match the number of no search and search samples from arena
for i, row in simple_qa_sample.iterrows():
    samples.append({'prompt': row['problem'], 'search_helpful': 1, 'source': 'simple_qa', 'label': row['answer'], 'metadata': row['metadata']})

Number of simple_qa samples: 4326


Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.


In [33]:
print('Number of samples: {}'.format(len(samples)))
print('Number of no search prompts: {}'.format(len([s for s in samples if s['search_helpful'] == 0])))
print('Number of search prompts from chatbot arena: {}'.format(len([s for s in samples if s['source'] == 'arena' and s['search_helpful'] == 1])))
print('Number of search prompts from simple_qa: {}'.format(len([s for s in samples if s['source'] == 'simple_qa' and s['search_helpful'] == 1])))

# saving the samples
with open('samples/eval_dataset.json', 'w') as f:
    json.dump(samples, f, indent=4)
    
pd.DataFrame(samples).tail()

Number of samples: 90
Number of no search prompts: 30
Number of search prompts from chatbot arena: 30
Number of search prompts from simple_qa: 30


Unnamed: 0,prompt,search_helpful,source,label,metadata
85,"On what date (month, day, year) was politician...",1,simple_qa,28 August 1901.,"{'topic': 'Politics', 'answer_type': 'Date', '..."
86,What were the month and year when Telegram ann...,1,simple_qa,September 2015.,"{'topic': 'Science and technology', 'answer_ty..."
87,Who opened the first gender clinic in Canada a...,1,simple_qa,Dr. Lorne Warneke,"{'topic': 'Other', 'answer_type': 'Person', 'u..."
88,"What day, month, and year did the Christchurch...",1,simple_qa,19 of July of 2023,"{'topic': 'Other', 'answer_type': 'Date', 'url..."
89,"On what day, month, and year did the Canadian ...",1,simple_qa,"May 26, 2011","{'topic': 'Other', 'answer_type': 'Date', 'url..."


## Evaluation

In [6]:
with open('samples/eval_dataset.json', 'r') as f:
    eval_dataset = json.load(f)

pd.DataFrame(eval_dataset).tail()

Unnamed: 0,prompt,search_helpful,source,label,metadata
85,"On what date (month, day, year) was politician...",1,simple_qa,28 August 1901.,"{'topic': 'Politics', 'answer_type': 'Date', '..."
86,What were the month and year when Telegram ann...,1,simple_qa,September 2015.,"{'topic': 'Science and technology', 'answer_ty..."
87,Who opened the first gender clinic in Canada a...,1,simple_qa,Dr. Lorne Warneke,"{'topic': 'Other', 'answer_type': 'Person', 'u..."
88,"What day, month, and year did the Christchurch...",1,simple_qa,19 of July of 2023,"{'topic': 'Other', 'answer_type': 'Date', 'url..."
89,"On what day, month, and year did the Canadian ...",1,simple_qa,"May 26, 2011","{'topic': 'Other', 'answer_type': 'Date', 'url..."


In [18]:
class SimpleQAGrader():
    GRADER_PROMPT = prompts.SIMPLE_QA_GRADER_PROMPT
    def __init__(self, grader_model_name: str = "gpt-4o"):
        self.grader_model_name = grader_model_name
    
    def grader_model(self, prompt_messages: List[Dict]) -> str:
        completion = OpenAI_CLIENT.chat.completions.create(
            model=self.grader_model_name,
            messages=prompt_messages
        )
        return completion.choices[0].message.content

    def grade_sample(self, question: str, target: str, predicted_answer: str) -> str:
        grader_prompt = self.GRADER_PROMPT.format(
            question=question,
            target=target,
            predicted_answer=predicted_answer,
        )
        prompt_messages = [
            {"role": "user", "content": grader_prompt}
        ]
        grading_response = self.grader_model(prompt_messages)
        match = re.search(r"(A|B|C)", grading_response)
        return match.group(0) if match else "C"
    
    def evaluate(self, questions: List[str], targets: List[str], sampler_model: callable) -> Dict[str, List]:
        print('Sampling...')
        predicted_answers = []
        for question in tqdm(questions):
            predicted_answers.append(sampler_model(question))
        print('Grading...')
        grade_results = []
        for question, target, predicted_answer in zip(questions, targets, predicted_answers):
            grade_results.append(self.grade_sample(question, target, predicted_answer))
        is_correct = [grade_result == "A" for grade_result in grade_results]
        is_incorrect = [grade_result == "B" for grade_result in grade_results]
        is_not_attempted = [grade_result == "C" for grade_result in grade_results]
        total = len(questions)
        print('Accuracy: {:.2f}%'.format(sum(is_correct) / total * 100))
        print('Incorrect: {:.2f}%'.format(sum(is_incorrect) / total * 100))
        print('Not attempted: {:.2f}%'.format(sum(is_not_attempted) / total * 100))
        return {'predicted_answers': predicted_answers, 'grade_results': grade_results}
        

In [27]:
class SamplerModel():
    def __init__(self, model_name: str = "gpt-4o-agent"):
        self.model_name = model_name
        
    def sample(self, prompt : str) -> str:
        with suppress_stdout():
            GRADIO_CLIENT = Client("http://0.0.0.0:7860") # start a new session
        GRADIO_CLIENT.predict(
            model_selector=self.model_name,
            text=prompt,
            api_name="/add_text_1")
        out = GRADIO_CLIENT.predict(api_name="/bot_response_2")
        return out[0][1]

In [63]:
def gpt_3_5_sampler_model(question: str) -> str:
    prompt_messages = [{'role': 'user', 'content': question}]
    completion = OpenAI_CLIENT.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=prompt_messages
    )
    return completion.choices[0].message.content

gpt_4o_sampler_model = SamplerModel(model_name='gpt-4o')
gpt_4o_agent_sampler_model = SamplerModel(model_name='gpt-4o-agent')

### Simple QA eval

In [None]:
eval_simple_qa = [sample for sample in eval_dataset if sample['source'] == 'simple_qa']
grader = SimpleQAGrader(grader_model_name='gpt-4o')

In [None]:
gpt_3_5_results = grader.evaluate(questions=[sample['prompt'] for sample in eval_simple_qa],
                                  targets=[sample['label'] for sample in eval_simple_qa],
                                  sampler_model=gpt_3_5_sampler_model)

Sampling...


100%|██████████| 30/30 [00:19<00:00,  1.58it/s]


Grading...
Accuracy: 3.33%
Incorrect: 93.33%
Not attempted: 3.33%


In [None]:
gpt_4_results = grader.evaluate(questions=[sample['prompt'] for sample in eval_simple_qa],
                                  targets=[sample['label'] for sample in eval_simple_qa],
                                  sampler_model=gpt_4o_sampler_model.sample)

Sampling...


100%|██████████| 30/30 [00:33<00:00,  1.13s/it]


Grading...
Accuracy: 43.33%
Incorrect: 56.67%
Not attempted: 0.00%


In [None]:
gpt_4_agent_results = grader.evaluate(questions=[sample['prompt'] for sample in eval_simple_qa],
                                      targets=[sample['label'] for sample in eval_simple_qa],
                                      sampler_model=gpt_4o_agent_sampler_model.sample)

Sampling...


100%|██████████| 30/30 [03:34<00:00,  7.16s/it]


Grading...
Accuracy: 53.33%
Incorrect: 33.33%
Not attempted: 13.33%


In [45]:
eval_simple_qa_df = pd.DataFrame(eval_simple_qa)
eval_simple_qa_df['gpt_3_5_answer'] = gpt_3_5_results['predicted_answers']
eval_simple_qa_df['gpt_3_5_grade_result'] = gpt_3_5_results['grade_results']
eval_simple_qa_df['gpt_4o_answer'] = gpt_4_results['predicted_answers']
eval_simple_qa_df['gpt_4o_grade_result'] = gpt_4_results['grade_results']
eval_simple_qa_df['gpt_4o_agent_answer'] = gpt_4_agent_results['predicted_answers']
eval_simple_qa_df['gpt_4o_agent_grade_result'] = gpt_4_agent_results['grade_results']
eval_simple_qa_df.head()

Unnamed: 0,prompt,search_helpful,source,label,metadata,gpt_3_5_answer,gpt_3_5_grade_result,gpt_4o_answer,gpt_4o_grade_result,gpt_4o_agent_answer,gpt_4o_agent_grade_result
0,At what age was Ken Noda invited by President ...,1,simple_qa,20,"{'topic': 'Art', 'answer_type': 'Number', 'url...",Ken Noda was invited to perform at the White H...,B,Ken Noda was invited to perform at the White H...,B,Ken Noda was invited by President Ronald Reaga...,B
1,Which art dealership did Peter Arrell Browne W...,1,simple_qa,Knoedler,"{'topic': 'Art', 'answer_type': 'Other', 'urls...","Peter Arrell Browne Widener bought ""Portrait o...",B,"Peter Arrell Browne Widener purchased ""Portrai...",B,"Peter Arrell Browne Widener purchased ""Portrai...",B
2,What was composer Sigrid Ingeborg Henriette Wi...,1,simple_qa,Anna Bruun Tordenskjold,"{'topic': 'Music', 'answer_type': 'Person', 'u...",Composer Sigrid Ingeborg Henriette Wienecke's ...,B,Sigrid Ingeborg Henriette Wienecke's mother's ...,B,Sigrid Ingeborg Henriette Wienecke's mother wa...,A
3,What is the forest cover area of Madhya Prades...,1,simple_qa,77482.49,"{'topic': 'Geography', 'answer_type': 'Number'...",According to the India State of Forest Report ...,B,According to the India State of Forest Report ...,A,According to the India State of Forest Report ...,A
4,Who kills Daryl Garrs in Happy Valley?,1,simple_qa,Alison Garrs,"{'topic': 'TV shows', 'answer_type': 'Person',...",Tommy Lee Royce kills Daryl Garrs in Happy Val...,B,"In the TV series ""Happy Valley,"" Daryl Garrs i...",A,"In the television series ""Happy Valley,"" Daryl...",A


In [60]:
with open('results/gpt4_agent_worse.txt', 'w') as file:
    for _, row in eval_simple_qa_df[
        (eval_simple_qa_df['gpt_4o_grade_result'] == 'A') & 
        (eval_simple_qa_df['gpt_4o_agent_grade_result'] == 'B')
    ].iterrows():
        file.write("Prompt: {}\n".format(row['prompt']))
        file.write("Label: {}\n".format(row['label']))
        file.write('-' * 30 + '\n')
        file.write("GPT-4o Grade Result: {}\n".format(row['gpt_4o_grade_result']))
        file.write("GPT-4o Answer: {}\n".format(row['gpt_4o_answer']))
        file.write('-' * 30 + '\n')
        file.write("GPT-4o Agent Grade Result: {}\n".format(row['gpt_4o_agent_grade_result']))
        file.write("GPT-4o Agent Answer: {}\n".format(row['gpt_4o_agent_answer']))
        file.write("\n" + "=" * 50 + "\n")

In [57]:
with open('results/gpt4_agent_better.txt', 'w') as file:
    for _, row in eval_simple_qa_df[
        (eval_simple_qa_df['gpt_4o_grade_result'] == 'B') & 
        (eval_simple_qa_df['gpt_4o_agent_grade_result'] == 'A')
    ].iterrows():
        file.write("Prompt: {}\n".format(row['prompt']))
        file.write("Label: {}\n".format(row['label']))
        file.write('-' * 30 + '\n')
        file.write("GPT-4o Grade Result: {}\n".format(row['gpt_4o_grade_result']))
        file.write("GPT-4o Answer: {}\n".format(row['gpt_4o_answer']))
        file.write('-' * 30 + '\n')
        file.write("GPT-4o Agent Grade Result: {}\n".format(row['gpt_4o_agent_grade_result']))
        file.write("GPT-4o Agent Answer: {}\n".format(row['gpt_4o_agent_answer']))
        file.write("\n" + "=" * 50 + "\n")

In [109]:
eval_simple_qa_df.to_json('results/simpleqa_eval.json', orient='records')

### Arena evals

#### Search helpful

In [92]:
np.random.seed(SEED)

arena_eval_search = [sample for sample in eval_dataset if sample['source'] == 'arena' and sample['search_helpful'] == 1]
arena_eval_search = np.random.choice(arena_eval_search, 10, replace=False)

In [93]:
gpt_4o_samples = []
for sample in tqdm(arena_eval_search):
    gpt_4o_samples.append(gpt_4o_sampler_model.sample(sample['prompt']))

100%|██████████| 10/10 [00:50<00:00,  5.09s/it]


In [94]:
gpt_4o_agent_samples = []
for sample in tqdm(arena_eval_search):
    gpt_4o_agent_samples.append(gpt_4o_agent_sampler_model.sample(sample['prompt']))

100%|██████████| 10/10 [02:10<00:00, 13.07s/it]


In [98]:
arena_eval_search_df = pd.DataFrame(list(arena_eval_search))
arena_eval_search_df['gpt_4o_answer'] = gpt_4o_samples
arena_eval_search_df['gpt_4o_agent_answer'] = gpt_4o_agent_samples
arena_eval_search_df

Unnamed: 0,prompt,search_helpful,source,label,metadata,gpt_4o_answer,gpt_4o_agent_answer
0,tell me about the philosophy of Gustav Landauer?,1,arena,none,none,Gustav Landauer (1870–1919) was a German anarc...,Gustav Landauer was a notable German anarchist...
1,"tell me the temperature in celsius, hydrometry...",1,arena,none,none,- Temperature: 18°C\n- Hydrometry: 60%\n- Suns...,Here are some suggested conditions for growing...
2,Instruction: You're a bot in Meta ads support ...,1,arena,none,none,I want to invoke help center with the keywords...,I want to invoke help center with the keywords...
3,Valvula borboleta classe 1 ou A?,1,arena,none,none,A seleção entre válvula borboleta Classe 1 e C...,As válvulas borboleta são classificadas em dif...
4,what to know about the current political and s...,1,arena,none,none,"As of the latest information, political and so...",Political and social polarization in the Unite...
5,un buen bot para hacer trading gratuito,1,arena,none,none,Existen varios bots de trading gratuitos que p...,Aquí tienes algunas opciones de bots de tradin...
6,What the best performing method on SWE-Bench now?,1,arena,none,none,"As of my last update, I don't have real-time a...",**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE RE...
7,How many parameters does the largest neural ne...,1,arena,none,none,"As of my last update, the largest publicly kno...","As of the latest information, the largest neur..."
8,What hyper-automation tools are being used in ...,1,arena,none,none,"As of the latest available information, Sandvi...",Sandvik is utilizing several advanced hyper-au...
9,"tell me the temperature in celsius, hydrometry...",1,arena,none,none,"I'm sorry, but I can't provide real-time data ...",To provide specific weather and environmental ...


In [99]:
with open('results/arena_search_helpful.txt', 'w') as file:
    for prompt, gpt_4o_answer, gpt_4o_agent_answer in zip(
        arena_eval_search_df['prompt'], 
        arena_eval_search_df['gpt_4o_answer'], 
        arena_eval_search_df['gpt_4o_agent_answer']
    ):
        file.write('Prompt: {}\n'.format(prompt))
        file.write('-' * 30 + '\n')
        file.write('GPT-4o Answer: {}\n'.format(gpt_4o_answer))
        file.write('-' * 30 + '\n')
        file.write('GPT-4o Agent Answer: {}\n'.format(gpt_4o_agent_answer))
        file.write('=' * 50 + '\n')

In [101]:
arena_eval_search_df.to_json('results/arena_eval_search_helpful.json', orient='records')

#### Search not helpful

In [102]:
np.random.seed(SEED)

arena_eval_no_search = [sample for sample in eval_dataset if sample['source'] == 'arena' and sample['search_helpful'] == 0]
arena_eval_no_search = np.random.choice(arena_eval_no_search, 10, replace=False)

In [104]:
gpt_4o_samples = []
for sample in tqdm(arena_eval_no_search):
    gpt_4o_samples.append(gpt_4o_sampler_model.sample(sample['prompt']))

100%|██████████| 10/10 [00:40<00:00,  4.00s/it]


In [105]:
gpt_4o_agent_samples = []
for sample in tqdm(arena_eval_no_search):
    gpt_4o_agent_samples.append(gpt_4o_agent_sampler_model.sample(sample['prompt']))

100%|██████████| 10/10 [00:47<00:00,  4.76s/it]


In [106]:
arena_eval_no_search_df = pd.DataFrame(list(arena_eval_no_search))
arena_eval_no_search_df['gpt_4o_answer'] = gpt_4o_samples
arena_eval_no_search_df['gpt_4o_agent_answer'] = gpt_4o_agent_samples
arena_eval_no_search_df

Unnamed: 0,prompt,search_helpful,source,label,metadata,gpt_4o_answer,gpt_4o_agent_answer
0,### Instruction: You are an expert economist. ...,0,arena,none,none,Corruption - causes - blackout\n\nLack of main...,"Corruption, a lack of maintenance, the exodus ..."
1,Give me an introduction over 200 words for Sha...,0,arena,none,none,"Shandong Jiulong Fine Chemical Co., Ltd. is a ...","Shandong Jiulong Fine Chemical Co., Ltd., loca..."
2,"You act as an e-commerce expert, replace the s...",0,arena,none,none,"Certainly! If someone is searching for ""Nike,""...","To replace the search term ""nike"" with similar..."
3,This is the question:\n\nDo you know the diffe...,0,arena,none,none,"[2, 3]","[2, 3]\n"
4,You are a SQLite expert. Given an input questi...,0,arena,none,none,"Question: ""What are the names of the tracks in...","Question: ""What are the names of tracks compos..."
5,prepare a 3 day tour of Basel switzerland,0,arena,none,none,"Sure, here's a suggested itinerary for a 3-day...","Planning a 3-day tour of Basel, Switzerland, c..."
6,What items in this sentence are commonly found...,0,arena,none,none,"```json\n{\n ""kitchen_items"": [\n ""s...","```json\n{\n ""kitchen_items"": [\n ""shot gl..."
7,"Como un psicologo eclectico experto, responde ...",0,arena,none,none,"Desde un enfoque ecléctico, que integra divers...","Desde una perspectiva ecléctica en psicología,..."
8,"Hello, how are you?",0,arena,none,none,"Hello! I'm just a program, so I don't have fee...","Hello! I'm just a computer program, so I don't..."
9,THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. P...,0,arena,none,none,It looks like you might be experiencing an iss...,It seems like there was an issue with connecti...


In [107]:
with open('results/arena_search_not_helpful.txt', 'w') as file:
    for prompt, gpt_4o_answer, gpt_4o_agent_answer in zip(
        arena_eval_no_search_df['prompt'], 
        arena_eval_no_search_df['gpt_4o_answer'], 
        arena_eval_no_search_df['gpt_4o_agent_answer']
    ):
        file.write('Prompt: {}\n'.format(prompt))
        file.write('-' * 30 + '\n')
        file.write('GPT-4o Answer: {}\n'.format(gpt_4o_answer))
        file.write('-' * 30 + '\n')
        file.write('GPT-4o Agent Answer: {}\n'.format(gpt_4o_agent_answer))
        file.write('=' * 50 + '\n')

In [108]:
arena_eval_no_search_df.to_json('results/arena_eval_search_not_helpful.json', orient='records')