In [28]:
import ast
import re
import json
import pandas as pd
from typing import List, Dict
from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv('keys.env')

from openai import OpenAI
client = OpenAI()

import prompts
SEED = 42

In [29]:
with open('samples/search_helpful_prompts.json', 'r') as f:
    search_prompts = f.read()
    search_prompts = ast.literal_eval(search_prompts)

with open('samples/search_not_helpful_prompts.json', 'r') as f:
    no_search_prompts = f.read()
    no_search_prompts = ast.literal_eval(no_search_prompts)

print('Number of search prompts: {}'.format(len(search_prompts)))
print('Number of no search prompts: {}'.format(len(no_search_prompts)))

# processing initial samples
samples = []
for i, prompt in enumerate(search_prompts[:30]): # take only the samples from chatbot arena
    samples.append({'prompt': prompt, 'search_helpful': 1, 'source': 'arena', 'label': 'none', 'metadata': 'none'})
for prompt in no_search_prompts[:30]: # match the number of search samples from arena
    samples.append({'prompt': prompt, 'search_helpful': 0, 'source': 'arena', 'label': 'none', 'metadata': 'none'})

Number of search prompts: 50
Number of no search prompts: 51


In [30]:
simple_qa = pd.read_csv('samples/simple_qa_test_set.csv')
print('Number of simple_qa samples: {}'.format(len(simple_qa)))
display(simple_qa.head())

simple_qa_sample = simple_qa.sample(30, random_state=SEED) # match the number of no search and search samples from arena
for i, row in simple_qa_sample.iterrows():
    samples.append({'prompt': row['problem'], 'search_helpful': 1, 'source': 'simple_qa', 'label': row['answer'], 'metadata': row['metadata']})

Number of simple_qa samples: 4326


Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.


In [33]:
print('Number of samples: {}'.format(len(samples)))
print('Number of no search prompts: {}'.format(len([s for s in samples if s['search_helpful'] == 0])))
print('Number of search prompts from chatbot arena: {}'.format(len([s for s in samples if s['source'] == 'arena' and s['search_helpful'] == 1])))
print('Number of search prompts from simple_qa: {}'.format(len([s for s in samples if s['source'] == 'simple_qa' and s['search_helpful'] == 1])))

# saving the samples
with open('samples/eval_dataset.json', 'w') as f:
    json.dump(samples, f, indent=4)
    
pd.DataFrame(samples).tail()

Number of samples: 90
Number of no search prompts: 30
Number of search prompts from chatbot arena: 30
Number of search prompts from simple_qa: 30


Unnamed: 0,prompt,search_helpful,source,label,metadata
85,"On what date (month, day, year) was politician...",1,simple_qa,28 August 1901.,"{'topic': 'Politics', 'answer_type': 'Date', '..."
86,What were the month and year when Telegram ann...,1,simple_qa,September 2015.,"{'topic': 'Science and technology', 'answer_ty..."
87,Who opened the first gender clinic in Canada a...,1,simple_qa,Dr. Lorne Warneke,"{'topic': 'Other', 'answer_type': 'Person', 'u..."
88,"What day, month, and year did the Christchurch...",1,simple_qa,19 of July of 2023,"{'topic': 'Other', 'answer_type': 'Date', 'url..."
89,"On what day, month, and year did the Canadian ...",1,simple_qa,"May 26, 2011","{'topic': 'Other', 'answer_type': 'Date', 'url..."


In [20]:
class SimpleQAGrader():
    GRADER_PROMPT = prompts.SIMPLE_QA_GRADER_PROMPT
    def __init__(self, grader_model_name: str = "gpt-4o"):
        self.grader_model_name = grader_model_name
    
    def grader_model(self, prompt_messages: List[Dict]) -> str:
        completion = client.chat.completions.create(
            model=self.grader_model_name,
            messages=prompt_messages
        )
        return completion.choices[0].message.content

    def grade_sample(self, question: str, target: str, predicted_answer: str) -> str:
        grader_prompt = self.GRADER_PROMPT.format(
            question=question,
            target=target,
            predicted_answer=predicted_answer,
        )
        prompt_messages = [
            {"role": "user", "content": grader_prompt}
        ]
        grading_response = self.grader_model(prompt_messages)
        match = re.search(r"(A|B|C)", grading_response)
        return match.group(0) if match else "C"
    
    def evaluate(self, questions: List[str], targets: List[str], sampler_model: callable) -> Dict[str, List]:
        print('Sampling...')
        predicted_answers = []
        for question in tqdm(questions):
            predicted_answers.append(sampler_model(question))
        print('Grading...')
        grade_results = []
        for question, target, predicted_answer in zip(questions, targets, predicted_answers):
            grade_results.append(self.grade_sample(question, target, predicted_answer))
        is_correct = [grade_result == "A" for grade_result in grade_results]
        is_incorrect = [grade_result == "B" for grade_result in grade_results]
        is_not_attempted = [grade_result == "C" for grade_result in grade_results]
        total = len(questions)
        print('Accuracy: {:.2f}%'.format(sum(is_correct) / total * 100))
        print('Incorrect: {:.2f}%'.format(sum(is_incorrect) / total * 100))
        print('Not attempted: {:.2f}%'.format(sum(is_not_attempted) / total * 100))
        return {'predicted_answers': predicted_answers, 'grade_results': grade_results}
        

In [21]:
def sampler_model(question: str) -> str:
    prompt_messages = [{'role': 'user', 'content': question}]
    completion = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=prompt_messages
    )
    return completion.choices[0].message.content

grader = SimpleQAGrader(grader_model_name='gpt-4o')
sample = [s for s in samples if s['source'] == 'simple_qa']
results = grader.evaluate(questions=[s['prompt'] for s in sample], targets=[s['label'] for s in sample], sampler_model=sampler_model)


Sampling...


100%|██████████| 30/30 [00:18<00:00,  1.60it/s]


Grading...
Accuracy: 6.67%
Incorrect: 93.33%
Not attempted: 0.00%
