In [1]:
import json
import pandas as pd

SEED = 42
SIMPLE_QA_SAMPLE_SIZE = 500

In [2]:
simple_qa = pd.read_csv('data/simple_qa_test_set.csv')
print('Full SimpleQA test set size: {}'.format(len(simple_qa)))
display(simple_qa.head())

simple_qa_sample = simple_qa.sample(
    n=SIMPLE_QA_SAMPLE_SIZE,
    random_state=SEED)
print('Number of SimpleQA samples: {}\n\n'.format(len(simple_qa_sample)))

search_dataset = []
for _, row in simple_qa_sample.iterrows():
    search_dataset.append({
        'prompt': row['problem'],
        'search_helpful': True,
        'source': 'simple_qa',
        'label': row['answer'],
        'metadata': row['metadata']
    })
    
print('Example:')
for k, v in search_dataset[0].items():
    print('{}: {}'.format(k, v))

Full SimpleQA test set size: 4326


Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.


Number of SimpleQA samples: 500


Example:
prompt: At what age was Ken Noda invited by President Ronald Reagan and First Lady Nancy Reagan to perform at the White House?
search_helpful: True
source: simple_qa
label: 20
metadata: {'topic': 'Art', 'answer_type': 'Number', 'urls': ['https://en.wikipedia.org/wiki/Ken_Noda', 'https://en.wikipedia.org/wiki/Ken_Noda', 'https://www.reaganlibrary.gov/reagans/reagan-administration/entertainers-white-house', 'https://www.nytimes.com/1982/10/28/arts/ken-noda-20-to-play-at-white-house.html']}


In [3]:
arena_hard = pd.read_json('data/arena_hard_prompts.jsonl', lines=True)
print('Full Arena-Hard test set size: {}'.format(len(arena_hard)))
display(arena_hard.head())

arena_hard['prompt'] = arena_hard['turns'].apply(lambda x: x[0]['content'])

no_search_dataset = []
for _, row in arena_hard.iterrows():
    no_search_dataset.append({
        'prompt': row['prompt'],
        'search_helpful': False,
        'source': 'arena_hard',
        'label': None,
        'metadata': None,
    })

print('Example:')
for k, v in no_search_dataset[0].items():
    print('{}: {}'.format(k, v))

Full Arena-Hard test set size: 500


Unnamed: 0,question_id,category,cluster,turns
0,328c149ed45a41c0b9d6f14659e63599,arena-hard-v0.1,ABC Sequence Puzzles & Groups,[{'content': 'Use ABC notation to write a melo...
1,b43c07656ead4150b360294ee932b410,arena-hard-v0.1,ABC Sequence Puzzles & Groups,[{'content': 'SOLVE THIS IN C++ : There are th...
2,1f07cf6d146d4038b2b93aaba3935ce0,arena-hard-v0.1,AI & Sequence Alignment Challenges,[{'content': 'Explain the book the Alignment p...
3,9f25ff7c0d6a4d74846bfe76af8d925c,arena-hard-v0.1,AI & Sequence Alignment Challenges,[{'content': 'Design a semikinematic mounting ...
4,04ba0aeb79524f6c8520d47cada34f25,arena-hard-v0.1,AI Image Upscaling,[{'content': 'I have a dataset which contains ...


Example:
prompt: Use ABC notation to write a melody in the style of a folk tune.
search_helpful: False
source: arena_hard
label: None
metadata: None


In [4]:
with open('eval_sets/search_dataset.jsonl', 'w') as f:
    for item in search_dataset:
        f.write('{}\n'.format(json.dumps(item)))

with open('eval_sets/no_search_dataset.jsonl', 'w') as f:
    for item in no_search_dataset:
        f.write('{}\n'.format(json.dumps(item)))