### Setup

In [56]:
import pandas as pd

SEED = 42
SAMPLE_SIZE = 250

import warnings
warnings.filterwarnings('ignore')

## Simple QA

In [57]:
simple_qa = pd.read_csv('data/simple_qa_test_set.csv')
print('Full SimpleQA test set size: {}'.format(len(simple_qa)))
display(simple_qa.head())

Full SimpleQA test set size: 4326


Unnamed: 0,metadata,problem,answer
0,"{'topic': 'Science and technology', 'answer_ty...",Who received the IEEE Frank Rosenblatt Award i...,Michio Sugeno
1,"{'topic': 'Science and technology', 'answer_ty...",Who was awarded the Oceanography Society's Jer...,Annick Bricaud
2,"{'topic': 'Geography', 'answer_type': 'Place',...",What's the name of the women's liberal arts co...,Radcliffe College
3,"{'topic': 'Sports', 'answer_type': 'Person', '...",In whose honor was the Leipzig 1877 tournament...,Adolf Anderssen
4,"{'topic': 'Art', 'answer_type': 'Person', 'url...","According to Karl Küchler, what did Empress El...",Poet Henrich Heine.


In [58]:
simple_qa_sample = simple_qa.sample(
    n=SAMPLE_SIZE,
    random_state=SEED)
simple_qa_sample.reset_index(drop=True, inplace=True)
print('Number of SimpleQA samples: {}\n\n'.format(len(simple_qa_sample)))

simple_qa_sample.rename(columns={"problem": "prompt", "answer": "label"}, inplace=True)
simple_qa_sample["question_id"] = "simple_qa_" + pd.Series(simple_qa_sample.index).astype(str)
simple_qa_sample = simple_qa_sample[['question_id', 'prompt', 'label', 'metadata']]
simple_qa_sample

Number of SimpleQA samples: 250




Unnamed: 0,question_id,prompt,label,metadata
0,simple_qa_0,At what age was Ken Noda invited by President ...,20,"{'topic': 'Art', 'answer_type': 'Number', 'url..."
1,simple_qa_1,Which art dealership did Peter Arrell Browne W...,Knoedler,"{'topic': 'Art', 'answer_type': 'Other', 'urls..."
2,simple_qa_2,What was composer Sigrid Ingeborg Henriette Wi...,Anna Bruun Tordenskjold,"{'topic': 'Music', 'answer_type': 'Person', 'u..."
3,simple_qa_3,What is the forest cover area of Madhya Prades...,77482.49,"{'topic': 'Geography', 'answer_type': 'Number'..."
4,simple_qa_4,Who kills Daryl Garrs in Happy Valley?,Alison Garrs,"{'topic': 'TV shows', 'answer_type': 'Person',..."
...,...,...,...,...
245,simple_qa_245,What cabinet position did Sir Hector-Louis Lan...,Minister of Public Works,"{'topic': 'Politics', 'answer_type': 'Other', ..."
246,simple_qa_246,In what year was Natalia Dmitriyevna Shpiller ...,1951,"{'topic': 'Art', 'answer_type': 'Date', 'urls'..."
247,simple_qa_247,"In Season 3, Episode 7 of ""Love Is Blind"" (the...",Week 2\n,"{'topic': 'TV shows', 'answer_type': 'Number',..."
248,simple_qa_248,In which month and year was Sayyid Ghulam Muhi...,November 1888,"{'topic': 'Politics', 'answer_type': 'Date', '..."


In [51]:
simple_qa_sample.to_json("eval_inputs/simple_qa_{}.jsonl".format(SAMPLE_SIZE), orient="records", lines=True)

## Arena Hard

In [59]:
arena_hard = pd.read_json('data/arena_hard_prompts.jsonl', lines=True)
print('Full Arena Hard test set size: {}'.format(len(arena_hard)))
display(arena_hard.head())

Full Arena Hard test set size: 500


Unnamed: 0,question_id,category,cluster,turns
0,328c149ed45a41c0b9d6f14659e63599,arena-hard-v0.1,ABC Sequence Puzzles & Groups,[{'content': 'Use ABC notation to write a melo...
1,b43c07656ead4150b360294ee932b410,arena-hard-v0.1,ABC Sequence Puzzles & Groups,[{'content': 'SOLVE THIS IN C++ : There are th...
2,1f07cf6d146d4038b2b93aaba3935ce0,arena-hard-v0.1,AI & Sequence Alignment Challenges,[{'content': 'Explain the book the Alignment p...
3,9f25ff7c0d6a4d74846bfe76af8d925c,arena-hard-v0.1,AI & Sequence Alignment Challenges,[{'content': 'Design a semikinematic mounting ...
4,04ba0aeb79524f6c8520d47cada34f25,arena-hard-v0.1,AI Image Upscaling,[{'content': 'I have a dataset which contains ...


In [60]:
# arena_hard_sample = arena_hard.drop_duplicates(subset=['cluster'])
arena_hard_sample = arena_hard.sample(n=50, random_state=SEED)
arena_hard_sample.reset_index(drop=True, inplace=True)
print('Number of Arena Hard samples: {}\n\n'.format(len(arena_hard_sample)))

arena_hard_sample["prompt"] = arena_hard_sample["turns"].apply(lambda x: x[0]["content"])
arena_hard_sample["question_id"] = "arena_hard_" + pd.Series(arena_hard_sample.index).astype(str)
arena_hard_sample = arena_hard_sample[['question_id', 'prompt']]
arena_hard_sample

Number of Arena Hard samples: 50




Unnamed: 0,question_id,prompt
0,arena_hard_0,remove dead code from the following: #include ...
1,arena_hard_1,Devise a way to parse the dataframe in python ...
2,arena_hard_2,give me code to generate random permutation fo...
3,arena_hard_3,"Just quickly, do you agree with this sentence:..."
4,arena_hard_4,There is a game where a player is assigned a l...
5,arena_hard_5,Translate this code into proper Rust:\nenum Co...
6,arena_hard_6,Please describe the most common optimizations ...
7,arena_hard_7,Make code in a synapse notebook that deletes a...
8,arena_hard_8,Act as Chief Information Officer and write 3 S...
9,arena_hard_9,I live in Germany and I am a german tax reside...


In [61]:
arena_hard_sample.to_json("eval_inputs/arena_hard_{}.jsonl".format(50), orient="records", lines=True)