# Create shuffled MMLU dataset  

We use the source `tasksource/mmlu` dataset to create a new dataset called `shuffled_mmlu` : 
[link to the dataset on HuggingFace](https://huggingface.co/datasets/the-french-artist/shuffled_mmlu)  

The idea is to shuffle the possible answers to a question.  

In MMLU, each question has 4 possible answers choices (ABCD).  
Factorial 4 is 24: each question can be expressed as 24 duplicate questions, each with a unique permutation of ABCD answer choices.  
We indicate the current shuffle order with the associated ABCD permutation.  
Eg. The totally inverse order of choices will be labelled "DCBA" in the feature `letter_order`.  

For practicality, we create a single dataset containing all 57 MMLU categories.  
This allows for a faster loading time when downloading from the hub. 
The dataset is still split into test, validation, and dev files.  

Our hypothesis: the choice order is important in the internal logic of an LLM. As the model operates in an autoregressive manner, and its internal logic is unidirectional, it will be less suceptible to discard wrong answers if presented first.  
By presenting answer choices in all possible orders, and voting on the most popular answer, we eras the order bias and benefit from collective intelligence.  
We hope to achieve a better overall score this way.  

## define helper functions

In [1]:
from itertools import permutations
import random

# Generate all possible 4-letter words with the letters ABCD
letters = ['A', 'B', 'C', 'D']
all_letter_orders = [''.join(word) for word in permutations(letters, 4)]

letter_to_num = {
    'A':0,
    'B':1,
    'C':2,
    'D':3,
}

def convert_answer(reference_answer, letter_order):
    letters = list(letter_order)
    default_order = list('ABCD')
    translation_dictionary = dict(zip(letters, default_order))
    
    translated_letter = translation_dictionary[default_order[reference_answer]]
    return letter_to_num[translated_letter]

# takes a dataset row and shuffles the choices based on a new letter order, updates the answer accordingly
def translate_question(row, letter_order):
    
    new_choices = reorder_choices_based_on_letter_order(row['choices'], letter_order)
    default_order = 'ABCD'
       
    new_answer = convert_answer(row['answer'], letter_order)

    row['choices'] = new_choices
    row['answer'] = new_answer
    return row


def convert_letter_to_index(letter):
  letter_to_index_dict = {
      'A' : 0,
      'B' : 1,
      'C' : 2,
      'D' : 3
  }
  return letter_to_index_dict[letter]

def reorder_choices_based_on_letter_order(choices, letter_order):
  new_choices = []
  for letter in letter_order:
    new_choices.append(choices[convert_letter_to_index(letter)])

  return new_choices

In [2]:
import random

# We have to use random to create a unique question ID because there are duplicate questions in the MMLU dataset  
# As the question, choices, answer, and category are all duplicates, our last resort is to salt the hash...
random.seed(10)
print(random.random())

0.5714025946899135


In [3]:
from hashlib import sha256

def create_question_id(row):
    row['question_id'] = sha256(
        (
            str(random.random()) +
            row['question'] + 
         ",".join(str(x) for x in row['choices'])
        ).encode('utf-8')
    ).hexdigest()
    return row

row = {
    'question': 'this is the question',
    'choices' : [1, 2, 3, 4]
}
create_question_id(row)

{'question': 'this is the question',
 'choices': [1, 2, 3, 4],
 'question_id': '9adf86b754c24f21ee4014208d51a0da8398882ba3546db8eea344ac77412699'}

## load dataset(s)

In [4]:
from datasets import load_dataset, get_dataset_config_names
from tqdm.auto import tqdm

configs = get_dataset_config_names("tasksource/mmlu")

# load all categories into a list
validation_dataset_list = []
for curr_cat in tqdm(configs, desc='loading categories...'):
    validation_dataset_list.append(load_dataset("tasksource/mmlu", curr_cat, split='validation'))

test_dataset_list = []
for curr_cat in tqdm(configs, desc='loading categories...'):
    test_dataset_list.append(load_dataset("tasksource/mmlu", curr_cat, split='test'))

dev_dataset_list = []
for curr_cat in tqdm(configs, desc='loading categories...'):
    dev_dataset_list.append(load_dataset("tasksource/mmlu", curr_cat, split='dev'))

loading categories...:   0%|          | 0/57 [00:00<?, ?it/s]

loading categories...:   0%|          | 0/57 [00:00<?, ?it/s]

loading categories...:   0%|          | 0/57 [00:00<?, ?it/s]

## Shuffle and save to parquet

In [5]:
# add question_id to dataset set
import pandas as pd

def shuffle_and_save_dataset_list(input_dataset_list, curr_split):
    new_dataset_list = []
    for curr_dataset in input_dataset_list:
        display(curr_dataset)
        new_dataset_list.append(curr_dataset.map(create_question_id))
    
    final_dataset_list = []
    for curr_dataset, category in tqdm(zip(new_dataset_list, configs), desc='converting categories', total=len(configs)):
        for letter_order in all_letter_orders:
            curr_df = curr_dataset.to_pandas()
            new_df = curr_df.apply(lambda row: translate_question(row, letter_order), axis=1)
            new_df['category'] = category
            new_df['letter_order'] = letter_order
            final_dataset_list.append(new_df)
            # new_df.to_parquet(folder_name+f"{category}_{letter_order}_validation.parquet")
    
    complete_shuffled_dataset = pd.concat(final_dataset_list)
    
    folder_name = 'shuffled_mmlu/'
    complete_shuffled_dataset.to_parquet(folder_name+f"shuffled_mmlu_{curr_split}.parquet")

In [6]:
shuffle_and_save_dataset_list(validation_dataset_list, 'validation')

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 14
})

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 16
})

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 29
})

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 16
})

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 8
})

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 22
})

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 26
})

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 12
})

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 16
})

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 41
})

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 14
})

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 10
})

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 32
})

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 22
})

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 9
})

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 18
})

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 22
})

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 21
})

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 43
})

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 29
})

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 26
})

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 17
})

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 60
})

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 23
})

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 22
})

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 26
})

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 23
})

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 12
})

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 13
})

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 18
})

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 25
})

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 86
})

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 38
})

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 33
})

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 34
})

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 35
})

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 31
})

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 170
})

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 31
})

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 69
})

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 12
})

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 27
})

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 22
})

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 11
})

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 18
})

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 19
})

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

converting categories:   0%|          | 0/57 [00:00<?, ?it/s]

In [7]:
shuffle_and_save_dataset_list(test_dataset_list, 'test')

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 135
})

Map:   0%|          | 0/135 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 152
})

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 265
})

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 144
})

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 173
})

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 102
})

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 235
})

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 114
})

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 145
})

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 378
})

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 126
})

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 310
})

Map:   0%|          | 0/310 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 203
})

Map:   0%|          | 0/203 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 165
})

Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 198
})

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 193
})

Map:   0%|          | 0/193 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 390
})

Map:   0%|          | 0/390 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 270
})

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 238
})

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 151
})

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 545
})

Map:   0%|          | 0/545 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 216
})

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 204
})

Map:   0%|          | 0/204 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 237
})

Map:   0%|          | 0/237 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 223
})

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 131
})

Map:   0%|          | 0/131 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 121
})

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 108
})

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 163
})

Map:   0%|          | 0/163 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 112
})

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 103
})

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 234
})

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 783
})

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 346
})

Map:   0%|          | 0/346 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 895
})

Map:   0%|          | 0/895 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 306
})

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 311
})

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 324
})

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 282
})

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 1534
})

Map:   0%|          | 0/1534 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 272
})

Map:   0%|          | 0/272 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 612
})

Map:   0%|          | 0/612 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 110
})

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 245
})

Map:   0%|          | 0/245 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 201
})

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 100
})

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 166
})

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 171
})

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

converting categories:   0%|          | 0/57 [00:00<?, ?it/s]

In [8]:
shuffle_and_save_dataset_list(dev_dataset_list, 'dev')

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'choices', 'answer'],
    num_rows: 5
})

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

converting categories:   0%|          | 0/57 [00:00<?, ?it/s]

## Check dataset

In [9]:
complete_shuffled_dataset = pd.read_parquet('shuffled_mmlu/shuffled_mmlu_test.parquet')

### Check uniqueness of question_id

This expression is true if the question_id is indeed unique

In [17]:
len(complete_shuffled_dataset)/24 == len(complete_shuffled_dataset.question_id.unique())

True

In [11]:
complete_shuffled_dataset.question_id.value_counts()

question_id
bc3778ec85a3abdf375449e14780a1318d32e859c2a2c16ca422eaf4349ae6bc    24
e1b9ce3aa1729e38a0ad50d9d61066187eadddb9ff7967cc0c94b97643201334    24
a7dd0fadbe7889b8a339a0a7221ec94ce15476eef1292d263ee6593940a93f16    24
59f25f85093da913ec051c4a9b57552da83bcd9afb01ddf225ed1def13c08cbd    24
7f2a08c5d15370f23d535cf6e02c6801f23a6404447c3cdc227bd74aa44e48b1    24
                                                                    ..
ce44c809afcb08fdd18d87c4067dbfbeb3edf5e20dac5a113a2837d5eaacd01c    24
7102862c0f284391e57e622ac4ca2273e58f7de296d0964cf0c6c86eeee53b17    24
db9c1a12cdbeae4ce5711642afeb045d94d46dff0bdd6177bcb9a0e17b37ce99    24
2fbe12e9363217e0c5350a4b69a531563d740b12d83e0ef3983e9c3aa334b17b    24
bd9f02cc864917b5a2b6ecad1f149247a362bd459eb62967febe1c9e5d3b19a1    24
Name: count, Length: 14042, dtype: int64

### Check translation  
Check that the translated answer gives the index of original question back after running translation again.  
`translated_answer` should always have the same value as sample from row 'ABCD'.  

In [12]:
complete_shuffled_dataset.head()

Unnamed: 0,question,choices,answer,question_id,category,letter_order
0,Find the degree for the given field extension ...,"[0, 4, 2, 6]",1,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ABCD
1,"Let p = (1, 2, 5, 4)(2, 3) in S_5 . Find the i...","[8, 2, 24, 120]",2,9dbee06135bb2cd4f1d6fc47c5b9698485a7758ce3ee76...,abstract_algebra,ABCD
2,Find all zeros in the indicated finite field o...,"[0, 1, 0,1, 0,4]",3,4cfb894cedaec3e7dee2ba71a6a781fbf1d0ded44bac22...,abstract_algebra,ABCD
3,Statement 1 | A factor group of a non-Abelian ...,"[True, True, False, False, True, False, False,...",1,7bdc038b56be4a1a507b6d156e061fc66c43098d756822...,abstract_algebra,ABCD
4,Find the product of the given polynomials in t...,"[2x^2 + 5, 6x^2 + 4x + 6, 0, x^2 + 1]",1,ff99adc312cd773b4959d6f9398f00342297a0d0379c65...,abstract_algebra,ABCD


In [13]:
def apply_result(row):
    return row['choices'][row['answer']]

In [14]:
small_sample = complete_shuffled_dataset[complete_shuffled_dataset.question==complete_shuffled_dataset.question.to_list()[0]]
small_sample = small_sample.copy(deep=True)
small_sample['translated_answer'] = small_sample.apply(apply_result, axis=1)
small_sample

Unnamed: 0,question,choices,answer,question_id,category,letter_order,translated_answer
0,Find the degree for the given field extension ...,"[0, 4, 2, 6]",1,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ABCD,4
0,Find the degree for the given field extension ...,"[0, 4, 6, 2]",1,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ABDC,4
0,Find the degree for the given field extension ...,"[0, 2, 4, 6]",2,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ACBD,4
0,Find the degree for the given field extension ...,"[0, 2, 6, 4]",3,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ACDB,4
0,Find the degree for the given field extension ...,"[0, 6, 4, 2]",2,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ADBC,4
0,Find the degree for the given field extension ...,"[0, 6, 2, 4]",3,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,ADCB,4
0,Find the degree for the given field extension ...,"[4, 0, 2, 6]",0,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,BACD,4
0,Find the degree for the given field extension ...,"[4, 0, 6, 2]",0,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,BADC,4
0,Find the degree for the given field extension ...,"[4, 2, 0, 6]",0,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,BCAD,4
0,Find the degree for the given field extension ...,"[4, 2, 6, 0]",0,bc3778ec85a3abdf375449e14780a1318d32e859c2a2c1...,abstract_algebra,BCDA,4


## Make HF HUB config

In [15]:
import yaml
yaml.Dumper.ignore_aliases = lambda *args : True #use this to prevent pointer from being included in the YAML file

complete_config = {
    'configs':[{
        'config_name':'default',
        'data_files':
        [
            {
                'split' : 'test',
                'path': 'shuffled_mmlu_test.parquet'
            },
            {
                'split' : 'validation',
                'path': 'shuffled_mmlu_validation.parquet'
            },
            {
                'split' : 'dev',
                'path': 'shuffled_mmlu_dev.parquet'
            },
        ]
    }]
}

print(yaml.dump(complete_config, default_flow_style=False))

configs:
- config_name: default
  data_files:
  - path: shuffled_mmlu_test.parquet
    split: test
  - path: shuffled_mmlu_validation.parquet
    split: validation
  - path: shuffled_mmlu_dev.parquet
    split: dev

