# AIMO data cleaning and tokenization

In [1]:
import warnings
warnings.filterwarnings('ignore')

import re
import string

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('olympiadbench-external-df.csv')
df.head()

Unnamed: 0,id,subfield,context,question,solution,final_answer,is_multiple_answer,unit,answer_type,error
0,800,Mechanics,,A quarantined physics student decides to perfo...,"['By conservation of energy, we have that\n$$\...",['2.47'],False,m,Numerical,0.1
1,800,Mechanics,"- Proton mass, $m_{p}=1.67 \cdot 10^{-27} \mat...",A quarantined physics student decides to perfo...,"['By conservation of energy, we have that\n$$\...",['2.47'],False,m,Numerical,0.1
2,801,Mechanics,"- Proton mass, $m_{p}=1.67 \cdot 10^{-27} \mat...","A wooden bus of mass $M=20,000 \mathrm{~kg}$ (...","[""The moment of inertia of each wheel can be t...",['3.32'],False,m,Numerical,0.1
3,801,Mechanics,,"A wooden bus of mass $M=20,000 \mathrm{~kg}$ (...","[""The moment of inertia of each wheel can be t...",['3.32'],False,m,Numerical,0.1
4,802,Mechanics,"- Proton mass, $m_{p}=1.67 \cdot 10^{-27} \mat...","In an old coal factory, a conveyor belt will m...",['The maximal force the convey belt can provid...,['$2022.2$'],False,$\frac{\mathrm{kg}}{\mathrm{m}^{3}}$,Numerical,0.1


In [3]:
df = df[df['answer_type'] == 'Numerical']
df.head()

Unnamed: 0,id,subfield,context,question,solution,final_answer,is_multiple_answer,unit,answer_type,error
0,800,Mechanics,,A quarantined physics student decides to perfo...,"['By conservation of energy, we have that\n$$\...",['2.47'],False,m,Numerical,0.1
1,800,Mechanics,"- Proton mass, $m_{p}=1.67 \cdot 10^{-27} \mat...",A quarantined physics student decides to perfo...,"['By conservation of energy, we have that\n$$\...",['2.47'],False,m,Numerical,0.1
2,801,Mechanics,"- Proton mass, $m_{p}=1.67 \cdot 10^{-27} \mat...","A wooden bus of mass $M=20,000 \mathrm{~kg}$ (...","[""The moment of inertia of each wheel can be t...",['3.32'],False,m,Numerical,0.1
3,801,Mechanics,,"A wooden bus of mass $M=20,000 \mathrm{~kg}$ (...","[""The moment of inertia of each wheel can be t...",['3.32'],False,m,Numerical,0.1
4,802,Mechanics,"- Proton mass, $m_{p}=1.67 \cdot 10^{-27} \mat...","In an old coal factory, a conveyor belt will m...",['The maximal force the convey belt can provid...,['$2022.2$'],False,$\frac{\mathrm{kg}}{\mathrm{m}^{3}}$,Numerical,0.1


In [4]:
df_clean = pd.DataFrame()

In [5]:
remove = ["[","]","'","$","\\",'{','}']

def clean_answer(answer):
    # Remove scientific notation times 10^d+ pattern
    answer = re.sub(r'times 10\^\d+', '', answer)
    # Remove specified characters
    for char in remove:
        answer = answer.replace(char, '')
    try:
        # Attempt to convert to float
        answer_float = float(answer)
        # Check if the float is a non-negative integer
        if answer_float == int(answer_float) and int(answer_float) >= 0:
            return int(answer_float)
    except ValueError:
        pass
    return None

# Apply the clean_answer function to the 'final_answer' column
df_clean['answer'] = df['final_answer'].apply(clean_answer)

In [7]:
pattern_1 = r'\n\n!\[\]\(.*?\)'

pattern_2 = r'<.*?>'

def clean_question(row):
    # Check if 'context' is not NaN and not an empty string
    question = row['question']

    if pd.notna(row['context']) and row['context']:
        context = row['context']
        combined = f"{question} Use the follwing information when solving: {context}"

    else:
        combined = question
    
    combined = re.sub(pattern_1, '', question)
    combined = re.sub(pattern_2, '', question)

    return combined

df_clean['question'] = df.apply(clean_question, axis=1)


In [8]:
def clean_solution(row):

    if pd.notna(row['solution']) and row['solution']:
        solution = row['solution'].strip("[").strip("]").strip("'").strip('"')
        solution = re.sub(pattern_1, '',solution)
        solution = re.sub(pattern_2, '',solution)
    else:
        solution = 'NONE'
    return solution

df_clean['solution'] = df.apply(clean_solution, axis=1)

In [9]:
df_clean.drop_duplicates(subset=["solution","answer"],inplace=True)
df_clean = df_clean[df_clean['answer'].notna()][['question','answer']].astype({'answer':'int_'})
df_clean.reset_index(drop=True,inplace=True)
df_clean.head()

Unnamed: 0,question,answer
0,A ball is situated at the midpoint of the bott...,2
1,A large metal conducting sphere with radius $1...,25
2,Max finds himself trapped in the center of a m...,120
3,"For his art project, Weishaupt cut out $N=20$ ...",21
4,Two identical neutron stars with mass $m=4 \ti...,590


In [34]:
def get_prompt(row):
    prompt = f"""Role: You are an advanced AI system with exceptional mathematical reasoning and problem-solving capabilities, specifically designed to solve tricky math problems (whose answer is a non-negative integer) written in LaTeX format from the AI Mathematical Olympiad (AIMO) competition. Your task is to accurately analyze and solve intricate mathematical problems, demonstrating a deep understanding of mathematical concepts and a strong ability to apply logical reasoning strategies.
    
    Instructions:
    1. Carefully read and comprehend the problem statement provided in the "Problem" section.
    2. Solve the problem.
    2. After solving, create an "Answer" section where you will state only the final integer answer, without any additional text or narrative.
    
    Problem: {row['question']}
    
    """
    return prompt

def get_prompt_answer(row):
    prompt = f"Answer: {row['answer']}"

    return prompt

df_clean['prompt'] = df_clean.apply(get_prompt, axis=1)
df_clean['prompt_answer'] = df_clean.apply(get_prompt_answer, axis=1)
df_clean.head()

Unnamed: 0,question,answer,prompt,prompt_answer
0,A ball is situated at the midpoint of the bott...,2,Role: You are an advanced AI system with excep...,Answer: 2
1,A large metal conducting sphere with radius $1...,25,Role: You are an advanced AI system with excep...,Answer: 25
2,Max finds himself trapped in the center of a m...,120,Role: You are an advanced AI system with excep...,Answer: 120
3,"For his art project, Weishaupt cut out $N=20$ ...",21,Role: You are an advanced AI system with excep...,Answer: 21
4,Two identical neutron stars with mass $m=4 \ti...,590,Role: You are an advanced AI system with excep...,Answer: 590


In [35]:
df_train, df_val= np.split(df_clean[['prompt','prompt_answer']].sample(frac=1), [int(.8*len(df_clean))])

df_train.reset_index(drop=True,inplace=True)
df_val.reset_index(drop=True,inplace=True)

df_train.to_json("prompts-train.json",orient="records")
df_val.to_json("prompts-val.json",orient="records")

In [22]:
#df_test.reset_index(inplace=True)
df_test['prompt'][1]

'Problem: Compute the least integer $n>1$ such that the product of all positive divisors of $n$ equals $n^{4}$.\n\nSolution: Note that every factor pair $d$ and $\\\\frac{n}{d}$ have product $n$. For the product of all such divisor pairs to equal $n^{4}$, there must be exactly 4 divisor pairs, or 8 positive integer divisors. A number has 8 positive integer divisors if it is of the form $a^{3} b^{1}$ or $a^{7}$ where $a$ and $b$ are distinct primes. The prime factorization $a^{3} b^{1}(a \\\\neq b)$ provides a set of divisors each of which has 4 options for using $a\\\\left(a^{0}, a^{1}, a^{2}, a^{3}\\\\right)$ and an independent 2 options for using $b\\\\left(b^{0}, b^{1}\\\\right)$. Using the least values $(a, b)=(2,3), a^{3} b^{1}=24$. If instead the prime factorization is $a^{7}$ (having divisors $a^{0}, a^{1}, a^{2}, \\\\ldots, a^{7}$ ), the least answer would be $2^{7}=128$. Thus the answer is 24 .\n\n'

In [None]:
"Problem: 2500 chess kings have to be placed on a $100 \\times 100$ chessboard so that\n\n(i) no king can capture any other one (i.e. no two kings are placed in two squares sharing a common vertex);\n\n(ii) each row and each column contains exactly 25 kings.\n\nFind the number of such arrangements. (Two arrangements differing by rotation or symmetry are supposed to be different.)\n\nSolution: Suppose that we have an arrangement satisfying the problem conditions. Divide the board into $2 \\\\times 2$ pieces; we call these pieces blocks. Each block can contain not more than one king (otherwise these two kings would attack each other); hence, by the pigeonhole principle each block must contain exactly one king.\\n\\nNow assign to each block a letter $\\\\mathrm{T}$ or $\\\\mathrm{B}$ if a king is placed in its top or bottom half, respectively. Similarly, assign to each block a letter $\\\\mathrm{L}$ or $\\\\mathrm{R}$ if a king stands in its left or right half. So we define T-blocks, B-blocks, L-blocks, and $R$-blocks. We also combine the letters; we call a block $a T L$-block if it is simultaneously T-block and L-block. Similarly we define TR-blocks, $B L$-blocks, and BR-blocks. The arrangement of blocks determines uniquely the arrangement of kings; so in the rest of the solution we consider the $50 \\\\times 50$ system of blocks (see Fig. 1). We identify the blocks by their coordinate pairs; the pair $(i, j)$, where $1 \\\\leq i, j \\\\leq 50$, refers to the $j$ th block in the $i$ th row (or the $i$ th block in the $j$ th column). The upper-left block is $(1,1)$.\\n\\nThe system of blocks has the following properties..\\n\\n$\\\\left(\\\\mathrm{i}^{\\\\prime}\\\\right)$ If $(i, j)$ is a B-block then $(i+1, j)$ is a B-block: otherwise the kings in these two blocks can take each other. Similarly: if $(i, j)$ is a T-block then $(i-1, j)$ is a T-block; if $(i, j)$ is an L-block then $(i, j-1)$ is an L-block; if $(i, j)$ is an R-block then $(i, j+1)$ is an R-block.\\n\\n(ii') Each column contains exactly 25 L-blocks and 25 R-blocks, and each row contains exactly 25 T-blocks and 25 B-blocks. In particular, the total number of L-blocks (or R-blocks, or T-blocks, or B-blocks) is equal to $25 \\\\cdot 50=1250$.\\n\\nConsider any B-block of the form $(1, j)$. By $\\\\left(\\\\mathrm{i}^{\\\\prime}\\\\right)$, all blocks in the $j$ th column are B-blocks; so we call such a column $B$-column. By (ii'), we have 25 B-blocks in the first row, so we obtain 25 B-columns. These $25 \\\\mathrm{~B}$-columns contain $1250 \\\\mathrm{~B}$-blocks, hence all blocks in the remaining columns are T-blocks, and we obtain 25 T-columns. Similarly, there are exactly 25 L-rows and exactly 25 -rows.\\n\\nNow consider an arbitrary pair of a T-column and a neighboring B-column (columns with numbers $j$ and $j+1$ ).\\n\\n\\n\\nFig. 1\\n\\n\\n\\nFig. 2\\n\\nCase 1. Suppose that the $j$ th column is a T-column, and the $(j+1)$ th column is a Bcolumn. Consider some index $i$ such that the $i$ th row is an L-row; then $(i, j+1)$ is a BL-block. Therefore, $(i+1, j)$ cannot be a TR-block (see Fig. 2), hence $(i+1, j)$ is a TL-block, thus the $(i+1)$ th row is an L-row. Now, choosing the $i$ th row to be the topmost L-row, we successively obtain that all rows from the $i$ th to the 50th are L-rows. Since we have exactly 25 L-rows, it follows that the rows from the 1st to the 25th are R-rows, and the rows from the 26th to the 50th are L-rows.\\n\\nNow consider the neighboring R-row and L-row (that are the rows with numbers 25 and 26). Replacing in the previous reasoning rows by columns and vice versa, the columns from the 1 st to the 25th are T-columns, and the columns from the 26th to the 50th are B-columns. So we have a unique arrangement of blocks that leads to the arrangement of kings satisfying the condition of the problem (see Fig. 3).\\n\\n\\n\\nFig. 3\\n\\n\\n\\nFig. 4\\n\\nCase 2. Suppose that the $j$ th column is a B-column, and the $(j+1)$ th column is a T-column. Repeating the arguments from Case 1, we obtain that the rows from the 1st to the 25th are L-rows (and all other rows are R-rows), the columns from the 1st to the 25th are B-columns (and all other columns are T-columns), so we find exactly one more arrangement of kings (see Fig. 4).\n\n"

In [19]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [36]:
dataset = load_dataset('json', data_files={'train': 'prompts-train.json', 'val' : 'prompts-val.json'})

Generating train split: 414 examples [00:00, 8777.98 examples/s]
Generating val split: 104 examples [00:00, 5938.27 examples/s]


In [21]:
model_name = "EleutherAI/llemma_7b"

tokenizer = AutoTokenizer.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [37]:
max_length = 512
stride = 128
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokenized_inputs =  tokenizer(
        examples["prompt"],
        examples["prompt_answer"],
        truncation="only_first",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True,remove_columns=['prompt', 'prompt_answer'])

Map: 100%|██████████| 414/414 [00:00<00:00, 1056.25 examples/s]
Map: 100%|██████████| 104/104 [00:00<00:00, 1058.93 examples/s]


In [31]:
tokenized_datasets.keys()

dict_keys(['train', 'val'])

In [38]:
def add_labels(examples):
    # Create labels shifted by one token to the right, with the last token discarded
    examples['labels'] = [x[1:] + [-100] for x in examples['input_ids']]
    return examples

# Map the add_labels function to the dataset
labelled_dataset = tokenized_datasets.map(add_labels, batched=True)

Map: 100%|██████████| 435/435 [00:00<00:00, 2630.64 examples/s]
Map: 100%|██████████| 115/115 [00:00<00:00, 2002.04 examples/s]


In [40]:
labelled_dataset.save_to_disk('tokenized-datasets')

Saving the dataset (1/1 shards): 100%|██████████| 435/435 [00:00<00:00, 14235.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 115/115 [00:00<00:00, 5755.08 examples/s]
