# Numina Olympiad Problems

In [None]:
# Extract AMC Data from numina
from concurrent.futures import as_completed
import json
from tqdm import tqdm
from datasets import load_dataset

ds = load_dataset("AI-MO/NuminaMath-CoT")
# Filter for amc_aime problems
olympiad = ds['train'].filter(lambda x: x['source'] == 'olympiads')

## Filter for non-proof problems.

In [None]:
# Filter for non-proof problems.
import concurrent.futures
from deepscaler.utils import call_gemini_llm
from deepscaler.system_prompts import FILTER_PROOF_PROMPT

def process_entry(entry):
    output_dict = {}
    # 1) Get the problem text
    problem_text = entry['problem']
    solution_text = entry['solution']
    # 2) Call Gemini LLM
    output_str = call_gemini_llm(f'Problem: {problem_text} \n\n Solution: {solution_text}', system_prompt=FILTER_PROOF_PROMPT)
    if not output_str:
        print("Gemini not happy.")
        return None
    if '[[2]]' in output_str:
        output_dict['problem'] = entry['problem']
        output_dict['solution'] = entry['solution']
        return {
            'problem': entry['problem'],
            'solution': entry['solution'],
        }
    else:
        if '[[3]]' in output_str:
            print(output_str, entry['problem'])
    return output_dict

# Suppose `olympiad` is your list of dictionaries
subset = olympiad  
results = []

with concurrent.futures.ProcessPoolExecutor(max_workers=48) as executor:
    # 1) Submit all jobs to the executor
    futures = [executor.submit(process_entry, entry) for entry in subset]

# 2) Process them as they complete, using tqdm for a progress bar
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing entries"):
    # Get the result for each completed future
    result = future.result()
    if result:
        results.append(result)


# Save final list as json
with open("olympiad_no_proof.json", "w") as f:
    json.dump(results, f, indent=2)

## Filter for Unique Olympiad Problems (outside of Omni_Math)

In [None]:
from deepscaler.utils import RAG
import json

with open("olympiad_no_proof.json", 'r', encoding='utf-8') as f:
    olympiad_data = json.load(f)

with open("../raw/train/omni_math.json", 'r', encoding='utf-8') as f:
    omni_data = json.load(f)

rag_searcher = RAG(docs=[d["problem"] for d in omni_data])

In [None]:
# Filter for olympiad problems that are not in the omni dataset
from tqdm import tqdm

filter_olympiad = []
num_problems = 0

counter = 0
# Wrap olympiad_data with tqdm, optionally adding a description and total
for d in tqdm(olympiad_data, desc="Filtering olympiad data", total=len(olympiad_data)):
    search_result = rag_searcher.top_k(d["problem"], k=1)[0]
    score = search_result["score"]
    if score > 0.93:
        num_problems += 1
    else:
        filter_olympiad.append(d)
    counter += 1
    if counter %1000 == 0:
        print(counter)
# Save final list as json
with open("olympiad_unique.json", "w") as f:
    json.dump(filter_olympiad, f, indent=2)
num_problems

## Extract Answers from each Problem

In [1]:
import concurrent.futures
from concurrent.futures import as_completed
import json
# Process dataset to prpoduce answers
with open("olympiad_no_proof.json", 'r', encoding='utf-8') as f:
    olympiad = json.load(f)

In [None]:
# Have gemini add the solutions to the numina dataset.
# Filter for non-proof problems.
from deepscaler.utils import call_gemini_llm
from deepscaler.system_prompts import EXTRACT_SOLUTION_PROMPT

def get_answer(entry):
    # 1) Get the problem text
    problem_text = entry['problem']
    solution_text = entry['solution']
    # 2) Call Gemini LLM
    output_list = call_gemini_llm(f'Problem: {problem_text} \n----\n Solution: {solution_text}', system_prompt=EXTRACT_SOLUTION_PROMPT, n=3)
    output_list = [o for o in output_list if 
                   'error' not in o and 'Error' not in o and 'Solution not found' not in o]
    if not output_list:
        print(problem_text)
        print(solution_text)
        return None

    for output_str in output_list:
        if 'answer' in entry:
            if output_str in entry['answer']:
                continue
            entry['answer'].append(output_str)
        else:
            entry['answer'] = [output_str]
    return entry

results = []
idx = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
    # 1) Submit all jobs to the executor
    futures = [executor.submit(get_answer, entry) for entry in olympiad]
    # 2) Process them as they complete, using tqdm for a progress bar
    for future in as_completed(futures):
        # Get the result for each completed future
        result = future.result()
        if result:
            results.append(result)
        if idx%1000 == 0:
            print(idx)
            with open("olympiad_checkpoint.json", "w") as f:
                json.dump(results, f, indent=2)
        idx += 1

# Save final list as json
with open("olympiad_no_proof.json", "w") as f:
    json.dump(results, f, indent=2)