# Omni-Math Preprocessing

## Filter for non-proof problems.

In [None]:
import json
with open('../train/omni_math.json', 'r') as f:
    omni = json.load(f)
len(omni)

In [None]:
# Filter for non-proof problems.
from copy import deepcopy
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from pprint import pprint

from deepscaler.utils import call_gemini_llm
from deepscaler.system_prompts import FILTER_PROOF_PROMPT

def filter_proofs(idx, entry):
    # 1) Get the problem text
    problem_text = entry['problem']
    solution_text = entry['answer']
    # 2) Call Gemini LLM
    output_str = call_gemini_llm(f'Problem: {problem_text} \n\n Answer: {solution_text}', 
                                 system_prompt=FILTER_PROOF_PROMPT, temperature=0.8, n=4)
    if not output_str:
        return idx, entry
    for output in output_str:
        if '[[1]]' in output:
            return idx, entry
    pprint(problem_text)
    pprint(solution_text)
    pprint(output_str[0])
    return idx, {}

data = deepcopy(omni)

with ProcessPoolExecutor(max_workers=32) as executor:
    # 1) Submit all jobs to the executor
    futures = [executor.submit(filter_proofs, f_idx, entry) for f_idx, entry in enumerate(data)]

# 2) Process them as they complete, using tqdm for a progress bar
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing entries"):
    # Get the result for each completed future
    idx, result = future.result()
    data[idx] = result
data = [d for d in data if d]
# Save final list as json
with open("omni_math.json", "w") as f:
    json.dump(data, f, indent=2)