# Research Example: Generating Procedures for the GSM8K Dataset Using a Genetic Algorithm and OLLaMa Queries

## Step 1: Import packages and necessary application functions & variables

In [1]:
import re
import math
from llm_procedure_generation_ga.ga_scaffold_structured import ProcedureGA, GAConfig
from llm_procedure_generation_ga.validators import validate_procedure_structured
from procedures.models import Procedure
from procedures.schemas import get_schema
from procedures.prompts import create_procedure_prompt
from procedures.ollama import query, repair_fn_ollama
from procedures.runners import run_steps_stateful_minimal


## Step 2: Import the GSM8K Dataset

In [4]:
from datasets import load_dataset

In [5]:
train_dataset = load_dataset("openai/gsm8k", "main", split="train")
test_dataset = load_dataset("openai/gsm8k", "main", split="test")

## Step 3: Set variable constants and instantiate necessary functions

In [6]:
FINAL_SCHEMA = get_schema("gsm")

In [7]:
def run_steps_fn(proc_json, question, final_answer_schema, model, print_bool=False):
    # use your general runner (backend-agnostic; pass Ollama query fn)
    state = run_steps_stateful_minimal(
        proc_json,
        problem_text=question,
        answer_schema=final_answer_schema,
        model=model,
        query_fn=query,
        print_bool=print_bool,
    )
    return state

def _extract_gold_number(gold_answer: str) -> float | None:
    # GSM8K gold answers are strings; often last number is the target
    nums = re.findall(r"-?\d+(?:\.\d+)?", gold_answer)
    return float(nums[-1]) if nums else None

def eval_fn(state, proc_json) -> float:
    """Return a fitness score in [0,1]."""
    # prefer model-extracted numeric if present, else try to parse its text
    pred_num = state.get("answer_numerical")
    if pred_num is None:
        try:
            pred_num = float(re.findall(r"-?\d+(?:\.\d+)?", state.get("answer",""))[-1])
        except Exception:
            return 0.0
    gold_num = state.get("_gold_num")  # weâ€™ll inject this per item
    if gold_num is None:
        return 0.0
    # exact match or close within small tolerance
    return 1.0 if math.isclose(pred_num, gold_num, rel_tol=0, abs_tol=1e-6) else 0.0

## Step 3: Instantiate the Procedure Genetic Algorithm Object

In [8]:
ga = ProcedureGA(
    model="gemma3:latest",
    create_proc_fn=lambda task: create_procedure_prompt(task),
    query_fn=query,                                     # backend call
    schema_json_fn=lambda: Procedure.model_json_schema(),
    validate_fn=validate_procedure_structured,          # pure function
    repair_fn=repair_fn_ollama,                         # GA expects (proc, model) -> proc
    cfg=GAConfig(population_size=3, max_generations=3, crossover_rate=0.7, mutation_rate=0.3, seed=42),
)

## Step 4: For each Question-Answer pair, run the GA with the question as task_description

In [9]:
def run_gsm8k_batch(examples):
    """
    examples: iterable of dicts like {"id": ..., "question": "...", "answer": "..."} (GSM8K format)
    Returns: list of per-item result dicts with procedure, state, and score
    """
    results = []
    for ex in examples:
        qid = ex.get("id")
        question = ex["question"]
        gold_text = ex["answer"]
        gold_num = _extract_gold_number(gold_text)

        # CHOOSE ONE OF THE FOLLOWING:
        # 1. Task-eval path: supply all three args so GA uses TaskEval scoring each generation
        # best, history = ga.run(
        #     task_description=question,
        #     final_answer_schema=FINAL_SCHEMA,
        #     eval_fn=lambda state, proc: eval_fn({**state, "_gold_num": gold_num}, proc),
        #     run_steps_fn=run_steps_fn,
        #     print_progress=False,
        # )

        # 2. NO Task-eval path: don't supply all three args so GA uses Hygiene scoring each generation
        best, history = ga.run(
            task_description=question,
            final_answer_schema=FINAL_SCHEMA,
            eval_fn=None,
            print_progress=False,
        )

        # After GA finishes, run once more to collect the final state/answer
        final_state = run_steps_fn(best.proc, question, FINAL_SCHEMA, ga.model, print_bool=False)

        results.append({
            "id": qid,
            "question": question,
            "gold_answer": gold_text,
            "gold_num": gold_num,
            "fitness": best.fitness,
            "procedure": best.proc,            # JSON dict
            "state": final_state,              # includes "answer" and "answer_numerical"
            "pred_answer": final_state.get("final_answer"),
            "pred_num": final_state.get("final_answer_numerical"),
            "correct": bool(eval_fn({**final_state, "_gold_num": gold_num}, best.proc) >= 1.0),
            "steps": len(best.proc.get("steps", [])),
        })
    return results

In [10]:
# For testing purposes, just grab first 10 as this will take a long time to run
first_two = train_dataset.select(range(2))

In [11]:
run_gsm8k_batch(first_two)

[{'id': None,
  'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'gold_answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
  'gold_num': 72.0,
  'fitness': 1.25,
  'procedure': {'NameDescription': 'Calculate the total clips sold in April and May.',
   'steps': [{'id': 1,
     'stepDescription': 'Extract the number of clips sold in April from the problem text.',
     'inputs': [{'name': 'problem_text',
       'description': 'The original problem text.'}],
     'output': [{'name': 'clips_sold_in_april',
       'description': 'The number of clips sold in April.'}]},
    {'id': 2,
     'stepDescription': 'Calculate the number of clips sold in May. This is half the number sold in April.',
     'inputs': [{'name': 'clips_sold_in_april',
       'description': 'The number of clips sol