# Prompt Optimization Example

In [22]:
!pip install nest_asyncio



In [23]:
import sys
import os
import asyncio
import time
import json
from pathlib import Path

from typing import Optional, Any
from pydantic import BaseModel, Field
import loguru

from dotenv import load_dotenv

from agentics.core.agentics import Agentics as AG
from agentics.core.utils import chunk_list

In [24]:
import nest_asyncio
nest_asyncio.apply()

## Define Data Model for GSM8K Dataset

* Each problem in GSM8K dataset has question and answer fields. 
* In data folder, we provide the post-processed dataset that separates the thought in the think filed and the integer answer  in the numeric field.
* The response_think and response_answer are the output field, and correct is a slot to store if the response answer was correct.

### async function modify_dataset
* We can use `modify_dataset` as a mapping function to asynchronous map to post process the dataset.

In [25]:
class GSM8K(BaseModel):
    question: Optional[str] = Field(None, description="a grade school math question.")
    answer: Optional[str] = Field(None, description="the ground-truth answer to the question including the reasoning, and #### formating.")
    think: Optional[str] = Field(None, description="the step by step reasoning process to derive answer.")
    numeric: Optional[str] = Field(None, description="the number extracted from the final answer to compare with the response answer.")
    response_think: Optional[str] = Field(None, description="the step by step reasoning of response, usually between <think> and </think> tags in CoT prompting.")
    response_answer: Optional[str] = Field(None, description="the number extracted from the final answer to the question that ignores units, etc.")
    correct: Optional[bool] = Field(None, description="place holder for storing True if the answer in the response was correct.")
    
    @staticmethod
    async def grade(state: "GSM8K")->"GSM8K":
        extracted_answer = GSM8K.regex_extract_answer(state.response_answer)
        state.correct = (state.numeric == extracted_answer)
        return state

    @staticmethod
    async def modify_dataset(state: "GSM8K")->"GSM8K":
        think_temp, num_temp = state.answer.split("####")
        state.think = think_temp.strip()
        state.numeric = GSM8K.regex_extract_answer(num_temp.strip())
        return state

    @staticmethod
    def regex_extract_answer(expr:str)->str:
        import re
        ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
        ANS_RE2 = re.compile(r"(\-?[0-9\.\,]+)")
        INVALID_ANS = "[invalid]"    
        match = ANS_RE.search(expr)
        if match:
            match_str = match.group(1).strip()
            match_str = match_str.replace(",", "")
            return match_str
        else:
            match2 = ANS_RE2.search(expr.strip())
            if match2:
                match_str = match2.group(1).strip()
                match_str = match_str.replace(",", "")
                return match_str
        return INVALID_ANS

## Meta Prompt for Prompt Optimization

* We demonstrate the prompt optimization method that uses meta-prompt (Large Language Model As Optimizers, Yang et al 2024).
* In this example, we optimzie the system prompt by searching the "role", "goal", "expected output", and "imperative" sentence.

In [26]:
OPT_META_INSTRUCTION = """Your proposed prompt template will be used in the following way.
* You are "role" -- this role must be suitable for solving the demo task.
* Your personal goal is: "goal" -- the goal achieves the outputs given inputs.
* This is the expected criteria for your final answer "expected_output" -- this constrains the output format.
* You can add a short imperative instruction "imperative" -- this comes after the input of the task.

[[Several demo tasks of input and outputs will be provided when you solve problem.]]

[[The previous optimized prompt templates with scores appear from the worst to the best.]]
{optimization_history}

* Given the previous optimization results, don't generate duplicate or similar prompt templates.
* Generate prompt template that achieves the best score, and succint and concise instructions.
"""


USER_PROMPT_TEMPLATE = """
You are {role}.
Your personal goal is: {goal}.
This is the expected criteria for your final answer: {expected_output}.

solve the following task.
{question}

{imperative}
"""

## Define Data Model for Prompt Optimization Task

* The following cell defines the data model for optimization task.
* The optimization task has demo tasks from the training set.
* The role, goal, expected output, imperative are the slots for storing the response from LLMs.
* The score stores the evaluated score for the proposed prompt.

### Type manipulations 

* In `create_optimization_demos`, the method takes the training data set.
* The demo data set is created by cloning it and making modification to the underlying aType.
* It create a subtype with two fields using `subset_atype` and rebind the type to the demo dataset using `rebind_atype`.
* When we need to modify the underlying data type for AG class, we can modify the data type and rebind it to the AG.


In [27]:
class OptimizationTask(BaseModel):
    demos: Optional[list[Any]] = Field(None, description="optimization demo tasks to undertand the problem domain")
    role: Optional[str] = Field(None, description="New role instruction suggested by LLM")
    goal: Optional[str] = Field(None, description="New goal instruction suggested by LLM")
    expected_output: Optional[str] = Field(None, description="New expected_output instruction suggested by LLM")
    imperative: Optional[str] = Field(None, description="New imperative suggested by LLM")
    score: Optional[int] = Field(None, description="evaluation score of optimization output")

    @staticmethod
    def create_optimization_demos(dataset: AG, num_demos:int)->list[list[BaseModel]]:
        demoset = dataset.clone()
        demoset = demoset.rebind_atype(new_atype=demoset.subset_atype(include_fields={"question", "numeric"}))
        return chunk_list(demoset.states, chunk_size=num_demos)
    
    @classmethod
    def create_optimization_tasks(cls, demo_list: list[list[BaseModel]])->list["OptimizationTask"]:
        # take the list of Demos from chunk list, make 1 task 
        optimization_tasks = []
        for demos in demo_list:
            optimization_tasks.append(OptimizationTask(demos=demos))
        return optimization_tasks

    @staticmethod
    def remove_duplicates(optimization_history: list["OptimizationTask"]):
        sorted_history = sorted(optimization_history, key=lambda x: x.score, reverse=False)      # ascending
        kept_history = []
        for current_best in reversed(sorted_history):       # keep better score
            for kept_task in kept_history:
                if current_best == kept_task:
                    break
            else:
                kept_history.append(current_best)
        return list(reversed(kept_history))         # return ascending order
        
    def __eq__(self, other):
        return self.role == other.role and \
               self.goal == other.goal and \
               self.expected_output == other.expected_output and \
               self.imperative == other.imperative and \
               self.score == other.score
    
    @staticmethod
    def get_history_string(optimization_history: list["OptimizationTask"]):
        history_str = ""
        for optimized_task in optimization_history:
            history_str += (
                optimized_task.model_dump_json(exclude={"demos"}, indent=2)
                + "\n"
            )
        return history_str

In [28]:
def report(dataset: AG, report_name:str="test", first_n:int=0, dump_report:bool=False):
    dataset = dataset.truncate_states(first_n, len(dataset))
    total = len(dataset)
    dataset.filter(func=lambda state: state.correct)
    correct = len(dataset)
    summary = {
        "report_name": report_name,
        "total": total,
        "fewshots": first_n,
        "correct": correct,
        "ratio": "{:.4f}".format(correct/total),
        "score": int(100*correct/total),
    }
    if dump_report:
        with open(Path(__file__).parent/"output"/"report.jsonl", 'a') as fp:
            fp.write(json.dumps(summary) + "\n")
        print(json.dumps(summary, indent=4))
    return summary

### Setting AG parameters

* For AG instances, we can directly modify its parameters by accessing its field.
* The prompt related parameters are `instructions`, `prompt_template`, and `crew_prompt_params`.
* In the following cell, we make all prompt strings empty as initialization step.

In [29]:
def set_default_params(args, agentic:AG)->AG:
    agentic.batch_size = args.batch_size
    agentic.verbose_agent = args.verbose
    agentic.verbose_transduction = args.verbose
    agentic.skip_intensional_definiton = True
    return agentic


def set_prompt_null(agentic:AG)->AG:
    agentic.instructions = ""
    agentic.prompt_template = ""
    agentic.crew_prompt_params = {
        "role": "",
        "goal": "",
        "backstory": "",
        "expected_output": ""
    }
    return agentic

## Initialize Arguments

The following cell introduces additional arguments for the prompt optimization.

In [30]:
class Args(BaseModel):
    num_opts: int = Field(2, description="Total number of optimizers")
    num_demos: int = Field(3, description="Number of demo tasks or problems to show")
    num_trains: int = Field(6, description="Number of train examples = num_opts * num_demos")
    num_devs: int = Field(20, description="Size of devsets to evaluate proposed prompts")
    test_size: Optional[int] = Field(None, description="Size of test set; None uses all")
    train_size: int = Field(500, description="Size of train set")
    llm_model: str = Field("watsonx/meta-llama/llama-3-3-70b-instruct", description="WatsonX LLM model name")
    verbose: bool = Field(False, description="Enable verbose output")
    batch_size: int = Field(5, description="Batch size for transduction")
    best_k: int = Field(8, description="Maintain best-k prompts during optimization")
    max_iter: int = Field(2, description="Maximum number of optimization iterations")
    prompt_file: str = Field("gsm8k_optimized_prompts.jsonl", description="Path to prompt file")
    best_m: int = Field(5, description="Store best-m prompts")
    max_tokens: int = Field(4000, description="Max output token length; input + output < total allowed tokens")
    exp_name: Optional[str] = Field(None, description="Experiment name")
    early_stop_iter: int = Field(2, description="esacpe optimization loop if score doesn't improve")

In [31]:
args = Args()
args.num_demos = 5
args.num_devs = 10
args.test_size = 10
args.verbose = True
args.batch_size = 2
args.best_k = 4
args.best_m = 2
args.early_stop_iter = 2

In [32]:
loguru.logger.remove()
loguru.logger.add(sys.stdout, format="{time} {level} {message}")
logger_file = "gsm8k_opt.logs" if args.exp_name is None else f"gsm8k_opt_{args.exp_name}.logs"
loguru.logger.add(f"logs/{logger_file}", format="{time} {level} {message}")

4

## Load Train Set to AG[GSM8K]

In [33]:
trainset = AG.from_jsonl("data/train_extended.jsonl", GSM8K, jsonl=True, max_rows=args.train_size)

In [34]:
set_default_params(args, trainset)
set_prompt_null(trainset)
trainset.prompt_template = USER_PROMPT_TEMPLATE

## Create LLM Clients for Generation and Evaluation

* Here, we assume that we use watsonx served models.
* The `.env` file stores necessary environment variables.
* We use temperature 1.0 for prompt generation and 0.0 for prompt evaluation.

In [35]:
eval_llm = AG.create_crewai_llm(model=args.llm_model, 
                                base_url=os.getenv("WATSONX_URL"),
                                project_id=os.getenv("WATSONX_PROJECTID"),
                                max_tokens=args.max_tokens,
                                temperature=0.0)
gen_llm = AG.create_crewai_llm(model=args.llm_model, 
                                base_url=os.getenv("WATSONX_URL"),
                                project_id=os.getenv("WATSONX_PROJECTID"),
                                max_tokens=args.max_tokens,
                                temperature=1.0)

In [36]:
loguru.logger.info("## start optimization.")
evaluation_time = 0
optimized_tasks = []
best_test_score = 0
current_best_score = 0
no_improvment_count = 0

2025-09-04T19:25:20.382646-0400 INFO ## start optimization.


## Prompt Optimization Loop

The prompt optimization loop has four parts
1. create optimizer AGs by shuffle the training set and create demo problems
2. transduce the prompt candidates
3. execute the generated prompts on dev set
4. grade the candidate prompts on the dev set

In this Notebook, we parallelize the optimization loop with logical transduction algebra
### Product of two AGs
* `opt_eval = optimizer.product(eval)` correspond to AG[OptimizationTask] x AG[GSM8K]
* This product AG maintains two state lists (internally it maintains a flattend list)
* This allows parallelizing the transduction in the next step

### Quotient of AGs
* `evalsets = eval.quotient(opt_eval)` correspond to AG[(OptimizationTask, GSM8K)/OptimizationTask]
* This allows grading the result of each prompt candidate in the next step using asynchronous MAP


In [37]:
t0 = time.time()
for iter_ind in range(args.max_iter):
    iter_t0 = time.time()
    loguru.logger.info(f"################################")
    loguru.logger.info(f"#### iter {iter_ind}")
    loguru.logger.info(f"#### 1. create optimizer AGs")
    # shuffle the train set and truncate num_trains for demonstration
    shuffled_trainset = trainset.get_random_sample(percent=1.0)
    demosets = shuffled_trainset.clone().truncate_states(0, args.num_trains)
    # create a list of OptimizationTask objects with a list of demos
    chunked_demos = OptimizationTask.create_optimization_demos(demosets, num_demos=args.num_demos)
    optimization_tasks = OptimizationTask.create_optimization_tasks(chunked_demos)
    optimizer = AG.from_states(optimization_tasks, atype=OptimizationTask)
    set_default_params(args, optimizer)
    optimizer.llm = gen_llm
    # set the prompts for the optimizer AG
    optimizer.instructions = OPT_META_INSTRUCTION.format(
        optimization_history = OptimizationTask.get_history_string(optimized_tasks)
    )
    optimizer.prompt_template = """{{"demo tasks":{demos}}}"""
    optimizer.crew_prompt_params = {
        "role": "Prompt optimizer.",
        "goal": "Propose diverse prompt templates that achieves high performance for the demo task given as input.",
        "backstory": "Understand the problem domain given the demo task example and propose what answer should be generated.",
        "expected_output": "the outputs are role, goal, and the expected output description, and imperative sentence for solving provided tasks."
    }

    loguru.logger.info(f"#### 2. generate {args.num_opts} prompts at iter {iter_ind}")
    optimizer = asyncio.run(optimizer.self_transduction(["demos"], ["role", "goal", "expected_output", "imperative"]))
        
    loguru.logger.info(f"#### 3. evalaute transduced {args.num_opts} prompts")
    eval = shuffled_trainset.clone().truncate_states(args.num_trains, args.num_trains + args.num_devs)
    # opt_eval AG is a product of the optimizer AG and the eval AG
    # we internally maintain the pair (optimizer states, eval states) as a flattened list
    opt_eval = optimizer.product(eval)
    set_default_params(args, opt_eval)
    opt_eval.llm = eval_llm
    opt_eval.prompt_template = USER_PROMPT_TEMPLATE
    # self-transduction applies to the combinations of the optimizer and eval states
    opt_eval = asyncio.run(opt_eval.self_transduction(["role", "goal", "expected_output", "imperative", "question"], ["response_think", "response_answer"]))

    loguru.logger.info(f"#### 4. grade responses from {args.num_opts} prompts")
    # quotient divide the evaluated opt_eval and returns the evaluated results as AG
    evalsets = eval.quotient(opt_eval)
    optimizer_scores = []
    for ind, evalset in enumerate(evalsets):
        # apply asychronous map to grade the responses
        evalset = asyncio.run(evalset.amap(GSM8K.grade))    
        summary = report(evalset, report_name=f"optimizer {ind+1}")
        optimizer_scores.append(summary["score"])
        setattr(optimizer[ind], "score", summary["score"])

    loguru.logger.info("#### store best_k prompts found so far")
    # keep the result of prompt optimization in a separte list
    # remove duplicated prompts if exists and sort them by the dev set evaluation score.
    optimized_tasks.extend(optimizer.states)
    optimized_tasks = OptimizationTask.remove_duplicates(optimized_tasks)
    optimized_tasks = optimized_tasks[-args.best_k:]
    current_best_score = optimized_tasks[-1].score

    loguru.logger.info(f"[[TIME]]::ITERATION::{iter_ind+1}={time.time()-iter_t0}")
    loguru.logger.info(f"[[DEV SCORE]]::ITERATION::{iter_ind+1}={current_best_score}")

2025-09-04T19:25:20.403595-0400 INFO ################################
2025-09-04T19:25:20.405325-0400 INFO #### iter 0
2025-09-04T19:25:20.406767-0400 INFO #### 1. create optimizer AGs
2025-09-04T19:25:20.426998-0400 INFO #### 2. generate 2 prompts at iter 0
2025-09-04T19:25:20.430153-0400 DEBUG Executing task: Your proposed prompt template will be used in the following way.
* You are "role" -- this role must be suitable for solving the demo task.
* Your personal goal is: "goal" -- the goal achieves the outputs given inputs.
* This is the expected criteria for your final answer "expected_output" -- this constrains the output format.
* You can add a short imperative instruction "imperative" -- this comes after the input of the task.

[[Several demo tasks of input and outputs will be provided when you solve problem.]]

[[The previous optimized prompt templates with scores appear from the worst to the best.]]


* Given the previous optimization results, don't generate duplicate or similar

2025-09-04T19:25:23.042577-0400 DEBUG Processed 2 states in 2.6083247661590576 seconds
2025-09-04T19:25:23.043802-0400 DEBUG 2 states processed in 1.3041623830795288 seconds average per state ...
2025-09-04T19:25:23.045209-0400 INFO #### 3. evalaute transduced 2 prompts
2025-09-04T19:25:23.065317-0400 DEBUG Executing task: Generate an object of the specified type from the following input.
20 states will be transduced
2025-09-04T19:25:23.073682-0400 DEBUG transducer class: <class 'agentics.abstractions.pydantic_transducer.PydanticTransducerCrewAI'>


2025-09-04T19:25:27.984207-0400 DEBUG Processed 2 states in 4.909012317657471 seconds
2025-09-04T19:25:27.985505-0400 DEBUG 2 states processed in 2.4545061588287354 seconds average per state ...


2025-09-04T19:25:34.255643-0400 DEBUG Processed 2 states in 6.268959045410156 seconds
2025-09-04T19:25:34.256797-0400 DEBUG 4 states processed in 3.134479522705078 seconds average per state ...


2025-09-04T19:25:38.997393-0400 DEBUG Processed 2 states in 4.739032030105591 seconds
2025-09-04T19:25:38.998716-0400 DEBUG 6 states processed in 2.3695160150527954 seconds average per state ...


2025-09-04T19:25:44.858928-0400 DEBUG Processed 2 states in 5.858622312545776 seconds
2025-09-04T19:25:44.860371-0400 DEBUG 8 states processed in 2.929311156272888 seconds average per state ...


2025-09-04T19:25:49.299212-0400 DEBUG Processed 2 states in 4.437593936920166 seconds
2025-09-04T19:25:49.300558-0400 DEBUG 10 states processed in 2.218796968460083 seconds average per state ...


2025-09-04T19:25:56.232809-0400 DEBUG Processed 2 states in 6.930762052536011 seconds
2025-09-04T19:25:56.234438-0400 DEBUG 12 states processed in 3.4653810262680054 seconds average per state ...


2025-09-04T19:26:02.290848-0400 DEBUG Processed 2 states in 6.055255889892578 seconds
2025-09-04T19:26:02.292178-0400 DEBUG 14 states processed in 3.027627944946289 seconds average per state ...


2025-09-04T19:26:09.998010-0400 DEBUG Processed 2 states in 7.704350709915161 seconds
2025-09-04T19:26:09.999381-0400 DEBUG 16 states processed in 3.8521753549575806 seconds average per state ...


2025-09-04T19:26:16.511478-0400 DEBUG Processed 2 states in 6.511184930801392 seconds
2025-09-04T19:26:16.513002-0400 DEBUG 18 states processed in 3.255592465400696 seconds average per state ...


2025-09-04T19:26:23.601313-0400 DEBUG Processed 2 states in 7.086977005004883 seconds
2025-09-04T19:26:23.602800-0400 DEBUG 20 states processed in 3.5434885025024414 seconds average per state ...
2025-09-04T19:26:23.605452-0400 INFO #### 4. grade responses from 2 prompts
2025-09-04T19:26:23.607730-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f91f194c220>
2025-09-04T19:26:23.609500-0400 DEBUG 2 states processed. 0.0003477334976196289 seconds average per state in the last chunk ...
2025-09-04T19:26:23.610655-0400 DEBUG 4 states processed. 0.00012814998626708984 seconds average per state in the last chunk ...
2025-09-04T19:26:23.611890-0400 DEBUG 6 states processed. 0.00011157989501953125 seconds average per state in the last chunk ...
2025-09-04T19:26:23.613048-0400 DEBUG 8 states processed. 0.00010859966278076172 seconds average per state in the last chunk ...
2025-09-04T19:26:23.614242-0400 DEBUG 10 states processed. 9.322166442871094e-05 seconds average per state 

2025-09-04T19:26:26.241095-0400 DEBUG Processed 2 states in 2.590846061706543 seconds
2025-09-04T19:26:26.242606-0400 DEBUG 2 states processed in 1.2954230308532715 seconds average per state ...
2025-09-04T19:26:26.244208-0400 INFO #### 3. evalaute transduced 2 prompts
2025-09-04T19:26:26.267807-0400 DEBUG Executing task: Generate an object of the specified type from the following input.
20 states will be transduced
2025-09-04T19:26:26.275318-0400 DEBUG transducer class: <class 'agentics.abstractions.pydantic_transducer.PydanticTransducerCrewAI'>


2025-09-04T19:26:30.158887-0400 DEBUG Processed 2 states in 3.882028341293335 seconds
2025-09-04T19:26:30.160132-0400 DEBUG 2 states processed in 1.9410141706466675 seconds average per state ...


2025-09-04T19:26:41.712958-0400 DEBUG Processed 2 states in 11.551889657974243 seconds
2025-09-04T19:26:41.714411-0400 DEBUG 4 states processed in 5.775944828987122 seconds average per state ...


2025-09-04T19:26:47.295641-0400 DEBUG Processed 2 states in 5.580082416534424 seconds
2025-09-04T19:26:47.296777-0400 DEBUG 6 states processed in 2.790041208267212 seconds average per state ...


2025-09-04T19:26:53.618862-0400 DEBUG Processed 2 states in 6.3212268352508545 seconds
2025-09-04T19:26:53.620252-0400 DEBUG 8 states processed in 3.1606134176254272 seconds average per state ...


2025-09-04T19:26:57.797197-0400 DEBUG Processed 2 states in 4.1757190227508545 seconds
2025-09-04T19:26:57.798486-0400 DEBUG 10 states processed in 2.0878595113754272 seconds average per state ...


2025-09-04T19:27:06.344589-0400 DEBUG Processed 2 states in 8.544989109039307 seconds
2025-09-04T19:27:06.346041-0400 DEBUG 12 states processed in 4.272494554519653 seconds average per state ...


2025-09-04T19:27:16.512237-0400 DEBUG Processed 2 states in 10.164736032485962 seconds
2025-09-04T19:27:16.513570-0400 DEBUG 14 states processed in 5.082368016242981 seconds average per state ...


2025-09-04T19:27:24.304807-0400 DEBUG Processed 2 states in 7.790239334106445 seconds
2025-09-04T19:27:24.305940-0400 DEBUG 16 states processed in 3.8951196670532227 seconds average per state ...


2025-09-04T19:27:33.571766-0400 DEBUG Processed 2 states in 9.264618158340454 seconds
2025-09-04T19:27:33.572992-0400 DEBUG 18 states processed in 4.632309079170227 seconds average per state ...


2025-09-04T19:27:40.821307-0400 DEBUG Processed 2 states in 7.2472851276397705 seconds
2025-09-04T19:27:40.822455-0400 DEBUG 20 states processed in 3.6236425638198853 seconds average per state ...
2025-09-04T19:27:40.824475-0400 INFO #### 4. grade responses from 2 prompts
2025-09-04T19:27:40.826148-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f91f194c220>
2025-09-04T19:27:40.827656-0400 DEBUG 2 states processed. 0.00022029876708984375 seconds average per state in the last chunk ...
2025-09-04T19:27:40.828672-0400 DEBUG 4 states processed. 0.0001271963119506836 seconds average per state in the last chunk ...
2025-09-04T19:27:40.830305-0400 DEBUG 6 states processed. 0.00013911724090576172 seconds average per state in the last chunk ...
2025-09-04T19:27:40.831231-0400 DEBUG 8 states processed. 0.00010883808135986328 seconds average per state in the last chunk ...
2025-09-04T19:27:40.832532-0400 DEBUG 10 states processed. 0.0001049041748046875 seconds average per state

## Display Optimized Prompts

In [38]:
optimized_tasks = OptimizationTask.remove_duplicates(optimized_tasks)
loguru.logger.info(f"## optimization results final {args.best_m} selections")
for ind, optimized_task in enumerate(optimized_tasks[-args.best_m:]):
    res = optimized_task.model_dump_json(exclude={"demos"}, indent=4)
    loguru.logger.info(f"## {ind+1}-th:\n{res}")
best_optimizers = AG.from_states(optimized_tasks[-args.best_m:], atype=OptimizationTask)

2025-09-04T19:27:40.867259-0400 INFO ## optimization results final 2 selections
2025-09-04T19:27:40.869527-0400 INFO ## 1-th:
{
    "role": "Mathematical Problem Solver",
    "goal": "Calculate the exact numerical answer to the given mathematical problems",
    "expected_output": "A numerical value that solves the problem",
    "imperative": "Read the problem carefully, identify the relevant mathematical operations, and compute the solution",
    "score": 80
}
2025-09-04T19:27:40.871176-0400 INFO ## 2-th:
{
    "role": "Mathematician",
    "goal": "To calculate the remaining amount of butter after making three kinds of cookies",
    "expected_output": "A numeric value representing the amount of butter left in kilograms",
    "imperative": "Calculate the amount of butter used for each type of cookie and subtract the total amount used from the initial amount of butter",
    "score": 80
}


## Evaluate Optimized Propmt In The Test Set

In [39]:
testset = AG.from_jsonl("data/test_extended.jsonl", GSM8K, jsonl=True, max_rows=args.test_size)
final_eval = best_optimizers.product(testset)
set_default_params(args, final_eval)
set_prompt_null(final_eval)

final_eval.llm = eval_llm
final_eval.llm.temperature = 0.0                
final_eval = asyncio.run(final_eval.self_transduction(["role", "goal", "expected_output", "imperative", "question"], ["response_think", "response_answer"]))

evalsets = testset.quotient(final_eval)
optimizer_scores = []
for ind, evalset in enumerate(evalsets):
    evalset = asyncio.run(evalset.amap(GSM8K.grade))    
    summary = report(evalset, report_name=f"optimizer-test {ind+1}")
    optimizer_scores.append(summary["score"])
    setattr(best_optimizers[ind], "score", summary["score"]) 

2025-09-04T19:27:40.900668-0400 DEBUG Executing task: 
20 states will be transduced
2025-09-04T19:27:40.902794-0400 DEBUG transducer class: <class 'agentics.abstractions.pydantic_transducer.PydanticTransducerCrewAI'>


2025-09-04T19:27:44.918257-0400 DEBUG Processed 2 states in 4.014400243759155 seconds
2025-09-04T19:27:44.919385-0400 DEBUG 2 states processed in 2.0072001218795776 seconds average per state ...


2025-09-04T19:27:48.982003-0400 DEBUG Processed 2 states in 4.061661958694458 seconds
2025-09-04T19:27:48.983395-0400 DEBUG 4 states processed in 2.030830979347229 seconds average per state ...


2025-09-04T19:27:52.896458-0400 DEBUG Processed 2 states in 3.9118380546569824 seconds
2025-09-04T19:27:52.897554-0400 DEBUG 6 states processed in 1.9559190273284912 seconds average per state ...


2025-09-04T19:27:57.502674-0400 DEBUG Processed 2 states in 4.603774309158325 seconds
2025-09-04T19:27:57.504089-0400 DEBUG 8 states processed in 2.3018871545791626 seconds average per state ...


2025-09-04T19:28:03.708635-0400 DEBUG Processed 2 states in 6.203189373016357 seconds
2025-09-04T19:28:03.710470-0400 DEBUG 10 states processed in 3.1015946865081787 seconds average per state ...


2025-09-04T19:28:08.216267-0400 DEBUG Processed 2 states in 4.504601955413818 seconds
2025-09-04T19:28:08.217740-0400 DEBUG 12 states processed in 2.252300977706909 seconds average per state ...


2025-09-04T19:28:12.819970-0400 DEBUG Processed 2 states in 4.601144313812256 seconds
2025-09-04T19:28:12.821608-0400 DEBUG 14 states processed in 2.300572156906128 seconds average per state ...


2025-09-04T19:28:18.451590-0400 DEBUG Processed 2 states in 5.628919839859009 seconds
2025-09-04T19:28:18.453366-0400 DEBUG 16 states processed in 2.8144599199295044 seconds average per state ...


2025-09-04T19:28:23.522173-0400 DEBUG Processed 2 states in 5.067671298980713 seconds
2025-09-04T19:28:23.523903-0400 DEBUG 18 states processed in 2.5338356494903564 seconds average per state ...


2025-09-04T19:28:29.540578-0400 DEBUG Processed 2 states in 6.015012502670288 seconds
2025-09-04T19:28:29.541760-0400 DEBUG 20 states processed in 3.007506251335144 seconds average per state ...
2025-09-04T19:28:29.546494-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f91f194c220>
2025-09-04T19:28:29.548394-0400 DEBUG 10 states processed. 3.830194473266602e-05 seconds average per state in the last chunk ...
2025-09-04T19:28:29.549606-0400 DEBUG Executing amap on function <function GSM8K.grade at 0x7f91f194c220>
2025-09-04T19:28:29.551381-0400 DEBUG 10 states processed. 3.775358200073242e-05 seconds average per state in the last chunk ...


In [40]:
loguru.logger.info(f"################################")
loguru.logger.info(f"# best test score:{optimizer_scores}")
loguru.logger.info(f"################################")   

2025-09-04T19:28:29.561306-0400 INFO ################################
2025-09-04T19:28:29.563181-0400 INFO # best test score:[90, 80]
2025-09-04T19:28:29.564278-0400 INFO ################################
