### Imports & Env Setup

In [14]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()

import dspy
sys.path.append(os.path.abspath('../'))
from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

### Configuration

In [35]:
NUM_THREADS = 16

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1' , # or api_base ?
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1', # or api_base ?
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = llama_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
#     benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
    benchmark.signature("You are a helpful assistant designed to help with multiple choice question.") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
    devset=[],
    metric=benchmark.metric,
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,
)

### Load dataset

In [36]:
trainset, valset, testset = benchmark.datasets(
    train_size=0.1,
    validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1197, 2156, 8626)

### Baseline Benchmark

In [37]:
print("BASE PROMPT:\n", program.predict.signature.instructions)

BASE PROMPT:
 You are a helpful assistant designed to help with multiple choice question.


In [39]:
eval_subset_size = 100
evaluate(
    program,
    devset=testset[:subset_size],
)

Average Metric: 71.00 / 99 (71.7%):  99%|████████████████████████████████████████▌| 99/100 [01:16<00:01,  1.58s/it]

2025/01/16 11:41:56 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A (1/4) in. thick double leather belt is used on a cast steel pulley 50 in. in diameter which rotates at 1000 rpm and transmits 100 hp. Calculate the belt width using the following data: Coefficient of friction between cast-steel and leather = 0.40. Safe stress for belting = 300 psi Joint efficiency = 70 percent.', 'options': {'A': '7(1/2) in.', 'B': '7 in.', 'C': '9 in.', 'D': '6 in.', 'E': '5(1/2) in.', 'F': '9(1/2) in.', 'G': '10 in.', 'H': '8(1/2) in.', 'I': '8 in.', 'J': '11 in.'}, 'answer': 'I'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 71.00 / 99 (71.7%): 100%|████████████████████████████████████████| 100/100 [01:30<00:00,  1.11it/s]

2025/01/16 11:41:56 INFO dspy.evaluate.evaluate: Average Metric: 71.0 / 100 (71.0%)





Unnamed: 0,question,options,example_answer,reasoning,pred_answer,metric,answer
0,Describe the evolution of the reptilian excretory system to accoun...,{'A': 'The excretory system includes a secondary bladder for water...,J,The evolution of the reptilian excretory system from an aquatic to...,J,✔️ [True],
1,A scientist used his car to transport a large quantity of highly f...,"{'A': 'No, because the doctor should have been more careful around...",D,To determine if the doctor will prevail in a claim against the sci...,D,✔️ [True],
2,Which of the following could be used as a test for autocorrelation...,"{'A': 'The Dickey-Fuller test', 'B': 'The Jarque-Bera test', 'C': ...",G,To determine which of the following could be used as a test for au...,G,✔️ [True],
3,Write the balanced cell reaction and calculate theemfat 298 K of t...,"{'A': '.25 V', 'B': '.114 V', 'C': '0.0157963 V', 'D': '.1298 V', ...",,"To solve this problem, we first need to write the balanced cell re...",B,,
4,Assume a temperature of 300 K and find the wavelength of the photo...,"{'A': '2100.0', 'B': '2200.0', 'C': '1600.0', 'D': '1400.0', 'E': ...",G,To find the wavelength of the photon necessary to cause an electro...,J,,
...,...,...,...,...,...,...,...
95,A pure lead bar 10 cm long is maintained with one end at T &=300 K...,"{'A': '2.56e-07', 'B': '6.40e-06', 'C': '6.40e-07', 'D': '5.12e-06...",H,"To find the thermoelectric power for lead, we first need to unders...",H,✔️ [True],
96,Which of the following is another name for evading the issue?,"{'A': 'hasty generalization', 'B': 'slippery slope', 'C': '""you to...",G,"To answer this question, we need to understand what ""evading the i...",G,✔️ [True],
97,A spherical charge distribution varies with the radius r by the eq...,"{'A': 'It increases as r approaches infinity.', 'B': 'It increases...",G,To determine how the electric field strength varies with distance ...,F,,
98,Where in the balance sheet does each of the following belong? (A) ...,"{'A': ""(A) Liability section, (B) Asset side, (C) Owner's Equity s...",J,To determine where each of the given items belongs on the balance ...,J,✔️ [True],


(71.0,
 [(Example({'question': 'Describe the evolution of the reptilian excretory system to account for the transition from an aquatic to a terrestrial habitat.', 'options': {'A': 'The excretory system includes a secondary bladder for water storage.', 'B': 'The reptilian excretory system has evolved to excrete more water', 'C': 'Reptiles have evolved to have a smooth, wet skin', 'D': 'Reptiles have evolved to excrete nitrogenous wastes primarily as ammonia to conserve water.', 'E': 'Reptiles have developed larger glomeruli to increase water retention.', 'F': 'The excretory system has adapted to increase salt excretion to facilitate life in marine environments.', 'G': 'Reptilian kidneys have evolved to produce ammonia directly for more efficient water use.', 'H': 'Reptiles excrete nitrogenous wastes as urea', 'I': 'The excretory system has evolved a complex series of ducts to recycle water back into the body.', 'J': 'The reptilian excretory system has evolved to conserve most of its wat

### Optimize Subset + Evaluation

In [17]:
subset_size = 20
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="light",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/16 11:20:46 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 20

2025/01/16 11:20:46 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/16 11:20:46 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/16 11:20:46 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


 20%|███████████████▊                                                               | 4/20 [00:14<00:56,  3.50s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/5


 20%|███████████████▊                                                               | 4/20 [00:19<01:16,  4.75s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/5


 20%|███████████████▊                                                               | 4/20 [00:20<01:20,  5.03s/it]
2025/01/16 11:21:40 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/16 11:21:40 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/01/16 11:22:02 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/16 11:22:47 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/16 11:22:47 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, `options`, produce the fields `reasoning`, `answer`.

2025/01/16 11:22:47 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To address the multiple-choice question effectively, analyze the given `question` and evaluate each option in the `options` dictionary. Generate a step-by-step `reasoning` process that considers the key concepts, definitions, and relationships relevant to the question. This process should logically lead to the identification of the correct `answer` choice. Ensure the `reasoning` is clear, concise, and directly related to the question asked, and that the `answer` is accurately selected based on this reasoning.

2025/01/16 11:22:47 INFO dspy.teleprompt.mipro_optimizer_v2: 2: To answer a multipl

Average Metric: 14.00 / 20 (70.0%): 100%|██████████████████████████████████████████| 20/20 [00:22<00:00,  1.11s/it]

2025/01/16 11:23:09 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)
2025/01/16 11:23:09 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 70.0

2025/01/16 11:23:09 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2025/01/16 11:23:09 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2025/01/16 11:23:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====



Average Metric: 17.00 / 20 (85.0%): 100%|██████████████████████████████████████████| 20/20 [00:26<00:00,  1.30s/it]

2025/01/16 11:23:36 INFO dspy.evaluate.evaluate: Average Metric: 17 / 20 (85.0%)
2025/01/16 11:23:36 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 85.0
2025/01/16 11:23:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 11:23:36 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0]
2025/01/16 11:23:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0


2025/01/16 11:23:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 16.00 / 20 (80.0%): 100%|██████████████████████████████████████████| 20/20 [00:24<00:00,  1.20s/it]

2025/01/16 11:24:00 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)
2025/01/16 11:24:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 11:24:00 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0, 80.0]
2025/01/16 11:24:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0


2025/01/16 11:24:00 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████| 20/20 [00:19<00:00,  1.02it/s]

2025/01/16 11:24:20 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0, 80.0, 75.0]
2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0


2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====



Average Metric: 16.00 / 20 (80.0%): 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 1343.08it/s]

2025/01/16 11:24:20 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)
2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0, 80.0, 75.0, 80.0]
2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0


2025/01/16 11:24:20 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====



Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████| 20/20 [00:30<00:00,  1.54s/it]

2025/01/16 11:24:51 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/16 11:24:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/01/16 11:24:51 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0, 80.0, 75.0, 80.0, 75.0]
2025/01/16 11:24:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0


2025/01/16 11:24:51 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====



Average Metric: 17.00 / 20 (85.0%): 100%|██████████████████████████████████████████| 20/20 [00:17<00:00,  1.14it/s]

2025/01/16 11:25:08 INFO dspy.evaluate.evaluate: Average Metric: 17 / 20 (85.0%)
2025/01/16 11:25:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 11:25:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0, 80.0, 75.0, 80.0, 75.0, 85.0]
2025/01/16 11:25:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0


2025/01/16 11:25:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====



Average Metric: 18.00 / 20 (90.0%): 100%|██████████████████████████████████████████| 20/20 [00:23<00:00,  1.17s/it]

2025/01/16 11:25:32 INFO dspy.evaluate.evaluate: Average Metric: 18 / 20 (90.0%)
2025/01/16 11:25:32 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 90.0
2025/01/16 11:25:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 90.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2025/01/16 11:25:32 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [70.0, 85.0, 80.0, 75.0, 80.0, 75.0, 85.0, 90.0]
2025/01/16 11:25:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 90.0


2025/01/16 11:25:32 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 90.0!





In [18]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

BEST PROMPT:
 You are a high-stakes test grader, responsible for evaluating the critical thinking and problem-solving skills of students in a high-pressure, timed environment. Given a complex, multiple-choice question that requires the application of technical vocabulary, critical thinking, analysis, and the ability to apply concepts to novel situations, along with a set of potential answer options, your task is to generate a step-by-step reasoning process for arriving at the correct answer, as well as identifying the correct answer itself. The question and options will be provided in the fields `question` and `options`, and you must produce detailed, logical reasoning in the field `reasoning` and the correct answer in the field `answer`. Your response will be evaluated not only on the correctness of the answer but also on the clarity, coherence, and logical soundness of the reasoning provided.


In [43]:
eval_subset_size = 200
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:eval_subset_size],
)

Average Metric: 2.00 / 2 (100.0%):   0%|▏                                          | 1/200 [00:00<00:19, 10.44it/s]

2025/01/16 12:03:59 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A store wishes to make $12,000 profit on sales of $200,000. Find the markup percent on the selling price needed if expenses will be $56,000, markdowns $15,000, shortages $5,000, alteration costs $2,500, and cash discounts earned from vendors $ 4,500.', 'options': {'A': '33.33%', 'B': '39.09%', 'C': '25%', 'D': '37.5%', 'E': '42.5%', 'F': '40%', 'G': '35%', 'H': '28%', 'I': '45%', 'J': '30%'}, 'answer': 'B'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 5.00 / 6 (83.3%):   2%|█                                           | 5/200 [00:00<01:03,  3.07it/s]

2025/01/16 12:03:59 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider the initial value problem\n$$\n5 u^{\\prime \\prime}+2 u^{\\prime}+7 u=0, \\quad u(0)=2, \\quad u^{\\prime}(0)=1\n$$\nFind the smallest $T$ such that $|u(t)| \\leq 0.1$ for all $t>T$.', 'options': {'A': '18.6543', 'B': '8.9765', 'C': '11.1111', 'D': '10.1234', 'E': '14.5115', 'F': '22.2222', 'G': '9.8765', 'H': '16.7890', 'I': '12.3456', 'J': '20.2020'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


  0%|                                                                                      | 0/100 [21:19<?, ?it/s]
Average Metric: 158.00 / 198 (79.8%): 100%|█████████████████████████████████████| 200/200 [00:01<00:00, 187.22it/s]

2025/01/16 12:04:00 INFO dspy.evaluate.evaluate: Average Metric: 158.0 / 200 (79.0%)





Unnamed: 0,question,options,example_answer,reasoning,pred_answer,metric,answer
0,Describe the evolution of the reptilian excretory system to accoun...,{'A': 'The excretory system includes a secondary bladder for water...,J,The transition from an aquatic to a terrestrial habitat imposed si...,J,✔️ [True],
1,A scientist used his car to transport a large quantity of highly f...,"{'A': 'No, because the doctor should have been more careful around...",D,"To prevail in a claim based on strict liability, the doctor must s...",D,✔️ [True],
2,Which of the following could be used as a test for autocorrelation...,"{'A': 'The Dickey-Fuller test', 'B': 'The Jarque-Bera test', 'C': ...",G,The question asks for a test that can be used to detect autocorrel...,G,✔️ [True],
3,Write the balanced cell reaction and calculate theemfat 298 K of t...,"{'A': '.25 V', 'B': '.114 V', 'C': '0.0157963 V', 'D': '.1298 V', ...",,"To solve this problem, we first need to write the balanced cell re...",D,,
4,Assume a temperature of 300 K and find the wavelength of the photo...,"{'A': '2100.0', 'B': '2200.0', 'C': '1600.0', 'D': '1400.0', 'E': ...",G,To find the wavelength of the photon necessary to cause an electro...,J,,
...,...,...,...,...,...,...,...
195,Which statement is true?,{'A': 'All trapezoids are rectangles because they have at least on...,D,"To determine which statement is true, we need to evaluate each opt...",J,,
196,"Select the best English interpretation of the given proposition, u...","{'A': 'All large apartments are bigger than some houses.', 'B': 'S...",E,The given proposition is (∃x)[(Ax • Lx) • (∃y)(Hy • Bxy)]. Breakin...,E,✔️ [True],
197,"f(X) = [\pi(1 + X^2)]^-1- \infty < x < \infty. If Y = X^2, what is...",{'A': 'h(y) = [2 / {\\pi(1 + \\sqrt{y})}] for y > 0 and = 0 otherw...,G,"To find the density function of Y, given that Y = X^2, we first ne...",G,✔️ [True],
198,Two thin convex lenses of focal lengths f_1 and f_2 are separated ...,"{'A': '[(3f_2) / 2]', 'B': '(f_1 + f_2) / 2', 'C': '(2f_2) / 3', '...",A,The focal length of the combination of two thin convex lenses can ...,A,✔️ [True],


## Medium Optimization

In [44]:
subset_size = 500
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="medium",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/16 12:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 19
valset size: 300

2025/01/16 12:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/16 12:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/16 12:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...


Bootstrapping set 1/19
Bootstrapping set 2/19
Bootstrapping set 3/19


  1%|▊                                                                             | 5/500 [00:24<40:21,  4.89s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/19


  1%|▍                                                                             | 3/500 [00:11<31:41,  3.83s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 5/19


  0%|▏                                                                             | 1/500 [00:03<31:28,  3.78s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/19


  1%|▌                                                                             | 4/500 [00:27<57:51,  7.00s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 7/19


  1%|▍                                                                             | 3/500 [00:12<35:51,  4.33s/it]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 8/19


  0%|▎                                                                           | 2/500 [00:21<1:27:53, 10.59s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 9/19


  0%|▏                                                                             | 1/500 [00:04<35:52,  4.31s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 10/19


  1%|▍                                                                             | 3/500 [00:14<38:52,  4.69s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 11/19


  1%|▌                                                                             | 4/500 [00:18<38:35,  4.67s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 12/19


  1%|▌                                                                             | 4/500 [00:18<38:28,  4.65s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 13/19


  1%|▌                                                                             | 4/500 [00:17<36:07,  4.37s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 14/19


  0%|▎                                                                             | 2/500 [00:06<27:05,  3.26s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 15/19


  1%|▌                                                                             | 4/500 [00:20<41:28,  5.02s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 16/19


  1%|▉                                                                           | 6/500 [00:44<1:00:49,  7.39s/it]


Bootstrapped 4 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 17/19


  1%|▍                                                                             | 3/500 [00:12<33:44,  4.07s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 18/19


  0%|▏                                                                             | 1/500 [00:03<25:13,  3.03s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 19/19


  1%|▍                                                                             | 3/500 [00:17<48:40,  5.88s/it]
2025/01/16 12:09:17 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/16 12:09:17 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.


2025/01/16 12:11:11 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/16 12:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/16 12:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a helpful assistant designed to help with multiple choice question.

2025/01/16 12:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To answer a multiple-choice question, provide a step-by-step reasoning process based on the given question and options, and then select the correct answer from the provided choices. Ensure the reasoning is clear, concise, and directly addresses the question being asked.

2025/01/16 12:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a knowledgeable tutor specializing in a wide range of subjects, including mathematics, physics, computer science, and social sciences. Your task is to assist students in understanding and solving multiple-choice questions by providing clear, step-by-step reason

Average Metric: 136.00 / 180 (75.6%):  60%|██████████████████████▊               | 180/300 [02:07<01:11,  1.68it/s]

2025/01/16 12:16:48 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'An ordinary deck of cards containing 26 red cards and 26 black cards is shuffled and dealt out one card at a time without replacement. Let $X_i$ be the color of the $i$th card. Compute $H(X_1,X_2,\\ldots,X_{52})$ in bits.', 'options': {'A': '53.2', 'B': '50.2', 'C': '47.3', 'D': '46.5', 'E': '51.5', 'F': '50.0', 'G': '49.9', 'H': '45.6', 'I': '48.8', 'J': '52'}, 'answer': 'C'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████| 25/25 [05:43<00:00, 13.74s/it]

2025/01/16 12:24:39 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/16 12:24:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].
2025/01/16 12:24:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0]
2025/01/16 12:24:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:24:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:24:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████| 25/25 [26:07<00:00, 62.68s/it]

2025/01/16 12:50:46 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/16 12:50:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].
2025/01/16 12:50:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0]
2025/01/16 12:50:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:50:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:50:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==



Average Metric: 17.00 / 24 (70.8%):  96%|████████████████████████████████████████▎ | 24/25 [00:31<00:02,  2.04s/it]

2025/01/16 12:51:50 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'An aluminum calorimeter of mass 50 g contains 95 g of a mixture of water and ice at 0°C. When 100 g of aluminum which has been heated in a steam jacket is dropped into the mixture, the temperature rises to 5°C. Find the mass of ice originally present if the specific heat capacity of aluminum is 0.22 cal/g\\bulletCdeg.', 'options': {'A': '13.0 g', 'B': '19.50 g', 'C': '22.0 g', 'D': '17.5 g', 'E': '25.0 g', 'F': '16.0 g', 'G': '20.0 g', 'H': '18.0 g', 'I': '21.0 g', 'J': '15.0 g'}, 'answer': 'G'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 17.00 / 24 (70.8%): 100%|██████████████████████████████████████████| 25/25 [01:04<00:00,  2.56s/it]

2025/01/16 12:51:50 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 25 (68.0%)
2025/01/16 12:51:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].
2025/01/16 12:51:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0]
2025/01/16 12:51:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:51:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:51:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████| 25/25 [00:25<00:00,  1.04s/it]

2025/01/16 12:52:16 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/16 12:52:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].
2025/01/16 12:52:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0]
2025/01/16 12:52:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:52:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:52:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==



Average Metric: 20.00 / 25 (80.0%): 100%|██████████████████████████████████████████| 25/25 [00:26<00:00,  1.07s/it]

2025/01/16 12:52:43 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)
2025/01/16 12:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].
2025/01/16 12:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0]
2025/01/16 12:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:52:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████| 25/25 [00:51<00:00,  2.04s/it]

2025/01/16 12:53:34 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/16 12:53:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 12:53:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0]
2025/01/16 12:53:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:53:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:53:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████| 25/25 [00:52<00:00,  2.12s/it]

2025/01/16 12:54:27 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/16 12:54:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].
2025/01/16 12:54:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0]
2025/01/16 12:54:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:54:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:54:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==



Average Metric: 19.00 / 24 (79.2%):  96%|████████████████████████████████████████▎ | 24/25 [01:29<00:05,  5.78s/it]

2025/01/16 12:57:16 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'An aluminum calorimeter of mass 50 g contains 95 g of a mixture of water and ice at 0°C. When 100 g of aluminum which has been heated in a steam jacket is dropped into the mixture, the temperature rises to 5°C. Find the mass of ice originally present if the specific heat capacity of aluminum is 0.22 cal/g\\bulletCdeg.', 'options': {'A': '13.0 g', 'B': '19.50 g', 'C': '22.0 g', 'D': '17.5 g', 'E': '25.0 g', 'F': '16.0 g', 'G': '20.0 g', 'H': '18.0 g', 'I': '21.0 g', 'J': '15.0 g'}, 'answer': 'G'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 19.00 / 24 (79.2%): 100%|██████████████████████████████████████████| 25/25 [01:48<00:00,  4.33s/it]

2025/01/16 12:57:16 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 25 (76.0%)
2025/01/16 12:57:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].
2025/01/16 12:57:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0]
2025/01/16 12:57:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:57:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:57:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████| 25/25 [00:29<00:00,  1.18s/it]

2025/01/16 12:57:45 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/16 12:57:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].
2025/01/16 12:57:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0]
2025/01/16 12:57:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67]
2025/01/16 12:57:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.67


2025/01/16 12:57:45 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====
2025/01/16 12:57:45 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...



Average Metric: 223.00 / 300 (74.3%): 100%|██████████████████████████████████████| 300/300 [04:36<00:00,  1.09it/s]

2025/01/16 13:02:22 INFO dspy.evaluate.evaluate: Average Metric: 223 / 300 (74.3%)
2025/01/16 13:02:22 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 74.33
2025/01/16 13:02:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:02:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33
2025/01/16 13:02:22 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/16 13:02:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==



Average Metric: 17.00 / 25 (68.0%): 100%|██████████████████████████████████████████| 25/25 [00:43<00:00,  1.76s/it]

2025/01/16 13:03:06 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)
2025/01/16 13:03:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 10'].
2025/01/16 13:03:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0]
2025/01/16 13:03:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:03:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:03:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==



Average Metric: 19.00 / 25 (76.0%): 100%|██████████████████████████████████████████| 25/25 [00:29<00:00,  1.16s/it]

2025/01/16 13:03:35 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)
2025/01/16 13:03:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 17'].
2025/01/16 13:03:35 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0]
2025/01/16 13:03:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:03:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:03:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==



Average Metric: 19.00 / 25 (76.0%): 100%|██████████████████████████████████████████| 25/25 [00:48<00:00,  1.94s/it]

2025/01/16 13:04:24 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)
2025/01/16 13:04:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 13'].
2025/01/16 13:04:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0]
2025/01/16 13:04:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:04:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:04:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==



Average Metric: 19.00 / 24 (79.2%):  96%|████████████████████████████████████████▎ | 24/25 [00:28<00:02,  2.65s/it]

2025/01/16 13:05:23 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider the initial value problem\n$$\ny^{\\prime \\prime}+\\gamma y^{\\prime}+y=k \\delta(t-1), \\quad y(0)=0, \\quad y^{\\prime}(0)=0\n$$\nwhere $k$ is the magnitude of an impulse at $t=1$ and $\\gamma$ is the damping coefficient (or resistance).\nLet $\\gamma=\\frac{1}{2}$. Find the value of $k$ for which the response has a peak value of 2 ; call this value $k_1$.', 'options': {'A': '3.1415', 'B': '3.9022', 'C': ' 2.8108', 'D': '2.0000', 'E': '3.5672', 'F': '2.3456', 'G': '4.0000', 'H': '2.7182', 'I': '1.7890', 'J': '1.6180'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 19.00 / 24 (79.2%): 100%|██████████████████████████████████████████| 25/25 [00:58<00:00,  2.36s/it]

2025/01/16 13:05:23 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 25 (76.0%)
2025/01/16 13:05:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].
2025/01/16 13:05:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0]
2025/01/16 13:05:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:05:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:05:23 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==



Average Metric: 19.00 / 24 (79.2%):  96%|████████████████████████████████████████▎ | 24/25 [00:48<00:06,  6.02s/it]

2025/01/16 13:06:27 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'An aluminum calorimeter of mass 50 g contains 95 g of a mixture of water and ice at 0°C. When 100 g of aluminum which has been heated in a steam jacket is dropped into the mixture, the temperature rises to 5°C. Find the mass of ice originally present if the specific heat capacity of aluminum is 0.22 cal/g\\bulletCdeg.', 'options': {'A': '13.0 g', 'B': '19.50 g', 'C': '22.0 g', 'D': '17.5 g', 'E': '25.0 g', 'F': '16.0 g', 'G': '20.0 g', 'H': '18.0 g', 'I': '21.0 g', 'J': '15.0 g'}, 'answer': 'G'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 19.00 / 24 (79.2%): 100%|██████████████████████████████████████████| 25/25 [01:03<00:00,  2.55s/it]

2025/01/16 13:06:27 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 25 (76.0%)
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 14'].
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0]
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|████████████████████████████████████████| 25/25 [00:00<00:00, 1556.24it/s]

2025/01/16 13:06:27 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0]
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==



Average Metric: 16.00 / 25 (64.0%): 100%|████████████████████████████████████████| 25/25 [00:00<00:00, 1375.00it/s]

2025/01/16 13:06:27 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0]
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:06:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==



Average Metric: 16.00 / 25 (64.0%): 100%|██████████████████████████████████████████| 25/25 [00:31<00:00,  1.26s/it]

2025/01/16 13:06:59 INFO dspy.evaluate.evaluate: Average Metric: 16 / 25 (64.0%)
2025/01/16 13:06:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/01/16 13:06:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0]
2025/01/16 13:06:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:06:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:06:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==



Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████| 25/25 [00:23<00:00,  1.05it/s]

2025/01/16 13:07:22 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)
2025/01/16 13:07:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 3'].
2025/01/16 13:07:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0]
2025/01/16 13:07:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:07:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:07:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████| 25/25 [00:59<00:00,  2.37s/it]

2025/01/16 13:08:22 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/16 13:08:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 12'].
2025/01/16 13:08:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0, 72.0]
2025/01/16 13:08:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33]
2025/01/16 13:08:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 74.33


2025/01/16 13:08:22 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====
2025/01/16 13:08:22 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 92.0) from minibatch trials...



Average Metric: 229.00 / 299 (76.6%): 100%|█████████████████████████████████████▊| 299/300 [04:10<00:05,  5.53s/it]

2025/01/16 13:12:47 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider the initial value problem\n$$\ny^{\\prime \\prime}+\\gamma y^{\\prime}+y=k \\delta(t-1), \\quad y(0)=0, \\quad y^{\\prime}(0)=0\n$$\nwhere $k$ is the magnitude of an impulse at $t=1$ and $\\gamma$ is the damping coefficient (or resistance).\nLet $\\gamma=\\frac{1}{2}$. Find the value of $k$ for which the response has a peak value of 2 ; call this value $k_1$.', 'options': {'A': '3.1415', 'B': '3.9022', 'C': ' 2.8108', 'D': '2.0000', 'E': '3.5672', 'F': '2.3456', 'G': '4.0000', 'H': '2.7182', 'I': '1.7890', 'J': '1.6180'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 229.00 / 299 (76.6%): 100%|██████████████████████████████████████| 300/300 [04:25<00:00,  1.13it/s]

2025/01/16 13:12:47 INFO dspy.evaluate.evaluate: Average Metric: 229.0 / 300 (76.3%)
2025/01/16 13:12:47 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 76.33
2025/01/16 13:12:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33]
2025/01/16 13:12:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33
2025/01/16 13:12:47 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/16 13:12:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==



Average Metric: 14.00 / 25 (56.0%): 100%|██████████████████████████████████████████| 25/25 [00:53<00:00,  2.14s/it]

2025/01/16 13:13:41 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0, 72.0, 56.0]
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33]
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33


2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==



Average Metric: 21.00 / 25 (84.0%): 100%|████████████████████████████████████████| 25/25 [00:00<00:00, 1533.12it/s]

2025/01/16 13:13:41 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 3'].
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0, 72.0, 56.0, 84.0]
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33]
2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33


2025/01/16 13:13:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==



Average Metric: 19.00 / 25 (76.0%): 100%|██████████████████████████████████████████| 25/25 [00:26<00:00,  1.07s/it]

2025/01/16 13:14:08 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)
2025/01/16 13:14:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 3'].
2025/01/16 13:14:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0, 72.0, 56.0, 84.0, 76.0]
2025/01/16 13:14:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33]
2025/01/16 13:14:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33


2025/01/16 13:14:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==



Average Metric: 18.00 / 25 (72.0%): 100%|██████████████████████████████████████████| 25/25 [00:31<00:00,  1.26s/it]

2025/01/16 13:14:40 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 9'].
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0, 72.0, 56.0, 84.0, 76.0, 72.0]
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33]
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33


2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==



Average Metric: 14.00 / 25 (56.0%): 100%|████████████████████████████████████████| 25/25 [00:00<00:00, 1454.34it/s]

2025/01/16 13:14:40 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 3'].
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [72.0, 68.0, 68.0, 72.0, 80.0, 72.0, 68.0, 80.0, 76.0, 72.0, 68.0, 76.0, 76.0, 76.0, 76.0, 92.0, 64.0, 64.0, 92.0, 72.0, 56.0, 84.0, 76.0, 72.0, 56.0]
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33]
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33


2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====
2025/01/16 13:14:40 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 78.0) from minibatch trials...



Average Metric: 208.00 / 271 (76.8%):  90%|██████████████████████████████████▏   | 270/300 [03:11<00:20,  1.50it/s]

2025/01/16 13:17:51 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider the initial value problem\n$$\ny^{\\prime \\prime}+\\gamma y^{\\prime}+y=k \\delta(t-1), \\quad y(0)=0, \\quad y^{\\prime}(0)=0\n$$\nwhere $k$ is the magnitude of an impulse at $t=1$ and $\\gamma$ is the damping coefficient (or resistance).\nLet $\\gamma=\\frac{1}{2}$. Find the value of $k$ for which the response has a peak value of 2 ; call this value $k_1$.', 'options': {'A': '3.1415', 'B': '3.9022', 'C': ' 2.8108', 'D': '2.0000', 'E': '3.5672', 'F': '2.3456', 'G': '4.0000', 'H': '2.7182', 'I': '1.7890', 'J': '1.6180'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 223.00 / 298 (74.8%): 100%|█████████████████████████████████████▊| 299/300 [03:56<00:07,  7.34s/it]

2025/01/16 13:18:50 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'An aluminum calorimeter of mass 50 g contains 95 g of a mixture of water and ice at 0°C. When 100 g of aluminum which has been heated in a steam jacket is dropped into the mixture, the temperature rises to 5°C. Find the mass of ice originally present if the specific heat capacity of aluminum is 0.22 cal/g\\bulletCdeg.', 'options': {'A': '13.0 g', 'B': '19.50 g', 'C': '22.0 g', 'D': '17.5 g', 'E': '25.0 g', 'F': '16.0 g', 'G': '20.0 g', 'H': '18.0 g', 'I': '21.0 g', 'J': '15.0 g'}, 'answer': 'G'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 223.00 / 298 (74.8%): 100%|██████████████████████████████████████| 300/300 [04:10<00:00,  1.20it/s]

2025/01/16 13:18:50 INFO dspy.evaluate.evaluate: Average Metric: 223.0 / 300 (74.3%)
2025/01/16 13:18:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [72.67, 74.33, 76.33, 74.33]
2025/01/16 13:18:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.33
2025/01/16 13:18:50 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/01/16 13:18:50 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 76.33!





In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [45]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

BEST PROMPT:
 You are a helpful assistant designed to help with multiple-choice questions. Given a question and a set of options, analyze the question carefully, considering the context and the principles involved. Then, evaluate each option in relation to the question, selecting the most appropriate answer based on your analysis. Provide a step-by-step reasoning for your choice, explaining why you selected a particular option and why the others are incorrect. Ensure your reasoning is clear, concise, and directly addresses the question being asked.


In [55]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

BEST EXAMPLES:
 [Example({'augmented': True, 'question': 'Howarethe type of muscle tissues differentiated?', 'options': {'A': 'Muscle tissues are differentiated by the presence or absence of striations under a microscope', 'B': 'Muscle tissues are differentiated by the types of hormones they respond to', 'C': 'Muscle tissues are differentiated by their color under standard lighting conditions', 'D': 'Muscle tissues are differentiated by their size and shape', 'E': 'Muscle tissues are differentiated by their ability to produce electrical impulses', 'F': 'Muscle tissues are differentiated by their ability to heal and regenerate', 'G': 'Muscle tissues are differentiated by the number of nuclei per cell', 'H': 'Muscle tissues are differentiated by the speed of contraction and relaxation', 'I': 'Muscle tissues are differentiated by their location in the body', 'J': 'Skeletal muscle is responsible for most voluntary movements, smooth muscle is involved in most involuntary movements of intern

In [46]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:subset_size],
    display_table=False,
)

Average Metric: 386.00 / 500 (77.2%): 100%|██████████████████████████████████████| 500/500 [31:43<00:00,  3.81s/it]

2025/01/16 14:00:46 INFO dspy.evaluate.evaluate: Average Metric: 386 / 500 (77.2%)





In [51]:
eval_medium_subset_size = 300
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:eval_medium_subset_size],
)

Average Metric: 236.00 /


2025/01/16 15:20:02 INFO dspy.evaluate.evaluate: Average Metric: 236 / 300 (78.7%)


Unnamed: 0,question,options,example_answer,reasoning,pred_answer,metric
0,Describe the evolution of the reptilian excretory system to accoun...,{'A': 'The excretory system includes a secondary bladder for water...,J,The evolution of the reptilian excretory system from an aquatic to...,J,✔️ [True]
1,A scientist used his car to transport a large quantity of highly f...,"{'A': 'No, because the doctor should have been more careful around...",D,To determine if the doctor will prevail in a claim against the sci...,D,✔️ [True]
2,Which of the following could be used as a test for autocorrelation...,"{'A': 'The Dickey-Fuller test', 'B': 'The Jarque-Bera test', 'C': ...",G,The Breusch-Godfrey test is a statistical test used to detect auto...,G,✔️ [True]
3,Write the balanced cell reaction and calculate theemfat 298 K of t...,"{'A': '.25 V', 'B': '.114 V', 'C': '0.0157963 V', 'D': '.1298 V', ...",,"To solve this problem, we need to write the balanced cell reaction...",D,
4,Assume a temperature of 300 K and find the wavelength of the photo...,"{'A': '2100.0', 'B': '2200.0', 'C': '1600.0', 'D': '1400.0', 'E': ...",G,To find the wavelength of the photon necessary to cause an electro...,J,
...,...,...,...,...,...,...
295,We were first able to accurately measure the diameter of Pluto from:,"{'A': ""Lunar-based observations made during NASA's Apollo missions...",H,The diameter of Pluto was first accurately measured through observ...,D,
296,Which of the following is a clustering algorithm in machine learning?,"{'A': 'Linear Regression', 'B': 'CART', 'C': 'Logistic Regression'...",D,Clustering algorithms in machine learning are used to group simila...,D,✔️ [True]
297,"In a population in Denmark, the relative fitness of the allele for...","{'A': '7.82 × 10^-5', 'B': '3.14 × 10^-5', 'C': '1.19 × 10^-4', 'D...",H,"To find the mutation rate, we first need to understand the relatio...",D,
298,Miss Jones has been concerned about her health lately. She has not...,"{'A': 'herpes', 'B': 'trichomoniasis', 'C': 'pubic lice', 'D': 'sy...",C,Given Miss Jones' symptoms of itching and skin irritation around h...,C,✔️ [True]


## Heavy Optimization

In [None]:
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="heavy",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
    display_table=False,
)