### Imports & Env Setup

In [6]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()

import dspy
sys.path.append(os.path.abspath('../'))
from prompt_migrator.benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

### Configuration

In [32]:
NUM_THREADS = 16

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1' , # or api_base ?
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1', # or api_base ?
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = leaderboard_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
#     benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
    benchmark.signature("") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
    devset=[],
    metric=benchmark.metric,
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,
)

### Load dataset

In [33]:
trainset, valset, testset = benchmark.datasets(
    train_size=0.1,
    validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1203, 2165, 8664)

### Optimize Subset + Evaluation

In [None]:
subset_size = 20
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="light",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 20

2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


 20%|███████████████████████████████████████████████                                                                                                                                                                                            | 4/20 [00:20<01:23,  5.19s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 4/5


 40%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                             | 8/20 [00:52<01:18,  6.56s/it]


Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 5/5


 20%|███████████████████████████████████████████████                                                                                                                                                                                            | 4/20 [00:21<01:24,  5.29s/it]
2025/01/15 17:46:23 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/15 17:46:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/01/15 17:46:47 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, `options`, produce the fields `reasoning`, `answer`.

2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To address the task effectively, provide a detailed, step-by-step explanation for your reasoning when answering multiple-choice questions across various subjects, including biology, chemistry, physics, and social sciences. Ensure your response includes the following elements: 
1. A clear understanding of the question being asked.
2. An evaluation of each option based on relevant knowledge and critical thinking.
3. A logical deduction of the most appropriate answer.
4. A concise summary of your reasoning process.
5. The final answer choice selected from the provided options.

When constructing

Average Metric: 5.00 / 20 (25.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:24<00:00,  1.24s/it]

2025/01/15 17:48:05 INFO dspy.evaluate.evaluate: Average Metric: 5 / 20 (25.0%)
2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 25.0

2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====



Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:23<00:00,  1.19s/it]

2025/01/15 17:48:29 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 75.0
2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0]
2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 13.00 / 16 (81.2%):  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 16/20 [00:12<00:01,  2.94it/s]

In [13]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

BEST PROMPT:
 Given the fields `question`, `options`, produce the fields `reasoning`, `answer`.


In [5]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:subset_size],
)

Average Metric: 11.00 / 20 (55.0%): 100%|██████████| 20/20 [00:12<00:00,  1.65it/s]

2025/01/15 13:55:52 INFO dspy.evaluate.evaluate: Average Metric: 11 / 20 (55.0%)





Unnamed: 0,question,options,example_answer,reasoning,pred_answer,metric
0,In 1935 roughly how many Americans were in favor of Social Securit...,"[A. 20%, B. 30%, C. 60%, D. 100%, E. 40%, F. 10%, G. 80%, H. 90%, ...",H,Not supplied for this particular example.,C,
1,A circular opening of radius 0.8mm in an opaque screen is illumina...,"['A. Wavelength of light for bright spot: 1.25 × 10^-6 m, Waveleng...",B,"To find the wavelength of light for both bright and dark spots, we...","B. Wavelength of light for bright spot: 1.17 × 10^-6 m, Wavelength...",
2,"For the dissociation reaction cl_2(g) > 2Cl(g) at 1200 °K, calcula...","['A. 1.10 × 10^-4', 'B. 6.89 × 10^-6', 'C. 8.97 × 10^-5', 'D. 1.23...",G,To calculate the equilibrium constant \( K_p \) for the dissociati...,B,
3,With what force does the Earth attract the moon?,"['A. 1.2 × 10^25 dynes', 'B. 5.0 × 10^25 dynes', 'C. 3.0 × 10^25 d...",D,The force with which the Earth attracts the Moon can be calculated...,D,✔️ [True]
4,A beam of electrons has speed 10^7 m/s. It is desired to use the m...,"['A. 0.1 m', 'B. 1 mm', 'C. 1 μm', 'D. 0.01 m', 'E. 1 m', 'F. 10 m...",E,To find the radius of the circle in which the electron beam will t...,E,✔️ [True]
5,Calculate the roost probable distribution and the thermodynamicpro...,['A. most probable distribution is number 3; \\Omega = 250; entrop...,I,Not supplied for this particular example.,B,
6,A cylinder with a movable piston contains a gas at pressure P = 1 ...,"['A. 6 × 10^5 Pa', 'B. 9 × 10^5 Pa', 'C. 3 × 10^5 Pa', 'D. 5 × 10^...",H,"According to Boyle's Law, for a given amount of gas at constant te...",H,✔️ [True]
7,If current real GDP is $5000 and full employment real GDP is at $4...,['A. A decrease in taxes and buying bonds in an open market operat...,F,The current real GDP of $5000 is above the full employment real GD...,G,
8,Describe the function of the lateral-line system in fishes.,['A. The lateral-line system in fishes helps with the digestion of...,G,The lateral-line system in fishes is a specialized sensory system ...,G,✔️ [True]
9,"In class, John's teacher tells him that she will give him the coin...","['A. sensory memory decay', 'B. retroactive interference', 'C. fai...",C,"John's inability to identify the pictures on the coins and bills, ...",C,✔️ [True]


## Medium Optimization

In [None]:
subset_size = 500
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="medium",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:subset_size],
    display_table=False,
)

## Heavy Optimization

In [None]:
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="heavy",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
    display_table=False,
)