### Imports & Env Setup

In [1]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()
from datasets import load_dataset


import dspy
sys.path.append(os.path.abspath('../'))
from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

### Configuration

In [2]:
NUM_THREADS = 48

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1' , # or api_base ?
    api_key = "dummy",
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1', # or api_base ?
    api_key = "dummy",

    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = llama_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
#     benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
    benchmark.signature("You are a helpful assistant designed to help with multiple choice question.") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
    devset=[],
    max_errors = 500,
    metric=benchmark.metric,
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,
)

### Load dataset

In [3]:
trainset, valset, testset = benchmark.datasets(
    train_size=0.1,
    validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1000, 200, 10779)

In [4]:
dataset = load_dataset(
    "meta-llama/Llama-3.3-70B-Instruct-evals",
    "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
)
full_dataset = list(map(benchmark._task_doc_example, dataset["latest"]))

len(full_dataset)

11979

# using promopt from Meta

In [None]:
program = dspy.ChainOfThought(
    benchmark.signature("Given the following question and candidate answers, choose the best answer.") # put your initial system prompt here, or leave blank
)

In [None]:
%%time

print("Starting execution...")
# eval_subset_size = len(testset)
score, results, all_scores = evaluate(
    program,
    devset=full_dataset,
)

### Baseline Benchmark

In [5]:
%%time
print("BASE PROMPT:\n", program.signature.instructions)

BASE PROMPT:
 You are a helpful assistant designed to help with multiple choice question.
CPU times: user 274 μs, sys: 7 μs, total: 281 μs
Wall time: 238 μs


In [9]:
%%time
print("BEST EXAMPLES:\n", program.demos)

BEST EXAMPLES:
 []
CPU times: user 272 μs, sys: 9 μs, total: 281 μs
Wall time: 245 μs


In [None]:
%%time

print("Starting execution...")
# eval_subset_size = len(testset)
score, results, all_scores = evaluate(
    program,
    devset=full_dataset,
)

In [None]:
results.head()

In [None]:
import csv

with open("my_results.csv", mode="w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    # If `results` is a list of lists, just write rows:
    for row in results:
        writer.writerow(row)


In [None]:
import pandas as pd

df = pd.DataFrame([(i, str(pred), score) for i, (example, pred, score) in enumerate(results)],
                     columns=['Example_Index', 'Prediction', 'Score'])
print("\nResults DataFrame:")
print(df)

In [None]:
comparison_df = pd.DataFrame([{
        'Question': example.inputs()['question'][:100] + '...' if len(example.inputs()['question']) > 100 else example.inputs()['question'],
        'Predicted Answer': prediction.answer if hasattr(prediction, 'answer') else str(prediction),
        'Correct Answer': example.answer,
        'Is Correct': '✔️' if score else '❌'
} for example, prediction, score in results])


csv_filename = 'prediction_results.csv'
comparison_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # utf-8-sig to handle special characters
print(f"\nResults saved to {csv_filename}")
    
pd.set_option('display.max_colwidth', None)
print("\nPredictions vs Actual Answers:")
print(comparison_df)

In [None]:
%%time

print("Starting execution...")
evaluate(
    program,
    devset=testset,
)

### Optimize Subset + Evaluation

In [10]:
%%time
subset_size = 20
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="light",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 20

2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


 25%|███████████▌                                  | 5/20 [00:00<00:00, 19.77it/s]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/5


 25%|███████████▎                                 | 5/20 [00:00<00:00, 249.55it/s]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 5/5


 10%|████▌                                        | 2/20 [00:00<00:00, 255.07it/s]
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a helpful assistant designed to help with multiple choice question.

2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To address the task effectively, I propose the following instruction: "You are a critical thinking assistant tasked with solving a multiple-choice question. Given the ques

Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 15.00 / 20 (75.0%): 100%|████████| 20/20 [00:00<00:00, 404.41it/s]

2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 75.0

2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====



Average Metric: 14.00 / 20 (70.0%): 100%|████████| 20/20 [00:00<00:00, 300.22it/s]


2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0]
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====


Average Metric: 13.00 / 20 (65.0%): 100%|████████| 20/20 [00:00<00:00, 148.40it/s]

2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 13 / 20 (65.0%)
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0]
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 16.00 / 20 (80.0%): 100%|████████| 20/20 [00:00<00:00, 357.67it/s]

2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)





2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 80.0
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0]
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====


Average Metric: 13.00 / 20 (65.0%): 100%|███████| 20/20 [00:00<00:00, 3125.76it/s]

2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 13 / 20 (65.0%)
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0]
2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====



Average Metric: 16.00 / 20 (80.0%): 100%|████████| 20/20 [00:00<00:00, 424.70it/s]

2025/01/29 14:31:09 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0, 80.0]
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====



Average Metric: 12.00 / 20 (60.0%): 100%|████████| 20/20 [00:00<00:00, 306.69it/s]


2025/01/29 14:31:09 INFO dspy.evaluate.evaluate: Average Metric: 12 / 20 (60.0%)
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0, 80.0, 60.0]
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====


Average Metric: 14.00 / 20 (70.0%): 100%|████████| 20/20 [00:00<00:00, 388.98it/s]


2025/01/29 14:31:09 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0, 80.0, 60.0, 70.0]
2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 80.0!


CPU times: user 1.43 s, sys: 328 ms, total: 1.76 s
Wall time: 1.55 s


In [11]:
%%time
print("BEST PROMPT:\n", optimized_program.signature.instructions)

BEST PROMPT:
 You are a helpful assistant designed to help with multiple choice question.
CPU times: user 252 μs, sys: 13 μs, total: 265 μs
Wall time: 236 μs


In [12]:
print("BEST EXAMPLES:\n", optimized_program.demos)

BEST EXAMPLES:
 [Example({'question': 'Explain what difficulties would arise if messenger RNA molecules were not destroyed after they had produced some polypeptide chains.', 'options': {'A': 'mRNA would replicate rapidly', 'B': 'The cell would use mRNA as a source of energy', 'C': 'The cell would lack proteins', 'D': 'Cell would enter a state of permanent division', 'E': 'mRNA would be transformed into DNA', 'F': 'Excess protein production, energy depletion, and potential harm to the cell', 'G': 'mRNA would exit the cell and infect neighboring cells', 'H': 'Proteins would be broken down into mRNA', 'I': 'mRNA would become part of the cell membrane', 'J': 'mRNA would bind to lipids and carbohydrates, disrupting cellular metabolism'}, 'answer': 'F'}) (input_keys={'options', 'question'}), Example({'question': 'Based on the characteristic population curves that result from plotting population growth of a species, the most effective means of controlling the mosquito population is to', 'opti

In [15]:
examples_json = [example.to_dict() for example in optimized_program.demos]
print("BEST EXAMPLES:\n", json.dumps(examples_json, indent=2))

AttributeError: 'Example' object has no attribute 'to_dict'

In [None]:
%%time
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
)

## Medium Optimization

In [13]:
%%time
subset_size = 500
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="medium",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 19
valset size: 200

2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...


Bootstrapping set 1/19
Bootstrapping set 2/19
Bootstrapping set 3/19


  1%|▍                                            | 5/500 [00:16<26:58,  3.27s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/19


  1%|▍                                          | 5/500 [00:47<1:17:43,  9.42s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 5/19


  0%|▏                                            | 2/500 [00:10<44:44,  5.39s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 6/19


  1%|▍                                            | 5/500 [00:25<42:01,  5.09s/it]


Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 7/19


  1%|▎                                          | 4/500 [00:40<1:24:42, 10.25s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 8/19


  0%|▏                                            | 2/500 [00:10<45:15,  5.45s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 9/19


  1%|▌                                            | 6/500 [00:26<36:40,  4.45s/it]


Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 10/19


  1%|▎                                          | 4/500 [00:35<1:12:42,  8.80s/it]


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 11/19


  1%|▎                                            | 3/500 [00:18<49:42,  6.00s/it]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 12/19


  0%|▏                                          | 2/500 [00:20<1:23:41, 10.08s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 13/19


  1%|▍                                            | 5/500 [00:24<41:05,  4.98s/it]


Bootstrapped 2 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 14/19


  0%|                                                     | 0/500 [00:03<?, ?it/s]

KeyboardInterrupt



In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.demos)

In [None]:
examples_json = [example.to_dict() for example in optimized_program.predict.demos]
print("BEST EXAMPLES:\n", json.dumps(examples_json, indent=2))

In [None]:
%%time
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
)

## Heavy Optimization

In [None]:
%%time
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="heavy",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
    display_table=False,
)