### Imports & Env Setup

In [1]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()

import dspy
sys.path.append(os.path.abspath('../'))
from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

* 'fields' has been removed


### Configuration

In [2]:
NUM_THREADS = 32

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1' , # or api_base ?
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1', # or api_base ?
    # api_version: Optional[str] = None,
    # api_key: Optional[str] = None,
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = llama_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
#     benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
    benchmark.signature("You are a helpful assistant designed to help with multiple choice question.") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
    devset=[],
    metric=benchmark.metric,
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,
)

### Load dataset

In [3]:
trainset, valset, testset = benchmark.datasets(
    train_size=0.1,
    validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1197, 2156, 8626)

### Baseline Benchmark

In [4]:
%%time
print("BASE PROMPT:\n", program.predict.signature.instructions)

BASE PROMPT:
 You are a helpful assistant designed to help with multiple choice question. Always return a JSON object with the following format:
{
  "reasoning": "Step-by-step reasoning here.",
  "answer": "Final answer (A, B, C, etc.)"
}
Do NOT return plain text. Only return a valid JSON object with these keys.
CPU times: user 178 μs, sys: 18 μs, total: 196 μs
Wall time: 176 μs


In [5]:
%%time

print("Starting execution...")
eval_subset_size = len(testset)
evaluate(
    program,
    devset=testset[:eval_subset_size],
)

Starting execution...
Average Metric: 0.00 / 8626 (0.0%): 100%|██████████████████████████████████████████████████████████████████████████████████| 8626/8626 [7:07:06<00:00,  2.97s/it]

2025/01/22 08:15:35 INFO dspy.evaluate.evaluate: Average Metric: 0 / 8626 (0.0%)





Unnamed: 0,question,options,example_reasoning,example_answer,pred_reasoning,pred_answer,metric
0,How does Freudian theory account for homosexuality?,"{'A': 'According to Freudian theory, homosexuality is a result of ...",,D,"Freudian theory, as proposed by Sigmund Freud, attempts to explain...",D,
1,Find the remainder when 25^1059 is divided by 23.,"{'A': '22', 'B': '6', 'C': '11', 'D': '5', 'E': '13', 'F': '3', 'G...",,,"To find the remainder when 25^1059 is divided by 23, we can use mo...",I,
2,A company sells its product at two different prices in two differe...,"{'A': 'Experimental', 'B': 'Predictive', 'C': 'Causal', 'D': 'Obse...",,A,"To identify the optimal price for the product, the research needs ...",A,
3,In 1989 scientists from Norway discovered that there are far more ...,"{'A': 'up to 5000000', 'B': 'up to 2500000', 'C': 'up to 500000000...",,B,"To answer this question, we need to consider the findings of the N...",B,
4,"Two processors, M-5 and M-7, implement the same instruction set. P...","{'A': 'All three statements are true', 'B': 'None of the statement...",,C,"To determine which of the statements are true, let's analyze each ...",C,
...,...,...,...,...,...,...,...
8621,What is meant by translocation? What theories have been advanced t...,{'A': 'Translocation is the movement of water from the roots to ot...,,I,"To answer this question, we first need to understand what transloc...",I,
8622,(1) Where is the near point of an eye for which a spectacle lens o...,"{'A': '30 cm, -150 cm', 'B': '40 cm, -170 cm', 'C': '65 cm, -210 c...",,F,To find the near point of an eye for which a spectacle lens of pow...,F,
8623,"In the popular nursery rhyme ""Mary Had a Little Lamb"", her lambwou...","{'A': ""The lamb had a habitual path that coincidentally aligned wi...",,B,The nursery rhyme 'Mary Had a Little Lamb' describes a scenario wh...,B,
8624,Studies into the etiology of Schizophrenia indicated a genetic pre...,{'A': 'Excess dopamine or sensitivity to dopamine could be a contr...,,B,The question requires identifying the incorrect statement regardin...,B,


CPU times: user 2min 9s, sys: 6.55 s, total: 2min 15s
Wall time: 7h 7min 7s


(0.0,
 [(Example({'question': 'How does Freudian theory account for homosexuality?', 'options': {'A': 'According to Freudian theory, homosexuality is a result of an overbearing mother', 'B': 'According to Freudian theory, homosexuality is a manifestation of latent desires that have not been expressed in childhood', 'C': 'Freudian theory claims that homosexuality is a consequence of a trauma experienced during adolescence', 'D': 'Freudian theory explains homosexuality as a result of unresolved Oedipal conflict and fear of sexual contact with the opposite sex stemming from early life events.', 'E': 'According to Freudian theory, homosexuality develops from a lack of proper role models of the same sex', 'F': 'Freudian theory attributes homosexuality to an imbalance of hormones in the developmental stages', 'G': 'Freudian theory suggests homosexuality is a choice', 'H': 'Freudian theory posits that homosexuality is an innate sexual orientation present from birth', 'I': 'Homosexuality is a 

In [6]:
%%time
subset_size = 20
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="light",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/22 09:16:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 20

2025/01/22 09:16:29 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/22 09:16:29 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/22 09:16:29 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:55<00:00,  2.78s/it]


Bootstrapped 0 full traces after 19 examples for up to 1 rounds, amounting to 20 attempts.
Bootstrapping set 4/5


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [01:03<00:00,  3.19s/it]


Bootstrapped 0 full traces after 19 examples for up to 1 rounds, amounting to 20 attempts.
Bootstrapping set 5/5


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [01:10<00:00,  3.50s/it]
2025/01/22 09:19:38 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/22 09:19:38 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 0 full traces after 19 examples for up to 1 rounds, amounting to 20 attempts.


2025/01/22 09:19:41 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...



Error getting data summary: Expected dict_keys(['observations']) but got dict_keys(['observations', 'reasoning', 'answer']).

Running without data aware proposer.



AssertionError: Expected dict_keys(['proposed_instruction']) but got dict_keys(['proposed_instruction', 'reasoning', 'answer'])

In [7]:
%%time
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

NameError: name 'optimized_program' is not defined

In [5]:
%%time
eval_subset_size = len(testset)
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:eval_subset_size],
)

NameError: name 'optimized_program' is not defined

## Medium Optimization

In [None]:
%%time
subset_size = 500
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="medium",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

2025/01/22 08:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 19
valset size: 300

2025/01/22 08:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/22 08:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/22 08:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...


Bootstrapping set 1/19
Bootstrapping set 2/19
Bootstrapping set 3/19


 72%|████████████████████████████████████████████████████████████████████████████████████████▎                                 | 362/500 [55:43<04:50,  2.11s/it]

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

In [None]:
%%time
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:subset_size],
    display_table=False,
)

In [None]:
%%time
eval_medium_subset_size = 300
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:eval_medium_subset_size],
)

## Heavy Optimization

In [None]:
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="heavy",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
    display_table=False,
)