### Imports & Env Setup

In [1]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()

import dspy
sys.path.append(os.path.abspath('../'))
from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

### Configuration

In [2]:
NUM_THREADS = 16

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1' , # or api_base ?
    # api_version: Optional[str] = None,
    api_key = "",
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
    "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
    api_base = 'http://localhost:8000/v1', # or api_base ?
    # api_version: Optional[str] = None,
    api_key = "",
    # seed: Optional[int] = None,
    # max_tokens: Optional[int] = None,
    # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = llama_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
#     benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
    benchmark.signature("You are a helpful assistant designed to help with multiple choice question.") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
    devset=[],
    metric=benchmark.metric,
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,
)

### Load dataset

In [3]:
trainset, valset, testset = benchmark.datasets(
    train_size=0.1,
    validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1197, 2156, 8626)

### Baseline Benchmark

In [4]:
print("BASE PROMPT:\n", program.predict.signature.instructions)

BASE PROMPT:
 You are a helpful assistant designed to help with multiple choice question.


In [5]:
!export HOSTED_VLLM_API_KEY=""

In [6]:
import os 

os.environ["HOSTED_VLLM_API_KEY"]=""

In [7]:
eval_subset_size = 100
evaluate(
    program,
    devset=testset[:eval_subset_size],
)

  0%|                                                                                                                                    | 0/100 [00:00<?, ?it/s]

2025/01/21 15:18:14 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'What is the economic effect of the increase in the number of financial intermediaries?', 'options': {'A': 'Decreases the diversity of financial products available', 'B': 'Decreases the number of financial intermediaries', 'C': 'Reduces the overall liquidity in the financial system', 'D': 'Decreases the velocity of money', 'E': 'Leads to a decrease in consumer lending options', 'F': 'Increases the velocity of money', 'G': 'Has no effect on the velocity of money', 'H': 'Enhances the direct investment into the economy', 'I': 'Increases the cost of capital for businesses', 'J': 'Stabilizes the stock market volatility'}, 'answer': 'F'}) (input_keys={'options', 'question'}): litellm.AuthenticationError: AuthenticationError: Hosted_vllmException - The api_key client option must be set either by passing api_key to the client or by setting the HOSTED_VLLM_API_KEY environment variable. Set `provide_tra

AuthenticationError: litellm.AuthenticationError: AuthenticationError: Hosted_vllmException - The api_key client option must be set either by passing api_key to the client or by setting the HOSTED_VLLM_API_KEY environment variable

### Optimize Subset + Evaluation

In [None]:
subset_size = 20
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="light",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
eval_subset_size = 200
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:eval_subset_size],
)

## Medium Optimization

In [None]:
subset_size = 500
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="medium",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:subset_size],
    display_table=False,
)

In [None]:
eval_medium_subset_size = 300
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset[:eval_medium_subset_size],
)

## Heavy Optimization

In [None]:
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="heavy",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
    optimized_program,
    devset=testset,
    display_table=False,
)