### Imports & Env Setup

In [None]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()
from datasets import load_dataset

import dspy
sys.path.append(os.path.abspath('../'))

from benchmarks import llama_mmlu_pro
from benchmarks.store_llama_mmlu_pro import store_results, store_optimization_results, store_evaluation_results
from benchmarks.statistical_eval import StatisticalEvaluate

from config import MODEL_CONFIGS


import weave
weave.init(project_name="mmlu-pro-optimization")  # You can change the project name as needed

### Configure Constants

In [None]:
NUM_THREADS = 36
benchmark = llama_mmlu_pro
STATISTICAL_EVAL = False
FEW_SHOT_BASELINE = False

### Configure Models

In [None]:
TASK_MODEL_NAME = "vllm_llama_8b"
TASK_MODEL_SETTINGS = MODEL_CONFIGS.get(TASK_MODEL_NAME, MODEL_CONFIGS[TASK_MODEL_NAME])

JUDGE_MODEL_NAME = "openrouter_gpt4o"
JUDGE_MODEL_SETTINGS = MODEL_CONFIGS.get(JUDGE_MODEL_NAME, MODEL_CONFIGS[JUDGE_MODEL_NAME])

TASK_MODEL = dspy.LM(
    TASK_MODEL_SETTINGS["model"],
    api_base=TASK_MODEL_SETTINGS["api_base"],
    api_key=TASK_MODEL_SETTINGS["api_key"],
    cache=False
)

JUDGE_MODEL = dspy.LM(
    JUDGE_MODEL_SETTINGS["model"],
    api_base=JUDGE_MODEL_SETTINGS["api_base"],
    api_key=JUDGE_MODEL_SETTINGS["api_key"],
    cache=False
)

dspy.configure(lm=TASK_MODEL)


# Create a partial metric function that includes the judge_lm
from functools import partial
metric_with_judge = partial(benchmark.metric, judge_lm=JUDGE_MODEL)


### Configure Prompts and Evals

In [1]:
if FEW_SHOT_BASELINE:
    program = dspy.ChainOfThought(
       benchmark.signature(
        """You are a helpful assistant. Few shots examples:
        1. 
        2.
        3.
        4.
        5.
        """
    ))
else:
    program = dspy.ChainOfThought(
   benchmark.signature(
    """You are a helpful assistant."""
))


evaluate = dspy.Evaluate(
    devset=[],
    max_errors=500,
    metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL
    num_threads=NUM_THREADS,
    display_progress=True,
    display_table=True,
    return_all_scores=True,
    return_outputs=True,
    provide_traceback=True
)

NameError: name 'FEW_SHOT_BASELINE' is not defined

### Load entire dataset

In [None]:
trainset, valset, testset = benchmark.datasets(
    train_size=0.25,
    validation_size=0.25,
)

len(trainset), len(valset), len(testset)

In [None]:
combined_dataset = trainset + valset + testset
len(combined_dataset)
TESTSET = combined_dataset

### Baseline Benchmark

In [None]:
%%time
print("Starting execution...")

if STATISTICAL_EVAL:
    evaluate = StatisticalEvaluate(
        n_runs=5,  # Number of evaluation runs for statistical significance
        confidence_level=0.95,  # 95% confidence interval
        devset=[],
        max_errors=500,
        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL
        num_threads=NUM_THREADS,
        display_progress=True,
        display_table=True,
        return_all_scores=True,
        return_outputs=True,
    )
    
    # Then modify the evaluation call to:
    
    stats_results = evaluate(
        program,
        devset=TESTSET,
        statistical=True  # Enable statistical evaluation
    )
    
    print(f"\nStatistical Results:")
    print(f"Mean Score: {stats_results.mean_score:.2f}")
    print(f"Standard Deviation: {stats_results.std_dev:.2f}")
    print(f"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})")
    print(f"Number of Runs: {stats_results.n_runs}")
    print(f"Sample Size: {stats_results.sample_size}")

    run_number = store_results(
        task_model=TASK_MODEL.model,
        judge_model=JUDGE_MODEL.model,
        program=program,  
        stats_results=stats_results,  
    )



else:
    # eval_subset_size = len(testset)
    score, results, all_scores = evaluate(
        program,
        devset=TESTSET,
    )

    run_number = store_results(
        task_model=TASK_MODEL.model,
        judge_model=JUDGE_MODEL.model,
        program=program,  
        results=results,  
        score=score,
    )

## Prompt Optimization + Evaluation

### Configure Lite Optimization

In [None]:
FEW_SHOTS = 5

PROMPT_MODEL_NAME = "vllm_llama_70b"
PROMPT_MODEL_SETTINGS = MODEL_CONFIGS.get(PROMPT_MODEL_NAME, MODEL_CONFIGS[PROMPT_MODEL_NAME])

PROMPT_MODEL = dspy.LM(
    PROMPT_MODEL_SETTINGS["model"],
    api_base=PROMPT_MODEL_SETTINGS["api_base"],
    api_key=PROMPT_MODEL_SETTINGS["api_key"],
    cache=False
)

## Run Lite Optimization

In [None]:
%%time
OPTIMIZER="light"

optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto=OPTIMIZER,
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
    verbose=True
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
    requires_permission_to_run=False,
)

In [None]:
run_number = store_optimization_results(
    task_model=TASK_MODEL.model,
    judge_model=JUDGE_MODEL.model,
    program=optimized_program,
    optimization=OPTIMIZER
)

In [None]:
%%time
print("BEST PROMPT:\n", optimized_program.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.demos)

## Run Evaluation

In [None]:
%%time
print("Starting execution...")
STATISTICAL_EVAL=True
if STATISTICAL_EVAL:
    evaluate = StatisticalEvaluate(
        n_runs=5,  # Number of evaluation runs for statistical significance
        confidence_level=0.95,  # 95% confidence interval
        devset=[],
        max_errors=500,
        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL
        num_threads=NUM_THREADS,
        display_progress=True,
        display_table=True,
        return_all_scores=True,
        return_outputs=True,
    )
    
    # Run the evaluation
    stats_results = evaluate(
        optimized_program,
        devset=TESTSET,
        statistical=True  # Enable statistical evaluation
    )
    
    # Print statistical results
    print(f"\nStatistical Results:")
    print(f"Mean Score: {stats_results.mean_score:.2f}")
    print(f"Standard Deviation: {stats_results.std_dev:.2f}")
    print(f"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})")
    print(f"Number of Runs: {stats_results.n_runs}")
    print(f"Sample Size: {stats_results.sample_size}")
    
    # Store evaluation results
    store_evaluation_results(
        run_number=run_number,  # Use the run_number from optimization
        task_model=TASK_MODEL.model,
        stats_results=stats_results
    )

else:
    # Run regular evaluation
    score, results, all_scores = evaluate(
        optimized_program,
        devset=TESTSET,
    )

    # Store evaluation results
    store_evaluation_results(
        run_number=run_number,  # Use the run_number from optimization
        task_model=TASK_MODEL.model,
        results=results,
        score=score
    )

print(f"Completed run_{run_number}")

## Medium Optimization

In [None]:
%%time
subset_size = 500
optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto="medium",
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset[:subset_size],
    valset=valset[:subset_size],
    requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

## Running Benchmark

In [None]:
%%time
print("Starting execution...")

if STATISTICAL_EVAL:
    evaluate = StatisticalEvaluate(
        n_runs=5,  # Number of evaluation runs for statistical significance
        confidence_level=0.95,  # 95% confidence interval
        devset=[],
        max_errors=500,
        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL
        num_threads=NUM_THREADS,
        display_progress=True,
        display_table=True,
        return_all_scores=True,
        return_outputs=True,
    )
    
    # Then modify the evaluation call to:
    
    stats_results = evaluate(
        program,
        devset=TESTSET,
        statistical=True  # Enable statistical evaluation
    )
    
    # Print statistical results
    print(f"\nStatistical Results:")
    print(f"Mean Score: {stats_results.mean_score:.2f}")
    print(f"Standard Deviation: {stats_results.std_dev:.2f}")
    print(f"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})")
    print(f"Number of Runs: {stats_results.n_runs}")
    print(f"Sample Size: {stats_results.sample_size}")
    
    # If you want to compare two programs:
    # stats_results_a, stats_results_b, p_value = evaluate.compare_programs(
    #     program_a=program,
    #     program_b=optimized_program,
    #     devset=testset
    # )
    
    # print("\nProgram Comparison:")
    # print(f"Program A Mean Score: {stats_results_a.mean_score:.2f}")
    # print(f"Program B Mean Score: {stats_results_b.mean_score:.2f}")
    # print(f"P-value: {p_value:.4f}")
    # print(f"Statistically Significant: {p_value < 0.05}")

    run_number = store_results(
        task_model=TASK_MODEL.model,
        judge_model=JUDGE_MODEL.model,
        program=optimized_program,  
        stats_results=stats_results,  
    )



else:
    # eval_subset_size = len(testset)
    score, results, all_scores = evaluate(
        program,
        devset=TESTSET,
    )

    run_number = store_results(
        task_model=TASK_MODEL.model,
        judge_model=JUDGE_MODEL.model,
        program=optimized_program,  
        results=results,  
        score=score,
    )

## Heavy Optimization

In [None]:
%%time
NUM_THREADS = 48
OPTIMIZER = 'heavy'

optimizer = dspy.MIPROv2(
    metric=benchmark.metric,
    auto=OPTIMIZER,
    num_threads=NUM_THREADS,
    task_model=TASK_MODEL,
    prompt_model=PROMPT_MODEL,
    max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
    program,
    trainset=trainset,
    valset=valset,
    requires_permission_to_run=False,

)

In [None]:
run_number = store_optimization_results(
    task_model=TASK_MODEL.model,
    judge_model=JUDGE_MODEL.model,
    program=optimized_program,
    optimization=OPTIMIZER
)

In [None]:
print("BEST PROMPT:\n", optimized_program.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.demos)

In [None]:
%%time
print("Starting execution...")

if STATISTICAL_EVAL:
    evaluate = StatisticalEvaluate(
        n_runs=5,  # Number of evaluation runs for statistical significance
        confidence_level=0.95,  # 95% confidence interval
        devset=[],
        max_errors=500,
        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL
        num_threads=NUM_THREADS,
        display_progress=True,
        display_table=True,
        return_all_scores=True,
        return_outputs=True,
    )
    
    # Run the evaluation
    stats_results = evaluate(
        optimized_program,
        devset=TESTSET,
        statistical=True  # Enable statistical evaluation
    )
    
    # Print statistical results
    print(f"\nStatistical Results:")
    print(f"Mean Score: {stats_results.mean_score:.2f}")
    print(f"Standard Deviation: {stats_results.std_dev:.2f}")
    print(f"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})")
    print(f"Number of Runs: {stats_results.n_runs}")
    print(f"Sample Size: {stats_results.sample_size}")
    
    # Store evaluation results
    store_evaluation_results(
        run_number=run_number,  # Use the run_number from optimization
        task_model=TASK_MODEL.model,
        stats_results=stats_results
    )

else:
    # Run regular evaluation
    score, results, all_scores = evaluate(
        optimized_program,
        devset=TESTSET,
    )

    # Store evaluation results
    store_evaluation_results(
        run_number=run_number,  # Use the run_number from optimization
        task_model=TASK_MODEL.model,
        results=results,
        score=score
    )

print(f"Completed run_{run_number}")