# DSPy Optimizer Comparison: Bootstrap FewShot vs MIPRO v2

## Overview

This notebook compares two DSPy optimizers for symptom diagnosis classification:

1. **Bootstrap FewShot**: Generates few-shot examples through bootstrapping
2. **MIPRO v2**: Advanced optimizer that uses prompt/instruction optimization

## Experiment Configuration

- **Project**: 1-dspy
- **Experiment**: 1-dspy-initial-pipeline
- **LLM Model**: ollama/qwen3:0.6b
- **Dataset**: 50 train examples, 20 validation examples
- **MIPRO Mode**: light (fast iteration)

## MLflow Tracking

All experiments are automatically logged to MLflow with:
- Metrics (train/validation accuracy)
- Parameters (optimizer config, model config)
- Artifacts (prompt details, prediction samples, disagreements)
- Model registration in MLflow registry

In [None]:
"""Import required modules for DSPy tuning experiments."""
import pandas as pd
from symptom_diagnosis_explorer.commands.classify.tune import (
    TuneCommand,
    TuneRequest,
)
from symptom_diagnosis_explorer.models.model_development import OptimizerType

print("Imports successful")

In [None]:
"""Common configuration for both optimizer experiments."""

# Experiment configuration
PROJECT = "1-dspy"
EXPERIMENT_NAME = "initial-pipeline"
FULL_EXPERIMENT_NAME = f"/symptom-diagnosis-explorer/{PROJECT}/{EXPERIMENT_NAME}"

# MLflow configuration
MLFLOW_TRACKING_URI = "http://localhost:5001"

# Model configuration
LM_MODEL = "ollama/qwen3:0.6b"
MODEL_NAME_BOOTSTRAP = "symptom-classifier-bootstrap"
MODEL_NAME_MIPRO = "symptom-classifier-mipro"

# Dataset configuration
TRAIN_SIZE = 50
VAL_SIZE = 20

# Optimizer-specific configuration
NUM_THREADS = 4

# Bootstrap configuration
BOOTSTRAP_MAX_BOOTSTRAPPED_DEMOS = 3
BOOTSTRAP_MAX_LABELED_DEMOS = 4

# MIPRO configuration
MIPRO_AUTO = "light"  # Fast iteration mode
MIPRO_MINIBATCH_SIZE = 35
MIPRO_MINIBATCH_FULL_EVAL_STEPS = 5

print("Configuration set")
print(f"Experiment: {FULL_EXPERIMENT_NAME}")
print(f"MLflow URI: {MLFLOW_TRACKING_URI}")
print(f"LLM: {LM_MODEL}")
print(f"Dataset: {TRAIN_SIZE} train, {VAL_SIZE} validation")

In [None]:
"""Run Bootstrap FewShot optimizer experiment."""

print("=" * 80)
print("EXPERIMENT 1: Bootstrap FewShot Optimizer")
print("=" * 80)

# Create request with Bootstrap optimizer configuration
bootstrap_request = TuneRequest(
    optimizer=OptimizerType.BOOTSTRAP_FEW_SHOT,
    train_size=TRAIN_SIZE,
    val_size=VAL_SIZE,
    model_name=MODEL_NAME_BOOTSTRAP,
    experiment_name=FULL_EXPERIMENT_NAME,
    experiment_project=PROJECT,
    lm_model=LM_MODEL,
    num_threads=NUM_THREADS,
    mlflow_tracking_uri=MLFLOW_TRACKING_URI,
    bootstrap_max_bootstrapped_demos=BOOTSTRAP_MAX_BOOTSTRAPPED_DEMOS,
    bootstrap_max_labeled_demos=BOOTSTRAP_MAX_LABELED_DEMOS,
)

# Execute tuning
print("\nStarting Bootstrap FewShot optimization...")
print("This may take a few minutes...")
bootstrap_command = TuneCommand(bootstrap_request)
bootstrap_response = bootstrap_command.execute()

print("\nBootstrap FewShot optimization complete!")

In [None]:
"""Display Bootstrap FewShot results."""

print("\nBOOTSTRAP FEWSHOT RESULTS")
print("-" * 80)

# Display metrics
print("\nMetrics:")
print(f"  Train Accuracy:      {bootstrap_response.metrics.train_accuracy:.4f}")
print(f"  Validation Accuracy: {bootstrap_response.metrics.validation_accuracy:.4f}")
print(f"  Train Examples:      {bootstrap_response.metrics.num_train_examples}")
print(f"  Validation Examples: {bootstrap_response.metrics.num_val_examples}")

# Display model info
print("\nModel Registry:")
print(f"  Name:    {bootstrap_response.model_info.name}")
print(f"  Version: {bootstrap_response.model_info.version}")
print(f"  Run ID:  {bootstrap_response.run_id}")

# Store for comparison
bootstrap_results = {
    "optimizer": "Bootstrap FewShot",
    "train_accuracy": bootstrap_response.metrics.train_accuracy,
    "validation_accuracy": bootstrap_response.metrics.validation_accuracy,
    "run_id": bootstrap_response.run_id,
}

In [None]:
"""Run MIPRO v2 optimizer experiment."""

print("\n" + "=" * 80)
print("EXPERIMENT 2: MIPRO v2 Optimizer")
print("=" * 80)

# Create request with MIPRO optimizer configuration
mipro_request = TuneRequest(
    optimizer=OptimizerType.MIPRO_V2,
    train_size=TRAIN_SIZE,
    val_size=VAL_SIZE,
    model_name=MODEL_NAME_MIPRO,
    experiment_name=FULL_EXPERIMENT_NAME,
    experiment_project=PROJECT,
    lm_model=LM_MODEL,
    num_threads=NUM_THREADS,
    mlflow_tracking_uri=MLFLOW_TRACKING_URI,
    mipro_auto=MIPRO_AUTO,
    mipro_minibatch_size=MIPRO_MINIBATCH_SIZE,
    mipro_minibatch_full_eval_steps=MIPRO_MINIBATCH_FULL_EVAL_STEPS,
    mipro_program_aware_proposer=True,
    mipro_data_aware_proposer=True,
    mipro_tip_aware_proposer=True,
    mipro_fewshot_aware_proposer=True,
)

# Execute tuning
print("\nStarting MIPRO v2 optimization...")
print("This may take several minutes...")
mipro_command = TuneCommand(mipro_request)
mipro_response = mipro_command.execute()

print("\nMIPRO v2 optimization complete!")

In [None]:
"""Display MIPRO v2 results."""

print("\nMIPRO V2 RESULTS")
print("-" * 80)

# Display metrics
print("\nMetrics:")
print(f"  Train Accuracy:      {mipro_response.metrics.train_accuracy:.4f}")
print(f"  Validation Accuracy: {mipro_response.metrics.validation_accuracy:.4f}")
print(f"  Train Examples:      {mipro_response.metrics.num_train_examples}")
print(f"  Validation Examples: {mipro_response.metrics.num_val_examples}")

# Display model info
print("\nModel Registry:")
print(f"  Name:    {mipro_response.model_info.name}")
print(f"  Version: {mipro_response.model_info.version}")
print(f"  Run ID:  {mipro_response.run_id}")

# Store for comparison
mipro_results = {
    "optimizer": "MIPRO v2",
    "train_accuracy": mipro_response.metrics.train_accuracy,
    "validation_accuracy": mipro_response.metrics.validation_accuracy,
    "run_id": mipro_response.run_id,
}

In [None]:
"""Compare results from both optimizers."""

print("\n" + "=" * 80)
print("OPTIMIZER COMPARISON")
print("=" * 80)

# Create comparison DataFrame
comparison_df = pd.DataFrame([bootstrap_results, mipro_results])

print("\n")
print(comparison_df.to_string(index=False))

# Calculate differences
train_acc_diff = mipro_results["train_accuracy"] - bootstrap_results["train_accuracy"]
val_acc_diff = (
    mipro_results["validation_accuracy"] - bootstrap_results["validation_accuracy"]
)

print("\n" + "-" * 80)
print("ANALYSIS")
print("-" * 80)
print("\nAccuracy Differences (MIPRO - Bootstrap):")
print(f"  Train Accuracy:      {train_acc_diff:+.4f}")
print(f"  Validation Accuracy: {val_acc_diff:+.4f}")

# Determine winner
if val_acc_diff > 0.01:
    winner = "MIPRO v2"
elif val_acc_diff < -0.01:
    winner = "Bootstrap FewShot"
else:
    winner = "TIE (within 1%)"

print(f"\nBest Validation Performance: {winner}")

print("\n" + "=" * 80)
print("MLflow Tracking:")
print("  View detailed results in MLflow at: .mlflow")
print(f"  Bootstrap Run ID: {bootstrap_results['run_id']}")
print(f"  MIPRO Run ID:     {mipro_results['run_id']}")
print("=" * 80)

In [None]:
"""Evaluate the best model on the test set."""

from symptom_diagnosis_explorer.commands.classify.evaluate import (
    EvaluateCommand,
    EvaluateRequest,
)

print("\n" + "=" * 80)
print("TEST SET EVALUATION")
print("=" * 80)

# Determine which model performed better on validation set
if bootstrap_results["validation_accuracy"] >= mipro_results["validation_accuracy"]:
    best_model_name = MODEL_NAME_BOOTSTRAP
    best_optimizer = "Bootstrap FewShot"
    best_val_acc = bootstrap_results["validation_accuracy"]
else:
    best_model_name = MODEL_NAME_MIPRO
    best_optimizer = "MIPRO v2"
    best_val_acc = mipro_results["validation_accuracy"]

print(f"\nEvaluating best model: {best_optimizer}")
print(f"  Model Name: {best_model_name}")
print(f"  Validation Accuracy: {best_val_acc:.4f}")

# Evaluate on first 10 test examples
EVAL_SIZE = 10
print(f"\nRunning evaluation on first {EVAL_SIZE} test examples...")

# Create evaluation request with same experiment for tracking
eval_request = EvaluateRequest(
    model_name=best_model_name,
    model_version=None,  # Use latest version
    split="test",
    eval_size=EVAL_SIZE,
    experiment_name=FULL_EXPERIMENT_NAME,
    experiment_project=PROJECT,
    mlflow_tracking_uri=MLFLOW_TRACKING_URI,
)

# Execute evaluation
eval_command = EvaluateCommand()
eval_response = eval_command.execute(eval_request)

# Display results
print("\n" + "-" * 80)
print("TEST SET RESULTS")
print("-" * 80)
print("\nMetrics:")
print(f"  Test Accuracy:  {eval_response.accuracy:.4f}")
print(f"  Test Examples:  {eval_response.num_examples}")
print(f"  Split:          {eval_response.split}")
print(f"  Run ID:         {eval_response.run_id}")

print("\n" + "=" * 80)
print("Evaluation complete! Check MLflow for detailed prediction artifacts.")
print("=" * 80)