# DSPy Tutorials

From dspy.ai - these tutorials demonstrate how to build DSPy programs to perform various tasks.

In [1]:
import dspy
# lm = dspy.LM('xai/grok-3-mini')
lm = dspy.LM('ollama_chat/devstral', api_base='http://localhost:11434', api_key='')
dspy.configure(lm=lm)

In [16]:
# Example 1: Simple Question Answering
math = dspy.ChainOfThought("question -> answer: float")
math(question="Two dice are tossed. What is the probability that the sum equals two?")


Prediction(
    reasoning='To find the probability that the sum of two dice equals two, we need to consider all possible outcomes when two six-sided dice are rolled. Each die has 6 faces, so there are a total of 6 * 6 = 36 possible outcomes.\n\nThe only way for the sum to be exactly two is if both dice show a value of one (i.e., 1 + 1). There is only one such outcome out of the 36 possible outcomes. Therefore, the probability is calculated as follows:\n\nProbability = Number of favorable outcomes / Total number of possible outcomes\n             = 1 / 36',
    answer=0.0278
)

In [17]:
# Example 2: RAG with Retrieval
def search_wikipedia(query: str) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
    return [x['text'] for x in results]

rag = dspy.ChainOfThought('context, question -> response')

question = "What's the name of the castle that David Gregory inherited?"
rag(context=search_wikipedia(question), question=question)

Prediction(
    reasoning='The context provides information about three different individuals named Gregory. The relevant section is [1], which discusses David Gregory, a Scottish physician and inventor. Within this section, it mentions that he inherited Kinnairdy Castle in 1664.',
    response='Kinnairdy Castle'
)

In [18]:
# Example 3: Classification
from typing import Literal

class Classify(dspy.Signature):
    """Classify sentiment of a given sentence."""

    sentence: str = dspy.InputField()
    sentiment: Literal['positive', 'negative', 'neutral'] = dspy.OutputField()
    confidence: float = dspy.OutputField()

classify = dspy.Predict(Classify)
classify(sentence="This book was super fun to read, though not the last chapter.")

Prediction(
    sentiment='positive',
    confidence=0.95
)

In [19]:
# Example 4: Information Extraction
class ExtractInfo(dspy.Signature):
    """Extract structured information from text."""

    text: str = dspy.InputField()
    title: str = dspy.OutputField()
    headings: list[str] = dspy.OutputField()
    entities: list[dict[str, str]] = dspy.OutputField(desc="a list of entities and their metadata")

module = dspy.Predict(ExtractInfo)

text = "Apple Inc. announced its latest iPhone 14 today." \
    "The CEO, Tim Cook, highlighted its new features in a press release."
response = module(text=text)

print(response.title)
print(response.headings)
print(response.entities)

Apple Inc. Announces Latest iPhone 14
['Latest iPhone 14', 'CEO Tim Cook']
[{'name': 'Apple Inc.', 'type': 'Organization'}, {'name': 'iPhone 14', 'type': 'Product'}, {'name': 'Tim Cook', 'type': 'Person'}]


In [20]:
# Example 5: Agents
def evaluate_math(expression: str):
    return dspy.PythonInterpreter({}).execute(expression)

def search_wikipedia(query: str):
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
    return [x['text'] for x in results]

react = dspy.ReAct("question -> answer: float", tools=[evaluate_math, search_wikipedia])

pred = react(question="What is 9362158 divided by the year of birth of David Gregory of Kinnairdy castle?")
print(pred.answer)

5762.4


In [21]:
# Example 6: Multi-stage pipeline
class Outline(dspy.Signature):
    """Outline a thorough overview of a topic."""

    topic: str = dspy.InputField()
    title: str = dspy.OutputField()
    sections: list[str] = dspy.OutputField()
    section_subheadings: dict[str, list[str]] = dspy.OutputField(desc="mapping from section headings to subheadings")

class DraftSection(dspy.Signature):
    """Draft a top-level section of an article."""

    topic: str = dspy.InputField()
    section_heading: str = dspy.InputField()
    section_subheadings: list[str] = dspy.InputField()
    content: str = dspy.OutputField(desc="markdown-formatted section")

class DraftArticle(dspy.Module):
    def __init__(self):
        self.build_outline = dspy.ChainOfThought(Outline)
        self.draft_section = dspy.ChainOfThought(DraftSection)

    def forward(self, topic):
        outline = self.build_outline(topic=topic)
        sections = []
        for heading, subheadings in outline.section_subheadings.items():
            section, subheadings = f"## {heading}", [f"### {subheading}" for subheading in subheadings]
            section = self.draft_section(topic=outline.title, section_heading=section, section_subheadings=subheadings)
            sections.append(section.content)
        return dspy.Prediction(title=outline.title, sections=sections)

draft_article = DraftArticle()
article = draft_article(topic="World Cup 2002")

In [22]:
from IPython.display import display, Markdown
for s in article.sections: display(Markdown(s))

## Introduction

### Background
The FIFA World Cup 2002 was an international football tournament held from May 31 to June 30, 2002. Co-hosted by South Korea and Japan, it marked the first time that the event was staged in Asia and also the first time two countries jointly hosted the tournament. The competition featured 32 national teams, with France entering as the defending champions.

### Significance
The World Cup 2002 is remembered for several reasons. It was a landmark event in Asian football history, showcasing the continent's growing influence in the sport. The co-hosting arrangement set a precedent for future tournaments and demonstrated the potential of collaborative efforts in organizing major sporting events. Additionally, the tournament produced memorable matches and performances that continue to be celebrated by fans worldwide.

## Host Countries

### Selection Process
The decision to co-host the 19th FIFA World Cup between South Korea and Japan was a groundbreaking move. The joint bid was chosen over Morocco in May 1996, marking the first time that two countries would host the tournament together. This selection process highlighted FIFA's desire to expand the global reach of the World Cup.

### Preparations
The preparations for the World Cup 2002 were extensive and involved significant infrastructure development. Both South Korea and Japan invested heavily in improving their stadiums, transportation networks, and accommodation facilities. The co-hosting nations worked closely with FIFA to ensure that all logistical aspects were meticulously planned and executed.

### Stadiums
The tournament featured a total of 16 stadiums across the two host countries. Each venue was carefully selected to provide optimal conditions for both players and spectators. Notable stadiums included the Seoul World Cup Stadium in South Korea, which hosted the opening match, and the International Stadium Yokohama in Japan, where the final took place.

## Qualification Process

### Regional Qualifiers
The qualification process for the World Cup 2002 was a lengthy and competitive journey that spanned over two years. A total of 198 teams from FIFA's six confederations participated in regional qualifiers to secure one of the 32 spots available for the tournament.

- **UEFA (Europe)**: 51 teams competed, with 13 qualifying for the World Cup.
- **CAF (Africa)**: 51 teams competed, with 5 qualifying for the World Cup.
- **CONMEBOL (South America)**: All 10 teams qualified automatically or through a playoff.
- **CONCACAF (North and Central America and Caribbean)**: 35 teams competed, with 3.5 spots available (4 teams in total).
- **AFC (Asia)**: 46 teams competed, with 4.5 spots available (5 teams in total).
- **OFC (Oceania)**: 10 teams competed, with the winner advancing to an inter-confederation playoff.

### Notable Absentees
Several notable teams failed to qualify for the World Cup 2002, including:

- France: The defending champions were eliminated in the playoffs by Denmark.
- Portugal: Despite having a strong squad, they finished third in their qualifying group behind Ukraine and Poland.
- Scotland: They narrowly missed out on qualification after losing a playoff to Norway.

## Group Stage

### Format
The World Cup 2002 was structured with a group stage followed by knockout rounds. The 32 teams were divided into eight groups of four. Each team played against every other team in their group once, earning three points for a win, one point for a draw, and no points for a loss. The top two teams from each group advanced to the Round of 16.

### Key Matches
Several matches stood out during the group stage:
- **South Korea vs. Poland**: This match was crucial as it determined which team would advance alongside the United States.
- **Brazil vs. Turkey**: Brazil's dominant performance showcased their strength, while Turkey's resilience set the stage for future upsets.
- **Germany vs. Saudi Arabia**: Germany's convincing win highlighted their early dominance in the tournament.

### Surprises
The group stage was not without its surprises:
- **France's Early Exit**: The defending champions failed to advance beyond the group stage, a shocking outcome given their previous success.
- **Senegal's Performance**: As one of the African teams, Senegal's strong showing against France and eventual qualification for the knockout stages was a significant surprise.

## Knockout Phase

### Round of 16
The Round of 16 marked the beginning of the knockout stage, where the top teams from each group faced off in single-elimination matches. Notable games included Germany's dramatic win over Paraguay and South Korea's upset victory over Italy. These matches set the stage for an exciting journey towards the final.

### Quarterfinals
In the Quarterfinals, the competition intensified with thrilling encounters. Brazil secured their spot in the Semifinals with a convincing win over England, while Turkey stunned the world with a penalty shootout victory against Japan. The other two matches saw South Korea continue their Cinderella run by defeating Spain, and Germany advancing after a hard-fought battle against the United States.

### Semifinals
The Semifinals featured some of the most memorable moments of the tournament. Brazil faced Turkey in a match that ended in a 1-0 victory for the Brazilians, securing their place in the final. Meanwhile, South Korea's remarkable journey came to an end as they lost to Germany in a closely contested match.

## Final Match

### Build-up
The road to the 2002 World Cup final was filled with unexpected twists and turns. The tournament saw traditional powerhouses like Argentina and France eliminated in the group stages, making way for underdogs and surprise contenders. Brazil, led by the formidable Ronaldo, had a dominant run through the knockout stages, defeating teams like Belgium, England, and Turkey en route to the final. On the other side of the bracket, Germany, despite not being at their best, managed to secure victories against Paraguay, the United States, and South Korea to reach the final.

### Match Details
The final match between Brazil and Germany took place on June 30, 2002, at the International Stadium Yokohama in Japan. The atmosphere was electric, with fans from both nations filling the stands. The match started with both teams playing cautiously, but it was Brazil who struck first. Ronaldo scored the opening goal in the 67th minute, sending the Brazilian fans into raptures. Just three minutes later, Ronaldo added another goal, effectively sealing the victory for Brazil. Germany managed to pull one back through a penalty kick by Klose in the 90th minute, but it was too little, too late.

### Aftermath
Brazil's 2-0 victory over Germany made them the first team to win four World Cup titles. The triumph was particularly sweet for Ronaldo, who had famously suffered an epileptic seizure before the 1998 final and had been written off by many. His performance in the 2002 final not only silenced his critics but also cemented his legacy as one of the greatest footballers of all time. The 2002 World Cup final will be remembered for its dramatic build-up, thrilling match details, and the historic aftermath that followed.

## Key Players and Moments

### Golden Boot Winner
The Golden Boot award is given to the player who scores the most goals in a World Cup tournament. In 2002, this prestigious award was won by Ronaldo of Brazil. The legendary striker scored eight goals throughout the tournament, including two in the final against Germany. His performance was instrumental in Brazil's victory and solidified his status as one of the greatest footballers of all time.

### Golden Ball Winner
The Golden Ball is awarded to the best player of the World Cup. In 2002, this honor went to Oliver Kahn of Germany. Despite his team finishing as runners-up, Kahn's exceptional performances in goal were crucial for Germany's run to the final. His saves and leadership on the field made him a standout player of the tournament.

### Memorable Goals
The World Cup 2002 was filled with memorable goals that will be remembered by football fans for years to come. One such goal was scored by Michael Ballack in Germany's quarter-final match against the United States. His powerful strike from outside the box secured a 1-0 victory for Germany and showcased his incredible skill.

Another unforgettable moment came in the final when Ronaldo scored two goals to lead Brazil to victory over Germany. His first goal was a brilliant individual effort, while his second was a clinical finish that sealed the win for the Brazilian team.

## Legacy and Impact

### Technical Innovations
The World Cup 2002 introduced several technical innovations that have since become standard in international football tournaments. One of the most notable advancements was the use of advanced goal-line technology, which helped referees make more accurate decisions during matches. Additionally, the tournament saw the implementation of high-definition broadcasting, enhancing the viewing experience for fans worldwide.

### Cultural Influence
The World Cup 2002 had a profound cultural impact on both participating nations and global audiences. The event fostered a sense of unity and pride among the host countries, South Korea and Japan, as they successfully co-hosted the tournament. This collaboration set a precedent for future multi-nation hosting arrangements. Moreover, the tournament showcased diverse cultures through music, dance, and culinary experiences, enriching the global football community.

### Future Tournaments
The success of the World Cup 2002 influenced the planning and execution of subsequent tournaments. The lessons learned from this event, particularly in terms of organization, technology, and cultural integration, have been applied to improve future World Cups. For instance, the use of advanced technologies and the emphasis on cultural exchange have become integral parts of modern football tournaments.

# Getting Started III: Optimizing the LM prompts or weights in DSPy programs

In [23]:
# Optimizing prompts for a ReAct agent
import dspy
from dspy.datasets import HotPotQA

# dspy.configure(lm=dspy.LM('openai/gpt-4o-mini'))

def search_wikipedia(query: str) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=3)
    return [x['text'] for x in results]

trainset = [x.with_inputs('question') for x in HotPotQA(train_seed=2024, train_size=500).train]
react = dspy.ReAct("question -> answer", tools=[search_wikipedia])

tp = dspy.MIPROv2(metric=dspy.evaluate.answer_exact_match, auto="light", num_threads=24)
optimized_react = tp.compile(react, trainset=trainset)

README.md:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

hotpot_qa.py:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

2025/05/25 18:31:35 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 20
minibatch: True
num_fewshot_candidates: 6
num_instruct_candidates: 3
valset size: 100



[93m[1mProjected Language Model (LM) Calls[0m

Based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Prompt Generation: [94m[1m10[0m[93m data summarizer calls + [94m[1m3[0m[93m * [94m[1m2[0m[93m lm calls in program + ([94m[1m3[0m[93m) lm calls in program-aware proposer = [94m[1m19[0m[93m prompt model calls[0m
[93m- Program Evaluation: [94m[1m35[0m[93m examples in minibatch * [94m[1m20[0m[93m batches + [94m[1m100[0m[93m examples in val set * [94m[1m5[0m[93m full evals = [94m[1m1200[0m[93m LM Program calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token)
            + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model

2025/05/25 18:31:55 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/05/25 18:31:55 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/05/25 18:31:55 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=6 sets of demonstrations...



No input received within 20 seconds. Proceeding with execution...
Bootstrapping set 1/6
Bootstrapping set 2/6
Bootstrapping set 3/6


 11%|█         | 11/100 [08:06<1:05:33, 44.20s/it]


Bootstrapped 4 full traces after 11 examples for up to 1 rounds, amounting to 11 attempts.
Bootstrapping set 4/6


  7%|▋         | 7/100 [04:36<1:01:16, 39.53s/it]


Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 5/6


  5%|▌         | 5/100 [02:47<52:53, 33.40s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 6/6


  1%|          | 1/100 [00:12<20:20, 12.33s/it]
2025/05/25 18:47:37 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/05/25 18:47:37 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/05/25 18:52:48 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=3 instructions...

2025/05/25 19:07:32 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/05/25 19:07:32 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, produce the fields `answer`.

You are an Agent. In each episode, you will be given the fields `question` as input. And you can see your past trajectory so far.
Your goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.

To do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.
After each tool call, you receive a resulting observation, which gets appended to your trajectory.

When writing next_thought, you may reason about the current situation and plan for future steps.
When selecting the next_tool_name and its next_tool_args, the tool must be one of:

(1) search_wikipedia. It ta

Average Metric: 38.00 / 100 (38.0%): : 102it [1:01:59, 36.46s/it]                      

2025/05/25 20:09:31 INFO dspy.evaluate.evaluate: Average Metric: 38 / 100 (38.0%)
2025/05/25 20:09:31 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 38.0

2025/05/25 20:09:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 25 - Minibatch ==



Average Metric: 13.00 / 35 (37.1%): 100%|██████████| 35/35 [21:59<00:00, 37.69s/it]

2025/05/25 20:31:30 INFO dspy.evaluate.evaluate: Average Metric: 13 / 35 (37.1%)
2025/05/25 20:31:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 37.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/05/25 20:31:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14]
2025/05/25 20:31:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0]
2025/05/25 20:31:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 38.0


2025/05/25 20:31:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 25 - Minibatch ==



Average Metric: 16.00 / 35 (45.7%): : 37it [30:51, 50.03s/it]                       

2025/05/25 21:02:21 INFO dspy.evaluate.evaluate: Average Metric: 16 / 35 (45.7%)
2025/05/25 21:02:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/05/25 21:02:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71]
2025/05/25 21:02:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0]
2025/05/25 21:02:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 38.0


2025/05/25 21:02:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 25 - Minibatch ==



Average Metric: 14.00 / 35 (40.0%): : 37it [25:46, 41.79s/it]                       

2025/05/25 21:28:07 INFO dspy.evaluate.evaluate: Average Metric: 14 / 35 (40.0%)
2025/05/25 21:28:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 40.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 0'].
2025/05/25 21:28:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0]
2025/05/25 21:28:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0]
2025/05/25 21:28:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 38.0


2025/05/25 21:28:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 25 - Minibatch ==



Average Metric: 19.00 / 35 (54.3%): 100%|██████████| 35/35 [27:48<00:00, 47.68s/it]

2025/05/25 21:55:56 INFO dspy.evaluate.evaluate: Average Metric: 19 / 35 (54.3%)
2025/05/25 21:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 4'].
2025/05/25 21:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29]
2025/05/25 21:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0]
2025/05/25 21:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 38.0


2025/05/25 21:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 25 - Minibatch ==



Average Metric: 21.00 / 35 (60.0%): : 37it [27:42, 44.93s/it]                       

2025/05/25 22:23:38 INFO dspy.evaluate.evaluate: Average Metric: 21 / 35 (60.0%)
2025/05/25 22:23:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/05/25 22:23:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0]
2025/05/25 22:23:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0]
2025/05/25 22:23:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 38.0


2025/05/25 22:23:38 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 25 - Full Evaluation =====
2025/05/25 22:23:38 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 60.0) from minibatch trials...



Average Metric: 55.00 / 100 (55.0%): : 102it [44:50, 26.38s/it]                      

2025/05/25 23:08:29 INFO dspy.evaluate.evaluate: Average Metric: 55 / 100 (55.0%)
2025/05/25 23:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 55.0
2025/05/25 23:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0]
2025/05/25 23:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0
2025/05/25 23:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/05/25 23:08:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 25 - Minibatch ==



Average Metric: 12.00 / 35 (34.3%): : 37it [17:55, 29.07s/it]                      

2025/05/25 23:26:25 INFO dspy.evaluate.evaluate: Average Metric: 12 / 35 (34.3%)
2025/05/25 23:26:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 0'].
2025/05/25 23:26:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29]
2025/05/25 23:26:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0]
2025/05/25 23:26:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/25 23:26:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 25 - Minibatch ==



Average Metric: 15.00 / 35 (42.9%): : 37it [36:36, 59.38s/it]                       

2025/05/26 00:03:02 INFO dspy.evaluate.evaluate: Average Metric: 15 / 35 (42.9%)
2025/05/26 00:03:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 1'].
2025/05/26 00:03:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86]
2025/05/26 00:03:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0]
2025/05/26 00:03:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 00:03:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 25 - Minibatch ==



Average Metric: 12.00 / 35 (34.3%): 100%|██████████| 35/35 [17:56<00:00, 30.75s/it]

2025/05/26 00:20:58 INFO dspy.evaluate.evaluate: Average Metric: 12 / 35 (34.3%)
2025/05/26 00:20:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 0'].
2025/05/26 00:20:58 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29]
2025/05/26 00:20:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0]
2025/05/26 00:20:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 00:20:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 25 - Minibatch ==



Average Metric: 16.00 / 35 (45.7%): 100%|██████████| 35/35 [22:46<00:00, 39.05s/it] 

2025/05/26 00:43:45 INFO dspy.evaluate.evaluate: Average Metric: 16 / 35 (45.7%)
2025/05/26 00:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 45.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 3'].
2025/05/26 00:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71]
2025/05/26 00:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0]
2025/05/26 00:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 00:43:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 25 - Minibatch ==



Average Metric: 18.00 / 35 (51.4%): 100%|██████████| 35/35 [00:00<00:00, 3587.15it/s]

2025/05/26 00:43:46 INFO dspy.evaluate.evaluate: Average Metric: 18 / 35 (51.4%)
2025/05/26 00:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 51.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/05/26 00:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43]
2025/05/26 00:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0]
2025/05/26 00:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 00:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 25 - Full Evaluation =====
2025/05/26 00:43:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 54.29) from minibatch trials...



Average Metric: 54.00 / 100 (54.0%): 100%|██████████| 100/100 [12:59<00:00,  7.80s/it]

2025/05/26 00:56:46 INFO dspy.evaluate.evaluate: Average Metric: 54 / 100 (54.0%)
2025/05/26 00:56:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0]
2025/05/26 00:56:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0
2025/05/26 00:56:46 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/05/26 00:56:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 25 - Minibatch ==



Average Metric: 19.00 / 35 (54.3%): : 38it [45:35, 71.99s/it]                       

2025/05/26 01:42:21 INFO dspy.evaluate.evaluate: Average Metric: 19 / 35 (54.3%)
2025/05/26 01:42:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 2'].
2025/05/26 01:42:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29]
2025/05/26 01:42:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0]
2025/05/26 01:42:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 01:42:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 25 - Minibatch ==



Average Metric: 24.00 / 35 (68.6%): 100%|██████████| 35/35 [29:07<00:00, 49.92s/it] 

2025/05/26 02:11:29 INFO dspy.evaluate.evaluate: Average Metric: 24 / 35 (68.6%)
2025/05/26 02:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 5'].
2025/05/26 02:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57]
2025/05/26 02:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0]
2025/05/26 02:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 02:11:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 25 - Minibatch ==



Average Metric: 18.00 / 35 (51.4%): : 36it [21:11, 35.32s/it]                      

2025/05/26 02:32:40 INFO dspy.evaluate.evaluate: Average Metric: 18 / 35 (51.4%)
2025/05/26 02:32:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 51.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 5'].
2025/05/26 02:32:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43]
2025/05/26 02:32:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0]
2025/05/26 02:32:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 02:32:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 25 - Minibatch ==



Average Metric: 15.00 / 35 (42.9%): : 37it [18:58, 30.78s/it]                      

2025/05/26 02:51:39 INFO dspy.evaluate.evaluate: Average Metric: 15 / 35 (42.9%)
2025/05/26 02:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 5'].
2025/05/26 02:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86]
2025/05/26 02:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0]
2025/05/26 02:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 02:51:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 25 - Minibatch ==



Average Metric: 21.00 / 35 (60.0%): : 38it [24:15, 38.30s/it]                       

2025/05/26 03:15:54 INFO dspy.evaluate.evaluate: Average Metric: 21 / 35 (60.0%)
2025/05/26 03:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 4'].
2025/05/26 03:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86, 60.0]
2025/05/26 03:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0]
2025/05/26 03:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 03:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 25 - Full Evaluation =====
2025/05/26 03:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.57) from minibatch trials...



Average Metric: 53.00 / 100 (53.0%): : 103it [35:36, 20.75s/it]                      

2025/05/26 03:51:31 INFO dspy.evaluate.evaluate: Average Metric: 53 / 100 (53.0%)
2025/05/26 03:51:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0]
2025/05/26 03:51:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0
2025/05/26 03:51:31 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/05/26 03:51:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 25 - Minibatch ==



Average Metric: 19.00 / 35 (54.3%): 100%|██████████| 35/35 [00:00<00:00, 4063.12it/s]

2025/05/26 03:51:32 INFO dspy.evaluate.evaluate: Average Metric: 19 / 35 (54.3%)
2025/05/26 03:51:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 5'].
2025/05/26 03:51:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86, 60.0, 54.29]
2025/05/26 03:51:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0]
2025/05/26 03:51:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 03:51:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 25 - Minibatch ==



Average Metric: 21.00 / 35 (60.0%): : 37it [37:32, 60.88s/it]                      

2025/05/26 04:29:04 INFO dspy.evaluate.evaluate: Average Metric: 21 / 35 (60.0%)
2025/05/26 04:29:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 2'].
2025/05/26 04:29:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86, 60.0, 54.29, 60.0]
2025/05/26 04:29:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0]
2025/05/26 04:29:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 04:29:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 25 - Minibatch ==



Average Metric: 18.00 / 35 (51.4%): 100%|██████████| 35/35 [06:25<00:00, 11.02s/it]

2025/05/26 04:35:30 INFO dspy.evaluate.evaluate: Average Metric: 18 / 35 (51.4%)
2025/05/26 04:35:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 51.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 4'].
2025/05/26 04:35:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86, 60.0, 54.29, 60.0, 51.43]
2025/05/26 04:35:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0]
2025/05/26 04:35:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 04:35:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 23 / 25 - Minibatch ==



Average Metric: 12.00 / 35 (34.3%): : 37it [24:47, 40.21s/it]                       

2025/05/26 05:00:18 INFO dspy.evaluate.evaluate: Average Metric: 12 / 35 (34.3%)
2025/05/26 05:00:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 34.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 1'].
2025/05/26 05:00:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86, 60.0, 54.29, 60.0, 51.43, 34.29]
2025/05/26 05:00:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0]
2025/05/26 05:00:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 05:00:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 24 / 25 - Minibatch ==



Average Metric: 20.00 / 35 (57.1%): : 37it [26:46, 43.42s/it]                       

2025/05/26 05:27:05 INFO dspy.evaluate.evaluate: Average Metric: 20 / 35 (57.1%)
2025/05/26 05:27:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 5'].
2025/05/26 05:27:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [37.14, 45.71, 40.0, 54.29, 60.0, 34.29, 42.86, 34.29, 45.71, 51.43, 54.29, 68.57, 51.43, 42.86, 60.0, 54.29, 60.0, 51.43, 34.29, 57.14]
2025/05/26 05:27:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0]
2025/05/26 05:27:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0


2025/05/26 05:27:05 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 25 - Full Evaluation =====
2025/05/26 05:27:05 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 60.0) from minibatch trials...



Average Metric: 53.00 / 100 (53.0%): : 102it [1:00:52, 35.81s/it]                      

2025/05/26 06:27:57 INFO dspy.evaluate.evaluate: Average Metric: 53 / 100 (53.0%)
2025/05/26 06:27:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [38.0, 55.0, 54.0, 53.0, 53.0]
2025/05/26 06:27:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.0
2025/05/26 06:27:57 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/05/26 06:27:57 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 55.0!





In [24]:
optimized_react

react = Predict(StringSignature(question, trajectory -> next_thought, next_tool_name, next_tool_args
    instructions='{\'question\': "Given the fields `question`, produce the fields `answer`.\\n\\nYou are an Agent. In each episode, you will be given the fields `question` as input. And you can see your past trajectory so far.\\nYour goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.\\n\\nTo do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.\\nAfter each tool call, you receive a resulting observation, which gets appended to your trajectory.\\n\\nWhen writing next_thought, you may reason about the current situation and plan for future steps.\\nWhen selecting the next_tool_name and its next_tool_args, the tool must be one of:\\n(1) search_wikipedia. It takes arguments {\'query\': {\'type\': \'string\'}} in JSON format.\\n(2) finish, whose description is <de