In [1]:
from pydantic import BaseModel
import random
import string

In [2]:
# Suppress Pydantic warnings
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='pydantic')

In [3]:
from dotenv import load_dotenv
from pathlib import Path
dotenv_path = Path("../.env")
load_dotenv(dotenv_path=dotenv_path)

import os
# Get API key from environment variables
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY_PERSONAL')

In [4]:

class Date(BaseModel):
    # Somehow LLM is bad at specifying `datetime.datetime`, so
    # we define a custom class to represent the date.
    year: int
    month: int
    day: int
    hour: int

class UserProfile(BaseModel):
    user_id: str
    name: str
    email: str

class Flight(BaseModel):
    flight_id: str
    date_time: Date
    origin: str
    destination: str
    duration: float
    price: float

class Itinerary(BaseModel):
    confirmation_number: str
    user_profile: UserProfile
    flight: Flight

class Ticket(BaseModel):
    user_request: str
    user_profile: UserProfile

In [5]:
user_database = {
    "Adam": UserProfile(user_id="1", name="Adam", email="adam@gmail.com"),
    "Bob": UserProfile(user_id="2", name="Bob", email="bob@gmail.com"),
    "Chelsie": UserProfile(user_id="3", name="Chelsie", email="chelsie@gmail.com"),
    "David": UserProfile(user_id="4", name="David", email="david@gmail.com"),
}

flight_database = {
    "DA123": Flight(
        flight_id="DA123",  # DSPy Airline 123
        origin="SFO",
        destination="JFK",
        date_time=Date(year=2025, month=9, day=1, hour=1),
        duration=3,
        price=200,
    ),
    "DA125": Flight(
        flight_id="DA125",
        origin="SFO",
        destination="JFK",
        date_time=Date(year=2025, month=9, day=1, hour=7),
        duration=9,
        price=500,
    ),
    "DA456": Flight(
        flight_id="DA456",
        origin="SFO",
        destination="SNA",
        date_time=Date(year=2025, month=10, day=1, hour=1),
        duration=2,
        price=100,
    ),
    "DA460": Flight(
        flight_id="DA460",
        origin="SFO",
        destination="SNA",
        date_time=Date(year=2025, month=10, day=1, hour=9),
        duration=2,
        price=120,
    ),
}

In [6]:
itinery_database = {}
ticket_database = {}

In [7]:
def fetch_flight_info(date: Date, origin: str, destination: str):
    """Fetch flight information from origin to destination on the given date"""
    flights = []

    for flight_id, flight in flight_database.items():
        if (
            flight.date_time.year == date.year
            and flight.date_time.month == date.month
            and flight.date_time.day == date.day
            and flight.origin == origin
            and flight.destination == destination
        ):
            flights.append(flight)
    if len(flights) == 0:
        raise ValueError("No matching flight found!")
    return flights


def fetch_itinerary(confirmation_number: str):
    """Fetch a booked itinerary information from database"""
    return itinery_database.get(confirmation_number)


def pick_flight(flights: list[Flight]):
    """Pick up the best flight that matches users' request. we pick the shortest, and cheaper one on ties."""
    sorted_flights = sorted(
        flights,
        key=lambda x: (
            x.get("duration") if isinstance(x, dict) else x.duration,
            x.get("price") if isinstance(x, dict) else x.price,
        ),
    )
    return sorted_flights[0]


def _generate_id(length=8):
    chars = string.ascii_lowercase + string.digits
    return "".join(random.choices(chars, k=length))


def book_flight(flight: Flight, user_profile: UserProfile):
    """Book a flight on behalf of the user."""
    confirmation_number = _generate_id()
    while confirmation_number in itinery_database:
        confirmation_number = _generate_id()
    itinery_database[confirmation_number] = Itinerary(
        confirmation_number=confirmation_number,
        user_profile=user_profile,
        flight=flight,
    )
    return confirmation_number, itinery_database[confirmation_number]


def cancel_itinerary(confirmation_number: str, user_profile: UserProfile):
    """Cancel an itinerary on behalf of the user."""
    if confirmation_number in itinery_database:
        del itinery_database[confirmation_number]
        return
    raise ValueError("Cannot find the itinerary, please check your confirmation number.")


def get_user_info(name: str):
    """Fetch the user profile from database with given name."""
    return user_database.get(name)


def file_ticket(user_request: str, user_profile: UserProfile):
    """File a customer support ticket if this is something the agent cannot handle."""
    ticket_id = _generate_id(length=6)
    ticket_database[ticket_id] = Ticket(
        user_request=user_request,
        user_profile=user_profile,
    )
    return ticket_id


In [8]:
import dspy

class DSPyAirlineCustomerSerice(dspy.Signature):
    """You are an airline customer service agent that helps user book and manage flights.

    You are given a list of tools to handle user request, and you should decide the right tool to use in order to
    fullfil users' request."""

    user_request: str = dspy.InputField()
    process_result: str = dspy.OutputField(
        desc=(
                "Message that summarizes the process result, and the information users need, e.g., the "
                "confirmation_number if a new flight is booked."
            )
        )

In [9]:
agent = dspy.ReAct(
    DSPyAirlineCustomerSerice,
    tools = [
        fetch_flight_info,
        fetch_itinerary,
        pick_flight,
        book_flight,
        cancel_itinerary,
        get_user_info,
        file_ticket,
    ]
)

In [10]:
dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"))

In [11]:
result = agent(user_request="please help me book a flight from SFO to JFK on 09/01/2025, my name is Adam")


# Print result with proper formatting
print(result)

Prediction(
    trajectory={'thought_0': 'I need to fetch flight information for Adam from SFO to JFK on 09/01/2025 to find available flights for booking.', 'tool_name_0': 'fetch_flight_info', 'tool_args_0': {'date': {'year': 2025, 'month': 9, 'day': 1, 'hour': 0}, 'origin': 'SFO', 'destination': 'JFK'}, 'observation_0': [Flight(flight_id='DA123', date_time=Date(year=2025, month=9, day=1, hour=1), origin='SFO', destination='JFK', duration=3.0, price=200.0), Flight(flight_id='DA125', date_time=Date(year=2025, month=9, day=1, hour=7), origin='SFO', destination='JFK', duration=9.0, price=500.0)], 'thought_1': 'I have retrieved the available flights from SFO to JFK on 09/01/2025. The options are flight DA123, which departs at 1 AM and costs $200, and flight DA125, which departs at 7 AM and costs $500. I will now pick the best flight based on the shortest duration and price.', 'tool_name_1': 'pick_flight', 'tool_args_1': {'flights': [{'flight_id': 'DA123', 'date_time': {'year': 2025, 'month

In [12]:
dspy.inspect_history(n=10)





[34m[2025-06-19T18:24:13.509383][0m

[31mSystem message:[0m

Your input fields are:
1. `user_request` (str): 
2. `trajectory` (str):
Your output fields are:
1. `next_thought` (str): 
2. `next_tool_name` (Literal['fetch_flight_info', 'fetch_itinerary', 'pick_flight', 'book_flight', 'cancel_itinerary', 'get_user_info', 'file_ticket', 'finish']): 
3. `next_tool_args` (dict[str, Any]):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## user_request ## ]]
{user_request}

[[ ## trajectory ## ]]
{trajectory}

[[ ## next_thought ## ]]
{next_thought}

[[ ## next_tool_name ## ]]
{next_tool_name}        # note: the value you produce must exactly match (no extra characters) one of: fetch_flight_info; fetch_itinerary; pick_flight; book_flight; cancel_itinerary; get_user_info; file_ticket; finish

[[ ## next_tool_args ## ]]
{next_tool_args}        # note: the value you produce must adhere to the JSON schema: {"type": "object", "additionalP

# RAG DSPy

In [13]:
import dspy

class QueryGenerator(dspy.Signature):
    """Generate a query based on question to fetch relevant context"""
    question: str = dspy.InputField()
    query: str = dspy.OutputField()

def search_wikipedia(query: str) -> list[str]:
    """Query ColBERT endpoint, which is a knowledge source based on wikipedia data"""
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=1)
    return [x["text"] for x in results]

class RAG(dspy.Module):
    def __init__(self):
        self.query_generator = dspy.Predict(QueryGenerator)
        self.answer_generator = dspy.ChainOfThought("question,context->answer")

    def forward(self, question, **kwargs):
        query = self.query_generator(question=question).query
        full_context = search_wikipedia(query)
        print(f"Full context is:\n {full_context}")
        context = full_context[0]
        return self.answer_generator(question=question, context=context).answer

In [14]:
dspy.configure(lm=dspy.LM("openai/gpt-4o-mini"))
rag = RAG()
print(rag(question="Is Lebron James the basketball GOAT?"))

Full context is:
 ["LeBron James | LeBron Raymone James ( ; born December 30, 1984) is an American professional basketball player for the Cleveland Cavaliers of the National Basketball Association (NBA). James has won three NBA championships, four NBA Most Valuable Player Awards, three NBA Finals MVP Awards, two Olympic gold medals, an NBA scoring title, and the NBA Rookie of the Year Award. He has also been selected to 13 NBA All-Star teams, 13 All-NBA teams, and six All-Defensive teams, is the Cavaliers' all-time leading scorer, and is the NBA career playoff scoring leader."]
LeBron James is often considered one of the greatest basketball players of all time, and many argue he is the GOAT, but this is subjective and varies by personal opinion.


#### Next tutorial, RAG

In [15]:
import dspy

lm = dspy.LM('openai/gpt-4o-mini')
dspy.configure(lm=lm)

In [16]:
qa = dspy.Predict('question: str -> responses: str')
response = qa(question="what are high memory and low memory on linux?")

print(response.responses)

In Linux, "high memory" and "low memory" refer to different regions of the system's memory address space, particularly in the context of how the kernel manages memory for processes.

1. **Low Memory**: This typically refers to the memory that is directly accessible by the kernel and can be used by processes without any special handling. In a 32-bit system, this is usually the first 896 MB of RAM (though it can vary based on the architecture and configuration). Low memory is used for kernel data structures and for user-space processes that require direct access to memory.

2. **High Memory**: This refers to memory that is above the low memory limit and is not directly accessible by the kernel in a 32-bit system. Processes can use this memory, but the kernel must use special mechanisms (like page tables) to access it. High memory is typically used in systems with large amounts of RAM, allowing more memory to be allocated to user-space processes while still keeping the kernel's memory usa

In [17]:
dspy.inspect_history(n=1)





[34m[2025-06-19T18:24:19.636686][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str):
Your output fields are:
1. `responses` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## responses ## ]]
{responses}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `responses`.


[31mUser message:[0m

[[ ## question ## ]]
what are high memory and low memory on linux?

Respond with the corresponding output fields, starting with the field `[[ ## responses ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## responses ## ]]
In Linux, "high memory" and "low memory" refer to different regions of the system's memory address space, particularly in the context of how the kernel manages memory for processes.

1. **Low Memory**: This typically refers to the mem

In [18]:
cot = dspy.ChainOfThought('question -> response')
cot(question="should curly braces appear on their own line?")

Prediction(
    reasoning='The placement of curly braces on their own line is largely a matter of coding style and conventions. In many programming languages, such as Java, C#, and JavaScript, it is common to place opening curly braces on the same line as the statement that precedes them, while closing curly braces are often placed on their own line. This style is known as "K&R style." However, some coding standards, like those used in Python or certain configurations of Java, prefer placing both opening and closing braces on their own lines for better readability, especially in nested structures. Ultimately, the decision should align with the team\'s coding standards or the style guide being followed.',
    response="Curly braces can either appear on their own line or on the same line as the preceding statement, depending on the coding style you choose to follow. It's important to adhere to the conventions of the language and the team's coding standards for consistency and readability

#### Optimization

In [19]:
import ujson
from dspy.utils import download

# Download question--answer pairs from the RAG-QA Arena "Tech" dataset.
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl")

with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

In [20]:
print(f"Length of data:{len(data)}")
data[0]

Length of data:2064


{'question': 'why igp is used in mpls?',
 'response': "An IGP exchanges routing prefixes between gateways/routers.  \nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.",
 'gold_doc_ids': [2822, 2823]}

In [21]:
# if with_inputs is not used, the data will be considered as label or metadata
# with_inputs(<label_name>) will make the data to be considered as input. Here it is ``question``
data = [dspy.Example(**d).with_inputs('question') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})

In [22]:
import random

random.Random(0).shuffle(data)
trainset, devset, testset = data[:200], data[200:500], data[500:1000]

len(trainset), len(devset), len(testset)

(200, 300, 500)

In [23]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)

# Produce a prediction from our `cot` module, using the `example` above as input.
# cot is a module that uses chain of thought to generate a response
pred = cot(**example.inputs())

# Compute the metric score for the prediction.
score = metric(example, pred)

print(f"Question: \t {example.question}\n")
print(f"Gold Response: \t {example.response}\n")
print(f"Predicted Response: \t {pred.response}\n")
print(f"Semantic F1 Score: {score:.2f}")

Question: 	 why are my text messages coming up as maybe?

Gold Response: 	 This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". 

However, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.

Predicted Response: 	 Your text messages are showing up as "maybe" because the recipient's messaging app is likely unsure about the sender's identity. This can occur if you're not saved in their contacts or if the app is trying to filter messages based on previous interactions. You might want to check if the recipient has your number saved or if there are any settings in their messaging app that could be adjusted.

Semantic F1 Score: 0.00


In [24]:
dspy.inspect_history(n=1)





[34m[2025-06-19T18:24:20.102665][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `ground_truth` (str): 
3. `system_response` (str):
Your output fields are:
1. `reasoning` (str): 
2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth
3. `system_response_key_ideas` (str): enumeration of key ideas in the system response
4. `discussion` (str): discussion of the overlap between ground truth and system response
5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response
6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## ground_truth ## ]]
{ground_truth}

[[ ## system_response ## ]]
{system_response}

[[ ## reasoning ## ]]
{reasoning}

[[ ## ground_truth_key_ideas ## ]]
{ground_truth_key_ideas}

[[ ## system_response_key_ideas

In [25]:
# Define an evaluator that we can re-use.
evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=24,
                         display_progress=True, display_table=2)

# Evaluate the Chain-of-Thought program.
evaluate(cot)

Average Metric: 124.20 / 300 (41.4%): 100%|██████████| 300/300 [00:00<00:00, 374.56it/s]

2025/06/19 18:24:21 INFO dspy.evaluate.evaluate: Average Metric: 124.19804803863978 / 300 (41.4%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,"when to use c over c++, and c++ over c?","If you are equally familiar with both C++ and C, it's advisable to...",[733],"C and C++ are both powerful programming languages, but they serve ...","Use C when you need low-level system programming, performance, and...",✔️ [0.500]
1,should images be stored in a git repository?,"One viewpoint expresses that there is no significant downside, esp...","[6253, 6254, 6275, 6278, 8215]",Storing images in a Git repository can be problematic for several ...,It is generally not advisable to store images in a Git repository ...,✔️ [0.222]


41.4

In [26]:
max_characters = 6000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

# Download question--answer pairs from the RAG-QA Arena "Tech" dataset.
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl")


with open("ragqa_arena_tech_corpus.jsonl") as f:
    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]
    print(f"Loaded {len(corpus)} documents. Will encode them below.")

embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)

Loaded 28436 documents. Will encode them below.
Training a 32-byte FAISS index with 337 partitions, based on 28436 x 512-dim embeddings


In [27]:
class RAG(dspy.Module):
    def __init__(self):
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = search(question).passages
        return self.respond(context=context, question=question)

In [28]:
rag = RAG()
rag(question="what are high memory and low memory on linux?")

Prediction(
    reasoning="High memory and low memory in Linux refer to two distinct segments of the kernel's memory space. Low memory is the portion of memory that the kernel can access directly and is statically mapped at boot time, allowing for efficient access. High memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel must map it temporarily when it needs to access it. This distinction is crucial for managing memory in a 32-bit architecture, where the kernel needs to handle more memory than it can directly address. High memory is typically used for temporary data buffers, while low memory is used for kernel operations.",
    response="In Linux, high memory refers to the segment of memory that is not permanently mapped in the kernel's address space, requiring the kernel to temporarily map it when needed. Low memory, conversely, is the portion that the kernel can access directly and is statically mapped at boot time. This separat

In [29]:
evaluate(rag)

Average Metric: 165.30 / 300 (55.1%): 100%|██████████| 300/300 [02:46<00:00,  1.80it/s]

2025/06/19 19:39:07 INFO dspy.evaluate.evaluate: Average Metric: 165.29804665171264 / 300 (55.1%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,"when to use c over c++, and c++ over c?","If you are equally familiar with both C++ and C, it's advisable to...",[733],"C should be used over C++ primarily in scenarios where simplicity,...","Use C over C++ when working on embedded systems, requiring low-lev...",✔️ [0.500]
1,should images be stored in a git repository?,"One viewpoint expresses that there is no significant downside, esp...","[6253, 6254, 6275, 6278, 8215]",Storing images in a Git repository is generally not recommended du...,While it is technically possible to store images in a Git reposito...,✔️ [0.400]


55.1

In [30]:
tp = dspy.MIPROv2(metric=metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_rag = tp.compile(RAG(), trainset=trainset,
                           max_bootstrapped_demos=2, max_labeled_demos=2,
                           requires_permission_to_run=False)

2025/06/19 19:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 160

2025/06/19 19:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/06/19 19:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/06/19 19:39:53 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


  from .autonotebook import tqdm as notebook_tqdm
  5%|▌         | 2/40 [00:35<11:14, 17.74s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 4/12


 12%|█▎        | 5/40 [01:09<08:08, 13.95s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 5/12


  2%|▎         | 1/40 [00:14<09:32, 14.67s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/12


  8%|▊         | 3/40 [00:45<09:24, 15.26s/it]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 7/12


  5%|▌         | 2/40 [00:23<07:25, 11.72s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/12


 12%|█▎        | 5/40 [00:59<06:55, 11.86s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 9/12


  5%|▌         | 2/40 [00:24<07:53, 12.46s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 10/12


  5%|▌         | 2/40 [00:25<07:56, 12.55s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 11/12


 12%|█▎        | 5/40 [00:53<06:16, 10.76s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 12/12


  2%|▎         | 1/40 [00:10<07:02, 10.84s/it]
2025/06/19 19:45:56 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/06/19 19:45:56 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/06/19 19:46:09 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...

2025/06/19 19:46:54 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/06/19 19:46:54 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.

2025/06/19 19:46:54 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Imagine you're a technical support specialist assisting a user who is facing urgent issues with their macOS system. The user is desperate to resolve DNS resolution problems that are affecting their work, and they need clear, actionable steps to troubleshoot the issue as quickly as possible. Your task is to provide a structured response based on the context provided, addressing the user's question about "dns not resolving on mac os x." Utilize the context to generate a detailed reasoning process and a coherent response that guides the user through the troubleshooting steps effectively.

2025/06/19 19

Average Metric: 89.26 / 160 (55.8%): 100%|██████████| 160/160 [01:37<00:00,  1.65it/s]

2025/06/19 19:48:31 INFO dspy.evaluate.evaluate: Average Metric: 89.26388237281645 / 160 (55.8%)
2025/06/19 19:48:31 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 55.79

2025/06/19 19:48:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 23 - Minibatch ==



Average Metric: 21.30 / 35 (60.8%): 100%|██████████| 35/35 [00:28<00:00,  1.23it/s]

2025/06/19 19:49:00 INFO dspy.evaluate.evaluate: Average Metric: 21.29626294696506 / 35 (60.8%)
2025/06/19 19:49:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.85 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/06/19 19:49:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85]
2025/06/19 19:49:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79]
2025/06/19 19:49:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.79


2025/06/19 19:49:00 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 23 - Minibatch ==



Average Metric: 19.75 / 35 (56.4%): 100%|██████████| 35/35 [00:27<00:00,  1.27it/s]

2025/06/19 19:49:27 INFO dspy.evaluate.evaluate: Average Metric: 19.75134564639418 / 35 (56.4%)
2025/06/19 19:49:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/06/19 19:49:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43]
2025/06/19 19:49:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79]
2025/06/19 19:49:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.79


2025/06/19 19:49:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 23 - Minibatch ==



Average Metric: 21.20 / 35 (60.6%): 100%|██████████| 35/35 [00:27<00:00,  1.29it/s]

2025/06/19 19:49:54 INFO dspy.evaluate.evaluate: Average Metric: 21.198747831773698 / 35 (60.6%)
2025/06/19 19:49:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/06/19 19:49:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57]
2025/06/19 19:49:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79]
2025/06/19 19:49:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.79


2025/06/19 19:49:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 23 - Minibatch ==



Average Metric: 22.90 / 35 (65.4%): 100%|██████████| 35/35 [00:26<00:00,  1.31it/s]

2025/06/19 19:50:21 INFO dspy.evaluate.evaluate: Average Metric: 22.897540091352568 / 35 (65.4%)
2025/06/19 19:50:21 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.42 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/06/19 19:50:21 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42]
2025/06/19 19:50:21 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79]
2025/06/19 19:50:21 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.79


2025/06/19 19:50:21 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 23 - Minibatch ==



Average Metric: 20.17 / 35 (57.6%): 100%|██████████| 35/35 [00:35<00:00,  1.02s/it]

2025/06/19 19:50:57 INFO dspy.evaluate.evaluate: Average Metric: 20.17094777916823 / 35 (57.6%)
2025/06/19 19:50:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.63 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/06/19 19:50:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63]
2025/06/19 19:50:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79]
2025/06/19 19:50:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.79


2025/06/19 19:50:57 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 23 - Full Evaluation =====
2025/06/19 19:50:57 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 65.42) from minibatch trials...



Average Metric: 94.17 / 160 (58.9%): 100%|██████████| 160/160 [01:18<00:00,  2.04it/s]

2025/06/19 19:52:15 INFO dspy.evaluate.evaluate: Average Metric: 94.16883740608652 / 160 (58.9%)
2025/06/19 19:52:15 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 58.86
2025/06/19 19:52:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86]
2025/06/19 19:52:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 58.86
2025/06/19 19:52:15 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/19 19:52:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 23 - Minibatch ==



Average Metric: 22.12 / 35 (63.2%): 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

2025/06/19 19:52:48 INFO dspy.evaluate.evaluate: Average Metric: 22.124802089202777 / 35 (63.2%)
2025/06/19 19:52:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.21 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/06/19 19:52:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21]
2025/06/19 19:52:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86]
2025/06/19 19:52:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 58.86


2025/06/19 19:52:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 23 - Minibatch ==



Average Metric: 21.07 / 35 (60.2%): 100%|██████████| 35/35 [00:29<00:00,  1.18it/s]

2025/06/19 19:53:18 INFO dspy.evaluate.evaluate: Average Metric: 21.066716678789707 / 35 (60.2%)
2025/06/19 19:53:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.19 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1'].
2025/06/19 19:53:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19]
2025/06/19 19:53:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86]
2025/06/19 19:53:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 58.86


2025/06/19 19:53:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 23 - Minibatch ==



Average Metric: 20.53 / 35 (58.7%): 100%|██████████| 35/35 [00:31<00:00,  1.11it/s]

2025/06/19 19:53:49 INFO dspy.evaluate.evaluate: Average Metric: 20.528647404307613 / 35 (58.7%)
2025/06/19 19:53:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.65 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/06/19 19:53:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65]
2025/06/19 19:53:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86]
2025/06/19 19:53:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 58.86


2025/06/19 19:53:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 23 - Minibatch ==



Average Metric: 20.88 / 35 (59.7%): 100%|██████████| 35/35 [00:32<00:00,  1.07it/s]

2025/06/19 19:54:22 INFO dspy.evaluate.evaluate: Average Metric: 20.883743839493903 / 35 (59.7%)
2025/06/19 19:54:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.67 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 10'].
2025/06/19 19:54:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67]
2025/06/19 19:54:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86]
2025/06/19 19:54:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 58.86


2025/06/19 19:54:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 23 - Minibatch ==



Average Metric: 20.20 / 35 (57.7%): 100%|██████████| 35/35 [00:26<00:00,  1.32it/s]

2025/06/19 19:54:49 INFO dspy.evaluate.evaluate: Average Metric: 20.19744593108016 / 35 (57.7%)
2025/06/19 19:54:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/06/19 19:54:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71]
2025/06/19 19:54:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86]
2025/06/19 19:54:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 58.86


2025/06/19 19:54:49 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 23 - Full Evaluation =====
2025/06/19 19:54:49 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 60.85) from minibatch trials...



Average Metric: 95.64 / 160 (59.8%): 100%|██████████| 160/160 [01:19<00:00,  2.02it/s]

2025/06/19 19:56:08 INFO dspy.evaluate.evaluate: Average Metric: 95.63916235776854 / 160 (59.8%)
2025/06/19 19:56:08 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 59.77
2025/06/19 19:56:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77]
2025/06/19 19:56:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77
2025/06/19 19:56:08 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/19 19:56:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 23 - Minibatch ==



Average Metric: 21.10 / 35 (60.3%): 100%|██████████| 35/35 [00:02<00:00, 17.12it/s]

2025/06/19 19:56:10 INFO dspy.evaluate.evaluate: Average Metric: 21.095833521702296 / 35 (60.3%)





2025/06/19 19:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.27 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/06/19 19:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27]
2025/06/19 19:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77]
2025/06/19 19:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 19:56:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 23 - Minibatch ==


Average Metric: 22.03 / 35 (62.9%): 100%|██████████| 35/35 [00:28<00:00,  1.24it/s]

2025/06/19 19:56:39 INFO dspy.evaluate.evaluate: Average Metric: 22.025330771541885 / 35 (62.9%)
2025/06/19 19:56:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.93 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2025/06/19 19:56:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93]
2025/06/19 19:56:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77]
2025/06/19 19:56:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 19:56:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 23 - Minibatch ==



Average Metric: 19.65 / 35 (56.2%): 100%|██████████| 35/35 [00:26<00:00,  1.31it/s]

2025/06/19 19:57:06 INFO dspy.evaluate.evaluate: Average Metric: 19.654531384790417 / 35 (56.2%)
2025/06/19 19:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.16 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 9'].
2025/06/19 19:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93, 56.16]
2025/06/19 19:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77]
2025/06/19 19:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 19:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 23 - Minibatch ==



Average Metric: 20.09 / 35 (57.4%): 100%|██████████| 35/35 [00:28<00:00,  1.21it/s]

2025/06/19 19:57:35 INFO dspy.evaluate.evaluate: Average Metric: 20.090181301390707 / 35 (57.4%)
2025/06/19 19:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.4 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 11'].
2025/06/19 19:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93, 56.16, 57.4]
2025/06/19 19:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77]
2025/06/19 19:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 19:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 23 - Minibatch ==



Average Metric: 21.86 / 35 (62.5%): 100%|██████████| 35/35 [00:30<00:00,  1.16it/s]

2025/06/19 19:58:05 INFO dspy.evaluate.evaluate: Average Metric: 21.862996815866257 / 35 (62.5%)
2025/06/19 19:58:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.47 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 8'].
2025/06/19 19:58:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93, 56.16, 57.4, 62.47]
2025/06/19 19:58:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77]
2025/06/19 19:58:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 19:58:05 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 23 - Full Evaluation =====
2025/06/19 19:58:05 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 62.93) from minibatch trials...



Average Metric: 93.32 / 160 (58.3%): 100%|██████████| 160/160 [01:18<00:00,  2.04it/s]

2025/06/19 19:59:24 INFO dspy.evaluate.evaluate: Average Metric: 93.31674960840108 / 160 (58.3%)
2025/06/19 19:59:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77, 58.32]
2025/06/19 19:59:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77
2025/06/19 19:59:24 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/19 19:59:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 23 - Minibatch ==



Average Metric: 19.75 / 35 (56.4%): 100%|██████████| 35/35 [00:30<00:00,  1.13it/s]

2025/06/19 19:59:55 INFO dspy.evaluate.evaluate: Average Metric: 19.750599004909674 / 35 (56.4%)
2025/06/19 19:59:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 7'].
2025/06/19 19:59:55 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93, 56.16, 57.4, 62.47, 56.43]
2025/06/19 19:59:55 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77, 58.32]
2025/06/19 19:59:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 19:59:55 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 23 - Minibatch ==



Average Metric: 20.03 / 35 (57.2%): 100%|██████████| 35/35 [00:36<00:00,  1.04s/it]

2025/06/19 20:00:31 INFO dspy.evaluate.evaluate: Average Metric: 20.02740421111082 / 35 (57.2%)
2025/06/19 20:00:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.22 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 11'].
2025/06/19 20:00:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93, 56.16, 57.4, 62.47, 56.43, 57.22]
2025/06/19 20:00:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77, 58.32]
2025/06/19 20:00:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 20:00:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 23 - Minibatch ==



Average Metric: 21.41 / 35 (61.2%): 100%|██████████| 35/35 [00:29<00:00,  1.20it/s]

2025/06/19 20:01:00 INFO dspy.evaluate.evaluate: Average Metric: 21.410599149993924 / 35 (61.2%)
2025/06/19 20:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 61.17 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 4'].
2025/06/19 20:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.85, 56.43, 60.57, 65.42, 57.63, 63.21, 60.19, 58.65, 59.67, 57.71, 60.27, 62.93, 56.16, 57.4, 62.47, 56.43, 57.22, 61.17]
2025/06/19 20:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77, 58.32]
2025/06/19 20:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77


2025/06/19 20:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 23 - Full Evaluation =====
2025/06/19 20:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 62.47) from minibatch trials...



Average Metric: 93.21 / 160 (58.3%): 100%|██████████| 160/160 [01:14<00:00,  2.14it/s]

2025/06/19 20:02:15 INFO dspy.evaluate.evaluate: Average Metric: 93.20871221007275 / 160 (58.3%)
2025/06/19 20:02:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.79, 58.86, 59.77, 58.32, 58.26]
2025/06/19 20:02:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.77
2025/06/19 20:02:15 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/06/19 20:02:15 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 59.77!





In [31]:
baseline = rag(question="cmd+tab does not work on hidden or minimized windows")
print(baseline.response)

You are correct; cmd+tab does not work on hidden or minimized windows. To switch back to a minimized app, you need to first switch to another app and let it take focus. The Command + Tab shortcut is designed to cycle through currently active applications, and minimized windows do not count as active.


In [32]:
pred = optimized_rag(question="cmd+tab does not work on hidden or minimized windows")
print(pred.response)

The Command + Tab functionality on macOS does not allow you to switch directly to hidden or minimized windows. When you use Command + Tab, it cycles through your recently used applications, but minimized windows require a different approach. To switch back to a minimized app, you need to first switch to another application and let it take focus. If you want to manage minimized windows more effectively, consider using Command + Option + H to hide other applications or Command + Option + M to minimize the currently active window. This way, you can better control your workspace without relying solely on Command + Tab.


In [33]:
evaluate(optimized_rag)

Average Metric: 182.05 / 300 (60.7%): 100%|██████████| 300/300 [03:09<00:00,  1.59it/s]

2025/06/19 21:39:02 INFO dspy.evaluate.evaluate: Average Metric: 182.04766646955244 / 300 (60.7%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,"when to use c over c++, and c++ over c?","If you are equally familiar with both C++ and C, it's advisable to...",[733],The context provides insights into the strengths and weaknesses of...,Use C over C++ when working on embedded systems or projects where ...,✔️ [0.179]
1,should images be stored in a git repository?,"One viewpoint expresses that there is no significant downside, esp...","[6253, 6254, 6275, 6278, 8215]",Storing images in a Git repository can be problematic due to Git's...,Storing images in a Git repository is generally not recommended du...,✔️ [0.500]


60.68

In [35]:
cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])  # in USD, as calculated by LiteLLM for certain providers
cost

1.8376081499999999