In [1]:
!pip install dspy-ai ragas langchain langchain-openai chromadb datasets pandas seaborn matplotlib

Collecting dspy-ai
  Downloading dspy_ai-3.0.4-py3-none-any.whl.metadata (285 bytes)
Collecting ragas
  Downloading ragas-0.4.1-py3-none-any.whl.metadata (22 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.3-py3-none-any.whl.metadata (2.6 kB)
Collecting chromadb
  Downloading chromadb-1.3.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting dspy>=3.0.4 (from dspy-ai)
  Downloading dspy-3.0.4-py3-none-any.whl.metadata (8.4 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.13.0-py3-none-any.whl.metadata (11 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_network-0.33.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting langchain-community (from ragas)
  Downloading l

In [8]:
import os
import dspy
import chromadb
import ujson
import requests
from typing import List, Dict, Any
from dspy.teleprompt import BootstrapFewShot

# RAGAS Imports
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
)
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from datasets import Dataset

In [7]:
#from google.colab import userdata
#import os

# Retrieve the API key from Colab secrets
#OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

# Set it as an environment variable so the libraries can pick it up
#os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

#print("OPENAI_API_KEY loaded from Colab secrets and set as environment variable.")

In [9]:
class RAGSignature(dspy.Signature):
    """
    Answer the question based strictly on the provided technical documentation context.
    """
    context = dspy.InputField(desc="technical documentation facts and retrieved snippets")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="a detailed, accurate, and faithful answer")

In [10]:
class DSPyRAGModule(dspy.Module):
    """
    The RAG Module that connects retrieval with generation.
    """
    def __init__(self, retrieve_fn):
        super().__init__()
        self.retrieve_fn = retrieve_fn
        self.generate = dspy.ChainOfThought(RAGSignature)

    def forward(self, question):
        # 1. Retrieve
        contexts = self.retrieve_fn(question)

        # 2. Format Context
        # DSPy works best with string inputs for context
        context_str = "\n".join(contexts)

        # 3. Generate
        pred = self.generate(context=context_str, question=question)

        # 4. Attach contexts to prediction under a *new* attribute for RAGAS evaluation
        # This avoids conflict with the 'context' InputField during DSPy's internal Example construction.
        pred.retrieved_contexts = contexts
        return pred


In [11]:
# ==========================================
# 2. RAGAS Metric Adapter
# ==========================================

class RAGASMetricAdapter:
    """
    Adapts RAGAS metrics to be used as optimization signals within DSPy.
    """

    @staticmethod
    def answer_correctness_metric(gold, pred, trace=None):
        """
        Calculates 'answer_correctness' for a single DSPy prediction.
        """
        # Extract contexts (ensure they were attached in the module forward pass
        # under 'retrieved_contexts')
        contexts = getattr(pred, 'retrieved_contexts', [])

        # Create a mini-dataset for RAGAS
        data = {
            "question": [gold.question],
            "answer": [pred.answer],
            "contexts": [contexts],
            "ground_truth": [gold.gold_answer]
        }
        dataset = Dataset.from_dict(data)

        # Run RAGAS Evaluation
        evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
        evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

        results = evaluate(
            dataset=dataset,
            metrics=[answer_correctness],
            llm=evaluator_llm,
            embeddings=evaluator_embeddings
        )

        results = evaluate(
            dataset=dataset,
            metrics=[faithfulness, answer_correctness],
            llm=evaluator_llm,
            embeddings=evaluator_embeddings
        )

        # Extract scores (default to 0 if calculation fails)
        f_score = results.get('faithfulness', 0)
        ac_score = results.get('answer_correctness', 0)

        # Calculate Harmonic Mean
        # Formula: 2 * (a * b) / (a + b)
        if f_score + ac_score == 0:
            return 0.0

        harmonic_mean = 2 * (f_score * ac_score) / (f_score + ac_score)

        return harmonic_mean

        # Return the float score for DSPy to optimize against
        #return results['answer_correctness']

In [31]:
# ==========================================
# 3. DSPy Optimizer Class
# ==========================================

class DSPyPipelineOptimizer:
    """
    Handles the compilation and optimization of the DSPy RAG Module with configurable optimizers.
    """
    def __init__(self, optimizer_class=BootstrapFewShot, **optimizer_kwargs):
        """
        Initialize the optimizer wrapper.

        Args:
            optimizer_class: The DSPy teleprompter class to use (default: BootstrapFewShot).
            **optimizer_kwargs: Arguments specific to the optimizer (e.g., max_bootstrapped_demos, num_threads).
        """
        self.metric_fn = RAGASMetricAdapter.answer_correctness_metric
        self.optimizer_class = optimizer_class
        self.optimizer_kwargs = optimizer_kwargs

    def compile(self, module: dspy.Module, trainset: List[dspy.Example]):
        """
        Compiles the module using the configured strategy and RAGAS metrics.
        """
        optimizer_name = self.optimizer_class.__name__
        print(f"\n⚙️  Initializing DSPy Optimizer ({optimizer_name})...")
        print("   Optimization Metric: RAGAS Harmonic Mean")

        # Instantiate the passed optimizer class with the metric and provided kwargs
        teleprompter = self.optimizer_class(
            metric=self.metric_fn,
            **self.optimizer_kwargs
        )

        print(f"   Compiling module with {len(trainset)} training examples...")

        # Some optimizers (like MIPRO) might require additional args in compile,
        # but standard RAG optimizers usually just take the module and trainset.
        optimized_module = teleprompter.compile(module, trainset=trainset)

        print(f"✅ Compilation with {optimizer_name} complete.")
        return optimized_module


In [13]:
# ==========================================
# 4. Main Data & Evaluation Class (Coordinator)
# ==========================================

class RAGExperimentRunner:
    def __init__(self, collection_name="rag_qa_arena_tech_v3"):
        # Setup Chroma
        self.chroma_client = chromadb.Client()
        try:
            self.chroma_client.delete_collection(collection_name)
        except:
            pass
        self.collection = self.chroma_client.create_collection(name=collection_name)

        # Setup DSPy LM
        self.lm = dspy.LM('openai/gpt-4o-mini')
        dspy.configure(lm=self.lm)

        # RAGAS standard evaluators for the final report
        self.evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
        self.evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

    def _download_file(self, url, filename):
        if not os.path.exists(filename):
            print(f"⬇️  Downloading {filename}...")
            r = requests.get(url)
            with open(filename, 'wb') as f:
                f.write(r.content)

    def load_and_vectorize(self, limit_docs=200):
        # 1. Corpus (Knowledge Base)
        corpus_file = "ragqa_arena_tech_corpus.jsonl"
        self._download_file("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl", corpus_file)

        print(f"📖 Vectorizing corpus from {corpus_file}...")
        ids, documents, metadatas = [], [], []

        with open(corpus_file) as f:
            # Parse JSONL line by line
            raw_data = [ujson.loads(line) for line in f][:limit_docs]
            for idx, entry in enumerate(raw_data):
                ids.append(str(idx))
                # Truncate text as per original requirement
                documents.append(entry['text'][:6000])
                metadatas.append({"source": "ragqa"})

        # Batch upsert to Chroma
        if ids:
            batch_size = 100
            for i in range(0, len(ids), batch_size):
                self.collection.upsert(
                    ids=ids[i:i+batch_size],
                    documents=documents[i:i+batch_size],
                    metadatas=metadatas[i:i+batch_size]
                )

        # 2. Test Data (Updated Source & Field)
        qa_url = "https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl"
        qa_file = "ragqa_arena_tech_examples.jsonl"
        self._download_file(qa_url, qa_file)

        print(f"❓ Loading QA pairs from {qa_file}...")
        data = []
        with open(qa_file) as f:
            # Parse JSONL line by line
            qa_entries = [ujson.loads(line) for line in f]

            # Take small subset for demo speed
            qa_subset = qa_entries[:10]

            for entry in qa_subset:
                # Map 'response' to 'gold_answer' as requested
                data.append(dspy.Example(
                    question=entry['question'],
                    gold_answer=entry['response']
                ).with_inputs('question'))

        return data

    def retrieve(self, query: str, k=3) -> List[str]:
        results = self.collection.query(query_texts=[query], n_results=k)
        return results['documents'][0] if results['documents'] else []

    def evaluate_system(self, module, dataset, name="System"):
        print(f"\n🧪 Evaluating {name}...")
        ragas_data = {"question": [], "answer": [], "contexts": [], "ground_truth": []}

        for example in dataset:
            # Execute Module
            pred = module(question=example.question)

            ragas_data['question'].append(example.question)
            ragas_data['ground_truth'].append(example.gold_answer)
            ragas_data['answer'].append(pred.answer)
            # Ensure context is captured (DSPyRAGModule attaches it)
            ragas_data['contexts'].append(getattr(pred, 'retrieved_contexts', []))

        # Run Comprehensive RAGAS Evaluation
        # Using a subset of metrics for speed in demonstration
        results = evaluate(
            dataset=Dataset.from_dict(ragas_data),
            metrics=[faithfulness, answer_correctness, context_precision, answer_relevancy],
            llm=self.evaluator_llm,
            embeddings=self.evaluator_embeddings
        )
        print(f"📊 {name} Results: {results}")
        return results, Dataset.from_dict(ragas_data)

    def prepare_finetuning(self, dataset, filename="finetune_data.jsonl"):
        print(f"\n💾 Saving fine-tuning data to {filename}...")
        with open(filename, "w") as f:
            for row in dataset:
                msg = {
                    "messages": [
                        {"role": "system", "content": "You are a helpful RAG assistant."},
                        {"role": "user", "content": f"Context: {row['contexts']}\n\nQuestion: {row['question']}"},
                        {"role": "assistant", "content": row['answer']}
                    ]
                }
                f.write(ujson.dumps(msg) + "\n")


In [17]:
# 1. Setup Runner
runner = RAGExperimentRunner()

# Load data (Updated to use new URL and 'response' field)
all_data = runner.load_and_vectorize(limit_docs=65000)

  self.evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
  self.evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


📖 Vectorizing corpus from ragqa_arena_tech_corpus.jsonl...
❓ Loading QA pairs from ragqa_arena_tech_examples.jsonl...


In [18]:
# Get the total number of documents in the collection
total_docs = runner.collection.count()
print(total_docs)

if total_docs == 0:
    print("No documents in the collection.")
else:
    # Determine how many documents to show (max 5 or total_docs if less than 5)
    num_to_show = min(5, total_docs)

    # Generate random indices from the range of existing document IDs
    random_indices = np.random.choice(total_docs, num_to_show, replace=False)


    # Construct the document IDs based on how they were inserted (e.g., 'doc_0', 'doc_1', etc.)
    random_ids = [f"{i}" for i in random_indices]
    print(random_ids)

    # Retrieve the selected documents from ChromaDB
    random_documents = runner.collection.get(ids=random_ids)
    print(random_documents)
    print(f"Displaying {num_to_show} random documents from the collection:")
    for i, doc_content in enumerate(random_documents['documents']):
        print(f"\n--- Document {i+1} (ID: {random_documents['ids'][i]}) ---")
        print(doc_content)
        print("-" * 30)

28436
['11424', '23496', '20327', '247', '11142']
{'ids': ['247', '11142', '11424', '20327', '23496'], 'embeddings': None, 'documents': ['Also available are iosnoop and iotop depending on your specific needs. These terminal commands can be piped through grep to watch for filesystem events from a specific process or against a specific file.', 'If you have no other Apple devices, then when you chose to delete your passwords, you likely deleted them from your phone (at first). But when you re-enabled iCloud, you synced your current phone status to iCloud, which is / was password-less and removed the content from iCloud as well.', 'A static IP for at least one side is advised; however, DDNS will work for this,(if both sides are assigned dynamic addresses and NAT Overloaded), while both routers have fqdns assigned for dynamic tracking of peer: http://www.cisco.com/c/en/us/support/docs/security-vpn/ipsec-architecture-implementation/118048-technote-ipsec-00.html . If DDNS is not leveraged, on

In [30]:
# 1. Your existing in-memory client (with data)
in_memory_col = runner.collection

# 2. Initialize a NEW persistent client
persistent_client = chromadb.PersistentClient(path="./demodata")
persistent_col = persistent_client.get_or_create_collection("rag_qa_arena_tech_v3")

# 3. Retrieve all data from memory
# Get all data (ids, embeddings, documents, metadatas)
existing_data = in_memory_col.get(include=['embeddings', 'documents', 'metadatas'])

In [22]:
def add_in_batches(
    col,
    ids,
    embeddings=None,
    documents=None,
    metadatas=None,
    batch_size=5000,  # choose <= your observed max (5461). 5000 gives headroom.
):
    if not ids:
        return

    n = len(ids)

    # Basic alignment checks (fail fast, prevents subtle corruption)
    if embeddings is not None and len(embeddings) != n:
        raise ValueError(f"len(embeddings)={len(embeddings)} must match len(ids)={n}")
    if documents is not None and len(documents) != n:
        raise ValueError(f"len(documents)={len(documents)} must match len(ids)={n}")
    if metadatas is not None and len(metadatas) != n:
        raise ValueError(f"len(metadatas)={len(metadatas)} must match len(ids)={n}")

    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)

        kwargs = {"ids": ids[start:end]}
        if embeddings is not None:
            kwargs["embeddings"] = embeddings[start:end]
        if documents is not None:
            kwargs["documents"] = documents[start:end]
        if metadatas is not None:
            kwargs["metadatas"] = metadatas[start:end]

        col.add(**kwargs)

In [23]:
add_in_batches(
    persistent_col,
    ids=existing_data["ids"],
    embeddings=existing_data.get("embeddings"),
    documents=existing_data.get("documents"),
    metadatas=existing_data.get("metadatas"),
    batch_size=5000,  # <= 5461
)
print("Data successfully migrated to disk!")

Data successfully migrated to disk!


In [24]:
# Split for Train (Optimization) and Test (Evaluation)
train_data = all_data[:5]
test_data = all_data[5:]

# 2. Instantiate the uncompiled DSPy Module
rag_module = DSPyRAGModule(retrieve_fn=runner.retrieve)

In [25]:
len(all_data)

10

In [82]:
from ragas import evaluate
runner.evaluate_system(rag_module, test_data, name="Baseline")


🧪 Evaluating Baseline...


Evaluating: 100%|██████████| 20/20 [03:00<00:00,  9.03s/it]


📊 Baseline Results: {'faithfulness': 0.9603, 'answer_correctness': 0.6147, 'context_precision': 0.8333, 'answer_relevancy': 0.9558}


({'faithfulness': 0.9603, 'answer_correctness': 0.6147, 'context_precision': 0.8333, 'answer_relevancy': 0.9558},
 Dataset({
     features: ['question', 'answer', 'contexts', 'ground_truth'],
     num_rows: 5
 }))

###DSPY Optimization

In [44]:
import ujson
from dspy.utils import download

# Download question--answer pairs from the RAG-QA Arena "Tech" dataset.
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl")

with open("ragqa_arena_tech_examples.jsonl") as f:
    data = [ujson.loads(line) for line in f]

Downloading 'ragqa_arena_tech_examples.jsonl'...


In [45]:
# Inspect one datapoint.
data[0]

{'question': 'why igp is used in mpls?',
 'response': "An IGP exchanges routing prefixes between gateways/routers.  \nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.",
 'gold_doc_ids': [2822, 2823]}

In [54]:
data = [dspy.Example(**d).with_inputs('question') for d in data]

# Let's pick an `example` here from the data.
example = data[2]
example

Example({'question': 'what is b in grep?', 'response': 'In regular expressions, the term "\\b" signifies a "word boundary," and the command searches for all words i in the file linux.txt.', 'gold_doc_ids': [4783]}) (input_keys={'question'})

In [47]:
import random

random.Random(0).shuffle(data)
trainset, devset, testset = data[:200], data[200:500], data[500:1000]

len(trainset), len(devset), len(testset)

(200, 300, 500)

In [49]:
lm = dspy.LM('openai/gpt-4o-mini')
dspy.configure(lm=lm)


In [50]:
qa = dspy.Predict('question: str -> response: str')
response = qa(question="what are high memory and low memory on linux?")

print(response.response)

In Linux, "high memory" and "low memory" refer to different ranges of memory addresses that the kernel can utilize, particularly in 32-bit systems:

1. **Low Memory**: This represents the lower portion of the memory range, typically below 4 GB (or 1 GB in some configurations). This memory is directly accessible by the kernel and can be used for various tasks, such as kernel data structures and process management. Low memory is preferable for operations that require quicker access because it doesn't involve any additional handling.

2. **High Memory**: This refers to memory addresses that are above the low memory address range (above 4 GB on a 32-bit system). The kernel cannot directly access this memory. Instead, it requires special mechanisms like paging to access high memory. This is relevant for systems with large amounts of RAM when they utilize a 64-bit architecture, allowing for more efficient memory use. High memory is often used for user-space processes or data that doesn't req

In [51]:
dspy.inspect_history(n=1)





[34m[2025-12-16T17:02:54.873275][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str):
Your output fields are:
1. `response` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## question ## ]]
what are high memory and low memory on linux?

Respond with the corresponding output fields, starting with the field `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## response ## ]]
In Linux, "high memory" and "low memory" refer to different ranges of memory addresses that the kernel can utilize, particularly in 32-bit systems:

1. **Low Memory**: This represents the lower portion of the memory range, typically below 4

In [52]:
cot = dspy.ChainOfThought('question -> response')
cot(question="should curly braces appear on their own line?")

Prediction(
    reasoning="The placement of curly braces largely depends on the style guide or coding standard you are following. In many programming languages, particularly those influenced by C-style syntax (like Java, C++, and JavaScript), there's a common practice of placing opening curly braces on the same line as the statement (like for loops or function definitions) and placing closing braces on their own line. This keeps related code visually grouped together. However, some coding styles, like the one advocated by Python (which uses indentation instead of braces), or other specific style guides, might suggest different practices. Ultimately, it's about consistency and adhering to the conventions established for the project or language you are working with.",
    response='Curly braces do not necessarily have to appear on their own line; it depends on the coding style guide you are following. Many prefer placing the opening brace on the same line as the preceding statement while

In [61]:
data[2]

Example({'question': 'what is b in grep?', 'response': 'In regular expressions, the term "\\b" signifies a "word boundary," and the command searches for all words i in the file linux.txt.', 'gold_doc_ids': [4783]}) (input_keys={'question'})

In [62]:
from dspy.evaluate import SemanticF1

# Instantiate the metric.
metric = SemanticF1(decompositional=True)
example = data[3]


# Produce a prediction from our `cot` module, using the `example` above as input.
pred = cot(**example.inputs())
#print(pred)
# Compute the metric score for the prediction.
score = metric(example, pred)

print(f"Question: \t {example.question}\n")
print(f"Gold Response: \t {example.response}\n")
print(f"Predicted Response: \t {pred.response}\n")
print(f"Semantic F1 Score: {score:.2f}")

Question: 	 how can i recursively delete empty directories in my home directory?

Gold Response: 	 To remove directories using command-line operations, it is possible to call rmdir on each directory, as it will only delete directories that are empty.

Predicted Response: 	 You can use the following command in your terminal to recursively delete empty directories in your home directory:

```bash
find ~/ -type d -empty -delete
```

This command will search for all empty directories within your home directory and delete them.

Semantic F1 Score: 0.50


In [64]:
dspy.inspect_history(n=2)





[34m[2025-12-16T17:11:49.636668][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str):
Your output fields are:
1. `reasoning` (str): 
2. `response` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Given the fields `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## question ## ]]
how can i recursively delete empty directories in my home directory?

Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## reasoning ## ]]
To recursively delete empty directories in your home directory, you can use the command line. The `find` command is very useful 

In [65]:
# Define an evaluator that we can re-use.
evaluate = dspy.Evaluate(devset=devset, metric=metric, num_threads=24,
                         display_progress=True, display_table=2)

# Evaluate the Chain-of-Thought program.
evaluate(cot)

Average Metric: 127.19 / 300 (42.4%): 100%|██████████| 300/300 [03:32<00:00,  1.41it/s]

2025/12/16 17:17:04 INFO dspy.evaluate.evaluate: Average Metric: 127.19304820331601 / 300 (42.4%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,"when to use c over c++, and c++ over c?","If you are equally familiar with both C++ and C, it's advisable to...",[733],"C and C++ are both powerful programming languages, but they serve ...",Use C when you need low-level programming with high performance an...,✔️ [0.489]
1,should images be stored in a git repository?,"One viewpoint expresses that there is no significant downside, esp...","[6253, 6254, 6275, 6278, 8215]",Storing images in a Git repository can lead to several challenges....,"No, images should generally not be stored in a Git repository due ...",✔️ [0.240]


EvaluationResult(score=42.4, results=<list of 300 results>)

In [66]:
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl")

In [67]:
max_characters = 6000  # for truncating >99th percentile of documents
topk_docs_to_retrieve = 5  # number of documents to retrieve per search query

with open("ragqa_arena_tech_corpus.jsonl") as f:
    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]
    print(f"Loaded {len(corpus)} documents. Will encode them below.")

embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)
search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)

Loaded 28436 documents. Will encode them below.
Training a 32-byte FAISS index with 337 partitions, based on 28436 x 512-dim embeddings


In [68]:
class RAGModule(dspy.Module):
    def __init__(self):
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = search(question).passages
        return self.respond(context=context, question=question)

In [69]:
rag = RAGModule()
rag(question="what are high memory and low memory on linux?")

Prediction(
    reasoning='High memory and low memory in Linux refer to different segments of the memory addressing space utilized by the kernel and user-space applications. Low memory is the region of memory that is always mapped in the kernel’s address space, allowing the kernel direct access for its operations. This memory is used for critical kernel data structures. Conversely, high memory refers to the portion of system memory that is not permanently mapped to the kernel’s address space. When the kernel needs to access high memory, it must temporarily map it into its address space. This distinction is crucial in managing memory, especially in a 32-bit architecture where the kernel needs to efficiently handle memory allocation and access for both kernel and user processes.',
    response='In Linux, high memory is the segment of memory that is not permanently mapped into the kernel’s address space and requires special procedures for access. Low memory, on the other hand, is always m

In [70]:
dspy.inspect_history()





[34m[2025-12-16T17:26:00.653378][0m

[31mSystem message:[0m

Your input fields are:
1. `context` (str): 
2. `question` (str):
Your output fields are:
1. `reasoning` (str): 
2. `response` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## context ## ]]
{context}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Given the fields `context`, `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## context ## ]]
[1] «As far as I remember, High Memory is used for application space and Low Memory for the kernel. Advantage is that (user-space) applications cant access kernel-space memory.»
[2] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x0000

In [71]:
evaluate(RAGModule())

Average Metric: 164.33 / 300 (54.8%): 100%|██████████| 300/300 [03:48<00:00,  1.31it/s]

2025/12/16 17:31:21 INFO dspy.evaluate.evaluate: Average Metric: 164.32849042819623 / 300 (54.8%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,"when to use c over c++, and c++ over c?","If you are equally familiar with both C++ and C, it's advisable to...",[733],C should be preferred over C++ when developing for embedded system...,Use C over C++ when developing embedded systems or applications wh...,✔️ [0.500]
1,should images be stored in a git repository?,"One viewpoint expresses that there is no significant downside, esp...","[6253, 6254, 6275, 6278, 8215]",Storing images in a Git repository is generally not recommended du...,"In summary, while it is possible to store images in a Git reposito...",✔️ [0.429]


EvaluationResult(score=54.78, results=<list of 300 results>)

In [72]:
tp = dspy.MIPROv2(metric=metric, auto="medium", num_threads=24)  # use fewer threads if your rate limit is small

optimized_rag = tp.compile(RAGModule(), trainset=trainset,
                           max_bootstrapped_demos=2, max_labeled_demos=2)

2025/12/16 17:37:17 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 18
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 160

2025/12/16 17:37:17 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/12/16 17:37:17 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/12/16 17:37:17 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...


Bootstrapping set 1/12
Bootstrapping set 2/12
Bootstrapping set 3/12


 12%|█▎        | 5/40 [01:25<10:00, 17.15s/it]


Bootstrapped 2 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/12


 15%|█▌        | 6/40 [01:45<09:59, 17.62s/it]


Bootstrapped 1 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 5/12


  2%|▎         | 1/40 [00:19<12:22, 19.03s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/12


  8%|▊         | 3/40 [00:54<11:08, 18.08s/it]


Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 7/12


  5%|▌         | 2/40 [00:34<11:04, 17.48s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/12


 10%|█         | 4/40 [01:02<09:24, 15.67s/it]


Bootstrapped 1 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 9/12


  5%|▌         | 2/40 [00:34<10:54, 17.22s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 10/12


 12%|█▎        | 5/40 [01:21<09:33, 16.38s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 11/12


 12%|█▎        | 5/40 [01:10<08:15, 14.16s/it]


Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 12/12


  2%|▎         | 1/40 [00:11<07:39, 11.79s/it]
2025/12/16 17:46:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/12/16 17:46:39 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


2025/12/16 17:46:59 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...

2025/12/16 17:48:03 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/12/16 17:48:03 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.

2025/12/16 17:48:03 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Imagine you are a tech support specialist assisting a user with a critical technical issue on a macOS or Unix/Linux system. Your task is to provide a comprehensive explanation to their query based on the provided context. Using the fields `context` and `question`, think critically about the information, and produce a detailed `response` that not only answers the user's question but also includes practical advice or troubleshooting tips they can apply immediately. Ensure your response empowers the user to understand their situation better and take the necessary steps to resolve their issue.

2025/12/

Average Metric: 89.97 / 160 (56.2%): 100%|██████████| 160/160 [01:53<00:00,  1.41it/s]

2025/12/16 17:49:57 INFO dspy.evaluate.evaluate: Average Metric: 89.96909671730273 / 160 (56.2%)
2025/12/16 17:49:57 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 56.23

2025/12/16 17:49:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 23 - Minibatch ==



Average Metric: 22.56 / 35 (64.4%): 100%|██████████| 35/35 [00:42<00:00,  1.21s/it]

2025/12/16 17:50:39 INFO dspy.evaluate.evaluate: Average Metric: 22.556492536651767 / 35 (64.4%)
2025/12/16 17:50:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.45 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 17:50:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45]
2025/12/16 17:50:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23]
2025/12/16 17:50:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 56.23


2025/12/16 17:50:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 23 - Minibatch ==



Average Metric: 19.87 / 35 (56.8%): 100%|██████████| 35/35 [00:40<00:00,  1.16s/it]

2025/12/16 17:51:19 INFO dspy.evaluate.evaluate: Average Metric: 19.869728051108016 / 35 (56.8%)
2025/12/16 17:51:19 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.77 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 2'].
2025/12/16 17:51:19 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77]
2025/12/16 17:51:19 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23]
2025/12/16 17:51:19 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 56.23


2025/12/16 17:51:19 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 23 - Minibatch ==



Average Metric: 23.84 / 35 (68.1%): 100%|██████████| 35/35 [00:37<00:00,  1.06s/it]

2025/12/16 17:51:57 INFO dspy.evaluate.evaluate: Average Metric: 23.839503800001268 / 35 (68.1%)
2025/12/16 17:51:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.11 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 17:51:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11]
2025/12/16 17:51:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23]
2025/12/16 17:51:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 56.23


2025/12/16 17:51:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 23 - Minibatch ==



Average Metric: 22.12 / 35 (63.2%): 100%|██████████| 35/35 [00:43<00:00,  1.23s/it]

2025/12/16 17:52:40 INFO dspy.evaluate.evaluate: Average Metric: 22.116801438694285 / 35 (63.2%)
2025/12/16 17:52:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.19 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].
2025/12/16 17:52:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19]
2025/12/16 17:52:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23]
2025/12/16 17:52:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 56.23


2025/12/16 17:52:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 23 - Minibatch ==



Average Metric: 19.88 / 35 (56.8%): 100%|██████████| 35/35 [00:47<00:00,  1.36s/it]

2025/12/16 17:53:28 INFO dspy.evaluate.evaluate: Average Metric: 19.877225795185158 / 35 (56.8%)
2025/12/16 17:53:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.79 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2025/12/16 17:53:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79]
2025/12/16 17:53:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23]
2025/12/16 17:53:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 56.23


2025/12/16 17:53:28 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 23 - Full Evaluation =====
2025/12/16 17:53:28 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.11) from minibatch trials...



Average Metric: 95.69 / 160 (59.8%): 100%|██████████| 160/160 [01:40<00:00,  1.59it/s]

2025/12/16 17:55:08 INFO dspy.evaluate.evaluate: Average Metric: 95.689302150198 / 160 (59.8%)
2025/12/16 17:55:08 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 59.81
2025/12/16 17:55:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81]
2025/12/16 17:55:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.81
2025/12/16 17:55:08 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/16 17:55:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 23 - Minibatch ==



Average Metric: 20.19 / 35 (57.7%): 100%|██████████| 35/35 [00:42<00:00,  1.21s/it]

2025/12/16 17:55:51 INFO dspy.evaluate.evaluate: Average Metric: 20.193079991579914 / 35 (57.7%)
2025/12/16 17:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.69 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 17:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69]
2025/12/16 17:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81]
2025/12/16 17:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.81


2025/12/16 17:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 23 - Minibatch ==



Average Metric: 21.10 / 35 (60.3%): 100%|██████████| 35/35 [00:42<00:00,  1.21s/it]

2025/12/16 17:56:33 INFO dspy.evaluate.evaluate: Average Metric: 21.104435920472095 / 35 (60.3%)
2025/12/16 17:56:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.3 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1'].
2025/12/16 17:56:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3]
2025/12/16 17:56:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81]
2025/12/16 17:56:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.81


2025/12/16 17:56:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 23 - Minibatch ==



Average Metric: 19.75 / 35 (56.4%): 100%|██████████| 35/35 [00:41<00:00,  1.20s/it]

2025/12/16 17:57:15 INFO dspy.evaluate.evaluate: Average Metric: 19.74756486658851 / 35 (56.4%)
2025/12/16 17:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 56.42 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 3'].
2025/12/16 17:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42]
2025/12/16 17:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81]
2025/12/16 17:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.81


2025/12/16 17:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 23 - Minibatch ==



Average Metric: 19.07 / 35 (54.5%): 100%|██████████| 35/35 [00:34<00:00,  1.01it/s]

2025/12/16 17:57:50 INFO dspy.evaluate.evaluate: Average Metric: 19.06859683486066 / 35 (54.5%)
2025/12/16 17:57:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 54.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2025/12/16 17:57:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48]
2025/12/16 17:57:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81]
2025/12/16 17:57:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.81


2025/12/16 17:57:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 23 - Minibatch ==



Average Metric: 21.07 / 35 (60.2%): 100%|██████████| 35/35 [00:01<00:00, 21.46it/s]

2025/12/16 17:57:51 INFO dspy.evaluate.evaluate: Average Metric: 21.066593729740685 / 35 (60.2%)
2025/12/16 17:57:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.19 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 17:57:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19]
2025/12/16 17:57:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81]
2025/12/16 17:57:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 59.81


2025/12/16 17:57:51 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 23 - Full Evaluation =====
2025/12/16 17:57:51 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 64.45) from minibatch trials...



Average Metric: 97.56 / 160 (61.0%): 100%|██████████| 160/160 [01:47<00:00,  1.48it/s]

2025/12/16 17:59:39 INFO dspy.evaluate.evaluate: Average Metric: 97.56216202908269 / 160 (61.0%)
2025/12/16 17:59:39 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 60.98
2025/12/16 17:59:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98]
2025/12/16 17:59:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98
2025/12/16 17:59:39 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/16 17:59:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 23 - Minibatch ==



Average Metric: 20.63 / 35 (58.9%): 100%|██████████| 35/35 [00:02<00:00, 11.98it/s]

2025/12/16 17:59:42 INFO dspy.evaluate.evaluate: Average Metric: 20.62872886582464 / 35 (58.9%)
2025/12/16 17:59:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.94 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 17:59:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94]
2025/12/16 17:59:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98]
2025/12/16 17:59:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 17:59:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 23 - Minibatch ==



Average Metric: 20.18 / 35 (57.7%): 100%|██████████| 35/35 [00:47<00:00,  1.35s/it]

2025/12/16 18:00:29 INFO dspy.evaluate.evaluate: Average Metric: 20.181681624595367 / 35 (57.7%)
2025/12/16 18:00:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.66 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 10'].
2025/12/16 18:00:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66]
2025/12/16 18:00:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98]
2025/12/16 18:00:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:00:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 23 - Minibatch ==



Average Metric: 21.81 / 35 (62.3%): 100%|██████████| 35/35 [00:36<00:00,  1.05s/it]

2025/12/16 18:01:06 INFO dspy.evaluate.evaluate: Average Metric: 21.805854852378438 / 35 (62.3%)
2025/12/16 18:01:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.3 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 18:01:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66, 62.3]
2025/12/16 18:01:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98]
2025/12/16 18:01:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:01:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 23 - Minibatch ==



Average Metric: 19.98 / 35 (57.1%): 100%|██████████| 35/35 [00:40<00:00,  1.16s/it]

2025/12/16 18:01:47 INFO dspy.evaluate.evaluate: Average Metric: 19.97927249111863 / 35 (57.1%)
2025/12/16 18:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.08 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 8'].
2025/12/16 18:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66, 62.3, 57.08]
2025/12/16 18:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98]
2025/12/16 18:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:01:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 23 - Minibatch ==



Average Metric: 22.87 / 35 (65.4%): 100%|██████████| 35/35 [00:41<00:00,  1.17s/it]

2025/12/16 18:02:28 INFO dspy.evaluate.evaluate: Average Metric: 22.874626288047256 / 35 (65.4%)
2025/12/16 18:02:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.36 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 18:02:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66, 62.3, 57.08, 65.36]
2025/12/16 18:02:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98]
2025/12/16 18:02:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:02:28 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 23 - Full Evaluation =====
2025/12/16 18:02:28 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 65.36) from minibatch trials...



Average Metric: 96.83 / 160 (60.5%): 100%|██████████| 160/160 [01:44<00:00,  1.54it/s]

2025/12/16 18:04:12 INFO dspy.evaluate.evaluate: Average Metric: 96.83368005552488 / 160 (60.5%)
2025/12/16 18:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98, 60.52]
2025/12/16 18:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98
2025/12/16 18:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/16 18:04:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 23 - Minibatch ==



Average Metric: 19.47 / 35 (55.6%): 100%|██████████| 35/35 [00:38<00:00,  1.11s/it]

2025/12/16 18:04:51 INFO dspy.evaluate.evaluate: Average Metric: 19.47227627767913 / 35 (55.6%)
2025/12/16 18:04:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.64 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 7'].
2025/12/16 18:04:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66, 62.3, 57.08, 65.36, 55.64]
2025/12/16 18:04:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98, 60.52]
2025/12/16 18:04:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:04:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 23 - Minibatch ==



Average Metric: 19.99 / 35 (57.1%): 100%|██████████| 35/35 [00:19<00:00,  1.80it/s]

2025/12/16 18:05:11 INFO dspy.evaluate.evaluate: Average Metric: 19.987004875081936 / 35 (57.1%)
2025/12/16 18:05:11 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.11 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 6'].
2025/12/16 18:05:11 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66, 62.3, 57.08, 65.36, 55.64, 57.11]
2025/12/16 18:05:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98, 60.52]
2025/12/16 18:05:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:05:11 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 23 - Minibatch ==



Average Metric: 20.23 / 35 (57.8%): 100%|██████████| 35/35 [00:39<00:00,  1.12s/it]

2025/12/16 18:05:50 INFO dspy.evaluate.evaluate: Average Metric: 20.229472275943348 / 35 (57.8%)
2025/12/16 18:05:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.8 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 11'].
2025/12/16 18:05:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [64.45, 56.77, 68.11, 63.19, 56.79, 57.69, 60.3, 56.42, 54.48, 60.19, 58.94, 57.66, 62.3, 57.08, 65.36, 55.64, 57.11, 57.8]
2025/12/16 18:05:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98, 60.52]
2025/12/16 18:05:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98


2025/12/16 18:05:50 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 23 - Full Evaluation =====
2025/12/16 18:05:50 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 63.19) from minibatch trials...



Average Metric: 96.59 / 160 (60.4%): 100%|██████████| 160/160 [01:43<00:00,  1.54it/s]

2025/12/16 18:07:34 INFO dspy.evaluate.evaluate: Average Metric: 96.58909676931971 / 160 (60.4%)
2025/12/16 18:07:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [56.23, 59.81, 60.98, 60.52, 60.37]
2025/12/16 18:07:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 60.98
2025/12/16 18:07:34 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/12/16 18:07:34 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 60.98!





In [73]:
baseline = rag(question="cmd+tab does not work on hidden or minimized windows")
print(baseline.response)

Command + Tab will not switch to minimized windows directly. To bring back a minimized app, first ensure the app is not minimized. If it is minimized, you'll need to select it from the application list without the Command key being released to allow focus transition. Additionally, you could alter your System Preferences in Mission Control to include options for managing minimized applications.


In [74]:
pred = optimized_rag(question="cmd+tab does not work on hidden or minimized windows")
print(pred.response)

When using Command + Tab on macOS, it's important to note that this shortcut does not reactivate hidden or minimized windows by default. Instead, it allows you to cycle through open applications that are active or visible. If you want to ensure that you can switch to a minimized app, you should first follow these steps: 

1. While holding the Command key, tap the Tab key until you highlight the app you want to switch to.
2. Before releasing the Command key, press and hold the Option key. This allows the app to take focus even if it was minimized before.
3. You will need to switch focus to another app entirely before returning to your minimized app by switching back while continuing to hold the keys.

If you have the "When switching to an application, switch to a Space with open windows for the application" option unchecked in Mission Control under System Preferences, this can help this switching process as well. 

Also, consider using the shortcut Command + Option + H + M to hide all o

In [76]:
dspy.inspect_history()





[34m[2025-12-16T18:25:32.142559][0m

[31mSystem message:[0m

Your input fields are:
1. `context` (str): 
2. `question` (str):
Your output fields are:
1. `reasoning` (str): 
2. `response` (str):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## context ## ]]
{context}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        Imagine you are a tech support specialist assisting a user with a critical technical issue on a macOS or Unix/Linux system. Your task is to provide a comprehensive explanation to their query based on the provided context. Using the fields `context` and `question`, think critically about the information, and produce a detailed `response` that not only answers the user's question but also includes practical advice or troubleshooting tips they can apply immediately. Ensure your respons

###Improved Instruction

Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.

In [78]:
optimized_rag

respond.predict = Predict(StringSignature(context, question -> reasoning, response
    instructions="Imagine you are a tech support specialist assisting a user with a critical technical issue on a macOS or Unix/Linux system. Your task is to provide a comprehensive explanation to their query based on the provided context. Using the fields `context` and `question`, think critically about the information, and produce a detailed `response` that not only answers the user's question but also includes practical advice or troubleshooting tips they can apply immediately. Ensure your response empowers the user to understand their situation better and take the necessary steps to resolve their issue."
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    reasonin

In [80]:
rag

respond.predict = Predict(StringSignature(context, question -> reasoning, response
    instructions='Given the fields `context`, `question`, produce the fields `response`.'
    context = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Context:', 'desc': '${context}'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    response = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Response:', 'desc': '${response}'})
))

#Synthetic Dataset Generation