In [1]:
import dspy
import chromadb
import ujson
import random

from chromadb.utils import embedding_functions
from dspy.evaluate import SemanticF1
from dspy.retrieve.chromadb_rm import ChromadbRM
from dspy.utils import download

# Download the Data

In [2]:
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl")
with open("./ragqa_arena_tech_corpus.jsonl", "r") as fp:
    corpus = [ujson.loads(line) for line in fp]

download(
    "https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl"
)
with open("ragqa_arena_tech_examples.jsonl", "r") as fp:
    qa = [ujson.loads(line) for line in fp]

print(corpus[2])
len(corpus)
print(qa[2])
len(qa)

{'doc_id': 131078, 'author': None, 'text': 'http://abtevrythng.blogspot.com/2010/06/adding-cer-certificates-on-your-android.html Shows how to actually achieve this. Worked fine for me. Try it out. In this article .cer to .pfx (which is what you need on Android) conversion is given. Simple method is given using which you can convert .cer to .pfx and use it to connect to the Wi-Fi network. Plus you dont need any Key to convert .cer to .pfx!!!'}
{'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}


2064

# Connect to ChromaDB

In [3]:
client = chromadb.PersistentClient(path="./db")
collection = client.get_or_create_collection(name="test")

# Define Retrievers

In [4]:
embed_fn = embedding_functions.DefaultEmbeddingFunction()
retriever_model = ChromadbRM(
    collection_name="test", persist_directory="./db", embedding_function=embed_fn, k=5
)

# Load LM

In [5]:
lm = dspy.LM(model="ollama_chat/llama3.1", temperature=0.0)
dspy.configure(lm=lm)

# Build RAG Module

In [6]:
class RAG(dspy.Module):
    def __init__(self):
        self.respond = dspy.ChainOfThought("context, question -> response")

    def forward(self, question):
        ret_docs = retriever_model(question)
        context = [doc["long_text"] for doc in ret_docs]
        return self.respond(context=context, question=question)

# DSPy RAG Optimization

In [7]:
qa_examples = [dspy.Example(**data).with_inputs("question") for data in qa]
random.Random(23).shuffle(qa_examples)

train, dev, test = qa_examples[:200], qa_examples[200:500], qa_examples[500:1000]
len(train), len(dev), len(test)

(200, 300, 500)

In [9]:
train[2]

Example({'question': 'how can i show typing keyboard in record screen', 'response': 'One method involves using Screenflow, a software that includes this feature and is priced at $99.  \nAdditionally, standalone apps like Keycastr and Mouseposé can be used for the same purpose.  \nAnother alternative is showing the on-screen keyboard, although this results in a different aesthetic.', 'gold_doc_ids': [6129, 6130, 6131]}) (input_keys={'question'})

In [11]:
metric = SemanticF1(decompositional=True)
tp = dspy.MIPROv2(metric=metric, auto="medium", num_threads=4)

optimized_rag = tp.compile(
    RAG(),
    trainset=train,
    max_bootstrapped_demos=2,
    max_labeled_demos=2,
    requires_permission_to_run=False,
)

2024/12/31 00:27:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 25
minibatch: True
num_candidates: 19
valset size: 160

2024/12/31 00:27:39 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2024/12/31 00:27:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2024/12/31 00:27:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...


Bootstrapping set 1/19
Bootstrapping set 2/19
Bootstrapping set 3/19


 12%|█▎        | 5/40 [01:03<07:23, 12.67s/it]


Bootstrapped 2 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/19


  2%|▎         | 1/40 [00:09<06:19,  9.72s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 5/19


  2%|▎         | 1/40 [00:09<06:22,  9.81s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 6/19


  5%|▌         | 2/40 [00:25<08:12, 12.95s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 7/19


  5%|▌         | 2/40 [00:20<06:21, 10.04s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 8/19


  2%|▎         | 1/40 [00:08<05:24,  8.32s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 9/19


  8%|▊         | 3/40 [00:30<06:15, 10.15s/it]


Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 10/19


  2%|▎         | 1/40 [00:10<06:31, 10.04s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 11/19


  2%|▎         | 1/40 [00:11<07:15, 11.18s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 12/19


  2%|▎         | 1/40 [00:06<04:06,  6.33s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 13/19


  2%|▎         | 1/40 [00:10<07:02, 10.83s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 14/19


  2%|▎         | 1/40 [00:09<06:00,  9.24s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 15/19


  2%|▎         | 1/40 [00:04<02:47,  4.31s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 16/19


  5%|▌         | 2/40 [00:20<06:37, 10.45s/it]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 17/19


  5%|▌         | 2/40 [00:15<04:46,  7.53s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 18/19


  5%|▌         | 2/40 [00:11<03:41,  5.82s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 19/19


 10%|█         | 4/40 [00:39<05:54,  9.84s/it]
2024/12/31 00:32:46 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2024/12/31 00:32:46 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2024/12/31 00:33:30 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2024/12/31 00:38:06 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2024/12/31 00:38:06 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.

2024/12/31 00:38:06 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given a context and a question, use the retrieved information and reasoning to generate a response that is coherent and relevant.

2024/12/31 00:38:06 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Is it good practice to always have an autoincrement integer primary key?

Reasoning: Let's think step by step in order to The question of whether it is good practice to always have an autoincrement integer primary key is a topic of debate among database designers. Some argue that it is a good idea, as it provides a unique identifier for each row and can be used as a foreign key in other tables. However, others p

Average Metric: 98.72 / 158 (62.5%):  99%|█████████▉| 158/160 [13:07<00:08,  4.06s/it]

2024/12/31 00:51:20 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'how should i test randomness?', 'response': 'Run tests multiple times and visualize your data. \nAnalyzing the results of these tests will allow you to determine whether the algorithm performs satisfactorily by ensuring all potential outcomes are represented and that their frequency is properly distributed.', 'gold_doc_ids': [4525]}) (input_keys={'question'}): Expected dict_keys(['reasoning', 'ground_truth_key_ideas', 'system_response_key_ideas', 'discussion', 'recall', 'precision']) but got dict_keys(['reasoning', 'ground_truth_key_ideas', 'system_response_key_ideas', 'discussion', 'recall']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 99.20 / 159 (62.4%): 100%|██████████| 160/160 [13:19<00:00,  5.00s/it]

2024/12/31 00:51:26 INFO dspy.evaluate.evaluate: Average Metric: 99.1984138482478 / 160 (62.0%)
2024/12/31 00:51:26 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 62.0

2024/12/31 00:51:26 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2024/12/31 00:51:26 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2024/12/31 00:51:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==



Average Metric: 16.71 / 25 (66.8%): 100%|██████████| 25/25 [02:59<00:00,  7.18s/it]

2024/12/31 00:54:25 INFO dspy.evaluate.evaluate: Average Metric: 16.706298003072195 / 25 (66.8%)
2024/12/31 00:54:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.83 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].
2024/12/31 00:54:25 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83]
2024/12/31 00:54:25 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 00:54:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 00:54:25 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==



Average Metric: 17.41 / 25 (69.6%): 100%|██████████| 25/25 [02:50<00:00,  6.81s/it]

2024/12/31 00:57:15 INFO dspy.evaluate.evaluate: Average Metric: 17.40610715449425 / 25 (69.6%)
2024/12/31 00:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.62 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].
2024/12/31 00:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62]
2024/12/31 00:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 00:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 00:57:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==



Average Metric: 16.19 / 25 (64.8%): 100%|██████████| 25/25 [02:32<00:00,  6.10s/it]

2024/12/31 00:59:48 INFO dspy.evaluate.evaluate: Average Metric: 16.191291019267613 / 25 (64.8%)
2024/12/31 00:59:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 64.77 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].
2024/12/31 00:59:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77]
2024/12/31 00:59:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 00:59:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 00:59:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==



Average Metric: 16.60 / 25 (66.4%): 100%|██████████| 25/25 [02:45<00:00,  6.63s/it]

2024/12/31 01:02:34 INFO dspy.evaluate.evaluate: Average Metric: 16.596550198082998 / 25 (66.4%)
2024/12/31 01:02:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.39 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].
2024/12/31 01:02:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39]
2024/12/31 01:02:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:02:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:02:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==



Average Metric: 15.88 / 25 (63.5%): 100%|██████████| 25/25 [02:39<00:00,  6.39s/it]

2024/12/31 01:05:13 INFO dspy.evaluate.evaluate: Average Metric: 15.884051941541601 / 25 (63.5%)
2024/12/31 01:05:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.54 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].
2024/12/31 01:05:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54]
2024/12/31 01:05:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:05:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:05:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==



Average Metric: 15.83 / 25 (63.3%): 100%|██████████| 25/25 [02:15<00:00,  5.41s/it]

2024/12/31 01:07:29 INFO dspy.evaluate.evaluate: Average Metric: 15.82948051948052 / 25 (63.3%)
2024/12/31 01:07:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.32 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].
2024/12/31 01:07:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32]
2024/12/31 01:07:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:07:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:07:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==



Average Metric: 15.62 / 25 (62.5%): 100%|██████████| 25/25 [02:48<00:00,  6.76s/it]

2024/12/31 01:10:18 INFO dspy.evaluate.evaluate: Average Metric: 15.623030303030303 / 25 (62.5%)
2024/12/31 01:10:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.49 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].
2024/12/31 01:10:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49]
2024/12/31 01:10:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:10:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:10:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==



Average Metric: 17.33 / 25 (69.3%): 100%|██████████| 25/25 [02:57<00:00,  7.11s/it]

2024/12/31 01:13:15 INFO dspy.evaluate.evaluate: Average Metric: 17.334413145539905 / 25 (69.3%)
2024/12/31 01:13:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.34 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].
2024/12/31 01:13:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34]
2024/12/31 01:13:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:13:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:13:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==



Average Metric: 14.80 / 25 (59.2%): 100%|██████████| 25/25 [03:34<00:00,  8.58s/it]

2024/12/31 01:16:50 INFO dspy.evaluate.evaluate: Average Metric: 14.79684695356553 / 25 (59.2%)
2024/12/31 01:16:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.19 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].
2024/12/31 01:16:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19]
2024/12/31 01:16:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:16:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:16:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==



Average Metric: 15.11 / 25 (60.4%): 100%|██████████| 25/25 [02:42<00:00,  6.52s/it]

2024/12/31 01:19:33 INFO dspy.evaluate.evaluate: Average Metric: 15.111731441279254 / 25 (60.4%)
2024/12/31 01:19:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.45 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].
2024/12/31 01:19:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45]
2024/12/31 01:19:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0]
2024/12/31 01:19:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.0


2024/12/31 01:19:33 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====
2024/12/31 01:19:33 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 69.62) from minibatch trials...



Average Metric: 24.57 / 39 (63.0%):  24%|██▍       | 38/160 [03:29<10:28,  5.15s/it]

2024/12/31 01:23:08 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'what are some undocumented iphone features?', 'response': "When you're using the iPod app and a new song starts playing, your iPhone's screen will light up briefly to display the track details, assuming the iPhone isn't in your pocket.  \nIf you double-click the home button while running iOS4, a customized task bar will appear, showing recent and currently running multi-tasked apps, as well as iPod controls.  \nBy pressing both the home button and the power button, your screen's contents will be captured and saved to your Photo Albums.  \nShould you wish to quickly scroll back to the top of any app, simply tapping the status bar will do the trick.  \nFor efficient typing on the iPhone's keyboard, you can hold the '123' button and slide to any number or punctuation, then lift your finger to return to the alphabetic layout.  The iPhone hides a Field Test mode, accessed by dialing *3001#12345#* 

Average Metric: 100.70 / 159 (63.3%): 100%|██████████| 160/160 [14:53<00:00,  5.58s/it]

2024/12/31 01:34:26 INFO dspy.evaluate.evaluate: Average Metric: 100.69839476237539 / 160 (62.9%)
2024/12/31 01:34:26 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 62.94
2024/12/31 01:34:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:34:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94
2024/12/31 01:34:26 INFO dspy.teleprompt.mipro_optimizer_v2: 

2024/12/31 01:34:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==



Average Metric: 12.60 / 25 (50.4%): 100%|██████████| 25/25 [02:48<00:00,  6.73s/it]

2024/12/31 01:37:14 INFO dspy.evaluate.evaluate: Average Metric: 12.603809523809524 / 25 (50.4%)
2024/12/31 01:37:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.42 on minibatch of size 25 with parameters ['Predictor 0: Instruction 17', 'Predictor 0: Few-Shot Set 17'].
2024/12/31 01:37:14 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42]
2024/12/31 01:37:14 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:37:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:37:14 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==



Average Metric: 16.41 / 25 (65.6%): 100%|██████████| 25/25 [02:33<00:00,  6.13s/it]

2024/12/31 01:39:48 INFO dspy.evaluate.evaluate: Average Metric: 16.411973311403 / 25 (65.6%)
2024/12/31 01:39:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.65 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 16'].
2024/12/31 01:39:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65]
2024/12/31 01:39:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:39:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:39:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==



Average Metric: 15.08 / 25 (60.3%): 100%|██████████| 25/25 [02:35<00:00,  6.20s/it]

2024/12/31 01:42:23 INFO dspy.evaluate.evaluate: Average Metric: 15.080806338589792 / 25 (60.3%)
2024/12/31 01:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.32 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 13'].
2024/12/31 01:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32]
2024/12/31 01:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:42:23 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==



Average Metric: 16.31 / 25 (65.2%): 100%|██████████| 25/25 [02:09<00:00,  5.18s/it]

2024/12/31 01:44:32 INFO dspy.evaluate.evaluate: Average Metric: 16.30507730594091 / 25 (65.2%)
2024/12/31 01:44:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.22 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].
2024/12/31 01:44:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22]
2024/12/31 01:44:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:44:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:44:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==



Average Metric: 14.79 / 25 (59.2%): 100%|██████████| 25/25 [02:57<00:00,  7.08s/it]

2024/12/31 01:47:29 INFO dspy.evaluate.evaluate: Average Metric: 14.79294413919414 / 25 (59.2%)
2024/12/31 01:47:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.17 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 12'].
2024/12/31 01:47:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17]
2024/12/31 01:47:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:47:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:47:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==



Average Metric: 15.18 / 25 (60.7%): 100%|██████████| 25/25 [00:00<00:00, 75.24it/s]

2024/12/31 01:47:30 INFO dspy.evaluate.evaluate: Average Metric: 15.177870763191072 / 25 (60.7%)
2024/12/31 01:47:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.71 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].
2024/12/31 01:47:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71]
2024/12/31 01:47:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:47:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:47:30 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==



Average Metric: 13.12 / 21 (62.5%):  84%|████████▍ | 21/25 [02:25<00:32,  8.04s/it]

2024/12/31 01:50:01 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'what tiny thing in lion makes you smile or has caught you off guard?', 'response': 'The features people like are: merging folders in Finder; QuickTime M4A recordings; the offer to add a Gmail account if logging in on Safari for the first time; the redesigned lock screen; Autocorrect; Multi-touch swiping of page history in Safari; tmutil command line into Time Machine; Quicklook natively supports animated GIFs and previews URL content directly from Mail/iChat; Mail.app is full of smiles and animations, and you can preview webpage links (a button opens a window); unsupported hardware notices; new services for opening a New Terminal Window/Tab at Folder; you can move items in the finder by using ⌘-C to copy and ⌥-⌘-V to move; more international localizations for system Text to Speech; native support for Microsoft DFS; disconnecting an iPhone doesn\'t wake display; wireless Internet Sharing suppo

Average Metric: 15.53 / 24 (64.7%): 100%|██████████| 25/25 [02:45<00:00,  6.64s/it]

2024/12/31 01:50:16 INFO dspy.evaluate.evaluate: Average Metric: 15.532663046426318 / 25 (62.1%)
2024/12/31 01:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.13 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 10'].
2024/12/31 01:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13]
2024/12/31 01:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:50:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==



Average Metric: 17.20 / 25 (68.8%): 100%|██████████| 25/25 [02:34<00:00,  6.18s/it]

2024/12/31 01:52:50 INFO dspy.evaluate.evaluate: Average Metric: 17.203314137021543 / 25 (68.8%)
2024/12/31 01:52:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.81 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].
2024/12/31 01:52:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81]
2024/12/31 01:52:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:52:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:52:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==



Average Metric: 15.84 / 25 (63.3%): 100%|██████████| 25/25 [02:31<00:00,  6.07s/it]

2024/12/31 01:55:22 INFO dspy.evaluate.evaluate: Average Metric: 15.83514146226184 / 25 (63.3%)
2024/12/31 01:55:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.34 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 3'].
2024/12/31 01:55:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34]
2024/12/31 01:55:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:55:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:55:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==



Average Metric: 17.17 / 25 (68.7%): 100%|██████████| 25/25 [02:46<00:00,  6.65s/it]

2024/12/31 01:58:08 INFO dspy.evaluate.evaluate: Average Metric: 17.167095610482708 / 25 (68.7%)
2024/12/31 01:58:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.67 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 7'].
2024/12/31 01:58:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34, 68.67]
2024/12/31 01:58:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94]
2024/12/31 01:58:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 62.94


2024/12/31 01:58:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====
2024/12/31 01:58:08 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.81) from minibatch trials...



Average Metric: 103.93 / 159 (65.4%):  99%|█████████▉| 159/160 [15:32<00:05,  5.95s/it]



Average Metric: 103.93 / 159 (65.4%): 100%|██████████| 160/160 [15:38<00:00,  5.87s/it]

2024/12/31 02:13:47 INFO dspy.evaluate.evaluate: Average Metric: 103.92736338065342 / 160 (65.0%)
2024/12/31 02:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 64.95
2024/12/31 02:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95]
2024/12/31 02:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.95
2024/12/31 02:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: 

2024/12/31 02:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==



Average Metric: 16.77 / 25 (67.1%): 100%|██████████| 25/25 [02:53<00:00,  6.93s/it]

2024/12/31 02:16:40 INFO dspy.evaluate.evaluate: Average Metric: 16.769554197128027 / 25 (67.1%)
2024/12/31 02:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.08 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5'].
2024/12/31 02:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34, 68.67, 67.08]
2024/12/31 02:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95]
2024/12/31 02:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.95


2024/12/31 02:16:40 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==



Average Metric: 15.94 / 25 (63.8%): 100%|██████████| 25/25 [03:16<00:00,  7.86s/it]

2024/12/31 02:19:56 INFO dspy.evaluate.evaluate: Average Metric: 15.938728411876287 / 25 (63.8%)
2024/12/31 02:19:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.75 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].
2024/12/31 02:19:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34, 68.67, 67.08, 63.75]
2024/12/31 02:19:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95]
2024/12/31 02:19:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.95


2024/12/31 02:19:56 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==



Average Metric: 16.37 / 25 (65.5%): 100%|██████████| 25/25 [02:53<00:00,  6.96s/it]

2024/12/31 02:22:50 INFO dspy.evaluate.evaluate: Average Metric: 16.374052701518433 / 25 (65.5%)
2024/12/31 02:22:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.5 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 16'].
2024/12/31 02:22:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34, 68.67, 67.08, 63.75, 65.5]
2024/12/31 02:22:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95]
2024/12/31 02:22:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.95


2024/12/31 02:22:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==



Average Metric: 15.70 / 25 (62.8%): 100%|██████████| 25/25 [02:21<00:00,  5.68s/it]

2024/12/31 02:25:12 INFO dspy.evaluate.evaluate: Average Metric: 15.697818740399384 / 25 (62.8%)
2024/12/31 02:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.79 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0'].
2024/12/31 02:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34, 68.67, 67.08, 63.75, 65.5, 62.79]
2024/12/31 02:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95]
2024/12/31 02:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.95


2024/12/31 02:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==



Average Metric: 16.32 / 25 (65.3%): 100%|██████████| 25/25 [02:43<00:00,  6.54s/it]

2024/12/31 02:27:56 INFO dspy.evaluate.evaluate: Average Metric: 16.317471753598515 / 25 (65.3%)
2024/12/31 02:27:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.27 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 6'].
2024/12/31 02:27:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [66.83, 69.62, 64.77, 66.39, 63.54, 63.32, 62.49, 69.34, 59.19, 60.45, 50.42, 65.65, 60.32, 65.22, 59.17, 60.71, 62.13, 68.81, 63.34, 68.67, 67.08, 63.75, 65.5, 62.79, 65.27]
2024/12/31 02:27:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95]
2024/12/31 02:27:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 64.95


2024/12/31 02:27:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====
2024/12/31 02:27:56 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.67) from minibatch trials...



Average Metric: 104.90 / 160 (65.6%): 100%|██████████| 160/160 [16:04<00:00,  6.03s/it]

2024/12/31 02:44:00 INFO dspy.evaluate.evaluate: Average Metric: 104.8971910398255 / 160 (65.6%)
2024/12/31 02:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: [92mNew best full eval score![0m Score: 65.56
2024/12/31 02:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [62.0, 62.94, 64.95, 65.56]
2024/12/31 02:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 65.56
2024/12/31 02:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: 

2024/12/31 02:44:00 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 65.56!





# Evaluate

In [12]:
evaluate = dspy.Evaluate(
    devset=dev, metric=metric, num_threads=4, display_progress=True, display_table=True
)

In [13]:
evaluate(optimized_rag)

Average Metric: 197.57 / 300 (65.9%): 100%|██████████| 300/300 [36:40<00:00,  7.34s/it]

2024/12/31 03:22:43 INFO dspy.evaluate.evaluate: Average Metric: 197.57031592560836 / 300 (65.9%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,vtdecoderxpcservice taking cpu,Quit your applications one by one and monitor the process. Common ...,"[2557, 4732, 5014, 5156, 5195, 5631, 5856, 4026]",The VTDecoderXPCService is a process that might be consuming CPU r...,Try quitting your applications one by one to identify which one is...,✔️ [0.667]
1,git auto-complete for *branches* at the command line?,"You should try to update the git version to the latest, and then i...","[6715, 1766, 2902, 6154]",Based on the provided context from various users who have encounte...,"To fix the issue of not being able to autocomplete branch names, y...",✔️ [0.667]
2,what functionality do marks offer in the el capitan terminal?,"Marks in the Terminal, which began with OS X 10.11 - El Capitan, h...","[1234, 1253]",The question is asking about the functionality offered by marks in...,"Marks offer several functionalities in the El Capitan Terminal, in...",✔️ [0.857]
3,how to convert a heif/heic image to jpeg in el capitan?,Users advise that tifig is an excellent command line tool that wor...,"[7785, 680, 2705, 7201]","Based on the provided context from various sources, I will look fo...",One possible solution is to use the `sips` command-line tool that ...,✔️ [0.857]
4,why should i use a factory class instead of direct object construc...,Using factory classes along with interfaces promotes greater long-...,[4782],The question of whether to use a factory class instead of direct o...,You should use a factory class instead of direct object constructi...,✔️ [0.857]
...,...,...,...,...,...,...
295,make the menu bar never show while in full screen,Some users report that the menu bar on Mac OS X cannot be hidden o...,"[5370, 4551, 44, 6307]","Based on the provided context from various sources, I will look fo...","According to source [1], you can achieve this by unchecking the bo...",✔️ [0.400]
296,whats a good ssh tunneling client for os x?,"SSHTunnel, a free application for managing SSH tunnels, is still g...","[2186, 2809, 2814, 2817, 526, 5277, 6415, 1635, 1671, 6739, 2868, ...",Based on the provided context about SSH tunneling clients for OS X...,Core Tunnel baked by Codinn appears to be a popular and user-frien...,✔️ [0.267]
297,delete line in vi,"Use the ""dd"" command, which will remove the line, allowing you to ...","[3122, 3123, 3129, 5910, 4792, 2164]","To answer this question, I will look through the provided context ...",There are several ways to delete lines in vi: * In any POSIX-compl...,✔️ [0.667]
298,return only the portion of a line after a matching pattern,The primary tool for text manipulation and pattern replacement is ...,"[5023, 958, 6330]","Based on the provided context from various sources, I will look fo...","To return only the portion of a line after a matching pattern, you...",


65.86