In [1]:
import dspy
import ujson
import chromadb

from chromadb.utils import embedding_functions
from dspy.evaluate import SemanticF1
from dspy.retrieve.chromadb_rm import ChromadbRM
from dspy.utils import download

# Download the Data

In [21]:
download("https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl")
with open("./ragqa_arena_tech_corpus.jsonl", "r") as fp:
    corpus = [ujson.loads(line) for line in fp]

download(
    "https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl"
)
with open("ragqa_arena_tech_examples.jsonl", "r") as fp:
    qa = [ujson.loads(line) for line in fp]

print(corpus[2])
len(corpus)
print(qa[2])
len(qa)

{'doc_id': 131078, 'author': None, 'text': 'http://abtevrythng.blogspot.com/2010/06/adding-cer-certificates-on-your-android.html Shows how to actually achieve this. Worked fine for me. Try it out. In this article .cer to .pfx (which is what you need on Android) conversion is given. Simple method is given using which you can convert .cer to .pfx and use it to connect to the Wi-Fi network. Plus you dont need any Key to convert .cer to .pfx!!!'}
{'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you "Maybe". \n\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in "Maybe" being displayed even when "Find Contacts in Other Apps" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}


2064

# Upload to Chroma

In [3]:
client = chromadb.PersistentClient(path="./db")
collection = client.get_or_create_collection(name="test")
max_chars = 6000

In [4]:
documents = [data["text"][:max_chars] for data in corpus]
ids = [str(data["doc_id"]) for data in corpus]
collection.add(documents=documents, ids=ids)

# Define the Retriever

In [5]:
embed_fn = embedding_functions.DefaultEmbeddingFunction()
retriever_model = ChromadbRM(
    collection_name="test", persist_directory="./db", embedding_function=embed_fn, k=5
)

In [6]:
retriever_model("what are high memory and low memory on linux?")

[{'id': '172068',
  'score': 0.6003709435462952,
  'long_text': 'This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory',
  'metadatas': None},
 {'id': '172066',
  'score': 0.6697648763656616,
  'long_text': 'As far as I remember, High Memory is used for application space and Low Memory for the kernel. Advantage is that (user-space) applications cant

# Load LM

In [9]:
lm = dspy.LM(model="ollama_chat/llama3.1", temperature=0.0)
dspy.configure(lm=lm)

# Build the RAG Module

In [10]:
class RAG(dspy.Module):
    def __init__(self):
        self.respond = dspy.ChainOfThought("context, question -> response")

    def forward(self, question):
        ret_docs = retriever_model(question)
        context = [doc["long_text"] for doc in ret_docs]
        return self.respond(context=context, question=question)

# Test RAG

In [11]:
rag = RAG()
rag(question="what are high memory and low memory on linux?")

Prediction(
    reasoning='Based on the provided context, High Memory (HIGHMEM) refers to a range of kernel memory space where pages are temporarily mapped and unmapped. It is used for single-time data buffers and has no static mapping. Low Memory (LOWMEM), on the other hand, is a statically mapped part of the 1GB kernel space that is used by the kernel directly.',
    response='High Memory and Low Memory are two segments of memory in Linux:\n\n* High Memory (HIGHMEM): A range of kernel memory space where pages are temporarily mapped and unmapped. It is used for single-time data buffers.\n* Low Memory (LOWMEM): A statically mapped part of the 1GB kernel space that is used by the kernel directly.'
)

In [13]:
dspy.inspect_history()





[34m[2024-12-30T19:37:33.293722][0m

[31mSystem message:[0m

Your input fields are:
1. `context` (str)
2. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `response` (str)

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## context ## ]]
{context}

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## response ## ]]
{response}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Given the fields `context`, `question`, produce the fields `response`.


[31mUser message:[0m

[[ ## context ## ]]
[1] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There w

# Evaluate RAG

In [34]:
import random

qa_examples = [dspy.Example(**data).with_inputs("question") for data in qa]
random.Random(23).shuffle(qa_examples)

train, dev, test = qa_examples[:200], qa_examples[200:500], qa_examples[500:1000]
len(train), len(dev), len(test)

(200, 300, 500)

In [35]:
dev[2]

Example({'question': 'i got an email threatening to ddos me if i dont pay a ransom. what should i do?', 'response': "When faced with threats of DDoS extortion, the advised action is to ignore them, as these are often scams without real consequences.  \nIf you receive such an email, it's recommended to verify if the threat is genuine by looking up the associated Bitcoin address to ascertain its authenticity. \nNevertheless, to be safe, you should report the incident to relevant parties such as your service provider's abuse team, your hosting company, and possibly law enforcement agencies, sending them an original copy of the threatening e-mail.  \nSharing the details with your VPS vendor is also recommended since any potential attack could impact their network.  \nAlways ensure the legitimacy of any threat before considering payment, and maintain robust security measures.", 'gold_doc_ids': [5876, 5877, 5879, 5880, 5885]}) (input_keys={'question'})

In [36]:
metric = SemanticF1(decompositional=True)
evaluate = dspy.Evaluate(
    devset=dev, metric=metric, num_threads=4, display_progress=True, display_table=True
)

In [37]:
evaluate(RAG())

Average Metric: 0.00 / 0 (0%):   1%|          | 3/300 [22:53<37:46:30, 457.88s/it]s/it]
Average Metric: 0.00 / 0 (0%):   0%|          | 1/300 [20:00<99:43:12, 1200.64s/it]
Average Metric: 0.00 / 0 (0%):   1%|          | 2/300 [19:11<47:40:31, 575.95s/it]
Average Metric: 0.00 / 0 (0%):   1%|          | 3/300 [18:25<30:24:15, 368.54s/it]
Average Metric: 184.55 / 300 (61.5%): 100%|██████████| 300/300 [25:41<00:00,  5.14s/it]

2024/12/30 20:14:38 INFO dspy.evaluate.evaluate: Average Metric: 184.55205669811133 / 300 (61.5%)





Unnamed: 0,question,example_response,gold_doc_ids,reasoning,pred_response,SemanticF1
0,how do i transfer edited photos from my iphone to my computer?,"AirDrop or email the photo to yourself, or, if you've enabled iClo...",[6048],The question is about transferring edited photos from an iPhone to...,"To transfer edited photos from your iPhone to your computer, you c...",✔️ [1.000]
1,what is the difference between a lan and wan port?,The differences between LANs and WANs tend to be at the physical a...,"[1263, 1276]",The question asks about the difference between a LAN and WAN port....,The main difference between a LAN and WAN port lies in their purpo...,✔️ [0.889]
2,i got an email threatening to ddos me if i dont pay a ransom. what...,"When faced with threats of DDoS extortion, the advised action is t...","[5876, 5877, 5879, 5880, 5885]","Based on the provided context, it appears that the email threateni...","Ignore the email threatening to DDoS you. It's likely a scam, and ...",✔️ [0.774]
3,is constantly looking for code examples a sign of a bad developer?,Being a developer who searches for code examples does not imply in...,"[5081, 5084, 5087]",The question is asking whether constantly looking for code example...,It's not necessarily a sign of a bad developer to constantly look ...,✔️ [0.789]
4,what is the difference between 16gb and 32gb?,Other than cost (for the initial purchase and potentially to repai...,"[7242, 7243, 7244]",The context discusses the differences in storage capacity between ...,The main difference between 16GB and 32GB is the amount of storage...,✔️ [0.667]
...,...,...,...,...,...,...
295,how to stop os x from writing spotlight and trash files to memory ...,"To do so, I utilize two tools: Clean Eject, which is a free applic...",[3234],The question is asking how to prevent OS X from writing Spotlight ...,To stop OS X from writing Spotlight and Trash files to memory card...,✔️ [0.857]
296,"im a subversion geek, why should i consider or not consider mercur...",Merging large commits in subversion can be uncomfortable. In contr...,"[2498, 2500, 2502, 2504, 2785]",The question is asking why a Subversion geek should consider or no...,"You might consider Mercurial or Git if your team is distributed, w...",✔️ [0.480]
297,how do you explain the necessity of nuke it from orbit to manageme...,Analogies comparing computer viruses to biological viruses can be ...,"[5060, 5065, 5067, 5068, 5069, 5070, 5071]","The concept of ""nuke it from orbit"" refers to a drastic measure of...","When explaining the necessity of ""nuke it from orbit"" to managemen...",✔️ [0.333]
298,how can i instruct yum to install a specific version of package x?,Download the rpm file and then command yum to execute a local inst...,[1187],"To instruct yum to install a specific version of package X, we can...",You can instruct yum to install a specific version of package X by...,✔️ [0.857]


61.52