## Installing dependencies

In [1]:
!pip install -q -U pip
!pip install -q dspy-ai

[0m

In [2]:
import sys
import os
import dspy
import pandas as pd

## Setting up the LLM with Azure OpenAI
Using an a GPT3.5-Turbo deployment and sourcing wikipedia abstracts from a public endpoint.

In [3]:
turbo = dspy.OpenAI(api_key="",api_provider="azure",deployment_id="gpt35", api_version="2023-09-15-preview",
                   api_base="",model_type='chat')
colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')

dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts)

dspy.settings.configure(lm=turbo)

## Using data from a Kaggle competition to evalute questions and responses
https://www.kaggle.com/datasets/rtatman/questionanswer-dataset/code

We're loading all the files, dropping nulls in the question and answer, and DSPy wants it to be lower case field names, so adjusting those.

In [4]:
df1 = pd.read_csv('data/S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('data/S09_question_answer_pairs.txt', sep='\t')
test = pd.read_csv('data/S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')

In [5]:
train = pd.concat([df1, df2], ignore_index=True)

In [6]:
train = train[train['Question'].notnull()]
train = train[train['Answer'].notnull()]
test = test[test['Question'].notnull()]
test = test[test['Answer'].notnull()]

In [7]:
train = train.rename(columns={"Question": "question", "Answer": "answer"})
test = test.rename(columns={"Question": "question", "Answer": "answer"})

In [8]:
test[:10]

Unnamed: 0,ArticleTitle,question,answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy,S10_set4_a10
1,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,No,easy,hard,S10_set4_a10
2,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy,S10_set4_a10
3,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Yes,easy,easy,S10_set4_a10
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,S10_set4_a10
5,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Yes,easy,easy,S10_set4_a10
6,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium,S10_set4_a10
7,Alessandro_Volta,Who did Alessandro Volta marry?,Teresa Peregrini,medium,medium,S10_set4_a10
8,Alessandro_Volta,What did Alessandro Volta invent in 1800?,"In 1800, Alessandro Volta invented the voltaic...",medium,easy,S10_set4_a10
9,Alessandro_Volta,What did Alessandro Volta invent in 1800?,voltaic pile,medium,medium,S10_set4_a10


### Converting to a Dataset
Including training and test sets

In [9]:
from datasets import load_dataset
from dspy.datasets.dataset import Dataset

In [10]:
class CustomQA(Dataset):
    def __init__(self, *args, df_train,df_test, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        
        official_train = []
        for raw_example in df_train.iterrows():
            example = {k: raw_example[1][k] for k in ['question','answer']}
            official_train.append(example)
        self._train = official_train
        
        official_train = []
        for raw_example in df_test.iterrows():
            example = {k: raw_example[1][k] for k in ['question','answer']}
            official_train.append(example)
        self._test = official_train

In [11]:
train_ds = CustomQA(df_train=train,df_test=test)

## Defining a Signature

In [12]:
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [13]:
# Define the predictor.
generate_answer = dspy.Predict(BasicQA)

example = train_ds.train[0]

# Call the predictor on a particular input.
pred = generate_answer(question=example.question)

# Print the input and the prediction.
print(f"Question: {example.question}")
print(f"Predicted Answer: {pred.answer}")

Question: Who was the general in charge at the Battle of Antietam?
Predicted Answer: George B. McClellan


In [14]:
#basic prompt and reply
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Follow the following format.

Question: ${question}
Answer: often between 1 and 5 words

---

Question: Who was the general in charge at the Battle of Antietam?
Answer:[32m George B. McClellan[0m





In [15]:
# Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged.
generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)

# Call the predictor on the same input.
pred = generate_answer_with_chain_of_thought(question=example.question)

# Print the input, the chain of thought, and the prediction.
print(f"Question: {example.question}")
print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
print(f"Predicted Answer: {pred.answer}")

Question: Who was the general in charge at the Battle of Antietam?
Thought: We know that the Battle of Antietam was a major battle during the American Civil War, so we can assume that a high-ranking military officer was in charge.
Predicted Answer: General George McClellan.


In [16]:
#conversation now that we added chain of thought tasks
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: often between 1 and 5 words

---

Question: Who was the general in charge at the Battle of Antietam?
Reasoning: Let's think step by step in order to[32m produce the answer. We know that the Battle of Antietam was a major battle during the American Civil War, so we can assume that a high-ranking military officer was in charge.
Answer: General George McClellan.[0m





# Retrieval Augmentation

In [17]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [18]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [19]:
trainset = [x.with_inputs('question') for x in train_ds.train]
testset = [x.with_inputs('question') for x in train_ds.test]

#we'll use a small subset of data to bootstrap the LLM
len(trainset), len(testset)

(2200, 1222)

In [20]:
trainset[100:110]

[Example({'question': 'Is it possible that there are more than 350,000 species of beetles?', 'answer': 'yes'}) (input_keys={'question'}),
 Example({'question': 'Have cymbals been used historically to suggest bacchanal?', 'answer': 'Yes'}) (input_keys={'question'}),
 Example({'question': "Does Indonesia have the world's hightest level of biodiversity?", 'answer': 'No'}) (input_keys={'question'}),
 Example({'question': 'Are studies insufficient evidence for global protection?', 'answer': 'It is arguable.'}) (input_keys={'question'}),
 Example({'question': 'Is the cello a stringed instrument?', 'answer': 'yes'}) (input_keys={'question'}),
 Example({'question': 'From what did Pascal suffer throughout his life?', 'answer': 'poor health'}) (input_keys={'question'}),
 Example({'question': 'When is the first record of S08_settlement in Singapore?', 'answer': 'The first records of S08_settlement in Singapore are from the second century AD.'}) (input_keys={'question'}),
 Example({'question': 'Wh

## Few Shot Prompts
So far the prompts have not included examples of successful question and answer pairs.  Teleprompters optimize the prompt by selecting pairs for inclusion to generate high quality results.

In [21]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset[100:150])

100%|██████████| 50/50 [00:00<00:00, 396.04it/s]

Bootstrapped 3 full traces after 50 examples in round 0.





In [22]:
# Ask any question you like to this simple RAG program.
print(testset[5])

# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(testset[5].question)

# Print the contexts and the answer.
print(f"Question: {testset[5].question}")
print(f"Predicted Answer: {pred.answer}")

Example({'question': 'What material is a chi flute fashioned from?', 'answer': 'Lacquered bamboo'}) (input_keys={'question'})
Question: What material is a chi flute fashioned from?
Predicted Answer: It is usually made of wood.


In [23]:
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Question: Why are otters vulnerable to prey depletion?
Answer: prey-dependency

Question: When was the pan flute spread to other parts of Europe?
Answer: After the 7th century BC

Question: From what type of Cymbals can a expert player obtain an enormous dynamic range?
Answer: Crash cymbals

Question: Did John Adams support the Stamp Act of 1765?
Answer: No

Question: What did Cleveland die from?
Answer: A heart attack

Question: When is the first record of S08_settlement in Singapore?
Answer: The first records of S08_settlement in Singapore are from the second century AD.

Question: Does the giant otter inhabit South Africa?
Answer: No

Question: What do we refer musicians who play flute?
Answer: A flute player, a flautist or a flutist.

Question: What resembles that of the similarly-sized cougar in the Americas?
Answer: The leopard's ecological role

Question: Was Millard Fillmore the thirteenth President of the United States?
An

In [24]:
from dspy.evaluate.evaluate import Evaluate

# Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
evaluate_on_hotpotqa = Evaluate(devset=testset[:50], num_threads=1, display_progress=True, display_table=5)

# Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
metric = dspy.evaluate.answer_exact_match
evaluate_on_hotpotqa(compiled_rag, metric=metric)

Average Metric: 2.0 / 36  (5.6):  72%|███████▏  | 36/50 [00:25<00:08,  1.58it/s]

Error for example in dev set: 		 The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766


Average Metric: 3.0 / 50  (6.0): 100%|██████████| 50/50 [00:34<00:00,  1.46it/s]


Average Metric: 3.0 / 50  (6.0%)


Unnamed: 0,question,example_answer,context,pred_answer,answer_exact_match,answer
0,Where is the James Watt Memorial College?,In Greenock.,"['James Watt College | The James Watt College was a further education college in Greenock, Scotland. It is now part of West College Scotland. There...",There is no James Watt Memorial College.,❌ [False],
1,Are termites actually more closely related to cockroaches as well as mantids?,"Yes, termites are actually more closely related to cockroaches as well as mantids.","['Blattodea | Blattodea is an order of insects that contains the cockroaches and the termites. Formerly, the termites were considered a separate order, Isoptera, but...","Yes, termites are more closely related to cockroaches as well as mantids.",❌ [False],
2,Was Volta made a count in 1810?,No.,"[""Alessandro Volta | Alessandro Giuseppe Antonio Anastasio Volta (] ; 18 February 1745 – 5 March 1827) was an Italian physicist, chemist, and a pioneer...",Unknown/No information provided.,❌ [False],
3,Is Berlin the headquarters of Springer?,"Yes, Berlin is the headquarters of Springer.","['Idealo | The company idealo internet GmbH is a price comparison service launched in Germany in 2000, and since bought by the Axel Springer AG...","Yes, Berlin is one of the major offices of Springer Science+Business Media, which is owned by Axel Springer AG.",❌ [False],
4,Why does Jakarta suffer frequent flooding?,Because it is located approximately eight meters above the sea level,"['2007 Jakarta flood | The 2007 Jakarta flood was a major flood in Jakarta, the capital of Indonesia and affected several other areas around the...","Heavy rain, deforestation, and clogged waterways.",❌ [False],


6.0

Results are poor but much of this is due to the fact that it does not identically match, many of the results are broadly accurate and an Evaluation LLM might help sift through and give a more accurate perspective.

# More Complex Multi-Hop
Some questions require more than one retrieval for context, if information must be looked up to shape follow on questions, this multi-hop Q&A requires a more complex Signature.

In [25]:
class GenerateSearchQuery(dspy.Signature):
    """Write a simple search query that will help answer a complex question."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    query = dspy.OutputField()

In [26]:
from dsp.utils import deduplicate

class SimplifiedBaleen(dspy.Module):
    def __init__(self, passages_per_hop=3, max_hops=2):
        super().__init__()

        self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
        self.retrieve = dspy.Retrieve(k=passages_per_hop)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
        self.max_hops = max_hops
    
    def forward(self, question):
        context = []
        
        for hop in range(self.max_hops):
            query = self.generate_query[hop](context=context, question=question).query
            passages = self.retrieve(query).passages
            context = deduplicate(context + passages)

        pred = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=pred.answer)

In [27]:
# Ask any question you like to this simple RAG program.
my_question = "How many storeys are in the castle that David Gregory inherited?"

# Get the prediction. This contains `pred.context` and `pred.answer`.
uncompiled_baleen = SimplifiedBaleen()  # uncompiled (i.e., zero-shot) program
pred = uncompiled_baleen(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")

Question: How many storeys are in the castle that David Gregory inherited?
Predicted Answer: Five storeys.
Retrieved Contexts (truncated): ['David Gregory (physician) | David Gregory (20 December 1625 – 1720) was a Scottish physician and inventor. His surname is sometimes spelt as Gregorie, the original Scottish spelling. He inherited Kinn...', 'Gregory House | Gregory House, M.D., commonly referred to by his surname House, is the title character of the American medical drama series "House". Created by David Shore and portrayed by English act...', 'David S. Castle | David S. Castle (13 February 1884 – 28 October 1956) was an architect in Texas....', 'Kinnairdy Castle | Kinnairdy Castle is a tower house, having five storeys and a garret, two miles south of Aberchirder, Aberdeenshire, Scotland. The alternative name is Old Kinnairdy....', 'Kinnaird Castle, Brechin | Kinnaird Castle is a 15th-century castle in Angus, Scotland. The castle has been home to the Carnegie family, the Earl of So

In [28]:
turbo.inspect_history(n=1)





Answer questions with short factoid answers.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: often between 1 and 5 words

---

Context:
[1] «David Gregory (physician) | David Gregory (20 December 1625 – 1720) was a Scottish physician and inventor. His surname is sometimes spelt as Gregorie, the original Scottish spelling. He inherited Kinnairdy Castle in 1664. Three of his twenty-nine children became mathematics professors. He is credited with inventing a military cannon that Isaac Newton described as "being destructive to the human species". Copies and details of the model no longer exist. Gregory's use of a barometer to predict farming-related weather conditions led him to be accused of witchcraft by Presbyterian ministers from Aberdeen, although he was never convicted.»
[2] «Gregory House | Gregory House, M.D., commonly referred to by his surname

In [29]:
def validate_context_and_answer_and_hops(example, pred, trace=None):
    if not dspy.evaluate.answer_exact_match(example, pred): return False
    if not dspy.evaluate.answer_passage_match(example, pred): return False

    hops = [example.question] + [outputs.query for *_, outputs in trace if 'query' in outputs]

    if max([len(h) for h in hops]) > 100: return False
    if any(dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))): return False

    return True

In [30]:
teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
compiled_baleen = teleprompter.compile(SimplifiedBaleen(), teacher=SimplifiedBaleen(passages_per_hop=2), trainset=trainset[:60])

100%|██████████| 60/60 [01:50<00:00,  1.84s/it]

Bootstrapped 3 full traces after 60 examples in round 0.





In [31]:
compiled_baleen_retrieval_score = evaluate_on_hotpotqa(compiled_baleen, metric=metric)

Average Metric: 5.0 / 36  (13.9):  72%|███████▏  | 36/50 [01:18<00:28,  2.02s/it]

Error for example in dev set: 		 The response was filtered due to the prompt triggering Azure OpenAI’s content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766


Average Metric: 7.0 / 50  (14.0): 100%|██████████| 50/50 [01:45<00:00,  2.12s/it]

Average Metric: 7.0 / 50  (14.0%)





Unnamed: 0,question,example_answer,context,pred_answer,answer_exact_match,answer
0,Where is the James Watt Memorial College?,In Greenock.,"['James Watt College | The James Watt College was a further education college in Greenock, Scotland. It is now part of West College Scotland. There...",N/A (there is no James Watt Memorial College mentioned in the context),❌ [False],
1,Are termites actually more closely related to cockroaches as well as mantids?,"Yes, termites are actually more closely related to cockroaches as well as mantids.","['Blattodea | Blattodea is an order of insects that contains the cockroaches and the termites. Formerly, the termites were considered a separate order, Isoptera, but...",Yes.,❌ [False],
2,Was Volta made a count in 1810?,No.,['2010 Volta ao Algarve | The 2010 Volta ao Algarve was the 36th edition of the Volta ao Algarve cycling stage race. It was held...,"Unknown, not mentioned in the context.",❌ [False],
3,Is Berlin the headquarters of Springer?,"Yes, Berlin is the headquarters of Springer.","['Springer Building | The Springer Building at 121 Tijeras Ave., NE, in Albuquerque, New Mexico, was built during 1929-30 It was listed on the National...","Yes, Springer Science+Business Media has major offices in Berlin.",❌ [False],
4,Why does Jakarta suffer frequent flooding?,Because it is located approximately eight meters above the sea level,"['2007 Jakarta flood | The 2007 Jakarta flood was a major flood in Jakarta, the capital of Indonesia and affected several other areas around the...","Heavy rain, deforestation, and clogged waterways.",❌ [False],
