In [1]:
import dspy
from dspy.predict import Retry
from dspy.primitives.assertions import assert_transform_module, backtrack_handler
import importlib
import json

importlib.reload(dspy)

# Configure the AzureOpenAI language model
azure_turbo = dspy.AzureOpenAI(api_base='https://bionlp-gpt4-wang.openai.azure.com/', api_key="a494edc84d714b6c8a12e7212974b793", api_version='2024-03-01-preview', model='gpt-4', max_tokens=400)
dspy.settings.configure(lm=azure_turbo)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
class PubMedQA(dspy.Signature):
    question = dspy.InputField(desc="Question")
    context = dspy.InputField(desc="A list of facts which provide the necessary context to answer the question. You should directly mention the portions of the facts that you used to help answer the question.")
    options = dspy.InputField(desc="Options, you should select one of them based on whichever is correct.")
    answer = dspy.OutputField(desc="Your final answer should contain only one of YES, MAYBE, or NO (all in upper case) based on whichever option is correct")


class PubMedQA_Response(dspy.Module):
    """Use the provided context to answer a question by providing a step-by-step explanation and also respond with one of YES, MAYBE, or NO (all in upper case) based on whichever is correct"""
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(PubMedQA)
        self.compare_answers = dspy.MultiChainComparison(PubMedQA, M=5)

    def forward(self, question, context, options):
        response = self.generate_answer(context=context, question=question, options=options, config=dict(n=5, temperature=1.0))


        while len(response.completions.answer) < 5:

            curr_len = len(response.completions.answer)

            for i in range(curr_len):
                response.completions.answer.append(response.completions.answer[i])
                response.completions.rationale.append(response.completions.rationale[i])

                if len(response.completions.answer) == 5:
                    break

        best_response = self.compare_answers(context=context, question=question, options=options, completions=response.completions)
        
        valid_response = "YES" in best_response.answer or "MAYBE" in best_response.answer or "NO" in best_response.answer
        dspy.Suggest(valid_response, "Your final answer must be one of YES, MAYBE, or NO (all upper case) as part of your final answer")
        return dspy.Prediction(answer=best_response.answer, rationale=response.rationale)

In [12]:
import re

def eval_metric_pubmedqa(true, prediction, trace=None):
    try:
        pred = prediction.answer
        matches = re.findall(r"\b(YES|NO|MAYBE)\b", pred)
        parsed_answer = matches[-1] if matches else ""
        return parsed_answer == true.answer
    except:
        return False

In [13]:
import random

# Add the training set here:
with open("/Users/khandekarns/Documents/pubmedqa/data/pqaa_train_set.json") as file:
    train_set = json.load(file)

dspy_train_set_pubmedqa = []

options = "1. YES\n2. MAYBE\n3. NO"

for key, val in train_set.items():

    combined_context = ""

    for i, context in enumerate(train_set[key]['CONTEXTS']):
        
        index_str = str(i + 1)

        combined_context += f"{index_str}. {context}\n"

    example = dspy.Example({"question": train_set[key]['QUESTION'], "options": options, "context": combined_context, "answer": train_set[key]['final_decision'].upper()}).with_inputs("question", "context", "options") 

    dspy_train_set_pubmedqa.append(example)

random.shuffle(dspy_train_set_pubmedqa)

dspy_train_set_pubmedqa = random.sample(dspy_train_set_pubmedqa, 5000)



In [14]:
import json 


with open("/Users/khandekarns/Documents/pubmedqa/data/test_set.json") as file:
    test_set = json.load(file)


with open("/Users/khandekarns/Documents/pubmedqa/data/test_ground_truth.json") as file:
    gt = json.load(file)


dspy_test_set_pubmedqa = []

options = "1. YES\n2. MAYBE\n3. NO"

for key, val in test_set.items():

    combined_context = ""

    for i, context in enumerate(test_set[key]['CONTEXTS']):
        
        index_str = str(i + 1)

        combined_context += f"{index_str}. {context}\n"

    example = dspy.Example({"question": test_set[key]['QUESTION'], "options": options, "context": combined_context, "answer": gt[key].upper()}).with_inputs("question", "context", "options") 
    dspy_test_set_pubmedqa.append(example)

In [15]:
import json
import random


with open("/Users/khandekarns/Documents/pubmedqa/data/ori_pqal.json") as file:
    labeled_set = json.load(file)

val_set = {}

for key, value in labeled_set.items():
    if key not in gt:
        val_set[key] = labeled_set[key]
       

dspy_val_set_pubmedqa = []

options = "1. YES\n2. MAYBE\n3. NO"

for key, val in val_set.items():

    combined_context = ""

    for i, context in enumerate(val_set[key]['CONTEXTS']):
        
        index_str = str(i + 1)

        combined_context += f"{index_str}. {context}\n"

    example = dspy.Example({"question": labeled_set[key]['QUESTION'], "options": options, "context": combined_context, "answer": labeled_set[key]['final_decision'].upper()}).with_inputs("question", "context", "options") 

    dspy_val_set_pubmedqa.append(example)

random.shuffle(dspy_val_set_pubmedqa)

In [16]:
from dspy.evaluate import Evaluate

evaluate_test = Evaluate(devset=dspy_test_set_pubmedqa, metric=eval_metric_pubmedqa, num_threads=6, display_progress=True, display_table=True)

In [17]:
answerqa_with_assertions = assert_transform_module(PubMedQA_Response().map_named_predictors(Retry), backtrack_handler) 
evaluate_test(answerqa_with_assertions)

  0%|          | 0/500 [00:00<?, ?it/s]

Average Metric: 369 / 500  (73.8): 100%|██████████| 500/500 [00:05<00:00, 85.38it/s] 


Unnamed: 0,question,options,context,example_answer,pred_answer,rationale,eval_metric_pubmedqa
0,Is anorectal endosonography valuable in dyschesia?,1. YES 2. MAYBE 3. NO,1. Dyschesia can be provoked by inappropriate defecation movements. The aim of this prospective study was to demonstrate dysfunction of the anal sphincter and/or the...,YES,YES,produce the answer. We learned from the context that anorectal endosonography was used in a study on patients with dyschesia and healthy control subjects. The...,✔️ [True]
1,Is there a connection between sublingual varices and hypertension?,1. YES 2. MAYBE 3. NO,"1. Sublingual varices have earlier been related to ageing, smoking and cardiovascular disease. The aim of this study was to investigate whether sublingual varices are...",YES,YES,produce the answer. The study mentioned in the context observed sublingual varices and blood pressure among dental patients. They found an association between sublingual varices...,✔️ [True]
2,Is the affinity column-mediated immunoassay method suitable as an alternative to the microparticle enzyme immunoassay method as a blood tacrolimus assay?,1. YES 2. MAYBE 3. NO,"1. Tacrolimus is a potent immunosuppressive drug used in organ transplantation. Because of its substantial toxic effects, narrow therapeutic index, and interindividual pharmacokinetic variability, therapeutic...",YES,YES,produce the answer. We have been asked to determine if the affinity column-mediated immunoassay (ACMIA) method is suitable as an alternative to the microparticle enzyme...,✔️ [True]
3,Does a physician's specialty influence the recording of medication history in patients' case notes?,1. YES 2. MAYBE 3. NO,1. To determine the impact of a physician's specialty on the frequency and depth of medication history documented in patient medical records. 2. A cross-sectional...,YES,YES,produce the answer. We learned from the context that the study cross-sectionally assessed the frequency and depth of medication history documentation by physicians across various...,✔️ [True]
4,Locoregional opening of the rodent blood-brain barrier for paclitaxel using Nd:YAG laser-induced thermo therapy: a new concept of adjuvant glioma therapy?,1. YES 2. MAYBE 3. NO,1. Nd:YAG laser-induced thermo therapy (LITT) of rat brains is associated with blood-brain barrier (BBB) permeability changes. We address the question of whether LITT-induced locoregional...,YES,YES,"produce the answer. The context provided a detailed experimental procedure where Nd:YAG laser-induced thermo therapy (LITT) was used on rat brains, causing changes in the...",✔️ [True]
5,Spinal subdural hematoma: a sequela of a ruptured intracranial aneurysm?,1. YES 2. MAYBE 3. NO,1. A case of spinal subdural hematoma (SSDH) following subarachnoid hemorrhage (SAH) because of a ruptured internal carotid aneurysm is described. Such a case has...,YES,YES,produce the answer. We first note the case report in our context mentioning that a patient had a spinal subdural hematoma (SSDH) following a subarachnoid...,✔️ [True]
6,Is there a correlation between androgens and sexual desire in women?,1. YES 2. MAYBE 3. NO,"1. For women, the correlation between circulating androgens and sexual desire is inconclusive. Substitution with androgens at physiological levels improves sexual function in women who...",YES,YES,produce the answer. We can see from the context that a study was conducted to observe the correlation between androgens and sexual desire in women....,✔️ [True]
7,Is the zeolite hemostatic agent beneficial in reducing blood loss during arterial injury?,1. YES 2. MAYBE 3. NO,1. Uncontrolled hemorrhage is the leading cause of fatality. The aim of this study was to evaluate the effect of zeolite mineral (QuikClot - Advanced...,YES,YES,produce the answer. We can conclude from the context that application of the zeolite mineral (QC-ACS) on the arterial injury model resulted in a slower...,✔️ [True]
8,Are endothelial cell patterns of astrocytomas indicative of grade?,1. YES 2. MAYBE 3. NO,1. The most common primary brain tumors in children and adults are of astrocytic origin. Classic histologic grading schemes for astrocytomas have included evaluating the...,YES,YES,produce the answer. We know that astrocytomas are the most common primary brain tumors in children and adults. The grading of these tumors traditionally includes...,✔️ [True]
9,Should cavitation in proximal surfaces be reported in cone beam computed tomography examination?,1. YES 2. MAYBE 3. NO,1. 79 adjacent proximal surfaces without restorations in permanent teeth were examined. Patients suspected to have carious lesions after a visual clinical and a bitewing...,YES,YES,produce the answer. We learn from the context that a study was conducted regarding cavitation in proximal surfaces using CBCT examination. It is noted that...,✔️ [True]


73.8

In [18]:
from tqdm import tqdm
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

tqdm._instances.clear()

config = dict(max_bootstrapped_demos=3, max_labeled_demos=0, num_candidate_programs=20, num_threads=6)
teleprompter = BootstrapFewShotWithRandomSearch(metric = eval_metric_pubmedqa, **config)
answer_question_teacher = teleprompter.compile(student=assert_transform_module(PubMedQA_Response().map_named_predictors(Retry), backtrack_handler), teacher = assert_transform_module(PubMedQA_Response().map_named_predictors(Retry), backtrack_handler), trainset=dspy_train_set_pubmedqa, valset=dspy_val_set_pubmedqa)

Going to sample between 1 and 3 traces per predictor.
Will attempt to bootstrap 20 candidate sets.


Average Metric: 376 / 500  (75.2): 100%|██████████| 500/500 [00:02<00:00, 172.04it/s]


Score: 75.2 for set: [0, 0]
New best sscore: 75.2 for seed -3
Scores so far: [75.2]
Best score: 75.2


Average Metric: 376 / 500  (75.2): 100%|██████████| 500/500 [00:02<00:00, 224.36it/s]


Score: 75.2 for set: [0, 0]
Scores so far: [75.2, 75.2]
Best score: 75.2


  0%|          | 3/5000 [00:17<8:06:05,  5.84s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 382 / 500  (76.4): 100%|██████████| 500/500 [1:00:34<00:00,  7.27s/it]


Score: 76.4 for set: [3, 3]
New best sscore: 76.4 for seed -1
Scores so far: [75.2, 75.2, 76.4]
Best score: 76.4
Average of max per entry across top 1 scores: 0.764
Average of max per entry across top 2 scores: 0.824
Average of max per entry across top 3 scores: 0.824
Average of max per entry across top 5 scores: 0.824
Average of max per entry across top 8 scores: 0.824
Average of max per entry across top 9999 scores: 0.824


  0%|          | 2/5000 [00:16<11:32:40,  8.32s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 370 / 500  (74.0): 100%|██████████| 500/500 [1:04:29<00:00,  7.74s/it]


Score: 74.0 for set: [2, 2]
Scores so far: [75.2, 75.2, 76.4, 74.0]
Best score: 76.4
Average of max per entry across top 1 scores: 0.764
Average of max per entry across top 2 scores: 0.824
Average of max per entry across top 3 scores: 0.824
Average of max per entry across top 5 scores: 0.844
Average of max per entry across top 8 scores: 0.844
Average of max per entry across top 9999 scores: 0.844


  0%|          | 1/5000 [00:08<11:47:34,  8.49s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 370 / 500  (74.0): 100%|██████████| 500/500 [59:31<00:00,  7.14s/it]


Score: 74.0 for set: [1, 1]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0]
Best score: 76.4
Average of max per entry across top 1 scores: 0.764
Average of max per entry across top 2 scores: 0.824
Average of max per entry across top 3 scores: 0.824
Average of max per entry across top 5 scores: 0.844
Average of max per entry across top 8 scores: 0.844
Average of max per entry across top 9999 scores: 0.844


  0%|          | 2/5000 [00:14<10:13:28,  7.36s/it]


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 373 / 500  (74.6): 100%|██████████| 500/500 [1:06:09<00:00,  7.94s/it]


Score: 74.6 for set: [1, 1]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6]
Best score: 76.4
Average of max per entry across top 1 scores: 0.764
Average of max per entry across top 2 scores: 0.824
Average of max per entry across top 3 scores: 0.824
Average of max per entry across top 5 scores: 0.854
Average of max per entry across top 8 scores: 0.854
Average of max per entry across top 9999 scores: 0.854


  0%|          | 2/5000 [00:13<9:31:51,  6.86s/it] 


Bootstrapped 1 full traces after 3 examples in round 0.


Average Metric: 383 / 500  (76.6): 100%|██████████| 500/500 [59:01<00:00,  7.08s/it]


Score: 76.6 for set: [1, 1]
New best sscore: 76.6 for seed 3
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6, 76.6]
Best score: 76.6
Average of max per entry across top 1 scores: 0.766
Average of max per entry across top 2 scores: 0.808
Average of max per entry across top 3 scores: 0.842
Average of max per entry across top 5 scores: 0.852
Average of max per entry across top 8 scores: 0.862
Average of max per entry across top 9999 scores: 0.862


  0%|          | 1/5000 [00:08<11:41:41,  8.42s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 368 / 500  (73.6): 100%|██████████| 500/500 [1:05:46<00:00,  7.89s/it]


Score: 73.6 for set: [1, 1]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6, 76.6, 73.6]
Best score: 76.6
Average of max per entry across top 1 scores: 0.766
Average of max per entry across top 2 scores: 0.808
Average of max per entry across top 3 scores: 0.842
Average of max per entry across top 5 scores: 0.852
Average of max per entry across top 8 scores: 0.866
Average of max per entry across top 9999 scores: 0.866


  0%|          | 4/5000 [00:29<10:05:36,  7.27s/it]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 374 / 500  (74.8): 100%|██████████| 500/500 [1:24:39<00:00, 10.16s/it]


Score: 74.8 for set: [3, 3]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6, 76.6, 73.6, 74.8]
Best score: 76.6
Average of max per entry across top 1 scores: 0.766
Average of max per entry across top 2 scores: 0.808
Average of max per entry across top 3 scores: 0.842
Average of max per entry across top 5 scores: 0.848
Average of max per entry across top 8 scores: 0.866
Average of max per entry across top 9999 scores: 0.87


  0%|          | 3/5000 [00:35<16:18:20, 11.75s/it]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 375 / 500  (75.0): 100%|██████████| 500/500 [1:23:46<00:00, 10.05s/it]


Score: 75.0 for set: [3, 3]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6, 76.6, 73.6, 74.8, 75.0]
Best score: 76.6
Average of max per entry across top 1 scores: 0.766
Average of max per entry across top 2 scores: 0.808
Average of max per entry across top 3 scores: 0.842
Average of max per entry across top 5 scores: 0.848
Average of max per entry across top 8 scores: 0.868
Average of max per entry across top 9999 scores: 0.872


  0%|          | 2/5000 [00:14<10:14:58,  7.38s/it]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 364 / 500  (72.8): 100%|██████████| 500/500 [1:22:53<00:00,  9.95s/it]


Score: 72.8 for set: [2, 2]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6, 76.6, 73.6, 74.8, 75.0, 72.8]
Best score: 76.6
Average of max per entry across top 1 scores: 0.766
Average of max per entry across top 2 scores: 0.808
Average of max per entry across top 3 scores: 0.842
Average of max per entry across top 5 scores: 0.848
Average of max per entry across top 8 scores: 0.868
Average of max per entry across top 9999 scores: 0.872


  0%|          | 1/5000 [00:08<12:29:20,  8.99s/it]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 363 / 500  (72.6): 100%|██████████| 500/500 [1:23:13<00:00,  9.99s/it]


Score: 72.6 for set: [1, 1]
Scores so far: [75.2, 75.2, 76.4, 74.0, 74.0, 74.6, 76.6, 73.6, 74.8, 75.0, 72.8, 72.6]
Best score: 76.6
Average of max per entry across top 1 scores: 0.766
Average of max per entry across top 2 scores: 0.808
Average of max per entry across top 3 scores: 0.842
Average of max per entry across top 5 scores: 0.848
Average of max per entry across top 8 scores: 0.868
Average of max per entry across top 9999 scores: 0.876


  0%|          | 2/5000 [00:23<16:18:40, 11.75s/it]


Bootstrapped 2 full traces after 3 examples in round 0.




In [None]:
evaluate_test(answer_question_teacher)