# MHQA with DSPy Optimization

We'll use DSPy's teleprompters to automatically optimize prompts for better performance compared to the baseline GPT implementation.

In [None]:
import os
import json
import dotenv
from tqdm import tqdm
import pandas as pd
import dspy
import numpy as np
import re
from typing import List, Dict, Any

dotenv.load_dotenv()

with open("../../../data/test_data.json", "r") as f:
    test_data = json.load(f)

with open("../../../data/train_data.json", "r") as f:
    train_data = json.load(f)

pquad_df = pd.read_csv('../../../data/pquad/pquad_questions.csv', encoding='utf-8')
pquad_data = pquad_df.to_dict(orient='records')[:150]  # Use first 150 samples

print(f"Loaded {len(test_data)} test examples")
print(f"Loaded {len(train_data)} train examples") 
print(f"Loaded {len(pquad_data)} PQUAD examples")

Loaded 152 test examples
Loaded 400 train examples
Loaded 150 PQUAD examples


In [None]:
MODEL_NAME = "gpt-4o-mini"

lm = dspy.LM(
    model=f"openai/{MODEL_NAME}",
    api_key=os.getenv("METIS_API_KEY"),
    api_base="https://api.metisai.ir/openai/v1",
    max_tokens=200,
    temperature=0.2
)

dspy.configure(lm=lm)

In [None]:
# DSPy signatures for Persian Question Answering
class PersianQASignature(dspy.Signature):
    """Answer Persian questions concisely and accurately."""
    question = dspy.InputField(desc="Persian question to be answered")
    answer = dspy.OutputField(desc="Concise Persian answer")

class PersianQAWithReasoningSignature(dspy.Signature):
    """Answer Persian questions with step-by-step reasoning."""
    question = dspy.InputField(desc="Persian question to be answered")
    answer = dspy.OutputField(desc="Concise Persian answer after reasoning")

# DSPy modules
class PersianQAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(PersianQASignature)
    
    def forward(self, question):
        result = self.generate_answer(question=question)
        return dspy.Prediction(answer=result.answer)

class PersianQAWithReasoningModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(PersianQAWithReasoningSignature)
    
    def forward(self, question):
        result = self.generate_answer(question=question)
        return dspy.Prediction(answer=result.answer)

In [5]:
## test
sample_question = "تهران پایتخت کدام کشور است؟"
qa_module = PersianQAModule()
result = qa_module(question=sample_question)
print(f"Question: {sample_question}")
print(f"Answer: {result.answer}")

Question: تهران پایتخت کدام کشور است؟
Answer: تهران پایتخت ایران است.


In [None]:
# Evaluation function using LLM-as-a-judge 
def create_llm_judge():
    judge_lm = dspy.LM(
        model="openai/gpt-4o-mini",
        api_key=os.getenv("METIS_API_KEY"), 
        api_base="https://api.metisai.ir/openai/v1",
        max_tokens=300,
        temperature=0.1
    )
    return judge_lm

def clean_model_answer(model_answer: str) -> str:
    """Clean model answer by removing XML tags"""
    cleaned = re.sub(r'<ANSWER>(.*?)</ANSWER>', r'\1', model_answer, flags=re.DOTALL|re.IGNORECASE)
    return cleaned.strip()

def evaluate_answer_with_judge(question: str, correct_answer: str, model_answer: str, judge_lm) -> bool:
    prompt = f"""
        You are an expert judge evaluating Persian/Farsi question-answer pairs. Determine if the model's answer is semantically equivalent to the correct answer. Be strict but fair about minor spelling variations and equivalent expressions.

        Question: {question}

        Correct Answer: {correct_answer}
        Model Answer: {clean_model_answer(model_answer)}

        Answer only "TRUE" if the model answer is semantically equivalent to the correct answer, or "FALSE" otherwise.
    """
    
    try:
        response = judge_lm(prompt)
        if isinstance(response, list) and len(response) > 0:
            response_text = str(response[0])
        else:
            response_text = str(response)
        
        # print(f"Judge response: {response_text}")  # Debug output
        return "TRUE" in response_text.upper()
    except Exception as e:
        print(f"Error in judge evaluation: {e}")  # Debug output
        return False

# DSPy evaluation metric
def accuracy_metric(gold, pred, trace=None):
    judge_lm = create_llm_judge()
    return evaluate_answer_with_judge(gold.question, gold.answer, pred.answer, judge_lm)


In [11]:
## test the evaluation function
sample_question = "تهران پایتخت کدام کشور است؟"
sample_answer = "ایران"

# Test the QA module
qa_module = PersianQAModule()
result = qa_module(question=sample_question)

print(f"Question: {sample_question}")
print(f"Expected Answer: {sample_answer}")
print(f"Model Answer: {result.answer}")
print(f"Cleaned Model Answer: {clean_model_answer(result.answer)}")

# Test the judge
print("\n" + "="*50)
print("Testing LLM Judge:")
print("="*50)
try:
    judge_lm = create_llm_judge()
    is_correct = evaluate_answer_with_judge(sample_question, sample_answer, result.answer, judge_lm)
    print(f"Final evaluation result: {is_correct}")
except Exception as e:
    print(f"Error during evaluation: {e}")
    import traceback
    traceback.print_exc()

Question: تهران پایتخت کدام کشور است؟
Expected Answer: ایران
Model Answer: تهران پایتخت ایران است.
Cleaned Model Answer: تهران پایتخت ایران است.

Testing LLM Judge:
Judge response: TRUE
Final evaluation result: True


In [13]:
try:
    judge_lm = create_llm_judge()
    is_correct = evaluate_answer_with_judge(sample_question, sample_answer, "فرانسه", judge_lm)
    print(f"Final evaluation result: {is_correct}")
except Exception as e:
    print(f"Error during evaluation: {e}")
    import traceback
    traceback.print_exc()

Judge response: FALSE
Final evaluation result: False


# MHQA Dataset Experiments

We'll first work with the Persian Multi-Hop Question Answering dataset, optimizing prompts for both reasoning and non-reasoning approaches.

In [None]:
# Prepare MHQA data for DSPy
def prepare_dspy_examples(data_list):
    examples = []
    for item in data_list:
        example = dspy.Example(
            question=item['question'],
            answer=item['answer']
        ).with_inputs('question')
        examples.append(example)
    return examples

mhqa_train_examples = prepare_dspy_examples(train_data[:50])  # Use first 50 for training
mhqa_test_examples = prepare_dspy_examples(test_data)

print(f"MHQA Train examples: {len(mhqa_train_examples)}")
print(f"MHQA Test examples: {len(mhqa_test_examples)}")

print(f"\nSample train example:")
print(f"Question: {mhqa_train_examples[0].question}")
print(f"Answer: {mhqa_train_examples[0].answer}")

MHQA Train examples: 50
MHQA Test examples: 152

Sample train example:
Question: در مقابل تبی که بنتونیت با نام گل ارمنی در آن معروف است چه چیز قرار دارد ؟
Answer:  پزشکی مبتنی بر شواهد


## MHQA - No Reasoning Approach

In [None]:
print("Starting MHQA No Reasoning optimization...")

mhqa_no_reasoning_model = PersianQAModule()

# Configure teleprompter (optimizer)
teleprompter = dspy.BootstrapFewShot(
    metric=accuracy_metric,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=2
)

# Optimize the model
print("Optimizing prompts for MHQA no reasoning...")
mhqa_optimized_no_reasoning = teleprompter.compile(
    mhqa_no_reasoning_model, 
    trainset=mhqa_train_examples
)

print("MHQA No Reasoning optimization completed!")

# Show optimized prompt
print("\n" + "="*50)
print("OPTIMIZED PROMPT FOR MHQA NO REASONING:")
print("="*50)
for i, predictor in enumerate(mhqa_optimized_no_reasoning.predictors()):
    print(f"\nPredictor {i+1}:")
    print(predictor.signature)
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Number of demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:2]):  # Show first 2 demos
            print(f"Demo {j+1}: Q: {demo.question[:100]}...")
            print(f"         A: {demo.answer}")
print("="*50)

Starting MHQA No Reasoning optimization...
Optimizing prompts for MHQA no reasoning...


 18%|█▊        | 9/50 [00:25<01:56,  2.83s/it]

Bootstrapped 4 full traces after 9 examples for up to 2 rounds, amounting to 14 attempts.
MHQA No Reasoning optimization completed!

OPTIMIZED PROMPT FOR MHQA NO REASONING:

Predictor 1:
PersianQASignature(question -> answer
    instructions='Answer Persian questions concisely and accurately.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Concise Persian answer', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Number of demonstrations: 4
Demo 1: Q: آیا سایتو دوسان یک نجیب زاده نظامی است ؟...
         A: بله، سایتو دوسان یک نجیب زاده نظامی است.
Demo 2: Q: کدام یک از مواردی که سشات در آن به عنوان ایزدبانو شناخته می شد فن محاسبه اعداد می باشد ؟...
         A: ریاضیات





In [18]:
# Evaluate MHQA no reasoning model
print("Evaluating MHQA No Reasoning model...")

mhqa_no_reasoning_results = []
judge_lm = create_llm_judge()

for example in tqdm(mhqa_test_examples, desc="Testing MHQA No Reasoning"):
    try:
        prediction = mhqa_optimized_no_reasoning(question=example.question)
        model_answer = prediction.answer
    except Exception as e:
        model_answer = f"Error: {e}"
    
    # Evaluate with LLM judge
    is_correct = evaluate_answer_with_judge(
        example.question, 
        example.answer, 
        model_answer, 
        judge_lm
    )
    
    mhqa_no_reasoning_results.append({
        'question': example.question,
        'answer': example.answer,
        'model_answer': model_answer,
        'clean_model_answer': clean_model_answer(model_answer),
        'is_correct': is_correct,
        'id': getattr(example, 'id', None)
    })

# Save results
mhqa_no_reasoning_df = pd.DataFrame(mhqa_no_reasoning_results)
mhqa_no_reasoning_df.to_csv('mhqa_dspy_no_reasoning_results.csv', index=False)

# Calculate accuracy
accuracy = len(mhqa_no_reasoning_df[mhqa_no_reasoning_df['is_correct'] == True]) / len(mhqa_no_reasoning_df)
print(f"\nMHQA No Reasoning Accuracy: {accuracy:.3f}")
print(f"Correct answers: {len(mhqa_no_reasoning_df[mhqa_no_reasoning_df['is_correct'] == True])}/{len(mhqa_no_reasoning_df)}")

Evaluating MHQA No Reasoning model...


Testing MHQA No Reasoning: 100%|██████████| 152/152 [06:47<00:00,  2.68s/it]


MHQA No Reasoning Accuracy: 0.441
Correct answers: 67/152





## MHQA - With Reasoning Approach

In [None]:
print("Starting MHQA With Reasoning optimization...")

# Create baseline model with reasoning
mhqa_reasoning_model = PersianQAWithReasoningModule()

# Configure teleprompter for reasoning approach
teleprompter_reasoning = dspy.BootstrapFewShot(
    metric=accuracy_metric,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=2
)

# Optimize the model
print("Optimizing prompts for MHQA with reasoning...")
mhqa_optimized_reasoning = teleprompter_reasoning.compile(
    mhqa_reasoning_model, 
    trainset=mhqa_train_examples
)

print("MHQA With Reasoning optimization completed!")

# Show optimized prompt
print("\n" + "="*50)
print("OPTIMIZED PROMPT FOR MHQA WITH REASONING:")
print("="*50)
for i, predictor in enumerate(mhqa_optimized_reasoning.predictors()):
    print(f"\nPredictor {i+1}:")
    print(predictor.signature)
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Number of demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:2]):  # Show first 2 demos
            print(f"Demo {j+1}: Q: {demo.question[:100]}...")
            print(f"         A: {demo.answer}")
print("="*50)

Starting MHQA With Reasoning optimization...
Optimizing prompts for MHQA with reasoning...


 16%|█▌        | 8/50 [00:47<04:11,  6.00s/it]

Bootstrapped 4 full traces after 8 examples for up to 2 rounds, amounting to 13 attempts.
MHQA With Reasoning optimization completed!

OPTIMIZED PROMPT FOR MHQA WITH REASONING:

Predictor 1:
StringSignature(question -> reasoning, answer
    instructions='Answer Persian questions with step-by-step reasoning.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Concise Persian answer after reasoning', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Number of demonstrations: 4
Demo 1: Q: آیا سایتو دوسان یک نجیب زاده نظامی است ؟...
         A: بله، سایتو دوسان یک نجیب‌زاده نظامی است.
Demo 2: Q: کدام یک از بازیگرا




In [None]:
print("Evaluating MHQA With Reasoning model...")

mhqa_reasoning_results = []
judge_lm = create_llm_judge()

for example in tqdm(mhqa_test_examples, desc="Testing MHQA With Reasoning"):
    try:
        prediction = mhqa_optimized_reasoning(question=example.question)
        model_answer = prediction.answer
    except Exception as e:
        model_answer = f"Error: {e}"
    
    # Evaluate with LLM judge
    is_correct = evaluate_answer_with_judge(
        example.question, 
        example.answer, 
        model_answer, 
        judge_lm
    )
    
    mhqa_reasoning_results.append({
        'question': example.question,
        'answer': example.answer,
        'model_answer': model_answer,
        'clean_model_answer': clean_model_answer(model_answer),
        'is_correct': is_correct,
        'id': getattr(example, 'id', None)
    })

mhqa_reasoning_df = pd.DataFrame(mhqa_reasoning_results)
mhqa_reasoning_df.to_csv('mhqa_dspy_reasoning_results.csv', index=False)

accuracy_reasoning = len(mhqa_reasoning_df[mhqa_reasoning_df['is_correct'] == True]) / len(mhqa_reasoning_df)
print(f"\nMHQA With Reasoning Accuracy: {accuracy_reasoning:.3f}")
print(f"Correct answers: {len(mhqa_reasoning_df[mhqa_reasoning_df['is_correct'] == True])}/{len(mhqa_reasoning_df)}")

Evaluating MHQA With Reasoning model...


Testing MHQA With Reasoning: 100%|██████████| 152/152 [09:20<00:00,  3.69s/it]


MHQA With Reasoning Accuracy: 0.454
Correct answers: 69/152





# PQUAD Dataset Experiments

Now we'll work with the PQUAD dataset, following the same optimization approach for both reasoning and non-reasoning methods.

In [None]:
pquad_examples = prepare_dspy_examples(pquad_data)

# Split PQUAD data for training and testing
pquad_train_examples = pquad_examples[:30]  # Use first 30 for training
pquad_test_examples = pquad_examples[30:]   # Rest for testing (150-30 =120)

print(f"PQUAD Train examples: {len(pquad_train_examples)}")
print(f"PQUAD Test examples: {len(pquad_test_examples)}")

# Show sample
print(f"\nSample PQUAD example:")
print(f"Question: {pquad_train_examples[0].question}")
print(f"Answer: {pquad_train_examples[0].answer}")

PQUAD Train examples: 30
PQUAD Test examples: 120

Sample PQUAD example:
Question: ساختار آب چیست؟
Answer: اکسید هیدروژن


## PQUAD - No Reasoning Approach

In [None]:
print("Starting PQUAD No Reasoning optimization...")

pquad_no_reasoning_model = PersianQAModule()

# Configure teleprompter for PQUAD
teleprompter_pquad = dspy.BootstrapFewShot(
    metric=accuracy_metric,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=2
)

# Optimize the model
print("Optimizing prompts for PQUAD no reasoning...")
pquad_optimized_no_reasoning = teleprompter_pquad.compile(
    pquad_no_reasoning_model, 
    trainset=pquad_train_examples
)

print("PQUAD No Reasoning optimization completed!")

print("\n" + "="*50)
print("OPTIMIZED PROMPT FOR PQUAD NO REASONING:")
print("="*50)
for i, predictor in enumerate(pquad_optimized_no_reasoning.predictors()):
    print(f"\nPredictor {i+1}:")
    print(predictor.signature)
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Number of demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:2]):  # Show first 2 demos
            print(f"Demo {j+1}: Q: {demo.question[:100]}...")
            print(f"         A: {demo.answer}")
print("="*50)

Starting PQUAD No Reasoning optimization...
Optimizing prompts for PQUAD no reasoning...


 50%|█████     | 15/30 [01:18<01:18,  5.25s/it]

Bootstrapped 4 full traces after 15 examples for up to 2 rounds, amounting to 27 attempts.
PQUAD No Reasoning optimization completed!

OPTIMIZED PROMPT FOR PQUAD NO REASONING:

Predictor 1:
PersianQASignature(question -> answer
    instructions='Answer Persian questions concisely and accurately.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Concise Persian answer', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Number of demonstrations: 4
Demo 1: Q: ساختار آب چیست؟...
         A: آب از دو اتم هیدروژن و یک اتم اکسیژن تشکیل شده است (H₂O).
Demo 2: Q: موقعیت جغرافیایی کلیسای کاتولیک رم چگونه است؟...
         A: کلیسای کاتولیک رم در شهر واتیکان واقع در ایتالیا قرار دارد و به عنوان مرکز کلیسای کاتولیک نیز شناخته می‌شود.





In [None]:
print("Evaluating PQUAD No Reasoning model...")

pquad_no_reasoning_results = []
judge_lm = create_llm_judge()

for example in tqdm(pquad_test_examples, desc="Testing PQUAD No Reasoning"):
    try:
        prediction = pquad_optimized_no_reasoning(question=example.question)
        model_answer = prediction.answer
    except Exception as e:
        model_answer = f"Error: {e}"
    
    # Evaluate with LLM judge
    is_correct = evaluate_answer_with_judge(
        example.question, 
        example.answer, 
        model_answer, 
        judge_lm
    )
    
    pquad_no_reasoning_results.append({
        'question': example.question,
        'answer': example.answer,
        'model_answer': model_answer,
        'clean_model_answer': clean_model_answer(model_answer),
        'is_correct': is_correct,
        'id': getattr(example, 'id', None)
    })

pquad_no_reasoning_df = pd.DataFrame(pquad_no_reasoning_results)
pquad_no_reasoning_df.to_csv('pquad_dspy_no_reasoning_results.csv', index=False)

# Calculate accuracy
pquad_accuracy = len(pquad_no_reasoning_df[pquad_no_reasoning_df['is_correct'] == True]) / len(pquad_no_reasoning_df)
print(f"\nPQUAD No Reasoning Accuracy: {pquad_accuracy:.3f}")
print(f"Correct answers: {len(pquad_no_reasoning_df[pquad_no_reasoning_df['is_correct'] == True])}/{len(pquad_no_reasoning_df)}")

Evaluating PQUAD No Reasoning model...


Testing PQUAD No Reasoning: 100%|██████████| 120/120 [06:32<00:00,  3.27s/it]


PQUAD No Reasoning Accuracy: 0.233
Correct answers: 28/120





## PQUAD - With Reasoning Approach

In [None]:
print("Starting PQUAD With Reasoning optimization...")

pquad_reasoning_model = PersianQAWithReasoningModule()

# Configure teleprompter for reasoning approach
teleprompter_pquad_reasoning = dspy.BootstrapFewShot(
    metric=accuracy_metric,
    max_bootstrapped_demos=4,
    max_labeled_demos=4,
    max_rounds=2
)

# Optimize the model
print("Optimizing prompts for PQUAD with reasoning...")
pquad_optimized_reasoning = teleprompter_pquad_reasoning.compile(
    pquad_reasoning_model, 
    trainset=pquad_train_examples
)

print("PQUAD With Reasoning optimization completed!")

print("\n" + "="*50)
print("OPTIMIZED PROMPT FOR PQUAD WITH REASONING:")
print("="*50)
for i, predictor in enumerate(pquad_optimized_reasoning.predictors()):
    print(f"\nPredictor {i+1}:")
    print(predictor.signature)
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Number of demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:2]):  # Show first 2 demos
            print(f"Demo {j+1}: Q: {demo.question[:100]}...")
            print(f"         A: {demo.answer}")
print("="*50)

Starting PQUAD With Reasoning optimization...
Optimizing prompts for PQUAD with reasoning...


 53%|█████▎    | 16/30 [02:17<02:00,  8.61s/it]

Bootstrapped 4 full traces after 16 examples for up to 2 rounds, amounting to 29 attempts.
PQUAD With Reasoning optimization completed!

OPTIMIZED PROMPT FOR PQUAD WITH REASONING:

Predictor 1:
StringSignature(question -> reasoning, answer
    instructions='Answer Persian questions with step-by-step reasoning.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${reasoning}', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Concise Persian answer after reasoning', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Number of demonstrations: 4
Demo 1: Q: مهمترین آثار باستانی استان گیلان کدام است؟...
         A: قلعه رودخان، بقعه شیخ زاهد گیلانی، و تپه‌های باستانی چمخاله.
Demo




In [None]:
print("Evaluating PQUAD With Reasoning model...")

pquad_reasoning_results = []
judge_lm = create_llm_judge()

for example in tqdm(pquad_test_examples, desc="Testing PQUAD With Reasoning"):
    try:
        prediction = pquad_optimized_reasoning(question=example.question)
        model_answer = prediction.answer
    except Exception as e:
        model_answer = f"Error: {e}"
    
    # Evaluate with LLM judge
    is_correct = evaluate_answer_with_judge(
        example.question, 
        example.answer, 
        model_answer, 
        judge_lm
    )
    
    pquad_reasoning_results.append({
        'question': example.question,
        'answer': example.answer,
        'model_answer': model_answer,
        'clean_model_answer': clean_model_answer(model_answer),
        'is_correct': is_correct,
        'id': getattr(example, 'id', None)
    })

pquad_reasoning_df = pd.DataFrame(pquad_reasoning_results)
pquad_reasoning_df.to_csv('pquad_dspy_reasoning_results.csv', index=False)

# Calculate accuracy
pquad_accuracy_reasoning = len(pquad_reasoning_df[pquad_reasoning_df['is_correct'] == True]) / len(pquad_reasoning_df)
print(f"\nPQUAD With Reasoning Accuracy: {pquad_accuracy_reasoning:.3f}")
print(f"Correct answers: {len(pquad_reasoning_df[pquad_reasoning_df['is_correct'] == True])}/{len(pquad_reasoning_df)}")

Evaluating PQUAD With Reasoning model...


Testing PQUAD With Reasoning: 100%|██████████| 120/120 [10:28<00:00,  5.24s/it]


PQUAD With Reasoning Accuracy: 0.233
Correct answers: 28/120





# Results Summary

Let's compare all the results and examine the optimized prompts that DSPy discovered.

In [None]:
print("="*70)
print("FINAL RESULTS SUMMARY - DSPy Optimized Models")
print("="*70)

print(f"\nMHQA Dataset Results:")
print(f"  • No Reasoning:  {accuracy:.3f} ")
print(f"  • With Reasoning: {accuracy_reasoning:.3f}")

print(f"\nPQUAD Dataset Results:")
print(f"  • No Reasoning:  {pquad_accuracy:.3f} ")
print(f"  • With Reasoning: {pquad_accuracy_reasoning:.3f} ")

print("\n" + "="*70)
print("COMPARISON WITH BASELINE GPT RESULTS")
print("="*70)

print(f"\nMHQA Dataset Results:")
print(f"  • No Reasoning:  {0.453} ")
print(f"  • With Reasoning: {0.447} ")

print(f"\nPQUAD Dataset Results:")
print(f"  • No Reasoning:  {0.187} ")
print(f"  • With Reasoning: {0.194}")



FINAL RESULTS SUMMARY - DSPy Optimized Models

MHQA Dataset Results:
  • No Reasoning:  0.441 
  • With Reasoning: 0.454

PQUAD Dataset Results:
  • No Reasoning:  0.233 
  • With Reasoning: 0.233 

COMPARISON WITH BASELINE GPT RESULTS

MHQA Dataset Results:
  • No Reasoning:  0.453 
  • With Reasoning: 0.447 

PQUAD Dataset Results:
  • No Reasoning:  0.187 
  • With Reasoning: 0.194


In [None]:
def inspect_optimized_model(model, title):
    print(f"\n{'='*50}")
    print(f"DETAILED PROMPT INSPECTION: {title}")
    print(f"{'='*50}")
    
    for i, predictor in enumerate(model.predictors()):
        print(f"\n--- Predictor {i+1} ---")
        print(f"Signature: {predictor.signature}")
        
        if hasattr(predictor, 'demos') and predictor.demos:
            print(f"Demonstrations found: {len(predictor.demos)}")
            print("\nFew-shot examples selected by DSPy:")
            for j, demo in enumerate(predictor.demos):
                print(f"\nExample {j+1}:")
                print(f"Q: {demo.question}")
                print(f"A: {demo.answer}")
                if j >= 3:  # Show max 4 examples
                    print(f"... and {len(predictor.demos) - 4} more examples")
                    break
        else:
            print("No demonstrations found")
        
        if hasattr(predictor, 'extended_signature'):
            print(f"\nExtended signature: {predictor.extended_signature}")

# Inspect all optimized models
inspect_optimized_model(mhqa_optimized_no_reasoning, "MHQA No Reasoning")
inspect_optimized_model(mhqa_optimized_reasoning, "MHQA With Reasoning") 
inspect_optimized_model(pquad_optimized_no_reasoning, "PQUAD No Reasoning")
inspect_optimized_model(pquad_optimized_reasoning, "PQUAD With Reasoning")


DETAILED PROMPT INSPECTION: MHQA No Reasoning

--- Predictor 1 ---
Signature: PersianQASignature(question -> answer
    instructions='Answer Persian questions concisely and accurately.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Concise Persian answer', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Demonstrations found: 4

Few-shot examples selected by DSPy:

Example 1:
Q: آیا سایتو دوسان یک نجیب زاده نظامی است ؟
A: بله، سایتو دوسان یک نجیب زاده نظامی است.

Example 2:
Q: کدام یک از مواردی که سشات در آن به عنوان ایزدبانو شناخته می شد فن محاسبه اعداد می باشد ؟
A: ریاضیات

Example 3:
Q: کارگردان فیلم شب یلدا دانش آموخته چه رشته ای می باشد ؟
A: رشته کارگردانی سینما

Example 4:
Q: آیا جایزه ای که سیدیبه در سال 2007 آن را دریافت کرد اکنون به عنوان یکی از معتبرترین و برجسته‌ترین جوایز صن