# MHQA with Advanced DSPy Optimization

We use different DSPy-based prompt optimization for PersianMHQA and PQUAD datasets using advanced optimization techniques:

In [1]:
import os
import json
import dotenv
from tqdm import tqdm
import pandas as pd
import dspy
import numpy as np
import re
from typing import List, Dict, Any
import random
from collections import defaultdict

dotenv.load_dotenv()

with open("../../../data/test_data.json", "r") as f:
    test_data = json.load(f)

with open("../../../data/train_data.json", "r") as f:
    train_data = json.load(f)

pquad_df = pd.read_csv('../../../data/pquad/pquad_questions.csv', encoding='utf-8')
pquad_data = pquad_df.to_dict(orient='records')[:150]  # Use first 150 samples

print(f"Loaded {len(test_data)} test examples")
print(f"Loaded {len(train_data)} train examples") 
print(f"Loaded {len(pquad_data)} PQUAD examples")

Loaded 152 test examples
Loaded 400 train examples
Loaded 150 PQUAD examples


In [2]:
MODEL_NAME = "gpt-4o-mini"

# Primary LM for generation
lm = dspy.LM(
    model=f"openai/{MODEL_NAME}",
    api_key=os.getenv("METIS_API_KEY"),
    api_base="https://api.metisai.ir/openai/v1",
    max_tokens=300,  
    temperature=0.1   
)

# Secondary LM for evaluation 
lm_reasoning = dspy.LM(
    model=f"openai/{MODEL_NAME}",
    api_key=os.getenv("METIS_API_KEY"),
    api_base="https://api.metisai.ir/openai/v1",
    max_tokens=500,  # More tokens for reasoning
    temperature=0.0   # Deterministic for evaluation
)

dspy.configure(lm=lm)

In [3]:
# Enhanced DSPy signatures with more detailed instructions
class EnhancedPersianQASignature(dspy.Signature):
    """Answer Persian/Farsi questions with high accuracy. Focus on providing precise, concise answers that directly address the question. Consider Persian language nuances and cultural context."""
    question = dspy.InputField(desc="Persian question requiring a factual answer")
    answer = dspy.OutputField(desc="Precise, concise Persian answer (one to three words when possible)")

class EnhancedPersianQAWithReasoningSignature(dspy.Signature):
    """Answer Persian/Farsi questions using step-by-step reasoning. Break down multi-hop questions into logical steps."""
    question = dspy.InputField(desc="Persian question requiring multi-step reasoning")
    reasoning = dspy.OutputField(desc="Step-by-step reasoning process in Persian")
    answer = dspy.OutputField(desc="Final precise Persian answer based on reasoning")

class PersianQAWithContextSignature(dspy.Signature):
    """Answer Persian questions by first analyzing the question type and required information."""
    question = dspy.InputField(desc="Persian question to analyze and answer")
    question_type = dspy.OutputField(desc="Type of question (factual, comparison, temporal, etc.)")
    key_entities = dspy.OutputField(desc="Key entities or concepts mentioned in the question")
    answer = dspy.OutputField(desc="Accurate Persian answer")

In [None]:
# DSPy modules with multi-stage processing
class EnhancedPersianQAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(EnhancedPersianQASignature)
    
    def forward(self, question):
        result = self.generate_answer(question=question)
        return dspy.Prediction(answer=result.answer)

class EnhancedPersianQAWithReasoningModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(EnhancedPersianQAWithReasoningSignature)
    
    def forward(self, question):
        result = self.generate_answer(question=question)
        return dspy.Prediction(answer=result.answer, reasoning=getattr(result, 'reasoning', ''))

class MultiStagePersianQAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.analyze_question = dspy.Predict(PersianQAWithContextSignature)
        self.refine_answer = dspy.Predict(EnhancedPersianQASignature)
    
    def forward(self, question):
        # first analyze question
        analysis = self.analyze_question(question=question)
        
        # then generate refined answer
        refined = self.refine_answer(question=question)
        
        return dspy.Prediction(
            answer=refined.answer,
            question_type=getattr(analysis, 'question_type', ''),
            key_entities=getattr(analysis, 'key_entities', '')
        )

class EnsemblePersianQAModule(dspy.Module):
    def __init__(self):
        super().__init__()
        self.qa_direct = dspy.Predict(EnhancedPersianQASignature)
        self.qa_reasoning = dspy.ChainOfThought(EnhancedPersianQAWithReasoningSignature)
        self.qa_context = dspy.Predict(PersianQAWithContextSignature)
    
    def forward(self, question):
        # Get predictions from all approaches
        direct = self.qa_direct(question=question)
        reasoning = self.qa_reasoning(question=question)
        context = self.qa_context(question=question)
        
        # Simple voting/selection mechanism 
        answers = [direct.answer, reasoning.answer, context.answer]
        
        final_answer = reasoning.answer if hasattr(reasoning, 'answer') else direct.answer
        
        return dspy.Prediction(
            answer=final_answer,
            direct_answer=direct.answer,
            reasoning_answer=reasoning.answer,
            context_answer=context.answer
        )


In [None]:
def clean_model_answer(model_answer: str) -> str:
    if not model_answer:
        return ""
    
    # Remove various tags and formatting
    cleaned = re.sub(r'<ANSWER>(.*?)</ANSWER>', r'\1', model_answer, flags=re.DOTALL|re.IGNORECASE)
    cleaned = re.sub(r'<[^>]+>', '', cleaned)  # Remove any remaining tags
    cleaned = re.sub(r'\s+', ' ', cleaned)     # Normalize whitespace
    cleaned = cleaned.strip()
    
    prefixes = ['پاسخ:', 'جواب:', 'Answer:', 'Response:']
    for prefix in prefixes:
        if cleaned.startswith(prefix):
            cleaned = cleaned[len(prefix):].strip()
    
    return cleaned

def evaluate_answer_with_judge(question: str, correct_answer: str, model_answer: str, judge_lm) -> bool:
    clean_answer = clean_model_answer(model_answer)
    
    prompt = f"""
        شما یک قاضی خبره هستید که پاسخ‌های فارسی را ارزیابی می‌کنید. 
        تعیین کنید که آیا پاسخ مدل از نظر معنایی معادل پاسخ صحیح است یا خیر.
        در نظر بگیرید که تغییرات جزئی در املا و عبارات معادل قابل قبول هستند.

        سوال: {question}

        پاسخ صحیح: {correct_answer}
        پاسخ مدل: {clean_answer}

        اگر پاسخ مدل از نظر معنایی معادل پاسخ صحیح است، فقط "TRUE" بنویسید.
        در غیر این صورت فقط "FALSE" بنویسید.

        پاسخ:
    """
    
    try:
        response = judge_lm(prompt)
        if isinstance(response, list) and len(response) > 0:
            response_text = str(response[0])
        else:
            response_text = str(response)
        
        return "TRUE" in response_text.upper()
    except Exception as e:
        print(f"Error in judge evaluation: {e}")
        return False

evaluation_cache = {}

def enhanced_accuracy_metric(gold, pred, trace=None): 
    cache_key = (gold.question, gold.answer, pred.answer)
    
    if cache_key in evaluation_cache:
        return evaluation_cache[cache_key]
    
    judge_lm = lm_reasoning
    result = evaluate_answer_with_judge(gold.question, gold.answer, pred.answer, judge_lm)
    
    evaluation_cache[cache_key] = result
    return result

In [None]:
# Enhanced data preparation with stratified sampling
def prepare_enhanced_dspy_examples(data_list, sample_size=None):
    examples = []
    
    for item in data_list:
        example = dspy.Example(
            question=item['question'],
            answer=item['answer'],
            question_type=item.get('type', 'unknown'),
            answer_type=item.get('answer_type', 'unknown')
        ).with_inputs('question')
        examples.append(example)
    
    # Stratified sampling if needed
    if sample_size and sample_size < len(examples):
        # Group by answer type for balanced sampling
        type_groups = defaultdict(list)
        for ex in examples:
            type_groups[ex.answer_type].append(ex)
        
        sampled = []
        samples_per_type = max(1, sample_size // len(type_groups))
        
        for type_name, type_examples in type_groups.items():
            n_samples = min(samples_per_type, len(type_examples))
            sampled.extend(random.sample(type_examples, n_samples))
        
        # Fill remaining slots randomly
        remaining = sample_size - len(sampled)
        if remaining > 0:
            remaining_examples = [ex for ex in examples if ex not in sampled]
            if remaining_examples:
                sampled.extend(random.sample(remaining_examples, min(remaining, len(remaining_examples))))
        
        return sampled[:sample_size]
    
    return examples

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Prepare enhanced datasets
mhqa_train_examples = prepare_enhanced_dspy_examples(train_data, sample_size=80)  # Increased training size
mhqa_test_examples = prepare_enhanced_dspy_examples(test_data)

print(f"MHQA Train examples: {len(mhqa_train_examples)}")
print(f"MHQA Test examples: {len(mhqa_test_examples)}")

# Show distribution
train_types = defaultdict(int)
for ex in mhqa_train_examples:
    train_types[ex.answer_type] += 1

print(f"\nTraining data distribution:")
for type_name, count in train_types.items():
    print(f"  {type_name}: {count}")

MHQA Train examples: 80
MHQA Test examples: 152

Training data distribution:
  اسامی عام: 8
  شخص: 9
  بلی/خیر: 13
  تاریخ: 7
  رویداد: 4
  مکان: 6
  اسامی خاص دیگر: 6
  شماره: 6
  کار هنری: 7
  گروه یا سازمان: 7
  صفت: 6
  : 1


# MHQA Advanced Optimization Experiments

We'll use multiple optimization strategies and compare their effectiveness.

## Strategy 1: Enhanced BootstrapFewShot with Random Search

In [7]:
# Strategy 1: Enhanced BootstrapFewShot
print("Strategy 1: Enhanced BootstrapFewShot Optimization...")

# Create enhanced model
mhqa_enhanced_model = EnhancedPersianQAModule()

# Enhanced teleprompter with more aggressive search
teleprompter_enhanced = dspy.BootstrapFewShotWithRandomSearch(
    metric=enhanced_accuracy_metric,
    max_bootstrapped_demos=8,    # Increased
    max_labeled_demos=6,         # Increased  
    max_rounds=3,                # More rounds
    num_candidate_programs=16,   # More candidates
    num_threads=4                # Parallel processing
)

# Optimize
print("Optimizing with enhanced BootstrapFewShot...")
mhqa_enhanced_optimized = teleprompter_enhanced.compile(
    mhqa_enhanced_model,
    trainset=mhqa_train_examples,
    valset=mhqa_train_examples[:20]  # Use subset for validation
)

print("Strategy 1 optimization completed!")

# Show optimized prompt
print("\n" + "="*60)
print("STRATEGY 1 - ENHANCED BOOTSTRAPFEWSHOT RESULTS:")
print("="*60)
for i, predictor in enumerate(mhqa_enhanced_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:3]):
            print(f"  Demo {j+1}: {demo.question[:80]}... -> {demo.answer}")
print("="*60)

Strategy 1: Enhanced BootstrapFewShot Optimization...
Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 16 candidate sets.
Optimizing with enhanced BootstrapFewShot...
Average Metric: 5.00 / 20 (25.0%): 100%|██████████| 20/20 [00:15<00:00,  1.26it/s]

2025/09/06 11:48:50 INFO dspy.evaluate.evaluate: Average Metric: 5 / 20 (25.0%)



New best score: 25.0 for seed -3
Scores so far: [25.0]
Best score so far: 25.0
Average Metric: 6.00 / 20 (30.0%): 100%|██████████| 20/20 [00:10<00:00,  1.93it/s]

2025/09/06 11:49:00 INFO dspy.evaluate.evaluate: Average Metric: 6 / 20 (30.0%)



New best score: 30.0 for seed -2
Scores so far: [25.0, 30.0]
Best score so far: 30.0


 34%|███▍      | 27/80 [01:37<03:12,  3.63s/it]


Bootstrapped 8 full traces after 27 examples for up to 3 rounds, amounting to 66 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:09<00:00,  2.19it/s]

2025/09/06 11:50:48 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



New best score: 40.0 for seed -1
Scores so far: [25.0, 30.0, 40.0]
Best score so far: 40.0


 14%|█▍        | 11/80 [00:46<04:53,  4.26s/it]


Bootstrapped 7 full traces after 11 examples for up to 3 rounds, amounting to 20 attempts.
Average Metric: 7.00 / 20 (35.0%): 100%|██████████| 20/20 [00:09<00:00,  2.21it/s]

2025/09/06 11:51:44 INFO dspy.evaluate.evaluate: Average Metric: 7 / 20 (35.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0]
Best score so far: 40.0


  8%|▊         | 6/80 [00:23<04:55,  4.00s/it]


Bootstrapped 3 full traces after 6 examples for up to 3 rounds, amounting to 13 attempts.
Average Metric: 7.00 / 20 (35.0%): 100%|██████████| 20/20 [00:08<00:00,  2.42it/s]

2025/09/06 11:52:16 INFO dspy.evaluate.evaluate: Average Metric: 7 / 20 (35.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0]
Best score so far: 40.0


  2%|▎         | 2/80 [00:07<04:40,  3.59s/it]


Bootstrapped 1 full traces after 2 examples for up to 3 rounds, amounting to 4 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:08<00:00,  2.47it/s]

2025/09/06 11:52:31 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0]
Best score so far: 40.0


  8%|▊         | 6/80 [00:17<03:31,  2.86s/it]


Bootstrapped 4 full traces after 6 examples for up to 3 rounds, amounting to 10 attempts.
Average Metric: 7.00 / 20 (35.0%): 100%|██████████| 20/20 [00:08<00:00,  2.34it/s]

2025/09/06 11:52:57 INFO dspy.evaluate.evaluate: Average Metric: 7 / 20 (35.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0]
Best score so far: 40.0


 14%|█▍        | 11/80 [00:49<05:11,  4.51s/it]


Bootstrapped 4 full traces after 11 examples for up to 3 rounds, amounting to 25 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:14<00:00,  1.40it/s]

2025/09/06 11:54:01 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0]
Best score so far: 40.0


 12%|█▎        | 10/80 [01:04<07:32,  6.46s/it]


Bootstrapped 5 full traces after 10 examples for up to 3 rounds, amounting to 20 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:06<00:00,  2.86it/s]

2025/09/06 11:55:12 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0]
Best score so far: 40.0


  5%|▌         | 4/80 [00:15<04:51,  3.84s/it]


Bootstrapped 2 full traces after 4 examples for up to 3 rounds, amounting to 8 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:09<00:00,  2.02it/s]

2025/09/06 11:55:38 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0]
Best score so far: 40.0


 15%|█▌        | 12/80 [00:46<04:22,  3.86s/it]


Bootstrapped 6 full traces after 12 examples for up to 3 rounds, amounting to 24 attempts.
Average Metric: 7.00 / 20 (35.0%): 100%|██████████| 20/20 [00:11<00:00,  1.81it/s]

2025/09/06 11:56:35 INFO dspy.evaluate.evaluate: Average Metric: 7 / 20 (35.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0]
Best score so far: 40.0


 16%|█▋        | 13/80 [01:12<06:15,  5.61s/it]


Bootstrapped 4 full traces after 13 examples for up to 3 rounds, amounting to 31 attempts.
Average Metric: 7.00 / 20 (35.0%): 100%|██████████| 20/20 [00:07<00:00,  2.73it/s]

2025/09/06 11:57:55 INFO dspy.evaluate.evaluate: Average Metric: 7 / 20 (35.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0]
Best score so far: 40.0


 21%|██▏       | 17/80 [01:10<04:20,  4.13s/it]


Bootstrapped 8 full traces after 17 examples for up to 3 rounds, amounting to 35 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:15<00:00,  1.30it/s]

2025/09/06 11:59:21 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0]
Best score so far: 40.0


  2%|▎         | 2/80 [00:11<07:31,  5.79s/it]


Bootstrapped 1 full traces after 2 examples for up to 3 rounds, amounting to 4 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:08<00:00,  2.31it/s]

2025/09/06 11:59:41 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0, 40.0]
Best score so far: 40.0


 25%|██▌       | 20/80 [01:20<04:00,  4.01s/it]


Bootstrapped 8 full traces after 20 examples for up to 3 rounds, amounting to 44 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:07<00:00,  2.51it/s]

2025/09/06 12:01:09 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0, 40.0, 40.0]
Best score so far: 40.0


 29%|██▉       | 23/80 [01:32<03:48,  4.00s/it]


Bootstrapped 8 full traces after 23 examples for up to 3 rounds, amounting to 53 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:07<00:00,  2.63it/s]

2025/09/06 12:02:49 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0, 40.0, 40.0, 40.0]
Best score so far: 40.0


 10%|█         | 8/80 [00:35<05:17,  4.41s/it]


Bootstrapped 5 full traces after 8 examples for up to 3 rounds, amounting to 14 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:09<00:00,  2.04it/s]

2025/09/06 12:03:34 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0, 40.0, 40.0, 40.0, 40.0]
Best score so far: 40.0


  8%|▊         | 6/80 [00:22<04:41,  3.80s/it]


Bootstrapped 2 full traces after 6 examples for up to 3 rounds, amounting to 14 attempts.
Average Metric: 9.00 / 20 (45.0%): 100%|██████████| 20/20 [00:08<00:00,  2.31it/s]

2025/09/06 12:04:06 INFO dspy.evaluate.evaluate: Average Metric: 9 / 20 (45.0%)



New best score: 45.0 for seed 14
Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0, 40.0, 40.0, 40.0, 40.0, 45.0]
Best score so far: 45.0


 12%|█▎        | 10/80 [00:54<06:19,  5.42s/it]


Bootstrapped 4 full traces after 10 examples for up to 3 rounds, amounting to 22 attempts.
Average Metric: 8.00 / 20 (40.0%): 100%|██████████| 20/20 [00:08<00:00,  2.45it/s]

2025/09/06 12:05:08 INFO dspy.evaluate.evaluate: Average Metric: 8 / 20 (40.0%)



Scores so far: [25.0, 30.0, 40.0, 35.0, 35.0, 40.0, 35.0, 40.0, 40.0, 40.0, 35.0, 35.0, 40.0, 40.0, 40.0, 40.0, 40.0, 45.0, 40.0]
Best score so far: 45.0
19 candidate programs found.
Strategy 1 optimization completed!

STRATEGY 1 - ENHANCED BOOTSTRAPFEWSHOT RESULTS:

Predictor 1:
Signature: EnhancedPersianQASignature(question -> answer
    instructions='Answer Persian/Farsi questions with high accuracy. Focus on providing precise, concise answers that directly address the question. Consider Persian language nuances and cultural context.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question requiring a factual answer', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Precise, concise Persian answer (one to three words when possible)', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Demonstrations: 6
  Demo 1: کدام یک از سبک‌هایی که فرانک وینسنت زاپا در آن آه

## Strategy 2: Multi-Stage Processing

In [8]:
# Strategy 2: Multi-Stage Processing
print("Strategy 2: Multi-Stage Processing Optimization...")

# Create multi-stage model
mhqa_multistage_model = MultiStagePersianQAModule()

# Optimize multi-stage model
teleprompter_multistage = dspy.BootstrapFewShot(
    metric=enhanced_accuracy_metric,
    max_bootstrapped_demos=6,
    max_labeled_demos=4,
    max_rounds=3
)

print("Optimizing multi-stage model...")
mhqa_multistage_optimized = teleprompter_multistage.compile(
    mhqa_multistage_model,
    trainset=mhqa_train_examples
)

print("Strategy 2 optimization completed!")

# Show results
print("\n" + "="*60)
print("STRATEGY 2 - MULTI-STAGE PROCESSING RESULTS:")
print("="*60)
for i, predictor in enumerate(mhqa_multistage_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Demonstrations: {len(predictor.demos)}")
print("="*60)

Strategy 2: Multi-Stage Processing Optimization...
Optimizing multi-stage model...


 21%|██▏       | 17/80 [02:25<09:00,  8.58s/it]

Bootstrapped 6 full traces after 17 examples for up to 3 rounds, amounting to 40 attempts.
Strategy 2 optimization completed!

STRATEGY 2 - MULTI-STAGE PROCESSING RESULTS:

Predictor 1:
Signature: PersianQAWithContextSignature(question -> question_type, key_entities, answer
    instructions='Answer Persian questions by first analyzing the question type and required information.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question to analyze and answer', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    question_type = Field(annotation=str required=True json_schema_extra={'desc': 'Type of question (factual, comparison, temporal, etc.)', '__dspy_field_type': 'output', 'prefix': 'Question Type:'})
    key_entities = Field(annotation=str required=True json_schema_extra={'desc': 'Key entities or concepts mentioned in the question', '__dspy_field_type': 'output', 'prefix': 'Key Entities:'})
    answer = Field(annotation=str required=True jso




## Strategy 3: Ensemble Approach

In [9]:
# Strategy 3: Ensemble Approach
print("Strategy 3: Ensemble Approach Optimization...")

# Create ensemble model
mhqa_ensemble_model = EnsemblePersianQAModule()

# Optimize ensemble
teleprompter_ensemble = dspy.BootstrapFewShot(
    metric=enhanced_accuracy_metric,
    max_bootstrapped_demos=5,
    max_labeled_demos=3,
    max_rounds=2
)

print("Optimizing ensemble model...")
mhqa_ensemble_optimized = teleprompter_ensemble.compile(
    mhqa_ensemble_model,
    trainset=mhqa_train_examples
)

print("Strategy 3 optimization completed!")

# Show results
print("\n" + "="*60)
print("STRATEGY 3 - ENSEMBLE APPROACH RESULTS:")
print("="*60)
for i, predictor in enumerate(mhqa_ensemble_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
print("="*60)

Strategy 3: Ensemble Approach Optimization...
Optimizing ensemble model...


 15%|█▌        | 12/80 [02:23<13:32, 11.95s/it]

Bootstrapped 5 full traces after 12 examples for up to 2 rounds, amounting to 19 attempts.
Strategy 3 optimization completed!

STRATEGY 3 - ENSEMBLE APPROACH RESULTS:

Predictor 1:
Signature: EnhancedPersianQASignature(question -> answer
    instructions='Answer Persian/Farsi questions with high accuracy. Focus on providing precise, concise answers that directly address the question. Consider Persian language nuances and cultural context.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question requiring a factual answer', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Precise, concise Persian answer (one to three words when possible)', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)

Predictor 2:
Signature: StringSignature(question -> reasoning, answer
    instructions='Answer Persian/Farsi questions using step-by-step reasoning. Break down multi-hop questio




## Strategy 4: Enhanced Reasoning with CoT

In [10]:
# Strategy 4: Enhanced Chain-of-Thought Reasoning
print("Strategy 4: Enhanced CoT Reasoning Optimization...")

# Create enhanced reasoning model
mhqa_enhanced_reasoning_model = EnhancedPersianQAWithReasoningModule()

# More intensive optimization for reasoning
teleprompter_reasoning = dspy.BootstrapFewShotWithRandomSearch(
    metric=enhanced_accuracy_metric,
    max_bootstrapped_demos=10,   # More demos for reasoning
    max_labeled_demos=8,
    max_rounds=4,                # More rounds
    num_candidate_programs=20,   # More candidates
    num_threads=4
)

print("Optimizing enhanced reasoning model...")
mhqa_enhanced_reasoning_optimized = teleprompter_reasoning.compile(
    mhqa_enhanced_reasoning_model,
    trainset=mhqa_train_examples,
    valset=mhqa_train_examples[:25]
)

print("Strategy 4 optimization completed!")

# Show results
print("\n" + "="*60)
print("STRATEGY 4 - ENHANCED COT REASONING RESULTS:")
print("="*60)
for i, predictor in enumerate(mhqa_enhanced_reasoning_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:2]):
            print(f"  Demo {j+1}: {demo.question[:60]}...")
            print(f"           -> {demo.answer}")
print("="*60)

Strategy 4: Enhanced CoT Reasoning Optimization...
Going to sample between 1 and 10 traces per predictor.
Will attempt to bootstrap 20 candidate sets.
Optimizing enhanced reasoning model...
Average Metric: 7.00 / 14 (50.0%):  52%|█████▏    | 13/25 [00:22<00:16,  1.33s/it]



Average Metric: 12.00 / 25 (48.0%): 100%|██████████| 25/25 [00:32<00:00,  1.29s/it]

2025/09/06 12:10:29 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)



New best score: 48.0 for seed -3
Scores so far: [48.0]
Best score so far: 48.0
Average Metric: 13.00 / 25 (52.0%): 100%|██████████| 25/25 [00:24<00:00,  1.02it/s]

2025/09/06 12:10:54 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)



New best score: 52.0 for seed -2
Scores so far: [48.0, 52.0]
Best score so far: 52.0


 21%|██▏       | 17/80 [01:39<06:10,  5.88s/it]


Bootstrapped 10 full traces after 17 examples for up to 4 rounds, amounting to 38 attempts.
Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:26<00:00,  1.05s/it]

2025/09/06 12:13:00 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)



Scores so far: [48.0, 52.0, 40.0]
Best score so far: 52.0


 14%|█▍        | 11/80 [01:46<11:08,  9.69s/it]


Bootstrapped 7 full traces after 11 examples for up to 4 rounds, amounting to 24 attempts.
Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:26<00:00,  1.07s/it]

2025/09/06 12:15:13 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0]
Best score so far: 52.0


  6%|▋         | 5/80 [01:01<15:25, 12.34s/it]


Bootstrapped 3 full traces after 5 examples for up to 4 rounds, amounting to 14 attempts.
Average Metric: 14.00 / 25 (56.0%): 100%|██████████| 25/25 [00:27<00:00,  1.12s/it]

2025/09/06 12:16:43 INFO dspy.evaluate.evaluate: Average Metric: 14 / 25 (56.0%)



New best score: 56.0 for seed 1
Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0]
Best score so far: 56.0


  2%|▎         | 2/80 [00:21<13:48, 10.63s/it]


Bootstrapped 1 full traces after 2 examples for up to 4 rounds, amounting to 5 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:23<00:00,  1.08it/s]

2025/09/06 12:17:28 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0]
Best score so far: 56.0


  8%|▊         | 6/80 [00:43<08:58,  7.28s/it]


Bootstrapped 4 full traces after 6 examples for up to 4 rounds, amounting to 12 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:26<00:00,  1.06s/it]

2025/09/06 12:18:38 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0]
Best score so far: 56.0


  8%|▊         | 6/80 [00:59<12:07,  9.83s/it]


Bootstrapped 4 full traces after 6 examples for up to 4 rounds, amounting to 13 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:24<00:00,  1.01it/s]

2025/09/06 12:20:02 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0]
Best score so far: 56.0


 31%|███▏      | 25/80 [04:57<10:55, 11.92s/it]


Bootstrapped 10 full traces after 25 examples for up to 4 rounds, amounting to 75 attempts.
Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]

2025/09/06 12:25:21 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0]
Best score so far: 56.0


 16%|█▋        | 13/80 [01:47<09:14,  8.28s/it]


Bootstrapped 10 full traces after 13 examples for up to 4 rounds, amounting to 27 attempts.
Average Metric: 12.00 / 25 (48.0%): 100%|██████████| 25/25 [00:20<00:00,  1.23it/s]

2025/09/06 12:27:29 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0]
Best score so far: 56.0


 22%|██▎       | 18/80 [04:01<13:51, 13.42s/it]


Bootstrapped 6 full traces after 18 examples for up to 4 rounds, amounting to 56 attempts.
Average Metric: 13.00 / 25 (52.0%): 100%|██████████| 25/25 [00:26<00:00,  1.08s/it]

2025/09/06 12:31:57 INFO dspy.evaluate.evaluate: Average Metric: 13 / 25 (52.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0]
Best score so far: 56.0


 16%|█▋        | 13/80 [02:48<14:28, 12.97s/it]


Bootstrapped 4 full traces after 13 examples for up to 4 rounds, amounting to 43 attempts.
Average Metric: 9.00 / 25 (36.0%): 100%|██████████| 25/25 [00:23<00:00,  1.08it/s]

2025/09/06 12:35:09 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0]
Best score so far: 56.0


 18%|█▊        | 14/80 [02:14<10:33,  9.59s/it]


Bootstrapped 8 full traces after 14 examples for up to 4 rounds, amounting to 35 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:30<00:00,  1.21s/it]

2025/09/06 12:37:54 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0]
Best score so far: 56.0


 30%|███       | 24/80 [04:29<10:28, 11.22s/it]


Bootstrapped 10 full traces after 24 examples for up to 4 rounds, amounting to 67 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:20<00:00,  1.21it/s]

2025/09/06 12:42:44 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0]
Best score so far: 56.0


 19%|█▉        | 15/80 [02:34<11:09, 10.30s/it]


Bootstrapped 8 full traces after 15 examples for up to 4 rounds, amounting to 40 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:22<00:00,  1.11it/s]

2025/09/06 12:45:41 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0]
Best score so far: 56.0


 28%|██▊       | 22/80 [04:15<11:13, 11.61s/it]


Bootstrapped 8 full traces after 22 examples for up to 4 rounds, amounting to 68 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:25<00:00,  1.02s/it]

2025/09/06 12:50:22 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0]
Best score so far: 56.0


 10%|█         | 8/80 [01:29<13:22, 11.15s/it]


Bootstrapped 5 full traces after 8 examples for up to 4 rounds, amounting to 17 attempts.
Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:23<00:00,  1.05it/s]

2025/09/06 12:52:15 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0]
Best score so far: 56.0


  8%|▊         | 6/80 [01:03<13:08, 10.65s/it]


Bootstrapped 2 full traces after 6 examples for up to 4 rounds, amounting to 18 attempts.
Average Metric: 9.00 / 25 (36.0%): 100%|██████████| 25/25 [00:21<00:00,  1.16it/s]

2025/09/06 12:53:40 INFO dspy.evaluate.evaluate: Average Metric: 9 / 25 (36.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0, 36.0]
Best score so far: 56.0


 12%|█▎        | 10/80 [01:53<13:17, 11.39s/it]


Bootstrapped 4 full traces after 10 examples for up to 4 rounds, amounting to 30 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:27<00:00,  1.09s/it]

2025/09/06 12:56:01 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0, 36.0, 44.0]
Best score so far: 56.0


 16%|█▋        | 13/80 [02:01<10:26,  9.34s/it]


Bootstrapped 6 full traces after 13 examples for up to 4 rounds, amounting to 34 attempts.
Average Metric: 12.00 / 25 (48.0%): 100%|██████████| 25/25 [00:21<00:00,  1.18it/s]

2025/09/06 12:58:24 INFO dspy.evaluate.evaluate: Average Metric: 12 / 25 (48.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0, 36.0, 44.0, 48.0]
Best score so far: 56.0


 24%|██▍       | 19/80 [02:46<08:54,  8.77s/it]


Bootstrapped 9 full traces after 19 examples for up to 4 rounds, amounting to 49 attempts.
Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:20<00:00,  1.21it/s]

2025/09/06 13:01:31 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0, 36.0, 44.0, 48.0, 40.0]
Best score so far: 56.0


  9%|▉         | 7/80 [01:23<14:27, 11.88s/it]


Bootstrapped 3 full traces after 7 examples for up to 4 rounds, amounting to 22 attempts.
Average Metric: 11.00 / 25 (44.0%): 100%|██████████| 25/25 [00:23<00:00,  1.05it/s]

2025/09/06 13:03:18 INFO dspy.evaluate.evaluate: Average Metric: 11 / 25 (44.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0, 36.0, 44.0, 48.0, 40.0, 44.0]
Best score so far: 56.0


  4%|▍         | 3/80 [00:42<18:12, 14.19s/it]


Bootstrapped 1 full traces after 3 examples for up to 4 rounds, amounting to 9 attempts.
Average Metric: 10.00 / 25 (40.0%): 100%|██████████| 25/25 [00:22<00:00,  1.12it/s]

2025/09/06 13:04:23 INFO dspy.evaluate.evaluate: Average Metric: 10 / 25 (40.0%)



Scores so far: [48.0, 52.0, 40.0, 40.0, 56.0, 44.0, 44.0, 44.0, 40.0, 48.0, 52.0, 36.0, 44.0, 44.0, 44.0, 44.0, 40.0, 36.0, 44.0, 48.0, 40.0, 44.0, 40.0]
Best score so far: 56.0
23 candidate programs found.
Strategy 4 optimization completed!

STRATEGY 4 - ENHANCED COT REASONING RESULTS:

Predictor 1:
Signature: StringSignature(question -> reasoning, answer
    instructions='Answer Persian/Farsi questions using step-by-step reasoning. Break down multi-hop questions into logical steps.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question requiring multi-step reasoning', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    reasoning = Field(annotation=str required=True json_schema_extra={'desc': 'Step-by-step reasoning process in Persian', '__dspy_field_type': 'output', 'prefix': 'Reasoning:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Final precise Persian answer based on reasoning', '__dspy_field_type': '

# Comprehensive Evaluation

Now we'll evaluate all four strategies and compare their performance.

In [None]:
def evaluate_model_comprehensive(model, model_name, test_examples, max_examples=None):
    print(f"\nEvaluating {model_name}...")
    
    if max_examples:
        test_examples = test_examples[:max_examples]
    
    results = []
    judge_lm = lm_reasoning
    
    # Track performance by answer type
    type_performance = defaultdict(lambda: {'correct': 0, 'total': 0})
    
    for example in tqdm(test_examples, desc=f"Testing {model_name}"):
        try:
            prediction = model(question=example.question)
            model_answer = prediction.answer
        except Exception as e:
            model_answer = f"Error: {e}"
        
        is_correct = evaluate_answer_with_judge(
            example.question,
            example.answer, 
            model_answer,
            judge_lm
        )
        
        # Track by type
        answer_type = getattr(example, 'answer_type', 'unknown')
        type_performance[answer_type]['total'] += 1
        if is_correct:
            type_performance[answer_type]['correct'] += 1
        
        results.append({
            'question': example.question,
            'expected_answer': example.answer,
            'model_answer': model_answer,
            'clean_model_answer': clean_model_answer(model_answer),
            'is_correct': is_correct,
            'answer_type': answer_type,
            'question_type': getattr(example, 'question_type', 'unknown')
        })
    
    # Calculate metrics
    total_correct = sum(1 for r in results if r['is_correct'])
    total_questions = len(results)
    accuracy = total_correct / total_questions if total_questions > 0 else 0
    
    print(f"\n{model_name} Results:")
    print(f"  Overall Accuracy: {accuracy:.3f} ({total_correct}/{total_questions})")
    
    print(f"\n  Performance by Answer Type:")
    for answer_type, perf in type_performance.items():
        if perf['total'] > 0:
            type_acc = perf['correct'] / perf['total']
            print(f"    {answer_type}: {type_acc:.3f} ({perf['correct']}/{perf['total']})")
    
    return results, accuracy, type_performance



In [None]:
print("\n" + "="*80)
print("COMPREHENSIVE MHQA EVALUATION - ALL STRATEGIES")
print("="*80)

# Use subset for faster evaluation and reduced costs
eval_subset_size = 100  

strategies = [
    (mhqa_enhanced_optimized, "Strategy 1: Enhanced BootstrapFewShot"),
    (mhqa_multistage_optimized, "Strategy 2: Multi-Stage Processing"),
    (mhqa_ensemble_optimized, "Strategy 3: Ensemble Approach"),
    (mhqa_enhanced_reasoning_optimized, "Strategy 4: Enhanced CoT Reasoning")
]

all_results = {}
all_accuracies = {}

for model, name in strategies:
    results, accuracy, type_perf = evaluate_model_comprehensive(
        model, name, mhqa_test_examples, max_examples=eval_subset_size
    )
    
    all_results[name] = results
    all_accuracies[name] = accuracy
    
    filename = f"mhqa_dspy2_{name.lower().replace(' ', '_').replace(':', '')}_results.csv"
    pd.DataFrame(results).to_csv(filename, index=False)
    print(f"  Saved to: {filename}")

print("\n" + "="*80)
print("FINAL MHQA COMPARISON:")
print("="*80)
for name, accuracy in sorted(all_accuracies.items(), key=lambda x: x[1], reverse=True):
    print(f"{accuracy:.3f} - {name}")
print("="*80)


COMPREHENSIVE MHQA EVALUATION - ALL STRATEGIES

Evaluating Strategy 1: Enhanced BootstrapFewShot...


Testing Strategy 1: Enhanced BootstrapFewShot: 100%|██████████| 100/100 [04:03<00:00,  2.44s/it]



Strategy 1: Enhanced BootstrapFewShot Results:
  Overall Accuracy: 0.490 (49/100)

  Performance by Answer Type:
    بلی/خیر: 0.762 (32/42)
    شخص: 0.364 (4/11)
    اسامی خاص دیگر: 0.167 (1/6)
    مکان: 0.667 (2/3)
    صفت: 0.000 (0/2)
    کار هنری: 0.167 (1/6)
    تاریخ: 0.167 (1/6)
    گروه یا سازمان: 0.600 (6/10)
    رویداد: 1.000 (1/1)
    اسامی عام: 0.000 (0/10)
    شماره: 0.333 (1/3)
  Saved to: mhqa_dspy2_strategy_1_enhanced_bootstrapfewshot_results.csv

Evaluating Strategy 2: Multi-Stage Processing...


Testing Strategy 2: Multi-Stage Processing: 100%|██████████| 100/100 [06:39<00:00,  3.99s/it]



Strategy 2: Multi-Stage Processing Results:
  Overall Accuracy: 0.480 (48/100)

  Performance by Answer Type:
    بلی/خیر: 0.690 (29/42)
    شخص: 0.364 (4/11)
    اسامی خاص دیگر: 0.333 (2/6)
    مکان: 0.667 (2/3)
    صفت: 0.000 (0/2)
    کار هنری: 0.500 (3/6)
    تاریخ: 0.167 (1/6)
    گروه یا سازمان: 0.600 (6/10)
    رویداد: 0.000 (0/1)
    اسامی عام: 0.000 (0/10)
    شماره: 0.333 (1/3)
  Saved to: mhqa_dspy2_strategy_2_multi-stage_processing_results.csv

Evaluating Strategy 3: Ensemble Approach...


Testing Strategy 3: Ensemble Approach: 100%|██████████| 100/100 [13:16<00:00,  7.97s/it]



Strategy 3: Ensemble Approach Results:
  Overall Accuracy: 0.500 (50/100)

  Performance by Answer Type:
    بلی/خیر: 0.762 (32/42)
    شخص: 0.273 (3/11)
    اسامی خاص دیگر: 0.500 (3/6)
    مکان: 1.000 (3/3)
    صفت: 0.000 (0/2)
    کار هنری: 0.167 (1/6)
    تاریخ: 0.500 (3/6)
    گروه یا سازمان: 0.500 (5/10)
    رویداد: 0.000 (0/1)
    اسامی عام: 0.000 (0/10)
    شماره: 0.000 (0/3)
  Saved to: mhqa_dspy2_strategy_3_ensemble_approach_results.csv

Evaluating Strategy 4: Enhanced CoT Reasoning...


Testing Strategy 4: Enhanced CoT Reasoning: 100%|██████████| 100/100 [06:40<00:00,  4.01s/it]


Strategy 4: Enhanced CoT Reasoning Results:
  Overall Accuracy: 0.500 (50/100)

  Performance by Answer Type:
    بلی/خیر: 0.762 (32/42)
    شخص: 0.364 (4/11)
    اسامی خاص دیگر: 0.667 (4/6)
    مکان: 0.667 (2/3)
    صفت: 0.000 (0/2)
    کار هنری: 0.167 (1/6)
    تاریخ: 0.333 (2/6)
    گروه یا سازمان: 0.500 (5/10)
    رویداد: 0.000 (0/1)
    اسامی عام: 0.000 (0/10)
    شماره: 0.000 (0/3)
  Saved to: mhqa_dspy2_strategy_4_enhanced_cot_reasoning_results.csv

FINAL MHQA COMPARISON:
0.500 - Strategy 3: Ensemble Approach
0.500 - Strategy 4: Enhanced CoT Reasoning
0.490 - Strategy 1: Enhanced BootstrapFewShot
0.480 - Strategy 2: Multi-Stage Processing





# PQUAD Advanced Optimization

Now let's apply the best performing strategy to PQUAD dataset.

In [None]:
pquad_examples = prepare_enhanced_dspy_examples(pquad_data)
pquad_train_examples = pquad_examples[:50]  # selected more compared to first notebook for better optimization
pquad_test_examples = pquad_examples[50:]   

print(f"PQUAD Train examples: {len(pquad_train_examples)}")
print(f"PQUAD Test examples: {len(pquad_test_examples)}")

# Determine best strategy from results
best_strategy_name = max(all_accuracies.keys(), key=lambda k: all_accuracies[k])
print(f"\nApplying best strategy to PQUAD: {best_strategy_name}")
print(f"Best MHQA accuracy: {all_accuracies[best_strategy_name]:.3f}")

PQUAD Train examples: 50
PQUAD Test examples: 100

Applying best strategy to PQUAD: Strategy 3: Ensemble Approach
Best MHQA accuracy: 0.500


In [None]:
# Apply best strategy to PQUAD
print("Optimizing PQUAD with Strategy 3: Ensemble Approach...")

pquad_model = EnsemblePersianQAModule()
pquad_teleprompter = dspy.BootstrapFewShot(
    metric=enhanced_accuracy_metric,
    max_bootstrapped_demos=5,
    max_labeled_demos=3,
    max_rounds=2
)

print("Optimizing PQUAD model...")
if hasattr(pquad_teleprompter, 'num_candidate_programs'):  # BootstrapFewShotWithRandomSearch
    pquad_optimized = pquad_teleprompter.compile(
        pquad_model,
        trainset=pquad_train_examples,
        valset=pquad_train_examples[:15] if len(pquad_train_examples) > 15 else None
    )
else:  # Regular BootstrapFewShot
    pquad_optimized = pquad_teleprompter.compile(
        pquad_model,
        trainset=pquad_train_examples
    )

print("PQUAD optimization completed!")

# Show PQUAD optimization results
print("\n" + "="*60)
print(f"PQUAD OPTIMIZATION RESULTS ({best_strategy_name}):")
print("="*60)
for i, predictor in enumerate(pquad_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Demonstrations: {len(predictor.demos)}")
print("="*60)


Optimizing PQUAD with Strategy 3: Ensemble Approach...
Optimizing PQUAD model...


 30%|███       | 15/50 [03:34<08:20, 14.29s/it]

Bootstrapped 5 full traces after 15 examples for up to 2 rounds, amounting to 26 attempts.
PQUAD optimization completed!

PQUAD OPTIMIZATION RESULTS (Strategy 3: Ensemble Approach):

Predictor 1:
Signature: EnhancedPersianQASignature(question -> answer
    instructions='Answer Persian/Farsi questions with high accuracy. Focus on providing precise, concise answers that directly address the question. Consider Persian language nuances and cultural context.'
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian question requiring a factual answer', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Precise, concise Persian answer (one to three words when possible)', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Demonstrations: 5

Predictor 2:
Signature: StringSignature(question -> reasoning, answer
    instructions='Answer Persian/Farsi questions using step-by-step reason




In [None]:
print("\n" + "="*60)
print("PQUAD EVALUATION")
print("="*60)

pquad_results, pquad_accuracy, pquad_type_perf = evaluate_model_comprehensive(
    pquad_optimized, f"PQUAD {best_strategy_name}", pquad_test_examples, max_examples=80
)

# Save PQUAD results
pquad_filename = f"pquad_dspy2_{best_strategy_name.lower().replace(' ', '_').replace(':', '')}_results.csv"
pd.DataFrame(pquad_results).to_csv(pquad_filename, index=False)
print(f"\nPQUAD results saved to: {pquad_filename}")


PQUAD EVALUATION

Evaluating PQUAD Strategy 3: Ensemble Approach...


Testing PQUAD Strategy 3: Ensemble Approach: 100%|██████████| 80/80 [12:26<00:00,  9.33s/it]


PQUAD Strategy 3: Ensemble Approach Results:
  Overall Accuracy: 0.375 (30/80)

  Performance by Answer Type:
    unknown: 0.375 (30/80)

PQUAD results saved to: pquad_dspy2_strategy_3_ensemble_approach_results.csv





# Final Results Summary and Analysis

In [None]:
print("\n" + "="*80)
print("FINAL COMPREHENSIVE RESULTS - ADVANCED DSPY OPTIMIZATION")
print("="*80)

print(f"\nMHQA Dataset Results (Top {len(all_accuracies)} Strategies):")
for i, (name, accuracy) in enumerate(sorted(all_accuracies.items(), key=lambda x: x[1], reverse=True), 1):
    improvement = "" if i == len(all_accuracies) else f" (+{accuracy - min(all_accuracies.values()):.3f})"
    print(f"  {i}. {accuracy:.3f}{improvement} - {name}")

print(f"\nPQUAD Dataset Results:")
print(f"  Best Strategy: {pquad_accuracy:.3f} - {best_strategy_name}")




FINAL COMPREHENSIVE RESULTS - ADVANCED DSPY OPTIMIZATION

MHQA Dataset Results (Top 4 Strategies):
  1. 0.500 (+0.020) - Strategy 3: Ensemble Approach
  2. 0.500 (+0.020) - Strategy 4: Enhanced CoT Reasoning
  3. 0.490 (+0.010) - Strategy 1: Enhanced BootstrapFewShot
  4. 0.480 - Strategy 2: Multi-Stage Processing

PQUAD Dataset Results:
  Best Strategy: 0.375 - Strategy 3: Ensemble Approach


In [None]:
print("\n" + "="*60)
print("SAMPLE PREDICTIONS FROM BEST MODEL")
print("="*60)

# Get best model
best_model = None
for model, name in strategies:
    if name == best_strategy_name:
        best_model = model
        break

if best_model:
    sample_examples = mhqa_test_examples[:5]  
    
    for i, example in enumerate(sample_examples, 1):
        try:
            prediction = best_model(question=example.question)
            model_answer = prediction.answer
        except Exception as e:
            model_answer = f"Error: {e}"
        
        print(f"\nExample {i}:")
        print(f"Q: {example.question}")
        print(f"Expected: {example.answer}")
        print(f"Predicted: {clean_model_answer(model_answer)}")
        
        is_correct = evaluate_answer_with_judge(
            example.question, example.answer, model_answer, lm_reasoning
        )
        print(f"Correct: {'True' if is_correct else 'False'}")


SAMPLE PREDICTIONS FROM BEST MODEL

Example 1:
Q: آیا کاترین لانگفورد نقش دختر بچه نوجوانی که هانا بیکر نام داشت را در سریال ۱۳ دلیل برای اینکه بازی کرده است ؟
Expected: بله
Predicted: بله، کاترین لانگفورد نقش هانا بیکر را در سریال "۱۳ دلیل برای اینکه" بازی کرده است.
Correct: True

Example 2:
Q: کدام یک از بازیگران فیلم اثر پروانه‌ای در فیلم آینه‌ها نیز ایفای نقش کرده است ؟
Expected: ایمی اسمارت
Predicted: آشتون کچر
Correct: False

Example 3:
Q: کدام بازی ویدیویی شرکتی ژاپنی منبع الهام طراحی بازی سوپرتاکس بوده است ؟
Expected: برادران سوپر ماریو
Predicted: بازی سوپر ماریو
Correct: True

Example 4:
Q: یکی از بازی های معروف شرکتی که اولین کنسول بازی ویدیویی خود را در سال ۱۹۹۷ توزیع کرد ؟
Expected: برادران سوپر ماریو
Predicted: سونی
Correct: False

Example 5:
Q: آیا جایزه ای که سیدیبه در سال 2007 آن را دریافت کرد برای افرادی است که نقش مهمی در سینما داشته‌اند ؟
Expected: بله
Predicted: بله، جایزه‌ای که سیدیبه در سال 2007 دریافت کرد برای افرادی است که نقش مهمی در سینما داشته‌اند.
Correct: 