# HippoRAG with DSPy Optimization - PersianMHQA Dataset


In [None]:
import json
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
import igraph as ig
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Dict, Any
import dotenv
import os
import pandas as pd
import dspy
import re
from collections import defaultdict
import random

dotenv.load_dotenv()


True

In [None]:
GRAPH_INDEX_PATH = "hippocampal_index.json"
SOURCE_TRIPLES_PATH = "knowledge_graph_triples_structured.json"
ORIGINAL_CONTEXT_PATH = "all_context.json"
ENCODER_MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
MODEL_NAME = "gpt-4o-mini"
PPR_DAMPING_FACTOR = 0.5

import atexit
import weakref

_dspy_clients = []

def cleanup_dspy_clients():
    for client_ref in _dspy_clients:
        client = client_ref()
        if client is not None:
            try:
                if hasattr(client, 'client') and hasattr(client.client, 'close'):
                    client.client.close()
            except:
                pass

atexit.register(cleanup_dspy_clients)

lm = dspy.LM(
    model=f"openai/{MODEL_NAME}",
    api_key=os.getenv("METIS_API_KEY"),
    api_base="https://api.metisai.ir/openai/v1",
    max_tokens=400,
    temperature=0.1
)

lm_judge = dspy.LM(
    model=f"openai/{MODEL_NAME}",
    api_key=os.getenv("METIS_API_KEY"),
    api_base="https://api.metisai.ir/openai/v1",
    max_tokens=200,
    temperature=0.0
)

# Store weak references for cleanup
_dspy_clients.extend([weakref.ref(lm), weakref.ref(lm_judge)])

dspy.configure(lm=lm)

# HippoRAG Retriever (Pre-built)

We'll use the same HippoRAG retriever from the original KG_PPR_MHQA.ipynb notebook.

In [None]:
class QueryEntities(BaseModel):
    named_entities: List[str] = Field(
        ...,
        description="A list of named entities that are important for solving the user's question."
    )


class HippoRAGRetriever:
    def __init__(self):
        print("Initializing HippoRAGRetriever for PersianMHQA...")
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        self._load_data()

        self.encoder_model = SentenceTransformer(ENCODER_MODEL_NAME, device=self.device)
        self.llm = ChatOpenAI(
            model=MODEL_NAME, temperature=0, api_key=os.getenv("METIS_API_KEY"), base_url="https://api.metisai.ir/openai/v1"
        )
        
        self._build_graph()
        self._compute_node_specificity()
        self._create_passage_to_node_mapping()

        print("Pre-encoding all graph nodes for retrieval...")
        self.node_labels = [node['label'] for node in self.graph_data['nodes']]
        self.node_embeddings = self.encoder_model.encode(
            self.node_labels, show_progress_bar=True, convert_to_tensor=True
        )

    def _load_data(self):
        print("Loading graph index and source data...") # DEBUG
        try:
            with open(GRAPH_INDEX_PATH, 'r', encoding='utf-8') as f:
                self.graph_data = json.load(f)
            with open(SOURCE_TRIPLES_PATH, 'r', encoding='utf-8') as f:
                self.source_triples = json.load(f)
            with open(ORIGINAL_CONTEXT_PATH, 'r', encoding='utf-8') as f:
                self.passages = json.load(f)
        except FileNotFoundError as e:
            print(f"Error: Could not find a file: {e.filename}")
            exit()

    def _build_graph(self):
        print("Building weighted igraph object...") # DEBUG
        self.g = ig.Graph(directed=False)
        self.g.add_vertices(len(self.graph_data['nodes']))
        
        self.node_label_to_id = {node['label']: node['id'] for node in self.graph_data['nodes']}
        
        edges = []
        weights = []
        for edge_data in self.graph_data['edges']:
            source_id = self.node_label_to_id.get(edge_data['source'])
            target_id = self.node_label_to_id.get(edge_data['target'])
            
            if source_id is not None and target_id is not None:
                edges.append((source_id, target_id))
                if edge_data['type'] == 'synonymy':
                    weights.append(edge_data.get('score', 0.8))
                else:
                    weights.append(1.0)
        
        self.g.add_edges(edges)
        self.g.es['weight'] = weights
        print(f"Graph built with {self.g.vcount()} vertices and {self.g.ecount()} edges.")

    def _compute_node_specificity(self):
        print("Computing node specificity...") # DEBUG
        node_passage_counts = {node_id: 0 for node_id in range(len(self.node_label_to_id))}
        passage_nodes = {}
        
        for item in self.source_triples:
            passage_id = item['id']
            nodes_in_passage = set(item.get('entities', []))
            for triple in item.get('triples', []):
                nodes_in_passage.add(triple['subject'])
                nodes_in_passage.add(triple['object'])
            
            for node_label in nodes_in_passage:
                node_id = self.node_label_to_id.get(node_label)
                if node_id is not None:
                    if passage_id not in passage_nodes.get(node_id, set()):
                        node_passage_counts[node_id] += 1
                        if node_id not in passage_nodes:
                            passage_nodes[node_id] = set()
                        passage_nodes[node_id].add(passage_id)

        self.node_specificity = {}
        for node_id, count in node_passage_counts.items():
            if count > 0:
                self.node_specificity[node_id] = 1.0 / count
            else:
                self.node_specificity[node_id] = 0

    def _create_passage_to_node_mapping(self):
        self.passage_to_nodes = {i: [] for i in range(len(self.passages))}
        for item in self.source_triples:
            passage_idx = item['id']
            nodes_in_passage = set(item.get('entities', []))
            for triple in item.get('triples', []):
                nodes_in_passage.add(triple['subject'])
                nodes_in_passage.add(triple['object'])
            
            node_ids_in_passage = {self.node_label_to_id.get(label) for label in nodes_in_passage}
            self.passage_to_nodes[passage_idx] = [nid for nid in node_ids_in_passage if nid is not None]

    def _extract_query_entities(self, query: str) -> List[str]:
        system_prompt = """
        Instruction:
        Your task is to extract named entities from the given paragraph in the user's message.
        Respond with a JSON list of entities.

        One-Shot Demonstration:
        If the user provides the paragraph:
        "Radio City is India's first private FM radio station and was started on 3 July 2001. It plays Hindi, English and regional songs. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features."

        Your output should be a JSON object with a list of the extracted entities:
        {{"named_entities": ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]}}
        """
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "Question: {query}")
        ])
        chain = prompt | self.llm.with_structured_output(QueryEntities)
        try:
            response = chain.invoke({"query": query})
            return response.named_entities
        except Exception as e:
            print(f"Error extracting query entities: {e}")
            return []
            
    def _find_query_nodes(self, query_entities: List[str]) -> List[int]:
        if not query_entities:
            return []
        
        query_embeddings = self.encoder_model.encode(query_entities, convert_to_tensor=True)
        cos_scores = util.cos_sim(query_embeddings, self.node_embeddings)
        
        top_matches_indices = torch.argmax(cos_scores, dim=1)
        
        matched_node_ids = []
        for i, entity in enumerate(query_entities):
            node_id = top_matches_indices[i].item()
            matched_node_ids.append(node_id)
        
        return list(set(matched_node_ids))

    def retrieve(self, query: str, top_k: int = 5) -> List[dict]:
        
        query_entities = self._extract_query_entities(query)
        if not query_entities:
            return []

        query_node_ids = self._find_query_nodes(query_entities)
        if not query_node_ids:
            return []

        personalization_vector = np.zeros(self.g.vcount())
        total_specificity = 0
        for node_id in query_node_ids:
            specificity = self.node_specificity.get(node_id, 0)
            personalization_vector[node_id] = specificity
            total_specificity += specificity
        
        if total_specificity > 0:
            personalization_vector /= total_specificity
            
        ppr_scores = self.g.personalized_pagerank(
            vertices=None,
            directed=False,
            damping=PPR_DAMPING_FACTOR,
            reset=personalization_vector.tolist(),
            weights=self.g.es['weight']
        )
        
        passage_scores = {i: 0.0 for i in range(len(self.passages))}
        for passage_idx, node_ids in self.passage_to_nodes.items():
            score = sum(ppr_scores[node_id] for node_id in node_ids)
            passage_scores[passage_idx] = score
            
        sorted_passages = sorted(passage_scores.items(), key=lambda item: item[1], reverse=True)
        
        results = []
        for passage_idx, score in sorted_passages[:top_k]:
            results.append({
                "passage_index": passage_idx,
                "passage_text": self.source_triples[passage_idx]["passage"],
                "score": score
            })
        return results

In [None]:
retriever = HippoRAGRetriever()

Initializing HippoRAGRetriever for PersianMHQA...
Loading graph index and source data...
Building weighted igraph object...
Graph built with 4644 vertices and 5980 edges.
Computing node specificity...
Pre-encoding all graph nodes for retrieval...


Batches:   0%|          | 0/146 [00:00<?, ?it/s]

Initialization complete.


# DSPy QA Module

Now we'll implement the Question Answering system using DSPy for automatic prompt optimization.

In [None]:
class PersianMHQASignature(dspy.Signature):
    """Answer Persian/Farsi multi-hop questions based on provided context passages. Handle bridge questions requiring multiple reasoning steps."""
    context = dspy.InputField(desc="Retrieved context passages relevant to the multi-hop question")
    question = dspy.InputField(desc="Persian multi-hop question requiring complex reasoning")
    answer = dspy.OutputField(desc="Precise, concise Persian answer based on multi-hop reasoning from context")

class PersianMHQAWithReasoningSignature(dspy.Signature):
    """Answer Persian/Farsi multi-hop questions using step-by-step reasoning based on context passages."""
    context = dspy.InputField(desc="Retrieved context passages relevant to the multi-hop question")
    question = dspy.InputField(desc="Persian multi-hop question requiring complex reasoning")
    reasoning = dspy.OutputField(desc="Step-by-step multi-hop reasoning process in Persian")
    answer = dspy.OutputField(desc="Final precise Persian answer based on multi-hop reasoning")

class EnhancedPersianMHQASignature(dspy.Signature):
    """Advanced Persian multi-hop QA with context analysis and evidence-based answers."""
    context = dspy.InputField(desc="Retrieved context passages with relevance scores for multi-hop reasoning")
    question = dspy.InputField(desc="Persian multi-hop question to answer comprehensively")
    relevant_evidence = dspy.OutputField(desc="Key evidence from context that supports the multi-hop answer")
    answer = dspy.OutputField(desc="Accurate Persian answer with supporting evidence from multi-hop reasoning")

In [None]:
class DSPyHippoRAGMHQAModule(dspy.Module):
    def __init__(self, retriever: HippoRAGRetriever):
        super().__init__()
        self.retriever = retriever
        self.generate_answer = dspy.Predict(PersianMHQASignature)
    
    def forward(self, question, top_k_retrieval=5):
        # Retrieve context using HippoRAG 
        try:
            retrieved_passages = self.retriever.retrieve(question, top_k=top_k_retrieval)
        except Exception as e:
            print(f"Error in retrieval: {e}")
            return dspy.Prediction(answer="Error")
        
        if not retrieved_passages:
            return dspy.Prediction(answer="No Context found")
        
        # Format context
        context_string = ""
        for i, doc in enumerate(retrieved_passages):
            context_string += f"متن {i+1} (امتیاز: {doc['score']:.4f}):\n{doc['passage_text']}\n\n"
        
        # Generate answer 
        try:
            with dspy.context(lm=lm):
                result = self.generate_answer(context=context_string.strip(), question=question)
            return dspy.Prediction(answer=result.answer, context=context_string)
        except Exception as e:
            print(f"Error in answer generation: {e}")
            return dspy.Prediction(answer="Error")

class DSPyHippoRAGMHQAWithReasoningModule(dspy.Module):
    def __init__(self, retriever: HippoRAGRetriever):
        super().__init__()
        self.retriever = retriever
        self.generate_answer = dspy.ChainOfThought(PersianMHQAWithReasoningSignature)
    
    def forward(self, question, top_k_retrieval=5):
        # Retrieve context using HippoRAG 
        try:
            retrieved_passages = self.retriever.retrieve(question, top_k=top_k_retrieval)
        except Exception as e:
            print(f"Error in retrieval: {e}")
            return dspy.Prediction(answer="Error")
        
        if not retrieved_passages:
            return dspy.Prediction(answer="No Context found")
        
        # Format context
        context_string = ""
        for i, doc in enumerate(retrieved_passages):
            context_string += f"متن {i+1} (امتیاز: {doc['score']:.4f}):\n{doc['passage_text']}\n\n"
        
        # Generate answer with reasoning and connection management
        try:
            with dspy.context(lm=lm):
                result = self.generate_answer(context=context_string.strip(), question=question)
            return dspy.Prediction(
                answer=result.answer, 
                reasoning=getattr(result, 'reasoning', ''),
                context=context_string
            )
        except Exception as e:
            print(f"Error in answer generation: {e}")
            return dspy.Prediction(answer="Error")

class EnhancedDSPyHippoRAGMHQAModule(dspy.Module):
    def __init__(self, retriever: HippoRAGRetriever):
        super().__init__()
        self.retriever = retriever
        self.generate_answer = dspy.Predict(EnhancedPersianMHQASignature)
    
    def forward(self, question, top_k_retrieval=5):
        # Retrieve context using HippoRAG
        try:
            retrieved_passages = self.retriever.retrieve(question, top_k=top_k_retrieval)
        except Exception as e:
            print(f"Error in retrieval: {e}")
            return dspy.Prediction(answer="Error")
        
        if not retrieved_passages:
            return dspy.Prediction(answer="No Context found")
        
        # Format context with scores
        context_string = ""
        for i, doc in enumerate(retrieved_passages):
            context_string += f"متن {i+1} (امتیاز مرتبط بودن: {doc['score']:.4f}):\n{doc['passage_text']}\n\n"
        
        # Generate answer with evidence and connection management
        try:
            with dspy.context(lm=lm):
                result = self.generate_answer(context=context_string.strip(), question=question)
            return dspy.Prediction(
                answer=result.answer,
                relevant_evidence=getattr(result, 'relevant_evidence', ''),
                context=context_string
            )
        except Exception as e:
            print(f"Error in answer generation: {e}")
            return dspy.Prediction(answer="Error")

class EnsemblePersianMHQAModule(dspy.Module):
    def __init__(self, retriever: HippoRAGRetriever):
        super().__init__()
        self.retriever = retriever
        self.qa_direct = dspy.Predict(PersianMHQASignature)
        self.qa_reasoning = dspy.ChainOfThought(PersianMHQAWithReasoningSignature)
        self.qa_enhanced = dspy.Predict(EnhancedPersianMHQASignature)
    
    def forward(self, question, top_k_retrieval=5):
        # Retrieve context using HippoRAG
        try:
            retrieved_passages = self.retriever.retrieve(question, top_k=top_k_retrieval)
        except Exception as e:
            print(f"Error in retrieval: {e}")
            return dspy.Prediction(answer="Error")
        
        if not retrieved_passages:
            return dspy.Prediction(answer="No Context found")
        
        # Format context
        context_string = ""
        for i, doc in enumerate(retrieved_passages):
            context_string += f"متن {i+1} (امتیاز: {doc['score']:.4f}):\n{doc['passage_text']}\n\n"
        
        # Get predictions from all approaches 
        try:
            with dspy.context(lm=lm):
                direct = self.qa_direct(context=context_string.strip(), question=question)
                reasoning = self.qa_reasoning(context=context_string.strip(), question=question)
                enhanced = self.qa_enhanced(context=context_string.strip(), question=question)
        except Exception as e:
            print(f"Error in ensemble generation: {e}")
            return dspy.Prediction(answer="Error")
        
        # Simple voting/selection mechanism (prefer reasoning answer for multi-hop questions)
        answers = [direct.answer, reasoning.answer, enhanced.answer]
        final_answer = reasoning.answer if hasattr(reasoning, 'answer') else direct.answer
        
        return dspy.Prediction(
            answer=final_answer,
            direct_answer=direct.answer,
            reasoning_answer=reasoning.answer,
            enhanced_answer=enhanced.answer,
            reasoning=getattr(reasoning, 'reasoning', ''),
            context=context_string
        )



# Load and Prepare PersianMHQA Data

In [None]:
# Load PersianMHQA dataset
with open('../../../data/test_data.json', 'r', encoding='utf-8') as f:
    test_dataset = json.load(f)

with open('../../../data/train_data.json', 'r', encoding='utf-8') as f:
    train_dataset = json.load(f)

df_test = pd.DataFrame(test_dataset)
df_train = pd.DataFrame(train_dataset)

print(f"Loaded {len(train_dataset)} PersianMHQA train examples")
print(f"Loaded {len(test_dataset)} PersianMHQA test examples")
print(f"Sample question: {test_dataset[0]['question']}")
print(f"Sample answer: {test_dataset[0]['answer']}")
print(f"Question type: {test_dataset[0].get('type', 'N/A')}")
print(f"Answer type: {test_dataset[0].get('answer_type', 'N/A')}")

Loaded 400 PersianMHQA train examples
Loaded 152 PersianMHQA test examples
Sample question: آیا کاترین لانگفورد نقش دختر بچه نوجوانی که هانا بیکر نام داشت را در سریال ۱۳ دلیل برای اینکه بازی کرده است ؟
Sample answer: بله
Question type: مقایسه‌ای
Answer type: بلی/خیر


In [None]:
def prepare_dspy_mhqa_examples(data_list, sample_size=None):
    examples = []
    
    for item in data_list:
        example = dspy.Example(
            question=item['question'],
            answer=item['answer'],
            question_type=item.get('type', ''),
            answer_type=item.get('answer_type', '')
        ).with_inputs('question')
        examples.append(example)
    
    if sample_size and sample_size < len(examples):
        return random.sample(examples, sample_size)
    
    return examples

random.seed(42)

mhqa_train_examples = prepare_dspy_mhqa_examples(train_dataset[:20])  
mhqa_test_examples = prepare_dspy_mhqa_examples(test_dataset[:50])    

print(f"Train examples: {len(mhqa_train_examples)}")
print(f"Test examples: {len(mhqa_test_examples)}")
print(f"Sample training example:")
print(f"  Question: {mhqa_train_examples[0].question}")
print(f"  Answer: {mhqa_train_examples[0].answer}")
print(f"  Type: {mhqa_train_examples[0].question_type}")

Train examples: 20
Test examples: 50
Sample training example:
  Question: در مقابل تبی که بنتونیت با نام گل ارمنی در آن معروف است چه چیز قرار دارد ؟
  Answer:  پزشکی مبتنی بر شواهد
  Type: پل


# Enhanced Evaluation Function for MHQA

In [None]:
def clean_model_answer(model_answer: str) -> str:
    if not model_answer:
        return ""
    
    # Remove various tags and formatting
    cleaned = re.sub(r'<ANSWER>(.*?)</ANSWER>', r'\1', model_answer, flags=re.DOTALL|re.IGNORECASE)
    cleaned = re.sub(r'<[^>]+>', '', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = cleaned.strip()
    
    # Remove common prefixes
    prefixes = ['پاسخ:', 'جواب:', 'Answer:', 'Response:']
    for prefix in prefixes:
        if cleaned.startswith(prefix):
            cleaned = cleaned[len(prefix):].strip()
    
    return cleaned

def evaluate_answer_with_judge(question: str, correct_answer: str, model_answer: str) -> bool:
    clean_answer = clean_model_answer(model_answer)
    
    
    prompt = f"""شما یک قاضی خبره هستید که پاسخ‌های فارسی را ارزیابی می‌کنید. 
تعیین کنید که آیا پاسخ مدل از نظر معنایی معادل پاسخ صحیح است یا خیر.
این سوالات چندمرحله‌ای هستند که ممکن است به استدلال پیچیده نیاز داشته باشند.
در نظر بگیرید که تغییرات جزئی در املا و عبارات معادل قابل قبول هستند.

سوال: {question}

پاسخ صحیح: {correct_answer}
پاسخ مدل: {clean_answer}

اگر پاسخ مدل از نظر معنایی معادل پاسخ صحیح است، فقط "TRUE" بنویسید.
در غیر این صورت فقط "FALSE" بنویسید.

پاسخ:"""
    
    try:
        with dspy.context(lm=lm_judge):
            response = lm_judge(prompt)
        
        if isinstance(response, list) and len(response) > 0:
            response_text = str(response[0])
        else:
            response_text = str(response)
        
        return "TRUE" in response_text.upper()
    except Exception as e:
        print(f"Error in judge evaluation: {e}")
        return clean_answer.lower().strip() in correct_answer.lower().strip() or correct_answer.lower().strip() in clean_answer.lower().strip()

evaluation_cache = {}

def persian_mhqa_accuracy_metric(gold, pred, trace=None):
    cache_key = (gold.question, gold.answer, pred.answer)
    
    if cache_key in evaluation_cache:
        return evaluation_cache[cache_key]
    
    result = evaluate_answer_with_judge(gold.question, gold.answer, pred.answer)
    evaluation_cache[cache_key] = result
    return result



# DSPy Optimization Experiments - PersianMHQA

We'll test multiple optimization strategies and compare their effectiveness on multi-hop Persian questions.

## Strategy 1: Basic DSPy HippoRAG for MHQA

In [None]:
print("Strategy 1: Basic DSPy HippoRAG MHQA Optimization...")

basic_mhqa_model = DSPyHippoRAGMHQAModule(retriever)

basic_teleprompter = dspy.BootstrapFewShot(
    metric=persian_mhqa_accuracy_metric,
    max_bootstrapped_demos=3, 
    max_labeled_demos=2,       
    max_rounds=1               
)

print("Optimizing basic MHQA model...")
print(f"Training examples: {len(mhqa_train_examples)}")
print("Starting DSPy optimization for multi-hop questions...")

try:
    with dspy.context(lm=lm):
        basic_mhqa_optimized = basic_teleprompter.compile(
            basic_mhqa_model,
            trainset=mhqa_train_examples
        )
    print("Strategy 1 optimization completed!")
except Exception as e:
    print(f"Error in basic optimization: {e}")
    basic_mhqa_optimized = basic_mhqa_model

print("\n" + "="*60)
print("STRATEGY 1 - BASIC DSPY HIPPORAG MHQA RESULTS:")
print("="*60)
for i, predictor in enumerate(basic_mhqa_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:2]):
            print(f"  Demo {j+1}: {demo.question[:80]}... -> {demo.answer}")
print("="*60)

Strategy 1: Basic DSPy HippoRAG MHQA Optimization...
Optimizing basic MHQA model...
Training examples: 20
Starting DSPy optimization for multi-hop questions...


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Strategy 1 optimization completed!

STRATEGY 1 - BASIC DSPY HIPPORAG MHQA RESULTS:

Predictor 1:
Signature: PersianMHQASignature(context, question -> answer
    instructions='Answer Persian/Farsi multi-hop questions based on provided context passages. Handle bridge questions requiring multiple reasoning steps.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'Retrieved context passages relevant to the multi-hop question', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian multi-hop question requiring complex reasoning', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Precise, concise Persian answer based on multi-hop reasoning from context', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
Demonstr




## Strategy 2: Enhanced DSPy with Random Search for MHQA

In [None]:
print("Strategy 2: Enhanced DSPy with Random Search for MHQA...")

enhanced_mhqa_model = EnhancedDSPyHippoRAGMHQAModule(retriever)

enhanced_teleprompter = dspy.BootstrapFewShotWithRandomSearch(
    metric=persian_mhqa_accuracy_metric,
    max_bootstrapped_demos=4,    
    max_labeled_demos=3,         
    max_rounds=2,                
    num_candidate_programs=6,    
    num_threads=2                
)

print("Optimizing enhanced MHQA model...")
print(f"Training examples: {len(mhqa_train_examples)}")
print("Starting enhanced DSPy optimization with random search for MHQA...")

try:
    with dspy.context(lm=lm):
        enhanced_mhqa_optimized = enhanced_teleprompter.compile(
            enhanced_mhqa_model,
            trainset=mhqa_train_examples,
            valset=mhqa_train_examples[:10]  
        )
    print("Strategy 2 optimization completed!")
except Exception as e:
    print(f"Error in enhanced optimization: {e}")
    enhanced_mhqa_optimized = enhanced_mhqa_model

print("\n" + "="*60)
print("STRATEGY 2 - ENHANCED DSPY WITH RANDOM SEARCH FOR MHQA:")
print("="*60)
for i, predictor in enumerate(enhanced_mhqa_optimized.predictors()):
    print(f"\nPredictor {i+1}:")
    print(f"Signature: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"Demonstrations: {len(predictor.demos)}")
print("="*60)

Strategy 2: Enhanced DSPy with Random Search for MHQA...
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 6 candidate sets.
Optimizing enhanced MHQA model...
Training examples: 20
Starting enhanced DSPy optimization with random search for MHQA...


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Average Metric: 6.00 / 10 (60.0%): 100%|██████████| 10/10 [00:11<00:00,  1.13s/it]

2025/09/06 16:44:56 INFO dspy.evaluate.evaluate: Average Metric: 6 / 10 (60.0%)



New best score: 60.0 for seed -3
Scores so far: [60.0]
Best score so far: 60.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Average Metric: 7.00 / 10 (70.0%): 100%|██████████| 10/10 [00:08<00:00,  1.23it/s]

2025/09/06 16:45:08 INFO dspy.evaluate.evaluate: Average Metric: 7 / 10 (70.0%)



New best score: 70.0 for seed -2
Scores so far: [60.0, 70.0]
Best score so far: 70.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 4 full traces after 5 examples for up to 2 rounds, amounting to 6 attempts.
Average Metric: 6.00 / 10 (60.0%): 100%|██████████| 10/10 [00:21<00:00,  2.19s/it]

2025/09/06 16:45:47 INFO dspy.evaluate.evaluate: Average Metric: 6 / 10 (60.0%)



Scores so far: [60.0, 70.0, 60.0]
Best score so far: 70.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 4 full traces after 6 examples for up to 2 rounds, amounting to 8 attempts.
Average Metric: 8.00 / 10 (80.0%): 100%|██████████| 10/10 [00:21<00:00,  2.18s/it]

2025/09/06 16:46:28 INFO dspy.evaluate.evaluate: Average Metric: 8 / 10 (80.0%)



New best score: 80.0 for seed 0
Scores so far: [60.0, 70.0, 60.0, 80.0]
Best score so far: 80.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 2 full traces after 3 examples for up to 2 rounds, amounting to 4 attempts.
Average Metric: 6.00 / 10 (60.0%): 100%|██████████| 10/10 [00:33<00:00,  3.30s/it]

2025/09/06 16:47:18 INFO dspy.evaluate.evaluate: Average Metric: 6 / 10 (60.0%)



Scores so far: [60.0, 70.0, 60.0, 80.0, 60.0]
Best score so far: 80.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 1 full traces after 1 examples for up to 2 rounds, amounting to 1 attempts.
Average Metric: 7.00 / 10 (70.0%): 100%|██████████| 10/10 [00:11<00:00,  1.19s/it]

2025/09/06 16:47:39 INFO dspy.evaluate.evaluate: Average Metric: 7 / 10 (70.0%)



Scores so far: [60.0, 70.0, 60.0, 80.0, 60.0, 70.0]
Best score so far: 80.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 2 full traces after 3 examples for up to 2 rounds, amounting to 4 attempts.
Average Metric: 7.00 / 10 (70.0%): 100%|██████████| 10/10 [00:33<00:00,  3.31s/it]

2025/09/06 16:48:35 INFO dspy.evaluate.evaluate: Average Metric: 7 / 10 (70.0%)



Scores so far: [60.0, 70.0, 60.0, 80.0, 60.0, 70.0, 70.0]
Best score so far: 80.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 2 full traces after 2 examples for up to 2 rounds, amounting to 2 attempts.
Average Metric: 7.00 / 10 (70.0%): 100%|██████████| 10/10 [00:20<00:00,  2.10s/it]

2025/09/06 16:49:26 INFO dspy.evaluate.evaluate: Average Metric: 7 / 10 (70.0%)



Scores so far: [60.0, 70.0, 60.0, 80.0, 60.0, 70.0, 70.0, 70.0]
Best score so far: 80.0


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 3 full traces after 3 examples for up to 2 rounds, amounting to 3 attempts.
Average Metric: 7.00 / 10 (70.0%): 100%|██████████| 10/10 [00:22<00:00,  2.27s/it]

2025/09/06 16:50:09 INFO dspy.evaluate.evaluate: Average Metric: 7 / 10 (70.0%)



Scores so far: [60.0, 70.0, 60.0, 80.0, 60.0, 70.0, 70.0, 70.0, 70.0]
Best score so far: 80.0
9 candidate programs found.
Strategy 2 optimization completed!

STRATEGY 2 - ENHANCED DSPY WITH RANDOM SEARCH FOR MHQA:

Predictor 1:
Signature: EnhancedPersianMHQASignature(context, question -> relevant_evidence, answer
    instructions='Advanced Persian multi-hop QA with context analysis and evidence-based answers.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'Retrieved context passages with relevance scores for multi-hop reasoning', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian multi-hop question to answer comprehensively', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    relevant_evidence = Field(annotation=str required=True json_schema_extra={'desc': 'Key evidence from context that supports the multi-hop answer', '__dspy_field_type': 'output', 'prefix': 'R

## Strategy 3: Ensemble Approach for MHQA

In [None]:
print("Strategy 3: Ensemble Approach Optimization for MHQA...")

ensemble_mhqa_model = EnsemblePersianMHQAModule(retriever)
print(f"{len(ensemble_mhqa_model.predictors())} predictors")

ensemble_teleprompter = dspy.BootstrapFewShot(
    metric=persian_mhqa_accuracy_metric,
    max_bootstrapped_demos=3,  
    max_labeled_demos=2,       
    max_rounds=1              
)

print("Optimizing ensemble MHQA model...")
print(f"Training examples: {len(mhqa_train_examples)}")
print("Starting ensemble DSPy optimization with teleprompter for multi-hop questions...")

try:
    with dspy.context(lm=lm):
        ensemble_mhqa_optimized = ensemble_teleprompter.compile(
            ensemble_mhqa_model,
            trainset=mhqa_train_examples
        )
    print(f" Optimized ensemble has {len(ensemble_mhqa_optimized.predictors())} optimized predictors")
    
except Exception as e:
    print(f"Error in ensemble optimization: {e}")
    ensemble_mhqa_optimized = ensemble_mhqa_model
    print("Using unoptimized ensemble as fallback")


print(f"\nOptimized Predictors:")
for i, predictor in enumerate(ensemble_mhqa_optimized.predictors()):
    print(f"  Predictor {i+1}: {predictor.signature}")
    if hasattr(predictor, 'demos') and predictor.demos:
        print(f"    Demonstrations: {len(predictor.demos)}")
        for j, demo in enumerate(predictor.demos[:1]):  
            print(f"      Demo {j+1}: {demo.question[:60]}... -> {demo.answer[:40]}...")

print("="*60)

Strategy 3: Ensemble Approach Optimization for MHQA...
This will optimize the EnsemblePersianMHQAModule with teleprompters for multi-hop questions!
3 predictors
Optimizing ensemble MHQA model...
Training examples: 20
Starting ensemble DSPy optimization with teleprompter for multi-hop questions...


Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' object has no attribute '_state'
Exception ignored in: <function SyncHttpxClientWrapper.__del__ at 0x316a7b920>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/openai/_base_client.py", line 811, in __del__
    if self.is_closed:
       ^^^^^^^^^^^^^^
  File "/opt/miniconda3/envs/llms/lib/python3.11/site-packages/httpx/_client.py", line 228, in is_closed
    return self._state == ClientState.CLOSED
           ^^^^^^^^^^^
AttributeError: 'SyncHttpxClientWrapper' obj

Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
 Optimized ensemble has 3 optimized predictors

Optimized Predictors:
  Predictor 1: PersianMHQASignature(context, question -> answer
    instructions='Answer Persian/Farsi multi-hop questions based on provided context passages. Handle bridge questions requiring multiple reasoning steps.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'Retrieved context passages relevant to the multi-hop question', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'desc': 'Persian multi-hop question requiring complex reasoning', '__dspy_field_type': 'input', 'prefix': 'Question:'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Precise, concise Persian answer based on multi-hop reasoning from context', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
)
    Demonstrations: 3
      Dem




In [None]:
test_ensemble = EnsemblePersianMHQAModule(retriever)

test_question = "آیا کاترین لانگفورد نقش دختر بچه نوجوانی که هانا بیکر نام داشت را در سریال ۱۳ دلیل برای اینکه بازی کرده است؟"
print(f"Test question: {test_question}")

try:
    prediction = test_ensemble(question=test_question, top_k_retrieval=3)
    
    print("\nEnsemble MHQA Prediction Results:")
    print(f"Final Answer: {prediction.answer}")
    
    if hasattr(prediction, 'direct_answer'):
        print(f"Direct Answer: {prediction.direct_answer}")
    if hasattr(prediction, 'reasoning_answer'):
        print(f"Reasoning Answer: {prediction.reasoning_answer}")
    if hasattr(prediction, 'enhanced_answer'):
        print(f"Enhanced Answer: {prediction.enhanced_answer}")
    if hasattr(prediction, 'reasoning'):
        print(f"Reasoning Process: {prediction.reasoning[:200]}...")
        
except Exception as e:
    print(f"Error testing ensemble: {e}")

print("\n" + "="*60)

Test question: آیا کاترین لانگفورد نقش دختر بچه نوجوانی که هانا بیکر نام داشت را در سریال ۱۳ دلیل برای اینکه بازی کرده است؟

Ensemble MHQA Prediction Results:
Final Answer: بله، کاترین لانگفورد نقش هانا بیکر را در سریال "۱۳ دلیل برای اینکه" بازی کرده است.
Direct Answer: بله، کاترین لانگفورد نقش هانا بیکر، دختر نوجوانی را در سریال ۱۳ دلیل برای اینکه بازی کرده است.
Reasoning Answer: بله، کاترین لانگفورد نقش هانا بیکر را در سریال "۱۳ دلیل برای اینکه" بازی کرده است.
Enhanced Answer: بله، کاترین لانگفورد نقش دختر نوجوانی به نام هانا بیکر را در سریال ۱۳ دلیل برای اینکه بازی کرده است. او در این سریال در دو فصل اول و دوم نقش این شخصیت را ایفا کرد.
Reasoning Process: بله، کاترین لانگفورد نقش هانا بیکر، دختر نوجوانی که در سریال "۱۳ دلیل برای اینکه" داستانش روایت می‌شود، را بازی کرده است. این شخصیت توسط نویسنده جی اشر خلق شده و در اقتباس سریالی آن که در سال ۲۰۱۷ توس...


Ensemble MHQA Prediction Results:
Final Answer: بله، کاترین لانگفورد نقش هانا بیکر را در سریال "۱۳ دلیل برای اینکه" بازی کرده ا

# Comprehensive Evaluation for MHQA

In [None]:
def evaluate_hipporag_mhqa_model(model, model_name, test_examples, max_examples=None):
    print(f"\nEvaluating {model_name} on PersianMHQA...")
    
    if max_examples:
        test_examples = test_examples[:max_examples]
    
    results = []
    
    for i, example in enumerate(tqdm(test_examples, desc=f"Testing {model_name}")):
        try:
            with dspy.context(lm=lm):
                prediction = model(question=example.question, top_k_retrieval=5)
            model_answer = prediction.answer
        except Exception as e:
            print(f"Error in prediction {i}: {e}")
            model_answer = f"Error: {e}"
        
        try:
            is_correct = evaluate_answer_with_judge(
                example.question,
                example.answer, 
                model_answer
            )
        except Exception as e:
            print(f"Error in evaluation {i}: {e}")
            is_correct = False
        
        results.append({
            'question': example.question,
            'expected_answer': example.answer,
            'model_answer': model_answer,
            'clean_model_answer': clean_model_answer(model_answer),
            'is_correct': is_correct,
            'question_type': getattr(example, 'question_type', ''),
            'answer_type': getattr(example, 'answer_type', ''),
            'context': getattr(prediction, 'context', ''),
            'reasoning': getattr(prediction, 'reasoning', ''),
            'evidence': getattr(prediction, 'relevant_evidence', '')
        })
        
        # Add small delay to prevent overwhelming the API
        if i % 5 == 4:  
            import time
            time.sleep(1)
    
    total_correct = sum(1 for r in results if r['is_correct'])
    total_questions = len(results)
    accuracy = total_correct / total_questions if total_questions > 0 else 0
    
    print(f"\n{model_name} Results:")
    print(f"  Overall Accuracy: {accuracy:.3f} ({total_correct}/{total_questions})")
    
    return results, accuracy

In [None]:
eval_subset_size = 50  

mhqa_strategies = [
    (basic_mhqa_optimized, "Strategy 1: Basic DSPy HippoRAG MHQA"),
    (enhanced_mhqa_optimized, "Strategy 2: Enhanced with Random Search MHQA"),
    (ensemble_mhqa_optimized, "Strategy 3: Ensemble Approach MHQA")
]

all_mhqa_results = {}
all_mhqa_accuracies = {}

for model, name in mhqa_strategies:
    try:
        results, accuracy = evaluate_hipporag_mhqa_model(
            model, name, mhqa_test_examples, max_examples=eval_subset_size
        )
        
        all_mhqa_results[name] = results
        all_mhqa_accuracies[name] = accuracy
        
        filename = f"hipporag_dspy_mhqa_{name.lower().replace(' ', '_').replace(':', '')}_results.csv"
        pd.DataFrame(results).to_csv(filename, index=False)
        print(f"  Saved to: {filename}")
    except Exception as e:
        print(f"Error evaluating {name}: {e}")
        all_mhqa_accuracies[name] = 0.0

for name, accuracy in sorted(all_mhqa_accuracies.items(), key=lambda x: x[1], reverse=True):
    print(f"{accuracy:.3f} - {name}")
print("="*80)


Evaluating Strategy 1: Basic DSPy HippoRAG MHQA on PersianMHQA...


Testing Strategy 1: Basic DSPy HippoRAG MHQA: 100%|██████████| 50/50 [03:16<00:00,  3.94s/it]



Strategy 1: Basic DSPy HippoRAG MHQA Results:
  Overall Accuracy: 0.760 (38/50)
  Saved to: hipporag_dspy_mhqa_strategy_1_basic_dspy_hipporag_mhqa_results.csv

Evaluating Strategy 2: Enhanced with Random Search MHQA on PersianMHQA...


Testing Strategy 2: Enhanced with Random Search MHQA: 100%|██████████| 50/50 [04:15<00:00,  5.10s/it]



Strategy 2: Enhanced with Random Search MHQA Results:
  Overall Accuracy: 0.740 (37/50)
  Saved to: hipporag_dspy_mhqa_strategy_2_enhanced_with_random_search_mhqa_results.csv

Evaluating Strategy 3: Ensemble Approach MHQA on PersianMHQA...


Testing Strategy 3: Ensemble Approach MHQA: 100%|██████████| 50/50 [06:58<00:00,  8.37s/it]


Strategy 3: Ensemble Approach MHQA Results:
  Overall Accuracy: 0.760 (38/50)
  Saved to: hipporag_dspy_mhqa_strategy_3_ensemble_approach_mhqa_results.csv
0.760 - Strategy 1: Basic DSPy HippoRAG MHQA
0.760 - Strategy 3: Ensemble Approach MHQA
0.740 - Strategy 2: Enhanced with Random Search MHQA





# Compare with Baseline MHQA Results

In [None]:
try:
    baseline_mhqa = pd.read_csv('../evaluated_results_no_reasoning_RAG.csv')
    baseline_mhqa_accuracy = baseline_mhqa['is_correct'].mean()
    

    print(f"Baseline HippoRAG MHQA (No DSPy): {baseline_mhqa_accuracy:.3f}")
    print("\nDSPy Optimized MHQA Results:")
    for name, accuracy in sorted(all_mhqa_accuracies.items(), key=lambda x: x[1], reverse=True):
        improvement = accuracy - baseline_mhqa_accuracy
        improvement_str = f" (+{improvement:.3f})" if improvement > 0 else f" ({improvement:.3f})"
        print(f"{accuracy:.3f}{improvement_str} - {name}")
    
    if all_mhqa_accuracies:
        best_dspy_accuracy = max(all_mhqa_accuracies.values())
        total_improvement = best_dspy_accuracy - baseline_mhqa_accuracy
        print(f"\nBest DSPy MHQA Improvement: +{total_improvement:.3f} ({total_improvement/baseline_mhqa_accuracy*100:.1f}% relative)")
    print("="*80)
    
except FileNotFoundError:
    print("Baseline MHQA results not found. Skipping comparison.")

Baseline HippoRAG MHQA (No DSPy): 0.763

DSPy Optimized MHQA Results:
0.760 (-0.003) - Strategy 1: Basic DSPy HippoRAG MHQA
0.760 (-0.003) - Strategy 3: Ensemble Approach MHQA
0.740 (-0.023) - Strategy 2: Enhanced with Random Search MHQA

Best DSPy MHQA Improvement: +-0.003 (-0.4% relative)
