# MHQA

## No Reasoning

In [None]:
import os
import json
import dotenv
from tqdm import tqdm
import pandas as pd

dotenv.load_dotenv()

# Load test set
with open("../../data/test_data.json", "r") as f:
    test_data = json.load(f)

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

MODEL_NAME = "gpt-4o-mini"  # baseline GPT model
llm = ChatOpenAI(
    model=MODEL_NAME,
    temperature=0.2,
    max_tokens=200,
    api_key=os.getenv("METIS_API_KEY"),
    base_url="https://api.metisai.ir/openai/v1",
)



In [None]:
class StructuredAnswer(BaseModel):
    """A structured final answer."""
    final_answer: str = Field(
        description="The final, concise answer to the user's question."
    )

answer_types_guidance = [
    "اسامی عام",
    "شخص",
    "بلی/خیر",
    "تاریخ",
    "رویداد",
    "مکان",
    "اسامی خاص دیگر",
    "شماره",
    "کار هنری",
    "گروه یا سازمان",
    "صفت",
]

categories_text = "، ".join(answer_types_guidance)

qa_prompt_no_reasoning = ChatPromptTemplate.from_messages([
    (
        "system",
        f"""
You are an expert Persian Question Answering assistant.
Your output MUST be a JSON object with a single field named final_answer.
- final_answer must be the concise actual answer value in Persian.
- Do NOT output category names (e.g., {categories_text}); they are semantic types of answers, not options to choose.
- Do NOT include explanations or extra text.

Examples:
Q: در مقابل تبی که بنتونیت با نام گل ارمنی در آن معروف است چه چیز قرار دارد ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "پزشکی مبتنی بر شواهد"}}}}

Q: کدام یک از ستارگان فیلم زنی پشت پنجره بازیکن هاکی نیز می باشد ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "وایـت راسل"}}}}

Q: آیا سایتو دوسان یک نجیب زاده نظامی است ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "بلی"}}}}

Q: داده های ماهواره ای که نخستین‌ بار نقشه‌بردار موضوعی بر روی آن نصب شد در چه سالی به فضا پرتاب شد ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "سال ۱۹۸۲"}}}}

Q: جایزه ای که سیدیبه در سال 2007 آن را دریافت کرد متعلق به کدام جشنواره است ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "جشنوارهٔ فیلم ونیز"}}}}

Q: نام دیگر قومی که پیروز یکم آن ها را به طور قطعی شکست داد چیست ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "قوم کیدار"}}}}
""",
    ),
    ("human", "QUESTION: {question}\n\nYOUR STRUCTURED OUTPUT:"),
])

qa_prompt_reasoning = ChatPromptTemplate.from_messages([
    (
        "system",
        f"""
You are an expert Persian Question Answering assistant.
Think step-by-step internally and then produce a JSON object with a single field final_answer.
- final_answer must be the concise actual answer value in Persian.
- Do NOT output category names (e.g., {categories_text}); they are semantic types of answers, not options to choose.
- Do NOT include explanations or intermediate reasoning in the output.

Examples:
Q: در مقابل تبی که بنتونیت با نام گل ارمنی در آن معروف است چه چیز قرار دارد ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "پزشکی مبتنی بر شواهد"}}}}

Q: کدام یک از ستارگان فیلم زنی پشت پنجره بازیکن هاکی نیز می باشد ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "وایـت راسل"}}}}

Q: آیا سایتو دوسان یک نجیب زاده نظامی است ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "بلی"}}}}

Q: داده های ماهواره ای که نخستین‌ بار نقشه‌بردار موضوعی بر روی آن نصب شد در چه سالی به فضا پرتاب شد ؟
YOUR STRUCTURED OUTPUT:
{{{{"final_answer": "سال ۱۹۸۲"}}}}
""",
    ),
    ("human", "QUESTION: {question}\n\nYOUR STRUCTURED OUTPUT:"),
])

In [None]:
# Inference loop (no reasoning) using structured output
results = []
structured_no_reason_chain = qa_prompt_no_reasoning | llm.with_structured_output(StructuredAnswer)
for item in tqdm(test_data):
    question = item["question"]
    answer = item["answer"]
    _id = item["id"]

    try:
        resp = structured_no_reason_chain.invoke({"question": question})
        model_answer = resp.final_answer
    except Exception as e:
        model_answer = f"Error: {e}"

    results.append({
        "question": question,
        "answer": answer,
        "model_answer": model_answer,
        "id": _id,
    })

pd.DataFrame(results).to_csv("results.csv", index=False)

100%|██████████| 152/152 [04:50<00:00,  1.91s/it]


## Evaluate

In [20]:
import re
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

class AnswerEvaluation(BaseModel):
    is_correct: bool = Field(description="True if the model answer is semantically equivalent to the correct answer, False otherwise")

llm_judge = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    max_tokens=300,
    api_key=os.getenv("METIS_API_KEY"),
    base_url='https://api.metisai.ir/openai/v1',
).with_structured_output(AnswerEvaluation)

def clean_model_answer(model_answer: str) -> str:
    cleaned = re.sub(r'<ANSWER>(.*?)</ANSWER>', r'\1', model_answer, flags=re.DOTALL|re.IGNORECASE)
    return cleaned.strip()

def evaluate_single_answer(question: str, correct_answer: str, model_answer: str) -> AnswerEvaluation:
    prompt_eval = ChatPromptTemplate.from_messages([
        ("system", """You are an expert judge evaluating Persian/Farsi question-answer pairs. Determine if the model's answer is semantically equivalent to the correct answer. Be strict but fair about minor spelling variations and equivalent expressions."""),
        ("human", """Question: {question}\n\nCorrect Answer: {correct_answer}\nModel Answer: {model_answer}\n\nEvaluate if the model answer is semantically equivalent to the correct answer."""),
    ])
    msgs = prompt_eval.format_messages(
        question=question,
        correct_answer=correct_answer,
        model_answer=clean_model_answer(model_answer),
    )
    try:
        return llm_judge.invoke(msgs)
    except Exception:
        return AnswerEvaluation(is_correct=False)

# Evaluate
evaluated_results = []
for item in tqdm(results, desc="Evaluating answers"):
    ev = evaluate_single_answer(item['question'], item['answer'], item['model_answer'])
    evaluated_results.append({**item, 'clean_model_answer': clean_model_answer(item['model_answer']), 'is_correct': ev.is_correct})

df_eval = pd.DataFrame(evaluated_results)
df_eval.to_csv('evaluated_results.csv', index=False)

Evaluating answers:   0%|          | 0/152 [00:00<?, ?it/s]

Evaluating answers: 100%|██████████| 152/152 [03:20<00:00,  1.32s/it]


In [21]:
import numpy as np
len(df_eval[df_eval['is_correct'] == True]) / len(df_eval)

0.45394736842105265

## with Reasoning

In [24]:
# Reasoning flow using structured output
structured_reason_chain = qa_prompt_reasoning | llm.with_structured_output(StructuredAnswer)

results_reasoning = []
for item in tqdm(test_data):
    question = item['question']
    answer = item['answer']
    _id = item['id']

    try:
        resp = structured_reason_chain.invoke({"question": question})
        model_answer = resp.final_answer
    except Exception as e:
        model_answer = f"Error: {e}"

    results_reasoning.append({
        'question': question,
        'answer': answer,
        'model_answer': model_answer,
        'id': _id,
    })

pd.DataFrame(results_reasoning).to_csv('results_reasoning.csv', index=False)

100%|██████████| 152/152 [03:33<00:00,  1.41s/it]


## Evaluate

In [27]:
evaluated_results_reasoning = []
for item in tqdm(results_reasoning, desc="Evaluating answers (reasoning)"):
    ev = evaluate_single_answer(item['question'], item['answer'], item['model_answer'])
    evaluated_results_reasoning.append({**item, 'clean_model_answer': clean_model_answer(item['model_answer']), 'is_correct': ev.is_correct})

df_eval_reasoning = pd.DataFrame(evaluated_results_reasoning)
df_eval_reasoning.to_csv('evaluated_results_reasoning.csv', index=False)

Evaluating answers (reasoning): 100%|██████████| 152/152 [05:26<00:00,  2.15s/it]


In [28]:
len(df_eval_reasoning[df_eval_reasoning['is_correct'] == True]) / len(df_eval_reasoning)

0.4473684210526316

# PQUAD

## No reasoning

In [None]:
pquad_df = pd.read_csv('../../data/pquad/pquad_questions.csv', encoding='utf-8')
pquad_data = pquad_df.to_dict(orient='records')
# select only 150 samples
pquad_data = pquad_data[:150]

In [36]:
results_pquad = []
structured_no_reason_chain = qa_prompt_no_reasoning | llm.with_structured_output(StructuredAnswer)
for item in tqdm(pquad_data):
    question = item['question']
    answer = item['answer']
    _id = item['id']
    context_id = item.get('context_id')

    try:
        resp = structured_no_reason_chain.invoke({"question": question})
        model_answer = resp.final_answer
    except Exception as e:
        model_answer = f"Error: {e}"

    results_pquad.append({
        'question': question,
        'answer': answer,
        'model_answer': model_answer,
        'id': _id,
        'context_id': context_id,
    })

pd.DataFrame(results_pquad).to_csv('pquad_results.csv', index=False)

  0%|          | 0/150 [00:00<?, ?it/s]

100%|██████████| 150/150 [04:49<00:00,  1.93s/it]


## Evaluate

In [37]:
evaluated_results_pquad = []
for item in tqdm(results_pquad, desc="Evaluating answers (PQUAD)"):
    ev = evaluate_single_answer(item['question'], item['answer'], item['model_answer'])
    evaluated_results_pquad.append({**item, 'clean_model_answer': clean_model_answer(item['model_answer']), 'is_correct': ev.is_correct})

pquad_eval_df = pd.DataFrame(evaluated_results_pquad)
pquad_eval_df.to_csv('pqaud_evaluated_results_baseline.csv', index=False)

len(pquad_eval_df[pquad_eval_df['is_correct'] == True]) / len(pquad_eval_df)

Evaluating answers (PQUAD): 100%|██████████| 150/150 [03:24<00:00,  1.37s/it]


0.18666666666666668

## With Reasoning

In [38]:
results_pquad_reasoning = []
structured_reason_chain = qa_prompt_reasoning | llm.with_structured_output(StructuredAnswer)
for item in tqdm(pquad_data):
    question = item['question']
    answer = item['answer']
    _id = item['id']

    try:
        resp = structured_reason_chain.invoke({"question": question})
        model_answer = resp.final_answer
    except Exception as e:
        model_answer = f"Error: {e}"

    results_pquad_reasoning.append({
        'question': question,
        'answer': answer,
        'model_answer': model_answer,
        'id': _id,
    })

pd.DataFrame(results_pquad_reasoning).to_csv('pquad_results_reasoning.csv', index=False)

100%|██████████| 150/150 [03:50<00:00,  1.53s/it]


In [39]:
evaluated_results_pquad_reasoning = []
for item in tqdm(results_pquad_reasoning, desc="Evaluating answers (PQUAD reasoning)"):
    ev = evaluate_single_answer(item['question'], item['answer'], item['model_answer'])
    evaluated_results_pquad_reasoning.append({**item, 'clean_model_answer': clean_model_answer(item['model_answer']), 'is_correct': ev.is_correct})

pquad_eval_reasoning_df = pd.DataFrame(evaluated_results_pquad_reasoning)
pquad_eval_reasoning_df.to_csv('pquad_evaluated_results_reasoning.csv', index=False)

len(pquad_eval_reasoning_df[pquad_eval_reasoning_df['is_correct'] == True]) / len(pquad_eval_reasoning_df)

Evaluating answers (PQUAD reasoning): 100%|██████████| 150/150 [03:12<00:00,  1.28s/it]


0.19333333333333333