In [1]:
import json
import os

import pandas as pd
import numpy as np
from collections import defaultdict

from sqlalchemy.testing.suite.test_reflection import metadata

np.random.seed(42)
with open('taji_1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(data)

# DATA Preprocessing

In [3]:
# Extract topics from context to new column
def extract_topics(context_list):
    topics = []
    for context_item in context_list:
        topics.append(context_item[0])
    return topics

df['topics'] = df['context'].apply(extract_topics)

# Get all unique topics
all_topics = set()
for topics in df['topics']:
    all_topics.update(topics)

print(f"Total unique topics: {len(all_topics)}")


Total unique topics: 388


In [4]:
topic_to_samples = defaultdict(list)
for idx, topics in enumerate(df['topics']):
    for topic in topics:
        topic_to_samples[topic].append(idx)

In [5]:
topic_to_samples

defaultdict(list,
            {'۱۳ دلیل برای اینکه ': [0, 1, 2, 256, 257, 258],
             'هانا بیکر ': [0, 1, 2, 510, 511],
             'سامورایی ': [3, 4],
             'سایتو دوسان ': [3, 4],
             'سوپرتاکس ': [5, 6, 246, 247, 248],
             'نینتندو ': [5, 6, 236, 237, 242],
             'مالک سیدیبه ': [7, 8, 9],
             'شیر طلایی ': [7, 8, 9, 241],
             'مسابقه آواز یوروویژن ۲۰۱۹ ': [10],
             'مسابقه آواز یوروویژن ۲۰۱۸ ': [10],
             'نامه فرهنگستان ': [11, 12, 13, 14, 15, 436],
             'سردبیر ': [11, 12, 13, 14, 15],
             'خاطرات بریجت جونز (فیلم) ': [16, 17, 18, 19],
             'جما جونز ': [16, 17, 18, 19],
             'لامپ نئون ': [20, 21, 22, 23],
             'جیوه ': [20, 21, 22, 23, 225, 226, 227, 228, 433, 434, 455],
             'زابیواکا (برنامه تلویزیونی) ': [24, 25, 26, 27, 28, 29, 30, 31],
             'جام جهانی فوتبال ۲۰۱۸ ': [24, 25, 26, 27, 28, 29, 30, 31],
             'استفانی مک\u200cمن ': [32, 3

In [6]:
# Select one sample per topic for training (to guarantee coverage)
guaranteed_train_indices = set()
for topic, sample_indices in topic_to_samples.items():
    selected_idx = np.random.choice(sample_indices)
    guaranteed_train_indices.add(selected_idx)

print(f"Guaranteed training samples: {len(guaranteed_train_indices)}")

Guaranteed training samples: 280


In [7]:
# Remaining samples to allocate
remaining_indices = set(range(len(df))) - guaranteed_train_indices
remaining_indices = list(remaining_indices)

additional_train_needed = 400 - len(guaranteed_train_indices)

additional_train_indices = np.random.choice(
    remaining_indices,
    size=additional_train_needed,
    replace=False
)
train_indices = list(guaranteed_train_indices) + list(additional_train_indices)
all_indices = set(range(len(df)))
test_indices = list(all_indices - set(train_indices))

# Create train and test splits
train_df = df.iloc[train_indices].reset_index(drop=True)
test_df = df.iloc[test_indices].reset_index(drop=True)

# Remove the helper 'topics' column
train_df = train_df.drop('topics', axis=1)
test_df = test_df.drop('topics', axis=1)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 400
Test size: 152


In [8]:
# Verify topic coverage in training
train_topics = set()
for _, row in train_df.iterrows():
    for context_item in row['context']:
        if len(context_item) > 0:
            train_topics.add(context_item[0])

print(f"Topics in training set: {len(train_topics)}")

# Save the splits
train_df.to_json('train_data.json', orient='records', force_ascii=False, indent=2)
test_df.to_json('test_data.json', orient='records', force_ascii=False, indent=2)

Topics in training set: 388


# RAG (PersianMHQA)

In [1]:
import pandas as pd

train_df = pd.read_json('train_data.json', orient='records', encoding='utf-8')
test_df = pd.read_json('test_data.json', orient='records', encoding='utf-8')

## Prepare data for Retrieval

In [2]:
from langchain_core.documents import Document

documents: list[Document] = []
for idx, row in test_df.iterrows():
    for doc in row['context']:
        title = doc[0]
        sentences = doc[1]
        for index_sent, sentence in enumerate(sentences):
            d = Document(
                page_content=title + " : " +sentence,
                metadata={
                    'title': title,
                    'index': index_sent,
                    'question': row['question'],
                    'answer': row['answer'],
            })
            documents.append(d)

print("Length of documents: ", len(documents))

Length of documents:  1316


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(documents)

print(f"Number of chunks: {len(chunks)}")

Number of chunks: 1316


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B", cache_folder=r"Q:\hf_models", model_kwargs={"local_files_only": True})
vector_store = FAISS.from_documents(chunks, embedding_model)
semantic_retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [6]:
vector_store.save_local("qwen3-0.6B-embs")

In [None]:
# load local
embedding_model = HuggingFaceEmbeddings(model_name="Qwen/Qwen3-Embedding-0.6B", cache_folder=r"Q:\hf_models", model_kwargs={"local_files_only": True})
vector_store = FAISS.load_local("qwen3-0.6B-embs", embeddings=embedding_model)

In [7]:
from langchain_core.prompts import SystemMessagePromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate

prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "You are a helpful assistant. Use the provided context to answer the user's question as accurately as possible in few words in Persian.\nContext:\n{context}"
    ),
    HumanMessagePromptTemplate.from_template("{question}"),
])

## API

### Avval AI

In [8]:
import dotenv
import os
dotenv.load_dotenv()

True

In [36]:
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(base_url="https://api.avalapis.ir/v1", api_key=os.environ.get("AVVALAI"), model="gemini-2.5-flash-preview-05-20")

### Gemini

In [None]:
import getpass

api_key = getpass.getpass("Enter your API key: ")

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-05-20", api_key=api_key)

## Semantic RAG

In [29]:
from langchain_core.runnables import RunnableMap, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()

def generate_response(inputs):
    # Format prompt with context and question
    formatted_prompt = prompt.format_prompt(question=inputs["question"], context="\n\n".join(doc.page_content for doc in inputs["context"])).to_messages()
    result = llm(formatted_prompt)
    parsed_result = output_parser.invoke(result)
    return {
        "response": parsed_result,
        "context": inputs["context"],
    }
# For this cell, everything is the same as the previous cell, except that we are using the semantic retriever instead of the sparse retriever.
def get_context_dense(inputs, num_queries=3):
    docs = semantic_retriever.get_relevant_documents(inputs["question"], k=num_queries)
    return {"context": docs, **inputs}

semantic_rag = (
    RunnableMap({
        "question": lambda x: x["question"],
    })
    | RunnableLambda(get_context_dense)
    | RunnableMap({
        "context": lambda x: x["context"],
        "question": lambda x: x.get("question", ""),
    })
    | RunnableLambda(generate_response)
)

# Now, let's test the semantic RAG pipeline with a sample query.
response = semantic_rag.invoke({"question": "کارگردان فیلم شب یلدا دانش آموخته چه رشته ای می باشد ؟"})
print(response["response"])

  result = llm(formatted_prompt)


مهندسی کشاورزی


In [30]:
def convert_digits_en2fa(text):
    english_digits = '0123456789'
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'

    translation_table = str.maketrans(english_digits, persian_digits)
    return text.translate(translation_table)

def parse_result(gold, generated_text) -> bool:
    gold = convert_digits_en2fa(gold)
    generated_text = convert_digits_en2fa(generated_text)

    if gold in generated_text:
        return True
    if gold == "بلی":
        if "بله" in generated_text:
            return True

    return False

In [37]:
import time
responses = []
correct = 0

for i, row in test_df.iterrows():
    if i<len(responses):
        continue
    try:
        response = semantic_rag.invoke({"question": row["question"]})
    except Exception as e:
        print(e)
        time.sleep(20)
        response = semantic_rag.invoke({"question": row["question"]})
    if parse_result(row["answer"], response["response"]):
        correct += 1
    print(response["response"])
    print("Gold", row["answer"])
    responses.append(response)
    print("-"*25)
    time.sleep(20)

برای نگاناسان‌ها بله، اما درباره مردم انتس اطلاعاتی در متن موجود نیست.
Gold بله
-------------------------
خیر. طبق متن، فقط نگاناسان‌ها به این زبان تکلم می‌کنند.
Gold خیر
-------------------------
خیر، متن فقط درباره مردم انتس اطلاعات دارد و به نگاناسان‌ها اشاره‌ای نمی‌کند.
Gold خیر
-------------------------
مایکل کین
Gold مایکل کین
-------------------------
اطلاعات مربوط به تأسیس شرکت تکمو در متن ارائه نشده است.
Gold خیر
-------------------------
در متن ارائه شده به گروه نقاشان تصویرگر اشاره نشده است.
Gold پیشارافائلی
-------------------------
تب
Gold تب
-------------------------
خیر، نینتندو مجموعه بازی برادران سوپر اسمش را منتشر کرده است.
Gold خیر
-------------------------
این اطلاعات در متن ارائه نشده است.
Gold سال 1953
-------------------------
طراحی شخصیت
Gold انیمه و مانگا
-------------------------
اطلاعاتی در این زمینه در متن ارائه نشده است.
Gold مغول‌ها
-------------------------
اطلاعاتی در این مورد در متن ارائه نشده است.
Gold جیوه
-------------------------
خیر، در متن به آن ا

## Evaluation:

### Accuracy:

In [38]:
print(correct / len(test_df))

0.4342105263157895


### Accuracy LLM as Judge:

In [None]:
from pydantic import BaseModel, Field
from langchain.schema import HumanMessage
import time

class QuestionAnsweringJudgement(BaseModel):
    same_result: bool = Field(description="Analyzes if results are same set true")
    reason: str = Field(description="Reason if results are same or not same")

llm_with_structure = llm.with_structured_output(QuestionAnsweringJudgement)

In [None]:
llm_as_judge_responses = []
llm_as_judge_correct = 0
for i, row in test_df.iterrows():
    prompt = f"""Judge question for generated answer with true answer and output result in json format in true false format with reason
    question: {row['question']}
    true answer: {row['answer']}
    generated answer: {responses[i]['response']}
    context for true answer: {row['context']}
    """

    try:
        response = llm_with_structure.invoke([HumanMessage(content=prompt)])
        llm_as_judge_responses.append(response)
        if response.same_result:
            llm_as_judge_correct += 1
    except Exception as e:
       print(e)
       time.sleep(10)

In [None]:
print(llm_as_judge_correct / len(test_df))

### F1

In [39]:
import collections
import string

def compute_f1_metrics(generated_response: str, ground_truth_answer: str):
    """
    Calculates F1, precision, and recall for text quality evaluation.

    This function compares the words in the generated response to the
    words in the ground-truth answer, based on the methodology used in
    the HotpotQA evaluation script.

    Args:
        generated_response: The text generated by the model.
        ground_truth_answer: The correct, reference text.

    Returns:
        A dictionary containing the F1 score, precision, and recall.
    """
    # Normalize and split the text into a list of words (tokens)
    def remove_punc(text):
        exclude = set(string.punctuation+"،")
        return ''.join(ch for ch in text if ch not in exclude)

    prediction_tokens = remove_punc(generated_response).lower().split()
    ground_truth_tokens = remove_punc(ground_truth_answer).lower().split()

    # If either the prediction or the ground truth is empty, metrics are 0
    if not prediction_tokens or not ground_truth_tokens:
        return {'f1': 0.0, 'precision': 0.0, 'recall': 0.0}

    # Use collections.Counter to count word frequencies
    prediction_counter = collections.Counter(prediction_tokens)
    ground_truth_counter = collections.Counter(ground_truth_tokens)

    # Find the common words, respecting their frequency
    common_words_counter = prediction_counter & ground_truth_counter
    num_common_words = sum(common_words_counter.values())

    # Calculate precision
    precision = num_common_words / len(prediction_tokens)

    # Calculate recall
    recall = num_common_words / len(ground_truth_tokens)

    # Calculate F1 score
    if (precision + recall) > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0

    return {'f1': f1, 'precision': precision, 'recall': recall}

In [40]:
precisions, recalls, f1s = [], [], []

for i, row in test_df.iterrows():
    scores = compute_f1_metrics(responses[i]["response"], row["answer"])
    precisions.append(scores["precision"])
    recalls.append(scores["recall"])
    f1s.append(scores["f1"])


In [41]:
import numpy as np
print("Average precision:", np.mean(precisions))
print("Average recall:", np.mean(recalls))
print("Average F1:", np.mean(f1s))

Average precision: 0.20543545698873594
Average recall: 0.4469298245614035
Average F1: 0.2324362339338101


### HotpotQA Evaluation scripts:

In [43]:
import sys
import json
import re
import string
from collections import Counter
import pickle

def normalize_answer(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation+"،")
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0, 0, 0)

    if normalized_prediction in ['بله', 'خیر', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['بله', 'خیر', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return ZERO_METRIC
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))

def update_answer(metrics, prediction, gold):
    em = exact_match_score(prediction, gold)
    f1, prec, recall = f1_score(prediction, gold)
    metrics['em'] += float(em)
    metrics['f1'] += f1
    metrics['prec'] += prec
    metrics['recall'] += recall
    return em, prec, recall

def update_sp(metrics, prediction, gold):
    cur_sp_pred = set(map(tuple, prediction))
    gold_sp_pred = set(map(tuple, gold))
    tp, fp, fn = 0, 0, 0
    for e in cur_sp_pred:
        if e in gold_sp_pred:
            tp += 1
        else:
            fp += 1
    for e in gold_sp_pred:
        if e not in cur_sp_pred:
            fn += 1
    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
    em = 1.0 if fp + fn == 0 else 0.0
    metrics['sp_em'] += em
    metrics['sp_f1'] += f1
    metrics['sp_prec'] += prec
    metrics['sp_recall'] += recall
    return em, prec, recall

def eval(prediction, gold_file):
    with open(gold_file, 'r', encoding="utf-8") as f:
        gold = json.load(f)

    metrics = {'em': 0, 'f1': 0, 'prec': 0, 'recall': 0,
        'sp_em': 0, 'sp_f1': 0, 'sp_prec': 0, 'sp_recall': 0,
        'joint_em': 0, 'joint_f1': 0, 'joint_prec': 0, 'joint_recall': 0}
    count = 0
    for dp in gold:
        cur_id = dp['id']

        supporting_facts = []
        for t in dp['supporting_facts']:
            for item in t[1]:
                supporting_facts.append([t[0], int(item)])
        dp['supporting_facts'] = supporting_facts

        can_eval_joint = True
        if cur_id not in prediction['answer']:
            # print('missing answer {}'.format(cur_id))
            can_eval_joint = False
            continue
        else:
            em, prec, recall = update_answer(
                metrics, prediction['answer'][cur_id], dp['answer'])
        if cur_id not in prediction['sp']:
            print('missing sp fact {}'.format(cur_id))
            can_eval_joint = False
        else:
            sp_em, sp_prec, sp_recall = update_sp(
                metrics, prediction['sp'][cur_id], dp['supporting_facts'])
        count += 1

        if can_eval_joint:
            joint_prec = prec * sp_prec
            joint_recall = recall * sp_recall
            if joint_prec + joint_recall > 0:
                joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
            else:
                joint_f1 = 0.
            joint_em = em * sp_em

            metrics['joint_em'] += joint_em
            metrics['joint_f1'] += joint_f1
            metrics['joint_prec'] += joint_prec
            metrics['joint_recall'] += joint_recall

    # N = len(gold)
    for k in metrics.keys():
        # metrics[k] /= N
        metrics[k] = metrics[k] / count if count > 0 else 0.0

    print(metrics)

In [42]:
# prepare prediction format for eval
prediction = {'answer':{},'sp':{}}
sps_count = 0
for idx, res in enumerate(responses):
    sps = []
    for context in res["context"]:
        sps.append([context.metadata["title"], context.metadata["index"]])
    answer = res["response"]
    sample_id = int(test_df.iloc[idx]["id"])

    prediction['answer'][sample_id] = answer
    prediction['sp'][sample_id] = sps
    sps_count += len(sps)

print(f'Average number of supporting facts: {sps_count/len(responses)}')

Average number of supporting facts: 3.0


In [44]:
eval(prediction, "test_data.json")

{'em': 0.13157894736842105, 'f1': 0.18629385964912282, 'prec': 0.17866019214703424, 'recall': 0.2166666666666667, 'sp_em': 0.02631578947368421, 'sp_f1': 0.5220238095238092, 'sp_prec': 0.7214912280701758, 'sp_recall': 0.4521929824561404, 'joint_em': 0.013157894736842105, 'joint_f1': 0.12197037726816125, 'joint_prec': 0.150717940685046, 'joint_recall': 0.12682748538011693}


## PQUAD