In [7]:
# خلية 1: استيراد المكتبات اللازمة

import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import inflect
import re
from bs4 import BeautifulSoup
import unicodedata
import contractions
import json
import asyncio
import httpx

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

#---

# خلية 2: TextProcessor كما سبق (مع تعديل دالة number_to_words)

class TextProcessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.inflect_engine = inflect.engine()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def cleaned_text(self, text):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def normalization_example(self, text):
        return text.lower()

    def stemming_example(self, text):
        words = self.tokenizer.tokenize(text)
        stemmed_words = [self.stemmer.stem(word) for word in words]
        return ' '.join(stemmed_words)

    def lemmatization_example(self, text):
        words = self.tokenizer.tokenize(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def remove_stopwords(self, text):
        words = self.tokenizer.tokenize(text)
        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def number_to_words(self, text):
        words = self.tokenizer.tokenize(text)
        converted_words = []
        for word in words:
            # تحقق من أن الكلمة أرقام عادية فقط
            if word.isdecimal() and word.isascii():
                try:
                    num = int(word)
                    if num <= 999999999999999:
                        converted_word = self.inflect_engine.number_to_words(word)
                        converted_words.append(converted_word)
                    else:
                        converted_words.append("[Number Out of Range]")
                except (ValueError, inflect.NumOutOfRangeError):
                    converted_words.append("[Number Out of Range]")
            else:
                converted_words.append(word)
        return ' '.join(converted_words)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def normalize_unicode(self, text):
        return unicodedata.normalize("NFKD", text)

    def handle_negations(self, text):
        words = self.tokenizer.tokenize(text)
        negated_text = []
        negate = False
        for word in words:
            if word.lower() in ['not', "n't"]:
                negate = True
            elif negate:
                negated_text.append(f"NOT_{word}")
                negate = False
            else:
                negated_text.append(word)
        return ' '.join(negated_text)

    def remove_urls(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def preprocess(self, text):
        if text is None:
            return text
        text = self.cleaned_text(text)
        text = self.normalization_example(text)
        text = self.stemming_example(text)
        text = self.lemmatization_example(text)
        text = self.remove_stopwords(text)
        text = self.number_to_words(text)
        text = self.expand_contractions(text)
        text = self.normalize_unicode(text)
        text = self.handle_negations(text)
        text = self.remove_urls(text)
        return text

processor = TextProcessor()

#---

# خلية 3: جلب البيانات من MongoDB

def get_data_from_mongo(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection_name = dataset_path.replace("/", "_")
    collection = db[collection_name]

    pids = []
    texts = []
    cursor = collection.find({}, {"_id": 0, "doc_id": 1, "text": 1})
    for doc in cursor:
        if "doc_id" in doc and "text" in doc and isinstance(doc["text"], str):
            pids.append(str(doc["doc_id"]))
            texts.append(doc["text"])

    df = pd.DataFrame({"pid": pids, "text": texts})
    df.dropna(subset=['text'], inplace=True)
    return df

#---

# خلية 4: بناء TF-IDF في الذاكرة

def build_tfidf_in_memory(df):
    vectorizer = TfidfVectorizer(preprocessor=processor.preprocess, max_df=0.5, min_df=1)
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    return vectorizer, tfidf_matrix

#---

# خلية 5: البحث في TF-IDF

def search_in_tfidf(query, vectorizer, tfidf_matrix, df, top_n=10):
    processed_query = processor.preprocess(query)
    query_vector = vectorizer.transform([processed_query])
    cosine_similarities = cosine_similarity(tfidf_matrix, query_vector).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    top_docs = df.iloc[top_indices]
    results = {
        "top_documents": top_docs.to_dict(orient="records"),
        "cosine_similarities": cosine_similarities[top_indices].tolist(),
        "top_documents_indices": top_indices.tolist()
    }
    return results

#---

# خلية 6: دوال التقييم

all_precisions = []
all_recalls = []
all_map_scores = []
all_mrrs = []

def calculate_precision_recall(relevantOrNot, retrievedDocument, threshold=0.5):
    binaryResult = (retrievedDocument >= threshold).astype(int)
    precision = precision_score(relevantOrNot, binaryResult, average='micro')
    recall = recall_score(relevantOrNot, binaryResult, average='micro')
    return precision, recall

def calculate_map_score(relevantOrNot, retrievedDocument):
    return average_precision_score(relevantOrNot, retrievedDocument, average='micro')

def calculate_mrr(y_true):
    rank_position = np.where(y_true == 1)[0]
    if len(rank_position) == 0:
        return 0
    else:
        return 1 / (rank_position[0] + 1)

def load_queries(queries_paths):
    queries = []
    for file_path in queries_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    query = json.loads(line.strip())
                    if 'query' in query:
                        queries.append(query)
                except json.JSONDecodeError:
                    print(f"Skipping invalid line in {file_path}: {line}")
    return queries

#---

# خلية 7: تقييم بحث TF-IDF (يرجى تعديل search_function حسب حاجتك)

def evaluate_search(dataset_path, search_function):
    import time
    start_time = time.time()
    
    df = get_data_from_mongo(dataset_path)
    
    queries_paths = ''
    if dataset_path == 'lotte/lifestyle/dev/forum':
        queries_paths = r'C:\Users\USER\.ir_datasets\lotte\lotte_extracted\lotte\lifestyle\dev\qas.search.jsonl'
    elif dataset_path == 'antique/train':
        queries_paths = r'C:\Users\USER\.ir_datasets\antique\test\Answers.jsonl'
    else:
        print("Warning: No queries path configured for this dataset.")
        return
    
    queries = load_queries([queries_paths])

    for query in queries:
        if 'query' not in query:
            continue
        
        # استدعاء دالة البحث مع الباراميترات المناسبة
        response_json = search_function(query['query'], top_n=10)
        
        top_documents = response_json["top_documents"]
        cosine_similarities = np.array(response_json["cosine_similarities"])
        top_documents_indices = response_json["top_documents_indices"]

        relevance = np.zeros(len(df))

        for pid in query.get('answer_pids', []):
            pid_str = str(pid)
            indices = np.where(df['pid'] == pid_str)[0]
            relevance[indices] = 1

        retrievedDocument = cosine_similarities
        relevantOrNot = relevance[top_documents_indices]

        if relevantOrNot.sum() == 0:
            continue

        precision, recall = calculate_precision_recall(relevantOrNot, retrievedDocument)
        all_precisions.append(precision)
        all_recalls.append(recall)

        map_score = calculate_map_score(relevantOrNot, retrievedDocument)
        all_map_scores.append(map_score)

        mrr = calculate_mrr(relevantOrNot)
        all_mrrs.append(mrr)

    if len(all_precisions) == 0:
        print("⚠️ No valid queries evaluated. Check PIDs matching and dataset content.")
        return

    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_map_score = np.mean(all_map_scores)
    avg_mrr = np.mean(all_mrrs)

    elapsed_time = time.time() - start_time

    print(f"Evaluation results for dataset: {dataset_path}")
    print(f"Execution Time (seconds): {elapsed_time:.2f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average MAP Score: {avg_map_score:.4f}")
    print(f"Average MRR: {avg_mrr:.4f}")

#---

# خلية 8: مثال كامل للاستخدام (تعديل حسب حاجتك)

dataset_path = "lotte/lifestyle/dev/forum"  # غير هذا حسب بياناتك

df = get_data_from_mongo(dataset_path)
vectorizer, tfidf_matrix = build_tfidf_in_memory(df)

def search_function(query, top_n=10):
    return search_in_tfidf(query, vectorizer, tfidf_matrix, df, top_n)

# تشغيل التقييم (في حال أردت)

evaluate_search(dataset_path, search_function)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluation results for dataset: lotte/lifestyle/dev/forum
Execution Time (seconds): 301.16
Average Precision: 0.4971
Average Recall: 0.4971
Average MAP Score: 0.5649
Average MRR: 0.5903


In [6]:
# خلية 1: استيراد المكتبات اللازمة

import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import inflect
import re
from bs4 import BeautifulSoup
import unicodedata
import contractions
import json
import os
import joblib
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

#---

# خلية 2: TextProcessor كما سبق (مع تعديل دالة number_to_words)

class TextProcessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.inflect_engine = inflect.engine()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def cleaned_text(self, text):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def normalization_example(self, text):
        return text.lower()

    def stemming_example(self, text):
        words = self.tokenizer.tokenize(text)
        stemmed_words = [self.stemmer.stem(word) for word in words]
        return ' '.join(stemmed_words)

    def lemmatization_example(self, text):
        words = self.tokenizer.tokenize(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def remove_stopwords(self, text):
        words = self.tokenizer.tokenize(text)
        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def number_to_words(self, text):
        words = self.tokenizer.tokenize(text)
        converted_words = []
        for word in words:
            # تحقق من أن الكلمة أرقام عادية فقط
            if word.isdecimal() and word.isascii():
                try:
                    num = int(word)
                    if num <= 999999999999999:
                        converted_word = self.inflect_engine.number_to_words(word)
                        converted_words.append(converted_word)
                    else:
                        converted_words.append("[Number Out of Range]")
                except (ValueError, inflect.NumOutOfRangeError):
                    converted_words.append("[Number Out of Range]")
            else:
                converted_words.append(word)
        return ' '.join(converted_words)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def normalize_unicode(self, text):
        return unicodedata.normalize("NFKD", text)

    def handle_negations(self, text):
        words = self.tokenizer.tokenize(text)
        negated_text = []
        negate = False
        for word in words:
            if word.lower() in ['not', "n't"]:
                negate = True
            elif negate:
                negated_text.append(f"NOT_{word}")
                negate = False
            else:
                negated_text.append(word)
        return ' '.join(negated_text)

    def remove_urls(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def preprocess(self, text):
        if text is None:
            return text
        text = self.cleaned_text(text)
        text = self.normalization_example(text)
        text = self.stemming_example(text)
        text = self.lemmatization_example(text)
        text = self.remove_stopwords(text)
        text = self.number_to_words(text)
        text = self.expand_contractions(text)
        text = self.normalize_unicode(text)
        text = self.handle_negations(text)
        text = self.remove_urls(text)
        return text

processor = TextProcessor()

#---

# خلية 3: جلب البيانات من MongoDB

def get_data_from_mongo(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection_name = dataset_path.replace("/", "_")
    collection = db[collection_name]

    pids = []
    texts = []
    cursor = collection.find({}, {"_id": 0, "doc_id": 1, "text": 1})
    for doc in cursor:
        if "doc_id" in doc and "text" in doc and isinstance(doc["text"], str):
            pids.append(str(doc["doc_id"]))
            texts.append(doc["text"])

    df = pd.DataFrame({"pid": pids, "text": texts})
    df.dropna(subset=['text'], inplace=True)
    return df

#---

# تحميل الموديلات خارج الدالة حتى تكون مرة وحدة
retrieval_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
processor = TextProcessor()

# تحميل FAISS index والـ doc_ids
def get_faiss_index_and_doc_ids(dataset_path: str):
    db_dir = os.path.join(r"C:\Users\USER\Desktop\IR_Final_Project\db", dataset_path.replace("/", "__"))
    index = faiss.read_index(os.path.join(db_dir, "bert_faiss.index"))
    doc_ids = joblib.load(os.path.join(db_dir, "bert_doc_ids.joblib"))
    return index, doc_ids

# تحميل كل الوثائق كماب: doc_id → text
def load_documents_map(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection = db[dataset_path.replace("/", "_")]
    cursor = collection.find({}, {"_id": 0, "doc_id": 1, "text": 1})
    doc_map = {}
    for doc in cursor:
        if "doc_id" in doc and "text" in doc:
            doc_map[str(doc["doc_id"])] = doc["text"]
    return doc_map

# دالة البحث باستخدام BERT + FAISS
def search_in_bert(query, dataset_path, top_k=50, rerank_k=10):
    index, doc_ids = get_faiss_index_and_doc_ids(dataset_path)
    doc_map = load_documents_map(dataset_path)
    doc_id_to_index = {str(doc_id): i for i, doc_id in enumerate(doc_ids)}

    query_processed = processor.preprocess(query)
    query_vec = retrieval_model.encode(query_processed, normalize_embeddings=True).astype(np.float32).reshape(1, -1)
    faiss.normalize_L2(query_vec)

    scores, indices = index.search(query_vec, top_k)
    top_doc_ids = [doc_ids[i] for i in indices[0]]
    top_docs = [(str(doc_id), doc_map.get(str(doc_id), "")) for doc_id in top_doc_ids]

    filtered_docs = [(doc_id, text) for doc_id, text in top_docs if text.strip()]
    pairs = [(query, text) for _, text in filtered_docs]
    rerank_scores = cross_encoder.predict(pairs)

    ranked = sorted(zip(filtered_docs, rerank_scores), key=lambda x: x[1], reverse=True)
    reranked = ranked[:rerank_k]

    top_documents = []
    cosine_similarities = []
    top_documents_indices = []

    for (doc_id, text), score in reranked:
        top_documents.append({
            "doc_id": doc_id,
            "score": float(score),
            "text": text
        })
        cosine_similarities.append(float(score))
        top_documents_indices.append(doc_id_to_index.get(doc_id, -1))

    return {
        "top_documents": top_documents,
        "cosine_similarities": cosine_similarities,
        "top_documents_indices": top_documents_indices
    }


# خلية 6: دوال التقييم

all_precisions = []
all_recalls = []
all_map_scores = []
all_mrrs = []

def calculate_precision_recall(relevantOrNot, retrievedDocument, threshold=0.5):
    binaryResult = (retrievedDocument >= threshold).astype(int)
    precision = precision_score(relevantOrNot, binaryResult, average='micro')
    recall = recall_score(relevantOrNot, binaryResult, average='micro')
    return precision, recall

def calculate_map_score(relevantOrNot, retrievedDocument):
    return average_precision_score(relevantOrNot, retrievedDocument, average='micro')

def calculate_mrr(y_true):
    rank_position = np.where(y_true == 1)[0]
    if len(rank_position) == 0:
        return 0
    else:
        return 1 / (rank_position[0] + 1)

def load_queries(queries_paths):
    queries = []
    for file_path in queries_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    query = json.loads(line.strip())
                    if 'query' in query:
                        queries.append(query)
                except json.JSONDecodeError:
                    print(f"Skipping invalid line in {file_path}: {line}")
    return queries

#---

# خلية 7: تقييم بحث TF-IDF (يرجى تعديل search_function حسب حاجتك)

def evaluate_search(dataset_path, search_function):
    import time
    start_time = time.time()
    
    df = get_data_from_mongo(dataset_path)
    
    queries_paths = ''
    if dataset_path == 'lotte/lifestyle/dev/forum':
        queries_paths = r'C:\Users\USER\.ir_datasets\lotte\lotte_extracted\lotte\lifestyle\dev\qas.search.jsonl'
    elif dataset_path == 'antique/train':
        queries_paths = r'C:\Users\USER\.ir_datasets\antique\test\Answers.jsonl'
    else:
        print("Warning: No queries path configured for this dataset.")
        return
    
    queries = load_queries([queries_paths])

    for query in queries:
        if 'query' not in query:
            continue
        
        # استدعاء دالة البحث مع الباراميترات المناسبة
        response_json = search_function(query['query'], top_n=10)
        
        top_documents = response_json["top_documents"]
        cosine_similarities = np.array(response_json["cosine_similarities"])
        top_documents_indices = response_json["top_documents_indices"]

        relevance = np.zeros(len(df))

        for pid in query.get('answer_pids', []):
            pid_str = str(pid)
            indices = np.where(df['pid'] == pid_str)[0]
            relevance[indices] = 1

        retrievedDocument = cosine_similarities
        relevantOrNot = relevance[top_documents_indices]

        if relevantOrNot.sum() == 0:
            continue

        precision, recall = calculate_precision_recall(relevantOrNot, retrievedDocument)
        all_precisions.append(precision)
        all_recalls.append(recall)

        map_score = calculate_map_score(relevantOrNot, retrievedDocument)
        all_map_scores.append(map_score)

        mrr = calculate_mrr(relevantOrNot)
        all_mrrs.append(mrr)

    if len(all_precisions) == 0:
        print("⚠️ No valid queries evaluated. Check PIDs matching and dataset content.")
        return

    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_map_score = np.mean(all_map_scores)
    avg_mrr = np.mean(all_mrrs)

    elapsed_time = time.time() - start_time

    print(f"Evaluation results for dataset: {dataset_path}")
    print(f"Execution Time (seconds): {elapsed_time:.2f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average MAP Score: {avg_map_score:.4f}")
    print(f"Average MRR: {avg_mrr:.4f}")

#---

# خلية 8: مثال كامل للاستخدام (تعديل حسب حاجتك)

dataset_path = "lotte/lifestyle/dev/forum"  

def search_function(query, top_n=10):
    return search_in_bert(query, dataset_path, top_k=50, rerank_k=top_n)

# تشغيل التقييم
evaluate_search(dataset_path, search_function)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluation results for dataset: lotte/lifestyle/dev/forum
Execution Time (seconds): 7989.27
Average Precision: 0.4691
Average Recall: 0.4691
Average MAP Score: 0.7245
Average MRR: 0.7826


In [1]:
# خلية 1: استيراد المكتبات اللازمة

import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import inflect
import re
from bs4 import BeautifulSoup
import unicodedata
import contractions
import json
import os
import joblib
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

#---

# خلية 2: TextProcessor كما سبق (مع تعديل دالة number_to_words)

class TextProcessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.inflect_engine = inflect.engine()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def cleaned_text(self, text):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def normalization_example(self, text):
        return text.lower()

    def stemming_example(self, text):
        words = self.tokenizer.tokenize(text)
        stemmed_words = [self.stemmer.stem(word) for word in words]
        return ' '.join(stemmed_words)

    def lemmatization_example(self, text):
        words = self.tokenizer.tokenize(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def remove_stopwords(self, text):
        words = self.tokenizer.tokenize(text)
        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def number_to_words(self, text):
        words = self.tokenizer.tokenize(text)
        converted_words = []
        for word in words:
            # تحقق من أن الكلمة أرقام عادية فقط
            if word.isdecimal() and word.isascii():
                try:
                    num = int(word)
                    if num <= 999999999999999:
                        converted_word = self.inflect_engine.number_to_words(word)
                        converted_words.append(converted_word)
                    else:
                        converted_words.append("[Number Out of Range]")
                except (ValueError, inflect.NumOutOfRangeError):
                    converted_words.append("[Number Out of Range]")
            else:
                converted_words.append(word)
        return ' '.join(converted_words)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def normalize_unicode(self, text):
        return unicodedata.normalize("NFKD", text)

    def handle_negations(self, text):
        words = self.tokenizer.tokenize(text)
        negated_text = []
        negate = False
        for word in words:
            if word.lower() in ['not', "n't"]:
                negate = True
            elif negate:
                negated_text.append(f"NOT_{word}")
                negate = False
            else:
                negated_text.append(word)
        return ' '.join(negated_text)

    def remove_urls(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def preprocess(self, text):
        if text is None:
            return text
        text = self.cleaned_text(text)
        text = self.normalization_example(text)
        text = self.stemming_example(text)
        text = self.lemmatization_example(text)
        text = self.remove_stopwords(text)
        text = self.number_to_words(text)
        text = self.expand_contractions(text)
        text = self.normalize_unicode(text)
        text = self.handle_negations(text)
        text = self.remove_urls(text)
        return text

processor = TextProcessor()

#---
def build_tfidf_in_memory(df):
    vectorizer = TfidfVectorizer(preprocessor=processor.preprocess, max_df=0.5, min_df=1)
    tfidf_matrix = vectorizer.fit_transform(df['text'])
    return vectorizer, tfidf_matrix


# خلية 3: جلب البيانات من MongoDB

def get_data_from_mongo(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection_name = dataset_path.replace("/", "_")
    collection = db[collection_name]

    pids = []
    texts = []
    cursor = collection.find({}, {"_id": 0, "doc_id": 1, "text": 1})
    for doc in cursor:
        if "doc_id" in doc and "text" in doc and isinstance(doc["text"], str):
            pids.append(str(doc["doc_id"]))
            texts.append(doc["text"])

    df = pd.DataFrame({"pid": pids, "text": texts})
    df.dropna(subset=['text'], inplace=True)
    return df

from functools import lru_cache
from sklearn.decomposition import TruncatedSVD  # إذا مش موجود ضمن ملف joblib، لكن ما رح نعمل import جديد هنا حسب طلبك

# كاش لتحميل المودل
@lru_cache(maxsize=1)
def load_bert_model():
    return SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

# كاش عام للملفات الثقيلة
_loaded_cache = {}

def load_cached(path, loader=joblib.load):
    if path not in _loaded_cache:
        _loaded_cache[path] = loader(path)
    return _loaded_cache[path]


def search_in_hybrid(query, dataset_path, tfidf_vectorizer, svd,top_n=10, tfidf_weight=0.4, bert_weight=0.6):
    db_dir = os.path.join(r"C:\Users\USER\Desktop\IR_Final_Project\db", dataset_path.replace("/", "__"))
    docs_df = get_data_from_mongo(dataset_path)

    query_processed = processor.preprocess(query)

    tfidf_q = tfidf_vectorizer.transform([query_processed])
    tfidf_q_reduced = svd.transform(tfidf_q)

    model = load_bert_model()
    bert_q = model.encode([query_processed], normalize_embeddings=True)

    min_dim = min(tfidf_q_reduced.shape[1], bert_q.shape[1])
    tfidf_q_reduced_cut = tfidf_q_reduced[:, :min_dim]
    bert_q_cut = bert_q[:, :min_dim]

    hybrid_query = tfidf_weight * tfidf_q_reduced_cut + bert_weight * bert_q_cut
    hybrid_query = np.ascontiguousarray(hybrid_query.astype(np.float32))
    faiss.normalize_L2(hybrid_query)

    index = faiss.read_index(os.path.join(db_dir, "hybrid_faiss.index"))
    D, I = index.search(hybrid_query, top_n)

    results = []
    for score, idx in zip(D[0], I[0]):
        results.append({
            "doc_id": docs_df.iloc[idx]["pid"],
            "score": float(score),
            "text": docs_df.iloc[idx]["text"]
        })

    return {
        "query": query,
        "top_documents": results,
        "cosine_similarities": D[0].tolist(),
        "top_documents_indices": I[0].tolist()
    }




# خلية 6: دوال التقييم

all_precisions = []
all_recalls = []
all_map_scores = []
all_mrrs = []

def calculate_precision_recall(relevantOrNot, retrievedDocument, threshold=0.5):
    binaryResult = (retrievedDocument >= threshold).astype(int)
    precision = precision_score(relevantOrNot, binaryResult, average='micro')
    recall = recall_score(relevantOrNot, binaryResult, average='micro')
    return precision, recall

def calculate_map_score(relevantOrNot, retrievedDocument):
    return average_precision_score(relevantOrNot, retrievedDocument, average='micro')

def calculate_mrr(y_true):
    rank_position = np.where(y_true == 1)[0]
    if len(rank_position) == 0:
        return 0
    else:
        return 1 / (rank_position[0] + 1)

def load_queries(queries_paths):
    queries = []
    for file_path in queries_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    query = json.loads(line.strip())
                    if 'query' in query:
                        queries.append(query)
                except json.JSONDecodeError:
                    print(f"Skipping invalid line in {file_path}: {line}")
    return queries

#---

# خلية 7: تقييم بحث TF-IDF (يرجى تعديل search_function حسب حاجتك)

def evaluate_search(dataset_path, search_function):
    import time
    start_time = time.time()
    
    df = get_data_from_mongo(dataset_path)
    
    queries_paths = ''
    if dataset_path == 'lotte/lifestyle/dev/forum':
        queries_paths = r'C:\Users\USER\.ir_datasets\lotte\lotte_extracted\lotte\lifestyle\dev\qas.search.jsonl'
    elif dataset_path == 'antique/train':
        queries_paths = r'C:\Users\USER\.ir_datasets\antique\test\Answers.jsonl'
    else:
        print("Warning: No queries path configured for this dataset.")
        return
    
    queries = load_queries([queries_paths])

    for query in queries:
        if 'query' not in query:
            continue
        
        # استدعاء دالة البحث مع الباراميترات المناسبة
        response_json = search_function(query['query'], top_n=10)
        
        top_documents = response_json["top_documents"]
        cosine_similarities = np.array(response_json["cosine_similarities"])
        top_documents_indices = response_json["top_documents_indices"]

        relevance = np.zeros(len(df))

        for pid in query.get('answer_pids', []):
            pid_str = str(pid)
            indices = np.where(df['pid'] == pid_str)[0]
            relevance[indices] = 1

        retrievedDocument = cosine_similarities
        relevantOrNot = relevance[top_documents_indices]

        if relevantOrNot.sum() == 0:
            continue

        precision, recall = calculate_precision_recall(relevantOrNot, retrievedDocument)
        all_precisions.append(precision)
        all_recalls.append(recall)

        map_score = calculate_map_score(relevantOrNot, retrievedDocument)
        all_map_scores.append(map_score)

        mrr = calculate_mrr(relevantOrNot)
        all_mrrs.append(mrr)

    if len(all_precisions) == 0:
        print("⚠️ No valid queries evaluated. Check PIDs matching and dataset content.")
        return

    avg_precision = np.mean(all_precisions)
    avg_recall = np.mean(all_recalls)
    avg_map_score = np.mean(all_map_scores)
    avg_mrr = np.mean(all_mrrs)

    elapsed_time = time.time() - start_time

    print(f"Evaluation results for dataset: {dataset_path}")
    print(f"Execution Time (seconds): {elapsed_time:.2f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average MAP Score: {avg_map_score:.4f}")
    print(f"Average MRR: {avg_mrr:.4f}")

#---

# خلية 8: مثال كامل للاستخدام (تعديل حسب حاجتك)

dataset_path = "lotte/lifestyle/dev/forum"  
df = get_data_from_mongo(dataset_path)
tfidf_vectorizer, tfidf_matrix = build_tfidf_in_memory(df)
svd = TruncatedSVD(n_components=300)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)
def search_function(query, top_n=10):
    return search_in_hybrid(query, dataset_path, tfidf_vectorizer, svd, top_n=top_n, tfidf_weight=0.4, bert_weight=0.6)

# تشغيل التقييم
evaluate_search(dataset_path, search_function)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluation results for dataset: lotte/lifestyle/dev/forum
Execution Time (seconds): 1247.31
Average Precision: 0.4807
Average Recall: 0.4807
Average MAP Score: 0.5537
Average MRR: 0.5892


In [5]:
# خلية 1: استيراد المكتبات اللازمة

import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import inflect
import re
from bs4 import BeautifulSoup
import unicodedata
import contractions
import json
import os
import joblib
import faiss
from sentence_transformers import SentenceTransformer, CrossEncoder
from collections import defaultdict

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

#--- دوال المونجو والكاش

def get_mongo_connection():
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    return db

stop_words = set(stopwords.words('english'))

def bm25_processed_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

def get_data_from_mongo(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection_name = dataset_path.replace("/", "_")
    collection = db[collection_name]

    pids = []
    texts = []
    cursor = collection.find({}, {"_id": 0, "doc_id": 1, "text": 1})
    for doc in cursor:
        if "doc_id" in doc and "text" in doc and isinstance(doc["text"], str):
            pids.append(str(doc["doc_id"]))
            texts.append(doc["text"])

    df = pd.DataFrame({"pid": pids, "text": texts})
    df.dropna(subset=['text'], inplace=True)
    return df

from functools import lru_cache

@lru_cache(maxsize=1)
def load_bert_model():
    return SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

_loaded_cache = {}
def load_cached(path, loader=joblib.load):
    if path not in _loaded_cache:
        _loaded_cache[path] = loader(path)
    return _loaded_cache[path]

bm25_cache = {}

def get_bm25_components(dataset_path):
    safe_name = os.path.join(r"C:\Users\USER\Desktop\IR_Final_Project\db", dataset_path.replace("/", "__"))
    if safe_name not in bm25_cache:
        base_path = os.path.join("db", safe_name)
        bm25_model = joblib.load(os.path.join(base_path, "bm25_model.joblib"))
        doc_ids = joblib.load(os.path.join(base_path, "doc_ids.joblib"))
        tokenized_texts = joblib.load(os.path.join(base_path, "all_tokenized_texts.joblib"))
        bm25_cache[safe_name] = (bm25_model, doc_ids, tokenized_texts)
    return bm25_cache[safe_name]

def load_documents_by_ids(dataset_path: str, doc_ids):
    db = get_mongo_connection()
    collection = db[dataset_path.replace("/", "_")]
    cursor = collection.find({"doc_id": {"$in": list(doc_ids)}}, {"_id": 0, "doc_id": 1, "text": 1})
    return {doc["doc_id"]: doc["text"] for doc in cursor if "text" in doc}

cross_encoder = None
def get_cross_encoder():
    global cross_encoder
    if cross_encoder is None:
        cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    return cross_encoder

#--- كاش في الذاكرة للـ BM25 weighted inverted index

bm25_weighted_index_cache = {}

def create_bm25_weighted_inverted_index_in_memory(dataset_path):
    db = get_mongo_connection()
    collection = db[dataset_path.replace("/", "_")]

    inverted_index = defaultdict(list)
    total_docs = 0

    print(f"🔄 Creating BM25-style weighted inverted index in memory for dataset: {dataset_path}")

    cursor = collection.find({}, {"_id": 0, "doc_id": 1, "text": 1})

    for doc in cursor:
        if "doc_id" in doc and "text" in doc:
            total_docs += 1
            doc_id = doc["doc_id"]
            tokens = bm25_processed_text(doc["text"])
            token_freq = defaultdict(int)
            for token in tokens:
                token_freq[token] += 1
            for token, freq in token_freq.items():
                inverted_index[token].append({
                    "doc_id": doc_id,
                    "weight": freq
                })

    bm25_weighted_index_cache[dataset_path] = dict(inverted_index)

    print(f"✅ BM25-style weighted inverted index created and stored in cache for dataset: {dataset_path}")
    return {
        "status": "BM25 weighted inverted index created in memory",
        "terms_count": len(inverted_index),
        "documents_indexed": total_docs
    }

def get_weighted_index(dataset_path):
    if dataset_path not in bm25_weighted_index_cache:
        create_bm25_weighted_inverted_index_in_memory(dataset_path)
    return bm25_weighted_index_cache[dataset_path]

def expand_query(tokens):
    expanded = set(tokens)
    for token in tokens:
        for syn in wordnet.synsets(token):
            for lemma in syn.lemmas():
                expanded.add(lemma.name().replace("_", " "))
    return list(expanded)

def search_in_bm25(query, dataset_path, top_k=10, initial_k=30):

    bm25_model, doc_ids, tokenized_texts = get_bm25_components(dataset_path)

    query_tokens = bm25_processed_text(query)
    if not query_tokens:
        return {"top_documents": [], "cosine_similarities": [], "top_documents_indices": []}

    expanded_query = expand_query(query_tokens)

    weighted_index = get_weighted_index(dataset_path)

    candidate_doc_ids = set()
    for term in expanded_query:
        if term in weighted_index:
            for entry in weighted_index[term]:
                if isinstance(entry, dict):
                    candidate_doc_ids.add(entry["doc_id"])
                else:
                    candidate_doc_ids.add(entry)

    if not candidate_doc_ids:
        return {"top_documents": [], "cosine_similarities": [], "top_documents_indices": []}

    doc_id_to_index = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
    candidate_indices = [doc_id_to_index[doc_id] for doc_id in candidate_doc_ids if doc_id in doc_id_to_index]

    if not candidate_indices:
        return {"top_documents": [], "cosine_similarities": [], "top_documents_indices": []}

    scores = bm25_model.get_scores(expanded_query)
    candidate_scores = [(i, scores[i]) for i in candidate_indices]
    candidate_scores.sort(key=lambda x: x[1], reverse=True)

    top_candidates = candidate_scores[:initial_k]
    top_indices = [i for i, _ in top_candidates]
    top_doc_ids = [doc_ids[i] for i in top_indices]
    doc_texts = load_documents_by_ids(dataset_path, top_doc_ids)

    cross_encoder = get_cross_encoder()
    cross_inputs = [(query, doc_texts[doc_ids[i]]) for i in top_indices]
    rerank_scores = cross_encoder.predict(cross_inputs, batch_size=16)

    reranked = sorted(zip(top_doc_ids, rerank_scores, top_indices), key=lambda x: x[1], reverse=True)[:top_k]

    top_documents = []
    cosine_similarities = []
    top_documents_indices = []

    for doc_id, score, idx in reranked:
        top_documents.append({
            "doc_id": doc_id,
            "score": float(score),
            "text": doc_texts[doc_id]
        })
        cosine_similarities.append(float(score))
        top_documents_indices.append(idx)

    return {
        "top_documents": top_documents,
        "cosine_similarities": cosine_similarities,
        "top_documents_indices": top_documents_indices
    }

#--- دوال التقييم

all_precisions = []
all_recalls = []
all_map_scores = []
all_mrrs = []

def calculate_precision_recall(relevantOrNot, retrievedDocument, threshold=0.5):
    binaryResult = (retrievedDocument >= threshold).astype(int)
    precision = precision_score(relevantOrNot, binaryResult, average='micro')
    recall = recall_score(relevantOrNot, binaryResult, average='micro')
    return precision, recall

def calculate_map_score(relevantOrNot, retrievedDocument):
    return average_precision_score(relevantOrNot, retrievedDocument, average='micro')

def calculate_mrr(y_true):
    rank_position = np.where(y_true == 1)[0]
    if len(rank_position) == 0:
        return 0
    else:
        return 1 / (rank_position[0] + 1)

def load_queries(queries_paths):
    queries = []
    for file_path in queries_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    query = json.loads(line.strip())
                    if 'query' in query:
                        queries.append(query)
                except json.JSONDecodeError:
                    print(f"Skipping invalid line in {file_path}: {line}")
    return queries

#--- مثال كامل للاستخدام

dataset_path = "lotte/lifestyle/dev/forum"

def search_function(query, top_n=10):
    return search_in_bm25(query, dataset_path, top_k=top_n, initial_k=30)

evaluate_search(dataset_path, search_function)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔄 Creating BM25-style weighted inverted index in memory for dataset: lotte/lifestyle/dev/forum


  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


✅ BM25-style weighted inverted index created and stored in cache for dataset: lotte/lifestyle/dev/forum


  text = BeautifulSoup(text, "html.parser").get_text()


Evaluation results for dataset: lotte/lifestyle/dev/forum
Execution Time (seconds): 9489.99
Average Precision: 0.5324
Average Recall: 0.5324
Average MAP Score: 0.7502
Average MRR: 0.7833


In [2]:
_loaded_cache.clear()
load_bert_model.cache_clear()


In [5]:
# خلية 1: استيراد المكتبات اللازمة
# ----------------------------------------------------------
import pandas as pd
import numpy as np
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, average_precision_score
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
import inflect
import re
from bs4 import BeautifulSoup   # موجودة أصلاً إن احتجتها لاحقاً
import unicodedata
import contractions
import json
import os
import joblib
from sentence_transformers import SentenceTransformer, CrossEncoder

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
# ----------------------------------------------------------


# خلية 2: كلاس TextProcessor
# ----------------------------------------------------------
class TextProcessor:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.inflect_engine = inflect.engine()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = nltk.tokenize.TreebankWordTokenizer()

    def cleaned_text(self, text):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def normalization_example(self, text):
        return text.lower()

    def stemming_example(self, text):
        words = self.tokenizer.tokenize(text)
        return ' '.join(self.stemmer.stem(w) for w in words)

    def lemmatization_example(self, text):
        words = self.tokenizer.tokenize(text)
        return ' '.join(self.lemmatizer.lemmatize(w) for w in words)

    def remove_stopwords(self, text):
        words = self.tokenizer.tokenize(text)
        return ' '.join(w for w in words if w.lower() not in self.stop_words)

    def number_to_words(self, text):
        words = self.tokenizer.tokenize(text)
        converted = []
        for w in words:
            if w.isdecimal() and w.isascii():
                try:
                    num = int(w)
                    if num <= 999_999_999_999_999:
                        converted.append(self.inflect_engine.number_to_words(w))
                    else:
                        converted.append('[Number Out of Range]')
                except (ValueError, inflect.NumOutOfRangeError):
                    converted.append('[Number Out of Range]')
            else:
                converted.append(w)
        return ' '.join(converted)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def normalize_unicode(self, text):
        return unicodedata.normalize("NFKD", text)

    def handle_negations(self, text):
        words, out, negate = self.tokenizer.tokenize(text), [], False
        for w in words:
            if w.lower() in ['not', "n't"]:
                negate = True
            elif negate:
                out.append(f'NOT_{w}')
                negate = False
            else:
                out.append(w)
        return ' '.join(out)

    def remove_urls(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def preprocess(self, text):
        if text is None:
            return text
        text = self.cleaned_text(text)
        text = self.normalization_example(text)
        text = self.stemming_example(text)
        text = self.lemmatization_example(text)
        text = self.remove_stopwords(text)
        text = self.number_to_words(text)
        text = self.expand_contractions(text)
        text = self.normalize_unicode(text)
        text = self.handle_negations(text)
        text = self.remove_urls(text)
        return text

processor = TextProcessor()
# ----------------------------------------------------------


# خلية 3: جلب البيانات من MongoDB
# ----------------------------------------------------------
def get_data_from_mongo(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection = db[dataset_path.replace("/", "_")]

    pids, texts = [], []
    for doc in collection.find({}, {"_id": 0, "doc_id": 1, "text": 1}):
        if "doc_id" in doc and "text" in doc and isinstance(doc["text"], str):
            pids.append(str(doc["doc_id"]))
            texts.append(doc["text"])

    df = pd.DataFrame({"pid": pids, "text": texts})
    df.dropna(subset=['text'], inplace=True)
    return df
# ----------------------------------------------------------


# خلية 4: تحميل النماذج وتعريف وظائف مساعدة
# ----------------------------------------------------------
retrieval_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
cross_encoder   = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
processor       = TextProcessor()

def load_documents_map(dataset_path):
    client = MongoClient("mongodb://localhost:27017")
    db = client["information_retrieval"]
    collection = db[dataset_path.replace("/", "_")]
    doc_map = {}
    for d in collection.find({}, {"_id": 0, "doc_id": 1, "text": 1}):
        if "doc_id" in d and "text" in d:
            doc_map[str(d["doc_id"])] = d["text"]
    return doc_map
# ----------------------------------------------------------


# خلية 5: search_in_bert (جديدة بدون FAISS)
# ----------------------------------------------------------
# كاش محلي للإيمبدنغز والـ doc_ids
embedding_cache = {}

def load_embeddings_and_doc_ids(dataset_path):
    if dataset_path in embedding_cache:
        return embedding_cache[dataset_path]

    db_dir = os.path.join(r"C:\Users\USER\Desktop\IR_Final_Project\db",
                          dataset_path.replace("/", "__"))
    embeddings = joblib.load(os.path.join(db_dir, "bert_embeddings.joblib"))
    doc_ids    = joblib.load(os.path.join(db_dir, "bert_doc_ids.joblib"))

    embedding_cache[dataset_path] = (embeddings, doc_ids)
    return embeddings, doc_ids


def search_in_bert(query, dataset_path, top_k=50, rerank_k=10):
    embeddings, doc_ids = load_embeddings_and_doc_ids(dataset_path)
    doc_map             = load_documents_map(dataset_path)
    doc_id_to_index     = {str(d): i for i, d in enumerate(doc_ids)}

    query_vec = retrieval_model.encode(
        processor.preprocess(query),
        normalize_embeddings=True
    ).reshape(1, -1)

    similarities = cosine_similarity(query_vec, embeddings)[0]        # (N,)
    top_idx      = np.argsort(similarities)[::-1][:top_k]
    top_doc_ids  = [doc_ids[i] for i in top_idx]
    top_docs     = [(str(d), doc_map.get(str(d), "")) for d in top_doc_ids]

    filtered_docs = [(d, t) for d, t in top_docs if t.strip()]
    pairs         = [(query, t) for _, t in filtered_docs]
    rerank_scores = cross_encoder.predict(pairs)

    ranked   = sorted(zip(filtered_docs, rerank_scores),
                      key=lambda x: x[1],
                      reverse=True)[:rerank_k]

    top_documents, cos_sims, top_doc_indices = [], [], []
    for (d, t), s in ranked:
        top_documents.append({"doc_id": d, "score": float(s), "text": t})
        cos_sims.append(float(s))
        top_doc_indices.append(doc_id_to_index.get(d, -1))

    return {
        "top_documents":         top_documents,
        "cosine_similarities":   cos_sims,
        "top_documents_indices": top_doc_indices
    }
# ----------------------------------------------------------


# خلية 6: دوال التقييم
# ----------------------------------------------------------
all_precisions, all_recalls, all_map_scores, all_mrrs = [], [], [], []

def calculate_precision_recall(y_true, y_score, thresh=0.5):
    bin_res = (y_score >= thresh).astype(int)
    return (precision_score(y_true, bin_res, average='micro'),
            recall_score(y_true, bin_res, average='micro'))

def calculate_map_score(y_true, y_score):
    return average_precision_score(y_true, y_score, average='micro')

def calculate_mrr(y_true):
    pos = np.where(y_true == 1)[0]
    return 0 if len(pos) == 0 else 1 / (pos[0] + 1)

def load_queries(paths):
    out = []
    for p in paths:
        with open(p, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    q = json.loads(line.strip())
                    if 'query' in q: out.append(q)
                except json.JSONDecodeError:
                    print(f"Skipping invalid line in {p}: {line}")
    return out
# ----------------------------------------------------------


# خلية 7: evaluate_search
# ----------------------------------------------------------
def evaluate_search(dataset_path, search_function):
    import time
    start = time.time()

    df = get_data_from_mongo(dataset_path)

    if dataset_path == 'lotte/lifestyle/dev/forum':
        q_path = r'C:\Users\USER\.ir_datasets\lotte\lotte_extracted\lotte\lifestyle\dev\qas.search.jsonl'
    elif dataset_path == 'antique/train':
        q_path = r'C:\Users\USER\.ir_datasets\antique\test\Answers.jsonl'
    else:
        print('⚠️ لم يُضبط مسار الاستعلامات لهذا الـ dataset.')
        return

    queries = load_queries([q_path])

    for q in queries:
        if 'query' not in q: continue

        res  = search_function(q['query'], top_n=10)
        sims = np.array(res['cosine_similarities'])
        idxs = res['top_documents_indices']
        rel  = np.zeros(len(df))
        for pid in q.get('answer_pids', []):
            rel[np.where(df['pid'] == str(pid))[0]] = 1

        y_true = rel[idxs]
        if y_true.sum() == 0: continue

        p, r = calculate_precision_recall(y_true, sims)
        all_precisions.append(p)
        all_recalls.append(r)
        all_map_scores.append(calculate_map_score(y_true, sims))
        all_mrrs.append(calculate_mrr(y_true))

    if not all_precisions:
        print('⚠️ No valid queries evaluated.')
        return

    print(f'نتائج التقييم لـ {dataset_path}')
    print(f'Execution Time: {time.time() - start:.2f} s')
    print(f'Avg Precision  : {np.mean(all_precisions):.4f}')
    print(f'Avg Recall     : {np.mean(all_recalls):.4f}')
    print(f'Avg MAP        : {np.mean(all_map_scores):.4f}')
    print(f'Avg MRR        : {np.mean(all_mrrs):.4f}')
# ----------------------------------------------------------


# خلية 8: مثال تشغيل
# ----------------------------------------------------------
dataset_path = 'lotte/lifestyle/dev/forum'

def search_function(query, top_n=10):
    return search_in_bert(query, dataset_path, top_k=50, rerank_k=top_n)

# لتشغيل التقييم:
evaluate_search(dataset_path, search_function)
# ----------------------------------------------------------


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


نتائج التقييم لـ lotte/lifestyle/dev/forum
Execution Time: 9178.94 s
Avg Precision  : 0.4691
Avg Recall     : 0.4691
Avg MAP        : 0.7245
Avg MRR        : 0.7826


<span style="color:red;">Note!</span>
<span style="color:yellow;">cell evaluation ordering</span>

<ol style="line-height:1.4">
  <li>tfidf</li>
  <li>bert with vector store</li>
  <li>hybrid</li>
  <li>bm25</li>
  <li>bert</li>
</ol>
 