In [4]:
pip install ir_datasets



In [5]:
import ir_datasets
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from collections import defaultdict
import re
from nltk.stem import WordNetLemmatizer
import csv
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# load data


In [6]:
def loadData(dataset):
    documents = [{'doc_id': doc.doc_id, 'text': doc.text} for doc in dataset.docs_iter() if doc.text.strip() ]
    qrels = defaultdict(list, {qrel.query_id: [(qrel.doc_id, qrel.relevance)] for qrel in dataset.qrels_iter()})
    queries = {query.query_id: query.text for query in dataset.queries_iter() if query.text.strip() }
    return qrels,queries,documents


In [7]:
# load data
dataset1 = ir_datasets.load('antique/test/non-offensive')
dataset2 = ir_datasets.load('beir/quora/test')

In [8]:
qrels1,queries1,documents1 =loadData(dataset1)
qrels2,queries2,documents2 =loadData(dataset2)


In [9]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# preprocess data


In [10]:
def remove_numbers_and_dates(text):
        if not text:
           return ''
        if not isinstance(text, str):
           return ''
        number_pattern = r'\d+'
        date_pattern = r'\d{1,2}/\d{1,2}/\d{4}'
        symbol_pattern = r'[^\w\s]'
        text = re.sub(number_pattern, '', text)
        text = re.sub(date_pattern, '', text)
        text = re.sub(symbol_pattern, '', text)

        return text

In [11]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # remove numbers and dates
    text = remove_numbers_and_dates(text)


    # Tokenize the text into individual words
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a single string
    processed_text = " ".join(tokens)

    return processed_text

In [12]:
import os
import pandas as pd

def preprocessData(documents, queries, folder_name):
    processed_docs_file = os.path.join(folder_name, 'processed_documents.csv')
    processed_queries_file = os.path.join(folder_name, 'processed_queries.csv')

    # Check if the processed files already exist
    if os.path.exists(processed_docs_file) and os.path.exists(processed_queries_file):
        # Load the processed data from the CSV files
        processed_docs = pd.read_csv(processed_docs_file, header=None).values.tolist()
        processed_queries = pd.read_csv(processed_queries_file, header=None).values.tolist()
    else:
        # Create the folder if it doesn't exist
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)

        # Process the documents and queries
        processed_docs = [preprocess_text(doc['text']) for doc in documents]
        processed_queries = [preprocess_text(query) for query in queries.values()]

        # Save the processed data to CSV files
        doc_df = pd.DataFrame(processed_docs)
        doc_df.to_csv(processed_docs_file, index=False, header=False)

        query_df = pd.DataFrame(processed_queries)
        query_df.to_csv(processed_queries_file, index=False, header=False)

    return processed_docs, processed_queries

In [13]:
processed_docs1,processed_queries1 = preprocessData(documents1,queries1,'folder_data1')
processed_docs2,processed_queries2 = preprocessData(documents2,queries2,'folder_data2')


# vectorizer Data

In [14]:
import os
from scipy.sparse import save_npz, load_npz
import pickle

def vectorizerData(processed_docs, processed_queries, folder_name):
    vectorizer = TfidfVectorizer()
    # Define file paths
    doc_vectors_file = os.path.join(folder_name, 'doc_vectors.npz')
    query_vectors_file = os.path.join(folder_name, 'query_vectors.npz')
    vectorizer_file = os.path.join(folder_name, 'vectorizer.pkl')

    # Check if the files already exist
    if os.path.exists(doc_vectors_file) and os.path.exists(query_vectors_file) and os.path.exists(vectorizer_file):
        # Load existing vectors and vectorizer
        doc_vectors = load_npz(doc_vectors_file)
        query_vectors = load_npz(query_vectors_file)
        with open(vectorizer_file, 'rb') as f:
            vectorizer = pickle.load(f)
    else:
        # Transform documents and queries
        doc_vectors = vectorizer.fit_transform(processed_docs)
        query_vectors = vectorizer.transform(processed_queries)

        # Save vectors and vectorizer
        save_npz(doc_vectors_file, doc_vectors)
        save_npz(query_vectors_file, query_vectors)
        with open(vectorizer_file, 'wb') as f:
            pickle.dump(vectorizer, f)

    return doc_vectors, query_vectors, vectorizer


## TFIDF vectorizer

In [15]:
doc_vectors1,query_vectors1,vectorizer1 = vectorizerData(processed_docs1,processed_queries1,'folder_data1')
doc_vectors2,query_vectors2,vectorizer2 = vectorizerData(processed_docs2,processed_queries2,'folder_data2')



In [16]:

# load tfdif from file and calculate cosine similarities
def match_document_query(query_tfidf_vector,tfidf_matrix):
        cosine_similarities = cosine_similarity(
            query_tfidf_vector,
            tfidf_matrix
        ).flatten()
        doc_indices_sorted = np.argsort(cosine_similarities)[::-1]
        doc_indices_sorted = doc_indices_sorted[cosine_similarities[doc_indices_sorted] > 0]
        return doc_indices_sorted


#  Match Rank Documents

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(query_vectors,doc_vectors):
        return cosine_similarity(query_vectors, doc_vectors)

def rank_documents(similarity_scores,queries,documents):
        query_results = {}
        for qid, scores in zip(queries.keys(), similarity_scores):
            sorted_docs = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
            query_results[qid] = [documents[idx]['doc_id'] for idx, _ in sorted_docs]
        return query_results

# Evaluation

In [18]:
def average_precision(retrieved_docs, relevant_docs):
        relevant_set = set(relevant_docs)
        hits = 0
        sum_precisions = 0
        for i, doc_id in enumerate(retrieved_docs):
            if doc_id in relevant_set:
                hits += 1
                sum_precisions += hits / (i + 1)
        return sum_precisions / len(relevant_docs) if relevant_docs else 0

def reciprocal_rank(retrieved_docs, relevant_docs):
        relevant_set = set(relevant_docs)
        for i, doc_id in enumerate(retrieved_docs):
            if doc_id in relevant_set:
                return 1 / (i + 1)
        return 0

def precision_at_k(retrieved_docs, relevant_docs, k=10):
        relevant_set = set(relevant_docs)
        retrieved_set = set(retrieved_docs[:k])
        true_positives = len(relevant_set & retrieved_set)
        return true_positives / k

def recall(retrieved_docs, relevant_docs):
        relevant_set = set(relevant_docs)
        retrieved_set = set(retrieved_docs)
        true_positives = len(relevant_set & retrieved_set)
        return true_positives / len(relevant_set) if relevant_set else 0



def calculate_metrics(qrels,queries,documents,query_vectors,doc_vectors):
        similarity_scores = compute_similarity(query_vectors,doc_vectors)
        query_results = rank_documents(similarity_scores,queries,documents)

        map_scores = []
        mrr_scores = []
        recall_scores = []
        precision_at_10_scores = []

        for qid, rel_docs in qrels.items():
            relevant_docs = [doc_id for doc_id, score in rel_docs if score > 0]
            retrieved_docs = query_results.get(qid, [])

            ap = average_precision(retrieved_docs, relevant_docs)
            rr = reciprocal_rank(retrieved_docs, relevant_docs)
            rec = recall(retrieved_docs, relevant_docs)
            prec_10 = precision_at_k(retrieved_docs, relevant_docs)

            map_scores.append(ap)
            mrr_scores.append(rr)
            recall_scores.append(rec)
            precision_at_10_scores.append(prec_10)

        return {
            'MAP': (sum(map_scores) / len(map_scores))*100  if map_scores else 0,
            'MRR': (sum(mrr_scores) / len(mrr_scores))*100 if mrr_scores else 0,
            'Average Recall': (sum(recall_scores) / len(recall_scores))*100 if recall_scores else 0,
            'Precision@10': (sum(precision_at_10_scores) / len(precision_at_10_scores))*100 if precision_at_10_scores else 0,
        }


In [19]:
calculate_metrics(qrels1,queries1,documents1,query_vectors1,doc_vectors1)

{'MAP': 2.6760579671677447,
 'MRR': 2.6760579671677447,
 'Average Recall': 100.0,
 'Precision@10': 0.6818181818181818}

In [None]:
calculate_metrics(qrels2,queries2,documents2,query_vectors2,doc_vectors2)

TypeError: sparse array length is ambiguous; use getnnz() or shape[0]

# Optimizer

## Clustering

In [20]:
def process_query(query, vectorizer):
    processed_query = preprocess_text(query)
    query_vector = vectorizer.transform([processed_query])
    return processed_query,query_vector


In [21]:
processed_query,query_vector = process_query("hi man",vectorizer1)


In [22]:
from sklearn.cluster import KMeans
import numpy as np

def perform_clustering(doc_vectors, n_clusters, folder_name):
    # Define file paths
    kmeans_file = os.path.join(folder_name, 'kmeans_model.pkl')

    # Check if the model already exists
    if os.path.exists(kmeans_file):
        # Load existing model
        with open(kmeans_file, 'rb') as f:
            kmeans = pickle.load(f)
    else:
        # Perform clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(doc_vectors)
        # Save the model
        with open(kmeans_file, 'wb') as f:
            pickle.dump(kmeans, f)

    return kmeans

# Specify the number of clusters
n_clusters = 2
kmeans1 = perform_clustering(doc_vectors1, n_clusters, 'folder_data1')
kmeans2 = perform_clustering(doc_vectors2, n_clusters, 'folder_data2')


In [20]:

 # load tfdif from file and clustering file and calculate cosine similarities
def match_document_query_with_clustering(doc_vectors1temp,query_tfidf_vector,kmeans):
        tfidf_matrix = doc_vectors1temp
        label = kmeans.predict(query_tfidf_vector)[0]
        cluster_indices = [i for i, l in enumerate(kmeans.labels_) if l == label]
        similarity_scores = cosine_similarity(query_tfidf_vector, tfidf_matrix[cluster_indices]).flatten()
        doc_indices_sorted = np.argsort(similarity_scores)[::-1]
        doc_indices_sorted = doc_indices_sorted[similarity_scores[doc_indices_sorted] > 0]
        return doc_indices_sorted


In [21]:
query_results = match_document_query_with_clustering(doc_vectors1,query_vector, kmeans1)


In [50]:
query_results

array([ 27188, 180284, 268909, ...,  46780, 130171, 232023])

In [22]:
def calculate_metrics_with_clustering(qrels, queries, documents, query_vectors, doc_vectors, kmeans):
    similarity_scores = []  # This will hold similarity scores for each query
    query_results = {}  # This will hold the final ranked document IDs for each query

    for qid, query_vector in zip(queries.keys(), query_vectors):
        # Find cluster indices for the current query vector
        doc_indices_sorted = match_document_query_with_clustering(doc_vectors, query_vector, kmeans)

        # Extract the document IDs based on sorted indices
        ranked_docs = [documents[idx]['doc_id'] for idx in doc_indices_sorted]

        query_results[qid] = ranked_docs
        # We store similarity scores if needed for further processing (not used in this example)
        # similarity_scores.append(cosine_similarity(query_vector, doc_vectors[doc_indices_sorted]).flatten())

    # Now calculate the evaluation metrics
    map_scores = []
    mrr_scores = []
    recall_scores = []
    precision_at_10_scores = []

    for qid, rel_docs in qrels.items():
        relevant_docs = [doc_id for doc_id, score in rel_docs if score > 0]
        retrieved_docs = query_results.get(qid, [])

        ap = average_precision(retrieved_docs, relevant_docs)
        rr = reciprocal_rank(retrieved_docs, relevant_docs)
        rec = recall(retrieved_docs, relevant_docs)
        prec_10 = precision_at_k(retrieved_docs, relevant_docs)

        map_scores.append(ap)
        mrr_scores.append(rr)
        recall_scores.append(rec)
        precision_at_10_scores.append(prec_10)

    return {
        'MAP': (sum(map_scores) / len(map_scores)) * 100 if map_scores else 0,
        'MRR': (sum(mrr_scores) / len(mrr_scores)) * 100 if mrr_scores else 0,
        'Average Recall': (sum(recall_scores) / len(recall_scores)) * 100 if recall_scores else 0,
        'Precision@10': (sum(precision_at_10_scores) / len(precision_at_10_scores)) * 100 if precision_at_10_scores else 0,
    }


## *kmeans* *5*

In [52]:
calculate_metrics_with_clustering(qrels1, queries1, documents1, query_vectors1, doc_vectors1, kmeans1)

{'MAP': 0.0002995300616742081,
 'MRR': 0.0002995300616742081,
 'Average Recall': 2.272727272727273,
 'Precision@10': 0.0}

## *kmeans* *3*

In [54]:
calculate_metrics_with_clustering(qrels1, queries1, documents1, query_vectors1, doc_vectors1, kmeans1)

{'MAP': 0.001150058140634537,
 'MRR': 0.001150058140634537,
 'Average Recall': 3.977272727272727,
 'Precision@10': 0.0}

## *kmeans* *2*

In [21]:
calculate_metrics_with_clustering(qrels1, queries1, documents1, query_vectors1, doc_vectors1, kmeans1)

{'MAP': 0.000884231204417897,
 'MRR': 0.000884231204417897,
 'Average Recall': 4.545454545454546,
 'Precision@10': 0.0}

# vector Store

In [23]:
pip install faiss-cpu  # For CPU




In [23]:
from sklearn.decomposition import TruncatedSVD

def reduce_dimensions(doc_vectors, n_components=100):
    svd = TruncatedSVD(n_components=n_components)
    reduced_vectors = svd.fit_transform(doc_vectors)
    return reduced_vectors.astype(np.float32)


In [24]:
import faiss

def build_faiss_index(reduced_vectors, dimension, nlist=100):
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
    index.train(reduced_vectors)
    index.add(reduced_vectors)
    return index


In [25]:
# مثال على تحميل بيانات
from scipy.sparse import csr_matrix

# تحويل مصفوفة المستندات إلى CSR وتقليل الأبعاد
reduced_vectors = reduce_dimensions(doc_vectors1)

# بناء المؤشر
dimension = 100  # الأبعاد بعد التقليل
faiss_index = build_faiss_index(reduced_vectors, dimension)

In [26]:
def search_with_faiss(query_vector, index, k=10):
    # Assure query vector is in the correct shape and dtype
    query_vector = query_vector.reshape(1, -1).astype(np.float32)
    distances, indices = index.search(query_vector, k)
    return indices


In [53]:
# استخدام الاستعلام للبحث في المؤشر
query_vector = reduced_vectors[0]  # مثال على استخدام أول متجه كاستعلام
matched_indices = search_with_faiss(query_vector, faiss_index).flatten()

print("Indices of matched documents:", matched_indices)
matched_indices

Indices of matched documents: [     0 286643 196005   1468 130607 381796 130609 130614 281751 375478]


array([     0, 286643, 196005,   1468, 130607, 381796, 130609, 130614,
       281751, 375478])

In [27]:
def search_with_faiss(query_vectors, index, k=10):
    # Assumes query_vectors is a scipy sparse matrix or numpy array
    distances, indices = index.search(query_vectors, k)
    return indices


In [41]:
def calculate_metrics_with_faiss(qrels, queries, index, query_vectors,documents):
    map_scores = []
    mrr_scores = []
    recall_scores = []
    precision_at_10_scores = []

    # Assume query_vectors is already a matrix where each row is a vectorized query
    # تحويل مصفوفة المستندات إلى CSR وتقليل الأبعاد
    query_vectors = reduce_dimensions(query_vectors)

    for qid, query_vector in zip(queries.keys(), query_vectors):
        # Search with FAISS - adjust k as necessary

        retrieved_indices = search_with_faiss(query_vector.reshape(1, -1), index, k=100)  # Make sure k is sufficient
        retrieved_docs = [documents[idx]['doc_id'] for idx in retrieved_indices.flatten()]  # documents should be a list where index corresponds to doc IDs

        # Extract relevant documents from qrels
        relevant_docs = [doc_id for doc_id, rel in qrels[qid] if rel > 0]

        # Calculate metrics for this query
        ap = average_precision(retrieved_docs, relevant_docs)
        rr = reciprocal_rank(retrieved_docs, relevant_docs)
        rec = recall(retrieved_docs, relevant_docs)
        prec_10 = precision_at_k(retrieved_docs, relevant_docs)

        map_scores.append(ap)
        mrr_scores.append(rr)
        recall_scores.append(rec)
        precision_at_10_scores.append(prec_10)

    # Compute final scores
    return {
        'MAP': sum(map_scores) / len(map_scores) * 100 if map_scores else 0,
        'MRR': sum(mrr_scores) / len(mrr_scores) * 100 if mrr_scores else 0,
        'Average Recall': sum(recall_scores) / len(recall_scores) * 100 if recall_scores else 0,
        'Precision@10': sum(precision_at_10_scores) / len(precision_at_10_scores) * 100 if precision_at_10_scores else 0,
    }


In [42]:
calculate_metrics_with_faiss(qrels1, queries1, faiss_index, query_vectors1,documents1)

{'MAP': 0.0, 'MRR': 0.0, 'Average Recall': 0.0, 'Precision@10': 0.0}