In [216]:
#| default_exp semantic_search_evaluator

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [217]:
#| export

import numpy as np
import pandas as pd
from typing import Dict, Set, List, Tuple
from transformers import AutoTokenizer, AutoModel
from enum import Enum
from torch import Tensor
import logging
import torch
import torch.nn.functional as F

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


In [229]:
base_dir = "../data"
questions_file = f"{base_dir}/evaluation_dataset/sample_queries_cleaned.jsonl"

In [230]:
df_queries = pd.read_json(questions_file, lines=True)

In [231]:
df_queries.head(n=3)

Unnamed: 0,text,id,query
0,ARTIOM DRUMEA Sent You a Message Behance Basic...,<010001809eb828f6-b2888a42-bd74-4f16-b366-cf0e...,"""message from ARTIOM DRUMEA on Behance"""
1,Fwd: CIIS Commencement 2021: Join us on May 1!...,<BY5PR22MB208353712894331A58D4D4C6E72F9@BY5PR2...,"""CIIS Commencement 2021 details"""
2,Your Digest: From the Mac Startup Tone to the ...,<esuh8huNSw2sRLJpnIw4HQ@ismtpd0012p1iad2.sendg...,"""OneZero digital digest"""


In [232]:
embeddings_index_path = f"{base_dir}/embeddings_distilbert_base_uncased_mean_pooling/embeddings_index.npy"
embeddings_index = np.load(embeddings_index_path, allow_pickle=True)

In [233]:
# contruct the ground truth dictionary from the dataframe

ground_truth = dict[int, Set[int]]()
for idx, row in df_queries.iterrows():
    email_id = row["id"]
    print(f"idx: {idx}, query: {row['query']}, id: {email_id}")
    # find in embeddings index it's index
    emb_index_set = set(np.where(embeddings_index == email_id)[0])
    ground_truth[idx] = emb_index_set

idx: 0, query: "message from ARTIOM DRUMEA on Behance", id: <010001809eb828f6-b2888a42-bd74-4f16-b366-cf0e2ff63842-000000@email.amazonses.com>
idx: 1, query: "CIIS Commencement 2021 details", id: <BY5PR22MB208353712894331A58D4D4C6E72F9@BY5PR22MB2083.namprd22.prod.outlook.com>
idx: 2, query: "OneZero digital digest", id: <esuh8huNSw2sRLJpnIw4HQ@ismtpd0012p1iad2.sendgrid.net>
idx: 3, query: "ASP.NET Core features email", id: <RL8EBdcFTSSO7q2dFdUCxw@ismtpd0023p1iad2.sendgrid.net>
idx: 4, query: "Outerbounds ML Platform improvement email from Will", id: <CAKmoF_9cZB-5TjiELdaSpKsAsQ2q2pgpisO_kb+7o=q+F-y81A@mail.gmail.com>
idx: 5, query: "scan srtnica mama", id: <20240122115635.1ab5b942d614a95a@sndmail.mail.io>
idx: 6, query: "Design Within Reach recommendation survey", id: <01000176860d2f7a-b6ea6de5-ab4d-40a5-b800-4bc46870cff0-000000@email.amazonses.com>
idx: 7, query: "Uber account deletion confirmation", id: <7948fea5-a34b-4993-a6ea-55742f2e4729@mail.uber.com>
idx: 8, query: "Pixlr Genesi

In [234]:
print(ground_truth[19])
# embeddings_index[2712]
print(np.where(embeddings_index == "<CPZP284MB058472D9136C52B8957CB8B192C69@CPZP284MB0584.BRAP284.PROD.OUTLOOK.COM>")[0])

{6391}
[1320 1321 1322]


In [238]:
#| export

class SimilarityFunction(Enum):
    COSINE = "cosine"
    DOT_PRODUCT = "dot"
    EUCLIDEAN = "euclidean"

class MailioInformationRetrievalEvaluator:
    """
    Insipired by on https://github.com/UKPLab/sentence-transformers/blob/v3.4-release/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
    Gives me a bit more le-way to customize creation of query embeddings and corpus embeddings
    Also simplifies the code a bit

    Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
    Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)
    """
    def __init__(self, 
        ground_truth:Dict[int, Set[int]],  # query_index => Set[corpus_index]
        corpus_embeddings: Tensor, # embeddingns of corpus_index
        query_embeddings: Tensor, # embeddings of query_index
        mrr_at_k: List[int] = [10],
        ndcg_at_k: List[int] = [10],
        accuracy_at_k: List[int] = [1, 3, 5, 10],
        precision_recall_at_k: List[int] = [1, 3, 5, 10],
        map_at_k: List[int] = [100],
        similarity_functions = [SimilarityFunction.COSINE],
        ) -> None:
        """
        Initializes the InformationRetrievalEvaluator.
        Args:
            corpus_embeddings (Tensor): A tensor of shape (N, D) containing the document embeddings.
            query_embeddings (Tensor): A tensor of shape (M, D) containing the query embeddings.
            ground_truth (Dict[str, Set[str]]): A dictionary mapping query index to a set of relevant document indexes.
            mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
            ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
            accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
            precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
            map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
        """
        self.corpus_embeddings = corpus_embeddings
        self.query_embeddings = query_embeddings
        self.ground_truth = ground_truth
        self.mrr_at_k = mrr_at_k
        self.ndcg_at_k = ndcg_at_k
        self.accuracy_at_k = accuracy_at_k
        self.precision_recall_at_k = precision_recall_at_k
        self.map_at_k = map_at_k
        self.similarity_functions = similarity_functions
        self.problematic_queries = set()

    def run(self):
        return self.compute_metrices()

    def get_problematic_queries(self):
        return self.problematic_queries

    def compute_metrices(self):
        """
        Computes the evaluation metrics.
        Args:
            top_k (int): The number of retrieved documents for which to compute the evaluation metrics. Defaults to 10.
        Returns:
            Dict[str, Dict[str, float]]: A dictionary mapping metric names to dictionaries mapping metric values to scores.
        """
        max_k = max(
            max(self.mrr_at_k),
            max(self.ndcg_at_k),
            max(self.accuracy_at_k),
            max(self.precision_recall_at_k),
            max(self.map_at_k),
        )
        # prepare the query result list for each query and each score function
        self.queries_result_list = {}
        
        metrics = {}
        for sim_fn in self.similarity_functions:
            similarity_name = str(sim_fn.value)
            queries_results = self.compute_similarity_function_product(sim_fn, top_k=max_k)
            similarity_metrics = self.compute_metrics(queries_results)
            metrics[similarity_name] = similarity_metrics
        return metrics
    
    def compute_similarity_function_product(self, similarity_function: SimilarityFunction, top_k: int = 100) -> Dict[int, List[Tuple[float, int]]]:
        """
        Computes the evaluation metrics for a given similarity function.
        Args:
            similarity_function (SimilarityFunction): The similarity function to use for computing the similarity between queries and documents.
            top_k (int): The number of retrieved documents for which to compute the evaluation metrics. Defaults to 10.
        Returns:
            Dict[int, List[Tuple[float, int]]] : A dictionary mapping query indexes to a list of tuples containing the similarity score and the document index.
        """

        query_results = {}
        
        # compute the similarity between each query and each document
        if similarity_function == SimilarityFunction.COSINE:
            for query_index in range(len(self.query_embeddings)):
                query_embedding = self.query_embeddings[query_index]
                similarity = F.cosine_similarity(query_embedding, self.corpus_embeddings, dim=1)
                scores, indices = similarity.topk(top_k, dim=0)
                s = scores.cpu().numpy().ravel()
                i = indices.cpu().numpy().ravel()
                query_results[query_index] = [(s, i) for s, i in zip(s, i)]
                
        return query_results    

    def compute_metrics(self, queries_results: Dict[int, List[Tuple[float, int]]]):
        """
        Computes the evaluation metrics for a given similarity function.
        Args:
            queries_results (Dict[int, List[Tuple[float, int]]]): A dictionary mapping query indexes to a list of tuples containing the similarity score and the document index.
        Returns:
            Dict[str, Dict[str, float]]: A dictionary mapping metric names to dictionaries mapping metric values to scores.
        """
        # Init score computation values
        num_hits_at_k = {k: 0 for k in self.accuracy_at_k}
        precisions_at_k = {k: [] for k in self.precision_recall_at_k}
        recall_at_k = {k: [] for k in self.precision_recall_at_k}
        MRR = {k: 0 for k in self.mrr_at_k}
        ndcg = {k: [] for k in self.ndcg_at_k}
        AveP_at_k = {k: [] for k in self.map_at_k}

        # queries not in top 3
        self.problematic_queries = set()

        # Compute scores on results
        for query_index, results in queries_results.items():
            # Sort scores (probably unecessary but just in case)
            top_hits = sorted(results, key=lambda x: x[0], reverse=True)

            relevant_docs_ids = self.ground_truth[query_index]
            # Accuracy@k - We count the result correct, if at least one relevant doc is across the top-k documents
            found_any_acc_k = False
            for k_val in self.accuracy_at_k:
                for hit in top_hits[0:k_val]:
                    if hit[1] in relevant_docs_ids:
                        num_hits_at_k[k_val] += 1
                        found_any_acc_k = True
                        break
            if not found_any_acc_k:
                self.problematic_queries.add(query_index)

            # Precision and Recall@k
            for k_val in self.precision_recall_at_k:
                num_correct = 0
                for hit in top_hits[0:k_val]:
                    if hit[1] in relevant_docs_ids:
                        num_correct += 1
                
                precisions_at_k[k_val].append(num_correct / k_val)
                recall_at_k[k_val].append(num_correct / len(relevant_docs_ids))

            # @Mean Reciprocal Rank
            for k_val in self.mrr_at_k:
                for rank, hit in enumerate(top_hits[0:k_val]):
                    if hit[1] in relevant_docs_ids:
                        MRR[k_val] += 1.0 / (rank + 1)
                        break

            # NDCG@k (normalized discounted cumulative gain at k)
            for k_val in self.ndcg_at_k:
                dcg = 0
                idcg = 0
                for i in range(k_val):
                    if i < len(top_hits):
                        if top_hits[i][1] in relevant_docs_ids:
                            dcg += 1 / np.log2(i + 2)
                    idcg += 1 / np.log2(i + 2)
                ndcg[k_val].append(dcg / idcg)
            
            # Map@k
            for k_val in self.map_at_k:
                num_correct = 0
                sum_precisions = 0
                for i, hit in enumerate(top_hits[0:k_val]):
                    if hit[1] in relevant_docs_ids:
                        num_correct += 1
                        sum_precisions += num_correct / (i + 1)
                avg_precision = sum_precisions / min(k_val, len(relevant_docs_ids))
                AveP_at_k[k_val].append(avg_precision)

            
        # Compute averages
        for k in num_hits_at_k:
            num_hits_at_k[k] /= len(self.query_embeddings)

        for k in precisions_at_k:
            precisions_at_k[k] = np.mean(precisions_at_k[k])
        
        for k in recall_at_k:
            recall_at_k[k] = np.mean(recall_at_k[k])

        for k in MRR:
            MRR[k] /= len(self.query_embeddings)
            MRR[k] = 1/MRR[k]

        for k in ndcg:
            ndcg[k] = np.mean(ndcg[k])
        
        for k in AveP_at_k:
            AveP_at_k[k] = np.mean(AveP_at_k[k])
        
        return {
            "accuracy@k": num_hits_at_k,
            "precision@k": precisions_at_k,
            "recall@k": recall_at_k,
            "ndcg@k": ndcg,
            "mrr@k": MRR,
            "map@k": AveP_at_k,
        }
    
    def output_scores(self, scores):
        """
        Outputs the evaluation metrics.
        Args:
            metrics (Dict[str, Dict[str, float]]): A dictionary mapping metric names to dictionaries mapping metric values to scores.
        """
        for k in scores["accuracy@k"]:
            logger.info("Accuracy@{}: {:.2f}%".format(k, scores["accuracy@k"][k] * 100))

        for k in scores["precision@k"]:
            logger.info("Precision@{}: {:.2f}%".format(k, scores["precision@k"][k] * 100))

        for k in scores["recall@k"]:
            logger.info("Recall@{}: {:.2f}%".format(k, scores["recall@k"][k] * 100))

        for k in scores["mrr@k"]:
            logger.info("MRR@{}: {:.4f} rank from top".format(k, scores["mrr@k"][k]))

        for k in scores["ndcg@k"]:
            logger.info("NDCG@{}: {:.4f}% as good as ideal ranking".format(k, scores["ndcg@k"][k] * 100))

        for k in scores["map@k"]:
            logger.info("MAP@{}: {:.4f}".format(k, scores["map@k"][k]))
    


In [239]:
embeddings_path = f"{base_dir}/embeddings_distilbert_base_uncased_mean_pooling/embeddings.npy"
query_embeddings_path = f"{base_dir}/evaluation_dataset/query_embeddings.npy"

corpus_embeddings = np.load(embeddings_path)
query_embeddings = np.load(query_embeddings_path)
# convert to tensor
corpus_embeddings = torch.from_numpy(corpus_embeddings)
query_embeddings = torch.from_numpy(query_embeddings)
#normalize
corpus_embeddings = F.normalize(corpus_embeddings, p=2, dim=1)
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)

In [241]:
ir_evaluator = MailioInformationRetrievalEvaluator(ground_truth, corpus_embeddings, query_embeddings)
metrics = ir_evaluator.run()
for name, score in metrics.items():
    print(f"Similarity function: {name}")
    ir_evaluator.output_scores(score)
    print("\n")
print(f"Found {len(ir_evaluator.get_problematic_queries())} highly problematic queries")
for problem_query_index in ir_evaluator.get_problematic_queries():
    print(df_queries.iloc[problem_query_index]["query"])
# print(metrics[SimilarityFunction.COSINE.name])

2025-01-28 14:49:37,664 - INFO - Accuracy@1: 61.36%
2025-01-28 14:49:37,665 - INFO - Accuracy@3: 70.45%
2025-01-28 14:49:37,665 - INFO - Accuracy@5: 80.68%
2025-01-28 14:49:37,665 - INFO - Accuracy@10: 87.50%
2025-01-28 14:49:37,666 - INFO - Precision@1: 61.36%
2025-01-28 14:49:37,666 - INFO - Precision@3: 25.38%
2025-01-28 14:49:37,666 - INFO - Precision@5: 17.27%
2025-01-28 14:49:37,666 - INFO - Precision@10: 10.11%
2025-01-28 14:49:37,667 - INFO - Recall@1: 53.27%
2025-01-28 14:49:37,667 - INFO - Recall@3: 63.83%
2025-01-28 14:49:37,667 - INFO - Recall@5: 72.92%
2025-01-28 14:49:37,667 - INFO - Recall@10: 80.49%
2025-01-28 14:49:37,668 - INFO - MRR@10: 1.4532 rank from top
2025-01-28 14:49:37,668 - INFO - NDCG@10: 17.4391% as good as ideal ranking
2025-01-28 14:49:37,668 - INFO - MAP@100: 0.6326


Similarity function: cosine


Found 11 highly problematic queries
"Zoom invitation on April 21, 2021"
"Pat asking how i'm doing with Ukrain war"
"United Airlines eTicket receipt for confirmation OY874S"
"Customer Acquisition and Retention Weekly Issue 281 February 2020"
"Mailio NFTs daily summary September 2022"
"GLS 508912991 delivery notice"
"Budget Rent A Car reservation reminder September 2021"
"Akontacija DDPO junij 2020"
"racun za april 2020"
"CueMateAPISpec API changes from Pat"
"San Francisco Java User Group workshop December 2023"


In [243]:
#| hide

import nbdev; nbdev.nbdev_export()