EVALUATION METRICS

In [2]:
import json
import math
import os
import re
import collections
from collections import defaultdict
from array import array
import numpy as np
import pandas as pd
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # one level up from part_2
sys.path.append(project_root)
from part_2.indexing_evaluation import load_processed_docs,create_index_tfidf,search_tf_idf

In [None]:
def precision_k(y_true,y_score,k=10):
    """
    Precision@K measures how many of the top K predicted items 
    are actually relevant (i.e., positive in y_true).
    
    Args:
        y_true = real binary labels (1 for relevant and 0 for not relevant)
        y_score = predicted labels
        k = number of top-scored items to consider (by default set to 10)
    
    Returns:
        Precision at rank k (between 0 and 1)
    
    """
    order=np.argsort(np.asarray(y_score))[::-1]
    y_true= np.take(np.asarray(y_true),order)
    k=min(k,len(y_true)) #handle if k> len(y_true)
    return np.sum(y_true[:k])/k if k>0 else 0.0

In [None]:
def recall_k(y_true,y_score,k=10):
    """
    Recall@K measures how many of the *relevant* items are found 
    in the top K predictions.

    Args:
        y_true: real binary levels (1 for relevant and 0 for not relevant)
        y_score: predicted labels
        k: number of top-scored items to consider (by default set to 10)

    Returns:
        recall at rank k
    """
    order=np.argsort(np.asarray(y_score))[::-1]
    y_true=np.take(np.asarray(y_true),order)
    relevenat=np.sum(y_true[:k])
    total_rel=np.sum(np.asarray(y_true))
    return relevenat/total_rel if total_rel>0 else 0.0

In [7]:
def f1_k(y_true,y_score,k=10):
    """
    Compute F1@K (F1-score at rank K)
    It provides a balanced measure of how well the model retrieves 
    relevant items among the top K predictions.

    Args:
        y_true: real binary levels (1 for relevant and 0 for not relevant)
        y_score: predicted labels
        k: number of top-scored items to consider (by default set to 10)

    Returns:
        F1-score at rank K (between 0 and 1)
    """
    prec=precision_k(y_true,y_score,k)
    rec=recall_k(y_true,y_score,k)
    return (2*prec*rec)/(prec+rec) if (prec+rec)>0 else 0.0

In [8]:
def average_precision(y_true,y_score,k=10):
    """
    Compute Average Precision@K (AP@K)
    It averages the precision values obtained every time a relevant item (y_true == 1) is found among the top K results.

    Args:
        y_true: real binary levels (1 for relevant and 0 for not relevant)
        y_score: predicted labels
        k: number of top-scored items to consider (by default set to 10)

    Returns:
        Average precision at rank k (between 0 and 1)
    """
    order=np.argsort(y_score)[::-1]
    y_true=np.take(np.asarray(y_true),order)
    prec_list=[]
    num_relevant=0
    for i in range(min(k,len(order))):
        if y_true[i]==1:
            num_relevant +=1
            prec_list.append(num_relevant/(i+1))
    return np.sum(prec_list)/num_relevant if num_relevant>0 else 0.0

In [9]:
def map_k(dt,k):
    """
    Compute Mean Average Precision@K (MAP@K)

    Args:
        dt : a DataFrame containing columns "query_id", "labels" and "predicted_relevance"
        k : number of top ranked documents to consider for the quey

    Returns:
       (The overall Mean Average Precision@K across all queries, The individual Average Precision@K scores for each query)
       
    """
    avp = []
    for q in dt["query_id"].unique():  
        curr_data = dt[dt["query_id"] == q] 
        y_true=curr_data["labels"].values
        y_score=curr_data["predicted_relevance"].values
        avp.append(average_precision(y_true,y_score,k)) 
    return np.sum(avp) / len(avp), avp  

In [None]:
def rr_k(y_true,y_score,k=10):
    """
    Compute Reciprocal Rank@K (RR@K)

    reciprocal Rank@K measures how far down the ranking 
    the *first relevant* item appears within the top K results.

    Args:
        y_true: real binary levels (1 for relevant and 0 for not relevant)
        y_score: predicted labels
        k: number of top-scored items to consider (by default set to 10)
        
    Returns:
        Reciprocal Rank at rank k
    """
    order=np.argsort(np.asarray(y_score))[::-1]
    y_true= np.take(np.asarray(y_true),order)[:k]
    if np.sum(y_true)==0: 
        return 0
    return 1/(np.argmax(y_true)+1)

In [10]:
def dcg_k(y_true,y_score,k=10):
    """
    Computes Discounted Cumulative Gain at rank K DCG@K
    DCG@K measures the ranking quality by assigning higher scores 
    to relevant items that appear earlier in the ranked list.

    Args:
        y_true: real binary levels (1 for relevant and 0 for not relevant)
        y_score: predicted labels
        k: number of top-scored items to consider (by default set to 10)

    Returns:
        Discounted Cumulative Gain at rank k
    """
    order=np.argsort(y_score)[::-1]
    y_true=np.take(y_true,order[:k])
    gain= 2**y_true-1
    discounts=np.log2(np.arange(len(y_true))+2) #+2 is added because log2(1) is 0
    return np.sum(gain/discounts)

In [11]:
def ndcg_k(y_true,y_score,k=10):
    """
    Computes Normalized Discounted Cumulative Gain at k NDCG@K

    Args:
        y_true: real binary levels (1 for relevant and 0 for not relevant)
        y_score: predicted labels
        k: number of top-scored items to consider (by default set to 10)

    Returns:
        Normalized Discounted Cumulative Gain at rank k
    """
    dcg= dcg_k(y_true,y_score,k)
    idcg=dcg_k(y_true,y_true,k)
    if not idcg:
        return 0
    return round(dcg/idcg,4)


In [None]:
# Main code

if __name__=="__main__":
    
    # get the current working directory
    base_dir=os.getcwd()
    
    #define path files for the processed documents and the validation labels
    proc_doc_path = os.path.join(base_dir,'..', '..', 'data', 'processed_docs.jsonl')
    validation_path = os.path.join(base_dir,'..', '..', 'data', 'validation_labels.csv')
    
    #load processed documents from the path
    docs=load_processed_docs(proc_doc_path)
    
    
    # create TF-IDF index components
    index,tf,df_counts,idf,title_index=create_index_tfidf(docs)
    
    # load validation data
    validation_df=pd.read_csv(validation_path)

    # queries that we will be using
    queries={1:"women full sleeve sweatshirt cotton",
             2:"men slim jeans blue"}
    
    result=[]

    # new column to store the predicted relevance
    validation_df["predicted_relevance"]=0
    
    #loop through each query to perform retrieving and scoring
    for q, query in queries.items():
        retrieve_pid=search_tf_idf(query,index,tf,idf,title_index)
        rank_score = {pid: len(retrieve_pid) - i for i, pid in enumerate(retrieve_pid)} #assigna  descending ranking score
        
        #update predicted relevance in the validation set
        validation_df.loc[validation_df["query_id"] == q, "predicted_relevance"] = validation_df.loc[validation_df["query_id"] == q, "pid"].apply(lambda pid: rank_score.get(pid, 0))


    #evaluate ranking metric for each quey
    for i in queries.keys():
        query_doc=validation_df[validation_df["query_id"]==i]
        y_true=query_doc["labels"].values
        y_score=query_doc["predicted_relevance"].values

        # compute the different statistics for this query
        result.append({
            "query_id":i,
            "Precision@5": round(precision_k(y_true,y_score,5),3),
            "Recalln@5": round(recall_k(y_true,y_score,5),3),
            "AP@5": round(average_precision(y_true,y_score,5),3),
            "F1@5": round(f1_k(y_true,y_score,5),3),
            "MRR": round(rr_k(y_true,y_score,5),3),
            "NDCG": round(ndcg_k(y_true,y_score,5),3),
        })
    
    # create a DataFrame with all the results and print them
    results_df=pd.DataFrame(result)
    print(results_df)

    # Compute Mean Average Precission accross all queries at k = 5
    mean_avp,avp_list=map_k(validation_df,k=5)
    print("MAP:",round(mean_avp,3))  
 

   query_id  Precision@5  Recalln@5  AP@5   F1@5  MRR   NDCG
0         1          1.0      0.385  1.00  0.556  1.0  1.000
1         2          0.8      0.400  0.95  0.533  1.0  0.854
MAP: 0.975
