EVALUATION METRICS

In [1]:
import json
import math
import os
import re
import collections
from collections import defaultdict
from array import array
import numpy as np
import pandas as pd
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # one level up from part_2
sys.path.append(project_root)
from part_2.indexing_evaluation import load_processed_docs,create_index_tfidf,search_tf_idf

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xfa12\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def precision_k(y_true,y_score,k=10):
    order=np.argsort(np.asarray(y_score))[::-1]
    y_true= np.take(np.asarray(y_true),order)
    k=min(k,len(y_true)) #handle if k> len(y_true)
    return np.sum(y_true[:k])/k if k>0 else 0.0

In [3]:
def recall_k(y_true,y_score,k=10):
    order=np.argsort(np.asarray(y_score))[::-1]
    y_true=np.take(np.asarray(y_true),order)
    relevenat=np.sum(y_true[:k])
    total_rel=np.sum(np.asarray(y_true))
    return relevenat/total_rel if total_rel>0 else 0.0

In [4]:
def f1_k(y_true,y_score,k=10):
    prec=precision_k(y_true,y_score,k)
    rec=recall_k(y_true,y_score,k)
    return (2*prec*rec)/(prec+rec) if (prec+rec)>0 else 0.0

In [5]:
def average_precision(y_true,y_score,k=10):
    order=np.argsort(y_score)[::-1]
    y_true=np.take(np.asarray(y_true),order)
    prec_list=[]
    num_relevant=0
    for i in range(min(k,len(order))):
        if y_true[i]==1:
            num_relevant +=1
            prec_list.append(num_relevant/(i+1))
    return np.sum(prec_list)/num_relevant if num_relevant>0 else 0.0

In [6]:
def map_k(dt,k):
    avp = []
    for q in dt["query_id"].unique():  
        curr_data = dt[dt["query_id"] == q] 
        y_true=curr_data["labels"].values
        y_score=curr_data["predicted_relevance"].values
        avp.append(average_precision(y_true,y_score,k)) 
    return np.sum(avp) / len(avp), avp  

In [7]:
def rr_k(y_true,y_score,k=10):
    order=np.argsort(np.asarray(y_score))[::-1]
    y_true= np.take(np.asarray(y_true),order)[:k]
    if np.sum(y_true)==0: 
        return 0
    return 1/(np.argmax(y_true)+1)

In [8]:
def dcg_k(y_true,y_score,k=10):
    order=np.argsort(y_score)[::-1]
    y_true=np.take(y_true,order[:k])
    gain= 2**y_true-1
    discounts=np.log2(np.arange(len(y_true))+2) #+2 is added because log2(1) is 0
    return np.sum(gain/discounts)

In [9]:
def ndcg_k(y_true,y_score,k=10):
    dcg= dcg_k(y_true,y_score,k)
    idcg=dcg_k(y_true,y_true,k)
    if not idcg:
        return 0
    return round(dcg/idcg,4)


In [12]:

if __name__=="__main__":
    base_dir=os.getcwd()
    proc_doc_path = os.path.join(base_dir,'..', '..', 'data', 'processed_docs.jsonl')
    validation_path = os.path.join(base_dir,'..', '..', 'data', 'validation_labels.csv')
    docs=load_processed_docs(proc_doc_path)
    
    index,tf,df_counts,idf,title_index=create_index_tfidf(docs)
    
    validation_df=pd.read_csv(validation_path)

    queries={1:"women full sleeve sweatshirt cotton",
             2:"men slim jeans blue"}
    
    result=[]

    validation_df["predicted_relevance"]=0
    for q, query in queries.items():
        retrieve_pid=search_tf_idf(query,index,tf,idf,title_index)
        rank_score = {pid: len(retrieve_pid) - i for i, pid in enumerate(retrieve_pid)}
        validation_df.loc[validation_df["query_id"] == q, "predicted_relevance"] = validation_df.loc[validation_df["query_id"] == q, "pid"].apply(lambda pid: rank_score.get(pid, 0))

    for i in queries.keys():
        query_doc=validation_df[validation_df["query_id"]==i]
        y_true=query_doc["labels"].values
        y_score=query_doc["predicted_relevance"].values

        result.append({
            "query_id":i,
            "Precision@5": round(precision_k(y_true,y_score,5),3),
            "Recalln@5": round(recall_k(y_true,y_score,5),3),
            "AP@5": round(average_precision(y_true,y_score,5),3),
            "F1@5": round(f1_k(y_true,y_score,5),3),
            "MRR": round(rr_k(y_true,y_score,5),3),
            "NDCG": round(ndcg_k(y_true,y_score,5),3),
        })
    results_df=pd.DataFrame(result)
    print(results_df)

    mean_avp,avp_list=map_k(validation_df,k=5)
    print("MAP:",round(mean_avp,3))  
 

   query_id  Precision@5  Recalln@5  AP@5   F1@5  MRR   NDCG
0         1          1.0      0.385  1.00  0.556  1.0  1.000
1         2          0.8      0.400  0.95  0.533  1.0  0.854
MAP: 0.975
