In [11]:
import numpy as np

# define ground truth and system output
# actual top 10 ranks from the vsm (doc_id is the document id) [cite: 56]
vsm_ranked_list_doc_ids = [1245, 1191, 1124, 1136, 571, 1190, 309, 341, 701, 1132]

# assumed true relevance (ground truth for the query)
# the key is the doc id
relevance_map = {
    1245: 3, # highly relevant
    1191: 2, # relevant
    1124: 1, # moderately relevant
    1136: 0, 571: 0, 1190: 0, 309: 0, 341: 0, 701: 0, 1132: 0 # not relevant for this sample
}
binary_relevant_set = {doc_id for doc_id, score in relevance_map.items() if score > 0}
total_relevant_docs = len(binary_relevant_set) # total relevant docs is 3

In [12]:

# precision and recall function
def precision_recall_at_k(ranked_list, relevant_set, k):
    """calculates precision and recall at cut-off k."""
    # count relevant documents retrieved in top k
    retrieved_at_k = ranked_list[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_at_k if doc_id in relevant_set)

    # precision@k: relevant retrieved / total retrieved
    precision = relevant_retrieved / k

    # recall@k: relevant retrieved / total relevant in corpus
    recall = relevant_retrieved / total_relevant_docs if total_relevant_docs > 0 else 0

    return precision, recall

# mean average precision (map) function
def average_precision(ranked_list, relevant_set):
    """calculates average precision for a single query."""
    sum_precisions = 0
    relevant_count = 0
    for k, doc_id in enumerate(ranked_list, 1):
        if doc_id in relevant_set:
            relevant_count += 1
            # precision at the rank of the current relevant document
            precision_at_k = relevant_count / k
            sum_precisions += precision_at_k

    # ap = sum of precision at relevant ranks / total relevant documents
    return sum_precisions / total_relevant_docs if total_relevant_docs > 0 else 0

# normalized discounted cumulative gain (ndcg) function
def ndcg_at_k(ranked_list, relevance_map, k):
    """calculates ndcg at cut-off k using graded relevance."""

    # extract relevance scores for ranked list
    rel_scores = np.array([relevance_map.get(doc_id, 0) for doc_id in ranked_list[:k]])

    # dcg (discounted cumulative gain)
    # dcg formula: $\text{dcg} = \sum_{i=1}^{k} \frac{2^{\text{rel}_i} - 1}{\log_2(i+1)}$
    gain = 2**rel_scores - 1
    # add 2 because rank starts at 1, so log2(1+1) = log2(2) = 1
    discounts = np.log2(np.arange(2, len(rel_scores) + 2))
    dcg = np.sum(gain / discounts)

    # idcg (ideal discounted cumulative gain)
    # sort true relevance scores ideally
    ideal_scores = np.array(sorted(relevance_map.values(), reverse=True))
    ideal_scores_k = ideal_scores[:k]

    ideal_gain = 2**ideal_scores_k - 1
    idcg = np.sum(ideal_gain / discounts)

    # ndcg = dcg / idcg
    return dcg / idcg if idcg > 0 else 0

In [13]:
# evaluation execution
K_value = 5 # evaluate performance for top 5 results
P_at_K, R_at_K = precision_recall_at_k(vsm_ranked_list_doc_ids, binary_relevant_set, K_value)
AP_score = average_precision(vsm_ranked_list_doc_ids, binary_relevant_set)
nDCG_at_K = ndcg_at_k(vsm_ranked_list_doc_ids, relevance_map, K_value)

print("\n--- VSM Evaluation Metrics Output (Sample) ---")
print(f"Total Relevant Docs (in sample): {total_relevant_docs}")
print(f"Precision@{K_value}: {P_at_K:.4f}")
print(f"Recall@{K_value}: {R_at_K:.4f}")
print(f"Average Precision (AP) for Query: {AP_score:.4f}")
print(f"nDCG@{K_value}: {nDCG_at_K:.4f}")


--- VSM Evaluation Metrics Output (Sample) ---
Total Relevant Docs (in sample): 3
Precision@5: 0.6000
Recall@5: 1.0000
Average Precision (AP) for Query: 1.0000
nDCG@5: 1.0000
