In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
import numpy as np
import math

# =========================
# 1. Dataset
# =========================
documents = [
    "AI is transforming healthcare through deep learning for medical image diagnosis.",   #1 relevant
    "COVID-19 pandemic disrupted global healthcare systems and accelerated telemedicine.",#2 partial
    "Machine learning helps in drug discovery and personalized treatment.",               #3 relevant
    "Travel restrictions during COVID affected mental health of many people.",            #4 not relevant to AI
    "Deep learning applications improve disease prediction in modern healthcare.",        #5 relevant
    "AI-powered robotic surgery increases precision and reduces recovery time.",          #6 relevant
    "Wearable health devices use machine learning to monitor patient vitals in real-time.",#7 relevant
    "Telemedicine platforms use AI chatbots to provide initial health consultations.",    #8 relevant
    "Big data analytics helps hospitals predict outbreaks and optimize resources.",       #9 partially relevant
    "Genomics research uses deep learning to identify gene mutations linked to diseases.",#10 relevant
    "AI assists doctors in early cancer detection using image classification models.",    #11 relevant
    "COVID-19 accelerated the adoption of remote patient monitoring technologies.",       #12 not relevant (no AI)
    "Natural language processing extracts clinical insights from electronic health records.", #13 relevant
    "AI-based drug discovery reduces research time from years to months.",                #14 relevant
    "Mental health chatbots use deep learning to provide emotional support to patients."  #15 relevant
]


query = "AI in healthcare"
# Ground truth for query: "AI in healthcare"
ground_truth = [
    1, 0, 1, 0, 1,
    1, 1, 1, 0, 1,
    1, 0, 1, 1, 1
]
ground_truth = np.array(ground_truth)  # ✅ FIX: convert to numpy array
  # ground truth relevance

# =========================
# 2. TF-IDF Search
# =========================
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)
query_vec = tfidf.transform([query])
tfidf_scores = (tfidf_matrix * query_vec.T).toarray().flatten()
tfidf_ranked = np.argsort(-tfidf_scores)  # descending

# =========================
# 3. BM25 Search
# =========================
tokenized_docs = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)
bm25_scores = bm25.get_scores(query.lower().split())
bm25_ranked = np.argsort(-bm25_scores)

# =========================
# 4. Evaluation Metrics
# =========================
def precision_at_k(ranked_idx, ground_truth, k):
    rel = ground_truth[ranked_idx[:k]]
    return np.sum(rel) / k

def average_precision(ranked_idx, ground_truth):
    score = 0
    relevant = 0
    for i, idx in enumerate(ranked_idx, start=1):
        if ground_truth[idx] == 1:
            relevant += 1
            score += relevant / i
    return score / np.sum(ground_truth)

def dcg(scores):
    return sum((r / math.log2(i+2)) for i, r in enumerate(scores))

def ndcg(ranked_idx, ground_truth):
    ideal = sorted(ground_truth, reverse=True)
    return dcg(ground_truth[ranked_idx]) / dcg(ideal)

# =========================
# 5. Calculate Metrics
# =========================
models = {
    "TF-IDF": tfidf_ranked,
    "BM25": bm25_ranked
}

print(f"Query: {query}\n")
for name, ranked in models.items():
    print(f"=== {name} Search Result Ranking ===")
    print("Document order (by relevance):", ranked)
    p_at_3 = precision_at_k(ranked, ground_truth, 3)
    map_score = average_precision(ranked, ground_truth)
    ndcg_score = ndcg(ranked, ground_truth)
    print(f"Precision@3 = {p_at_3:.2f}")
    print(f"MAP = {map_score:.4f}")
    print(f"NDCG = {ndcg_score:.4f}\n")


Query: AI in healthcare

=== TF-IDF Search Result Ranking ===
Document order (by relevance): [ 4  0 10  1  2  6  7 13  5  3  8  9 11 12 14]
Precision@3 = 1.00
MAP = 0.8593
NDCG = 0.9539

=== BM25 Search Result Ranking ===
Document order (by relevance): [ 0 10  1  7  2  4  6  3  5  8  9 11 12 13 14]
Precision@3 = 0.67
MAP = 0.8078
NDCG = 0.9303

