# Standard data

# Preprocess

In [None]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U gdown
import gdown

# import all articles
file_id = '165wV72OUUHYDO3avmcrVOI2QGkoGTrL-'
gdown.download(f"https://drive.google.com/uc?id={file_id}", "pubmed_metadata_sample_full.csv", quiet=False)




Downloading...
From (original): https://drive.google.com/uc?id=165wV72OUUHYDO3avmcrVOI2QGkoGTrL-
From (redirected): https://drive.google.com/uc?id=165wV72OUUHYDO3avmcrVOI2QGkoGTrL-&confirm=t&uuid=6deb043a-00de-455e-9bde-a7edaaa25eef
To: /content/pubmed_metadata_sample_full.csv
100%|██████████| 294M/294M [00:01<00:00, 209MB/s]


'pubmed_metadata_sample_full.csv'

In [None]:
pip install faiss-cpu



In [None]:
import json
import math
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss

##############################################
# 1. Preprocess the PubMed Metadata (Corpus)
##############################################

# Read the CSV file with columns: pmid, title, abstract, keywords
df = pd.read_csv("pubmed_metadata_sample_full.csv", usecols=[0, 1, 2, 3])
df.columns = ['pmid', 'title', 'abstract', 'keywords']
df = df.dropna(subset=['title', 'abstract'])

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # merge whitespace
    text = re.sub(r'[^a-zA-Z0-9., ]', '', text)  # remove special characters
    return text.strip()

df['title'] = df['title'].apply(clean_text)
df['abstract'] = df['abstract'].apply(clean_text)
df['keywords'] = df['keywords'].fillna("").apply(lambda x: clean_text(x.lower()))
# Combine text fields as full text
df['full_text'] = df['title'] + " " + df['abstract'] + " " + df['keywords']
df.to_csv("cleaned_clinical_trials.csv", index=False)
print(f"✅ Cleaned dataset: {df.shape[0]} articles")

##############################################
# 2. Build Corpus from Cleaned CSV and Create Embeddings
##############################################
# We use the cleaned CSV file to create our corpus.
df['pmid'] = df['pmid'].astype(int)

# Create a dictionary mapping pmid -> full_text
corpus_text = {row['pmid']: row['full_text'] for _, row in df.iterrows()}

# Build a list of PMIDs and texts (order matters for FAISS index)
all_pmids = list(corpus_text.keys())
all_texts = [corpus_text[pid] for pid in all_pmids]

# Choose a SentenceTransformer model among the following three.
#model = SentenceTransformer('paraphrase-mpnet-base-v2')
#model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

# Encode all texts into embeddings.
embeddings = model.encode(all_texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True)

# Normalize embeddings so cosine similarity equals inner product.
faiss.normalize_L2(embeddings)

# Build a FAISS index (inner product based, which works as cosine similarity for normalized vectors)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Save the FAISS index (optional)
faiss.write_index(index, "clinical_trials_faiss.index")

# Create a mapping from FAISS index position to PMIDs.
index_to_pmid = {i: pid for i, pid in enumerate(all_pmids)}

##############################################
# 3. Load RELISH Labels and Build Ground-Truth Mapping
##############################################
# The RELISH JSON file contains query PMIDs and their candidate relevance information.
# It is assumed that each entry has a 'pmid' and a 'response' field,
# where response contains lists under keys 'relevant', 'partial', and 'irrelevant'.

def load_labeled_data(json_file_path, num_entries=100):
    with open(json_file_path, 'r') as f:
        labeled_data = json.load(f)
    return labeled_data[:num_entries]

def extract_pmid_and_responses(labeled_data):
    queries = []
    for entry in labeled_data:
        pmid = entry['pmid']
        response = entry['response']
        queries.append({
            'pmid': pmid,
            'relevant': response.get('relevant', []),
            'partial': response.get('partial', []),
            'irrelevant': response.get('irrelevant', [])
        })
    return queries

# Update the file path as needed.
json_file_path = '/content/drive/MyDrive/RELISH_v1.json'
labeled_data = load_labeled_data(json_file_path)
queries_list = extract_pmid_and_responses(labeled_data)

# Build ground_truth mapping: for each query pmid (as int), map candidate pmid -> relevance score
# We assign: fully relevant: 2, partial: 1, irrelevant: 0
ground_truth = {}  # {query_pmid: {candidate_pmid: score}}
for entry in queries_list:
    qid = int(entry['pmid'])
    ground_truth[qid] = {}
    for pmid in entry['relevant']:
        ground_truth[qid][int(pmid)] = 2
    for pmid in entry['partial']:
        # If a candidate already exists with score 2, keep it.
        ground_truth[qid][int(pmid)] = max(ground_truth[qid].get(int(pmid), 0), 1)
    for pmid in entry['irrelevant']:
        # irrelevant explicitly scored as 0 (optional, since absence can be treated as 0)
        ground_truth[qid][int(pmid)] = 0

##############################################
# 4. Recommendation Function
##############################################
def recommend_articles(query_title, query_abstract, query_keywords, top_n=5):
    """
    Compute query embedding from title, abstract, keywords and search FAISS index.
    Excludes the query itself if present.
    Returns a list of recommended PMIDs.
    """
    query_text = " ".join((query_title + " " + query_abstract + " " + query_keywords).split())
    query_embedding = model.encode([query_text], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)
    # Retrieve more than top_n to allow filtering.
    D, I = index.search(query_embedding, top_n + 5)
    candidate_pmids = [index_to_pmid[int(idx)] for idx in I[0]]

    # Optionally, if the query article's PMID is known, filter it out.
    # Here, we do a simple heuristic: if the query text is very similar to a candidate's text, skip it.
    filtered = []
    for pid in candidate_pmids:
        # If the query is already in the corpus and the candidate text contains similar words, skip.
        # (Alternatively, if you know the query pmid, you can pass it in and filter exactly.)
        if query_title.lower() in corpus_text.get(pid, "").lower():
            continue
        filtered.append(pid)
        if len(filtered) == top_n:
            break
    return filtered

##############################################
# 5. Ranking Metrics Functions
##############################################
def average_precision_at_k(relevant_pmids, recommended_pmids, k):
    """
    Compute Average Precision at k.
    Treat any candidate with a relevance score >= 1 as relevant.
    """
    if not relevant_pmids:
        return 0.0
    relevant_set = set(relevant_pmids)
    num_relevant = 0.0
    ap_sum = 0.0
    for i, pid in enumerate(recommended_pmids[:k], start=1):
        # binary relevance: score >= 1 is relevant
        if pid in relevant_set:
            num_relevant += 1
            ap_sum += num_relevant / i
    return ap_sum / min(len(relevant_set), k)

def mean_average_precision(all_relevant_list, all_recommended_list, k):
    ap_scores = []
    for rels, recs in zip(all_relevant_list, all_recommended_list):
        ap = average_precision_at_k(rels, recs, k)
        ap_scores.append(ap)
    return np.mean(ap_scores) if ap_scores else 0.0

def reciprocal_rank(recommended_pmids, relevant_set):
    for i, pid in enumerate(recommended_pmids, start=1):
        if pid in relevant_set:
            return 1.0 / i
    return 0.0

def dcg_at_k(recommended_pmids, ground_truth_dict, k):
    dcg = 0.0
    for i, pid in enumerate(recommended_pmids[:k], start=1):
        # Use the graded relevance score (if missing, 0)
        score = ground_truth_dict.get(pid, 0)
        dcg += score / math.log2(i + 1)
    return dcg

def ndcg_at_k(recommended_pmids, ground_truth_dict, k):
    dcg = dcg_at_k(recommended_pmids, ground_truth_dict, k)
    # Ideal DCG: sort the relevance scores of the candidates in descending order.
    ideal_scores = sorted(ground_truth_dict.values(), reverse=True)[:k]
    idcg = sum(score / math.log2(i + 1) for i, score in enumerate(ideal_scores, start=1))
    return dcg / idcg if idcg > 0 else 0.0

##############################################
# 6. Evaluation Over Multiple Queries
##############################################
# Evaluate only queries that are in our ground_truth and also appear in our corpus.
query_ids = [qid for qid in ground_truth if qid in corpus_text]
K = 5

all_AP = []
all_RR = []
all_NDCG = []
per_query_results = {}

for qid in query_ids:
    # Get query text from corpus_text
    query_text = corpus_text[qid]
    # Here, we assume that the query's title, abstract and keywords can be recovered
    # by splitting or using the df if available. Otherwise, we use the full text.
    # For simplicity, we use the full_text from corpus.
    # In a real system, you would retrieve the original title, abstract, keywords.
    # Below, we simply split the full_text assuming the title is the first sentence.
    parts = query_text.split(".")
    query_title = parts[0] if parts else query_text
    # Use the remainder for abstract (keywords might be embedded)
    query_abstract = " ".join(parts[1:]) if len(parts) > 1 else ""
    query_keywords = ""  # If not separately available

    recommended_pmids = recommend_articles(query_title, query_abstract, query_keywords, top_n=K)
    per_query_results[qid] = recommended_pmids

    # For binary metrics (AP and RR), consider candidates with score>=1 as relevant.
    true_relevant_set = {pid for pid, score in ground_truth[qid].items() if score >= 1}

    ap = average_precision_at_k(list(true_relevant_set), recommended_pmids, K)
    rr = reciprocal_rank(recommended_pmids, true_relevant_set)
    ndcg = ndcg_at_k(recommended_pmids, ground_truth[qid], K)

    all_AP.append(ap)
    all_RR.append(rr)
    all_NDCG.append(ndcg)

MAP5 = np.mean(all_AP) * 100
MRR = np.mean(all_RR) * 100
NDCG5 = np.mean(all_NDCG) * 100

print(f"Overall MAP@5: {MAP5:.2f}%")
print(f"Overall MRR: {MRR:.2f}%")
print(f"Overall NDCG@5: {NDCG5:.2f}%")

# Optionally, print some per-query results.
for i, qid in enumerate(query_ids[:5]):
    print(f"\nQuery PMID: {qid}")
    print(f"Recommended PMIDs: {per_query_results[qid]}")
    binary_truth = [pid for pid, score in ground_truth[qid].items() if score >= 1]
    print(f"Ground truth relevant PMIDs: {binary_truth}")


✅ Cleaned dataset: 162360 articles


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/5074 [00:00<?, ?it/s]

Overall MAP@5: 77.23%
Overall MRR: 93.88%
Overall NDCG@5: 74.62%

Query PMID: 22569528
Recommended PMIDs: [18562239, 19282669, 19242111, 22177953, 21730285]
Ground truth relevant PMIDs: [17928366, 18562239, 19052640, 19060905, 19242111, 19244124, 19414607, 19805545, 19816936, 20079430, 20811985, 22028468, 22177953, 23549785, 23712012, 24089523, 25350931, 26235619, 27376062, 28474232, 29454854]

Query PMID: 23613754
Recommended PMIDs: [27924572, 25533345, 29304842, 26224636, 20675210]
Ground truth relevant PMIDs: [18818436, 20022960, 20675210, 22085933, 25533345, 25690936, 29061959, 29304842, 22307056]

Query PMID: 29409062
Recommended PMIDs: [23281855, 26355502, 21103052, 20487513, 16447990]
Ground truth relevant PMIDs: [18443018, 19772615, 22916718, 23281855, 24931993, 26355502, 28570104, 18593717, 19087303, 19237334, 20637083, 21609501, 21846404, 22080466, 22761950, 22927994, 22962469, 23229795, 23514199, 23868775, 24726865, 26455801, 27153661, 27506132, 27571416, 28113697, 28937982]