In [4]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [7]:
# Generate a sample test_set using the provided document metadata context

test_set = [
    {
        "question": "What are the early symptoms of Acute Lymphoblastic Leukemia?",
        "ground_truth_source": "processed-text/documents/CancerGov_0000001_1.txt",
        "ground_truth_answer": (
            "Early symptoms include fatigue, fever, easy bruising or bleeding, petechiae, "
            "shortness of breath, weight loss, and frequent infections."
        ),
        "relevance_mapping": {
            "processed-text/documents/CancerGov_0000001_1.txt": 3
        }
    },
    {
        "question": "How is Acute Lymphoblastic Leukemia diagnosed?",
        "ground_truth_source": "processed-text/documents/CancerGov_0000001_1.txt",
        "ground_truth_answer": (
            "Diagnosis is made using tests such as complete blood count (CBC), blood chemistry, "
            "bone marrow biopsy, cytogenetic analysis, immunophenotyping, and lumbar puncture."
        ),
        "relevance_mapping": {
            "processed-text/documents/CancerGov_0000001_1.txt": 3
        }
    },
    {
        "question": "What are the treatment phases of Acute Lymphoblastic Leukemia?",
        "ground_truth_source": "processed-text/documents/CancerGov_0000001_1.txt",
        "ground_truth_answer": (
            "The treatment includes remission induction therapy, consolidation therapy, and CNS prophylaxis, "
            "with chemotherapy and sometimes radiation."
        ),
        "relevance_mapping": {
            "processed-text/documents/CancerGov_0000001_1.txt": 3
        }
    },
    {
        "question": "Which genetic disorders are associated with a higher risk of childhood ALL?",
        "ground_truth_source": "processed-text/documents/CancerGov_0000001_6.txt",
        "ground_truth_answer": (
            "Genetic disorders like Down syndrome, neurofibromatosis type 1, Bloom syndrome, "
            "Fanconi anemia, ataxia-telangiectasia, and Li-Fraumeni syndrome are associated with higher risk."
        ),
        "relevance_mapping": {
            "processed-text/documents/CancerGov_0000001_6.txt": 3
        }
    },
    {
        "question": "What is the role of BRCA1 and BRCA2 in prostate cancer?",
        "ground_truth_source": "processed-text/documents/GHR_0000836.txt",
        "ground_truth_answer": (
            "BRCA1 and BRCA2 genes help repair damaged DNA. Mutations impair this function, increasing prostate cancer risk."
        ),
        "relevance_mapping": {
            "processed-text/documents/GHR_0000836.txt": 3
        }
    },
    {
        "question": "What causes Protein C Deficiency?",
        "ground_truth_source": "processed-text/documents/GHR_0000837.txt",
        "ground_truth_answer": (
            "Protein C deficiency is caused by mutations in the PROC gene, leading to reduced or altered protein C, affecting blood clot regulation."
        ),
        "relevance_mapping": {
            "processed-text/documents/GHR_0000837.txt": 3
        }
    }

]

import json
import os

# Save the test_set to a JSON file for later evaluation
output_path = "test_set.json"
with open(output_path, "w") as f:
    json.dump(test_set, f, indent=2)

output_path


'test_set.json'

In [8]:
import time
import json
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import CrossEncoder, SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import faiss

# Paths and model names
FAISS_INDEX_PATH = "faiss_doc_index_384.bin"
FAISS_METADATA_PATH = "faiss_doc_metadata.json"
TEST_SET_PATH = "test_set.json"
LLM_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
BI_ENCODER_LOCAL = "sentence-transformers/all-MiniLM-L6-v2"
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"

# Load FAISS index and metadata
def load_faiss():
    index = faiss.read_index(FAISS_INDEX_PATH)
    with open(FAISS_METADATA_PATH, "r", encoding="utf-8") as f:
        metadata = json.load(f)
    return index, metadata

# Embedding and reranking
def get_embedding(text, encoder):
    emb = encoder.encode(text, convert_to_numpy=True)
    if emb.ndim == 1:
        emb = emb[np.newaxis, :]
    return emb.astype("float32")

def rerank_local(query, candidates, cross_encoder, top_n=None):
    pairs = [[query, c["text"]] for c in candidates]
    scores = cross_encoder.predict(pairs)
    for c, score in zip(candidates, scores):
        c["rerank_score"] = float(score)
    sorted_hits = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
    return sorted_hits if top_n is None else sorted_hits[:top_n]

def search_faiss(query, faiss_index, metadata, bi_encoder, cross_encoder, top_k=10, rerank_top_k=5):
    q_emb = get_embedding(query, bi_encoder)
    distances, indices = faiss_index.search(q_emb, top_k)

    seen = set()
    results = []
    for i in range(top_k):
        idx = indices[0][i]
        dist = distances[0][i]
        meta = metadata[idx]
        doc_id = meta.get("source")
        chunk_id = meta.get("chunk_id")
        key = (doc_id, chunk_id)
        if key in seen:
            continue
        seen.add(key)
        results.append({
            "rank": len(results) + 1,
            "score": float(dist),
            "source": doc_id,
            "chunk_id": chunk_id,
            "text": meta.get("text")
        })

    return rerank_local(query, results, cross_encoder, top_n=rerank_top_k)

# Answer generation
def answer(question, context, model, tokenizer):
    prompt = (
        "<|system|>You are a helpful and knowledgeable medical assistant.<|end|>\n"
        f"<|user|>Context:\n{context}\n\nQuestion: {question}<|end|>\n"
        f"<|assistant|>The answer is:"
    )

    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
    response = generator(
        prompt,
        max_new_tokens=150,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        top_k=40
    )

    parts = response[0]["generated_text"].split("<|assistant|>The answer is:")
    return parts[1].strip() if len(parts) > 1 else "Could not extract answer"

# Evaluation metrics
def compute_mrr(results, ground_truth_source):
    for i, r in enumerate(results):
        if r["source"] == ground_truth_source:
            return 1 / (i + 1)
    return 0

def ndcg(relevances, k=5):
    dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevances[:k]))
    ideal = sorted(relevances, reverse=True)
    idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal[:k]))
    return dcg / idcg if idcg > 0 else 0

def compute_bleu(reference, candidate):
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference.split()], candidate.split(), smoothing_function=smoothie)

# Run evaluation
def evaluate(test_set):
    faiss_index, metadata = load_faiss()
    bi_encoder = SentenceTransformer(BI_ENCODER_LOCAL)
    cross_encoder = CrossEncoder(CROSS_ENCODER_MODEL)
    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_ID)

    latencies, mrr_scores, ndcg_scores, bleu_scores = [], [], [], []
    eval_logs = []

    for q in test_set:
        start = time.time()
        results = search_faiss(q["question"], faiss_index, metadata, bi_encoder, cross_encoder)
        context = "\n\n".join([r["text"][:500] for r in results[:3]])
        generated = answer(q["question"], context, model, tokenizer)
        end = time.time()

        latencies.append(end - start)
        mrr_scores.append(compute_mrr(results, q["ground_truth_source"]))
        relevances = [q["relevance_mapping"].get(r["source"], 0) for r in results]
        ndcg_scores.append(ndcg(relevances))
        bleu_scores.append(compute_bleu(q["ground_truth_answer"], generated))

        eval_logs.append({
            "question": q["question"],
            "answer": generated,
            "reference": q["ground_truth_answer"],
            "latency": end - start,
            "mrr": mrr_scores[-1],
            "ndcg": ndcg_scores[-1],
            "bleu": bleu_scores[-1]
        })

    summary = {
        "Average Latency": np.mean(latencies),
        "MRR": np.mean(mrr_scores),
        "NDCG": np.mean(ndcg_scores),
        "BLEU": np.mean(bleu_scores)
    }

    # Save detailed logs
    with open("detailed_eval_log.json", "w") as f:
        json.dump(eval_logs, f, indent=2)

    return summary

# Load test set and run evaluation
with open(TEST_SET_PATH, "r") as f:
    test_set = json.load(f)

metrics_summary = evaluate(test_set)
print(metrics_summary)
metrics_summary_path = "metrics_summary.json"
with open(metrics_summary_path, "w") as f:
    json.dump(metrics_summary, f, indent=2)

metrics_summary_path


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


{'Average Latency': np.float64(3.528817892074585), 'MRR': np.float64(0.7000000000000001), 'NDCG': np.float64(0.7257201317872851), 'BLEU': np.float64(0.04633737565877292)}


'metrics_summary.json'