# How We Catch Hallucinations in Medical AI: Beyond Basic RAG


In [1]:
import re
import json
import time
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI

from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.response_synthesizers import ResponseMode

from llama_index.core.postprocessor import SentenceTransformerRerank

from dotenv import load_dotenv

# Load environment variables.
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# 1. Create the query engine from post 2
Here we assume you already have indexed your knowledge base. If that is not the case, follow the steps [here](https://pub.towardsai.net/hallucinations-in-healthcare-llms-why-they-happen-and-how-to-prevent-them-614d845242f4) to download your knowledge base and [here](https://pub.towardsai.net/how-to-build-a-rag-system-for-healthcare-minimize-hallucinations-in-llm-outputs-0b8ea4a4eaae) to build it

In [2]:
# We will be using the bioBERT embeddings and small chunks configuraion from post 2
bio_embed = HuggingFaceEmbedding(model_name="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
Settings.embed_model = bio_embed
Settings.chunk_size = 256
Settings.chunk_overlap = 20

In [3]:
# We update slightly the create_engine function to add the reranker option
def create_query_engine(index_path, llm, k=5, reranker=None):
    # Load the index
    storage_context = StorageContext.from_defaults(persist_dir=index_path)
    index = load_index_from_storage(storage_context)

    # Configure the LLM
    Settings.llm = llm

    # Common kwargs
    qe_kwargs = {
        "response_mode": ResponseMode.TREE_SUMMARIZE,
        "text_qa_template": """
    You are a medical information assistant.
    Answer the question based ONLY on the following context.
    If you don't know the answer from the context, say "I don't have enough information to answer this question reliably. Please consult a healthcare professional."
    Do NOT make up or infer information not present in the context.
    Always cite the PMCID when providing information.

    Context:
    {context}

    Question: {query_str}

    Answer:""",
        "similarity_top_k": k,
    }

    # Add the reranker only if it's defined
    if reranker:
        qe_kwargs["node_postprocessors"] = [reranker]

    # Build the query engine
    query_engine = index.as_query_engine(**qe_kwargs)

    return query_engine

In [4]:
# Use biobert index and the reranker
llm = OpenAI(model="gpt-4o-2024-11-20", temperature=0.1)
reranker = SentenceTransformerRerank(
        model="mixedbread-ai/mxbai-rerank-base-v1", 
        top_n=4,
)
index_path = "./pneumonia_biobert_256"
query_engine_reranker = create_query_engine(index_path, llm, reranker=reranker, k=12)

Before digging into the new metrics, let's run ragas on our system to identify questions that are potentially challenging

# Run RAGAS on test set

In [5]:
from ragas import EvaluationDataset
from ragas import SingleTurnSample 

pneumonia_questions = [
    # community-acquired pneumonia (CAP)
    "What is the first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired bacterial pneumonia in an adult with no comorbidities?",
    "When is dual therapy with a β-lactam plus macrolide preferred over monotherapy for CAP?",
    "Recommended duration of therapy for uncomplicated CAP caused by Streptococcus pneumoniae?",
    "How does recommended empiric therapy change for CAP in regions with >25 % macrolide-resistant S. pneumoniae?",
    "Which respiratory fluoroquinolones are acceptable alternatives for CAP in a patient with severe penicillin allergy?",

    # hospital-acquired / ventilator-associated pneumonia (HAP/VAP)
    "What empiric coverage is advised for hospital-acquired pneumonia when MRSA risk factors are present?",
    "First-line IV therapy for severe CAP requiring ICU admission with pseudomonal risk?",
    "Role of local antibiogram data in selecting empiric therapy for ventilator-associated pneumonia?",

    # special situations
    "When should azithromycin dose be adjusted in moderate renal impairment?",
    "Preferred outpatient therapy for CAP in a pregnant patient during the second trimester?",
    "How does the guideline differ for treating aspiration pneumonia with anaerobic coverage?",
    "Recommended approach if a patient remains febrile after 48 h of appropriate CAP therapy?",
]

samples = [
    SingleTurnSample(
        id=f"q{i}",
        user_input=q,     # the question
        answer=None,      # use real answers if you have them
        contexts=[],      # and real citational chunks if you have them
    )
    for i, q in enumerate(pneumonia_questions)
]

eval_ds = EvaluationDataset(samples)  



In [6]:
from ragas.metrics import Faithfulness, AnswerRelevancy
from ragas.llms    import LlamaIndexLLMWrapper
from llama_index.llms.openai import OpenAI
from ragas.integrations.llama_index import evaluate

judge   = LlamaIndexLLMWrapper(OpenAI(model="gpt-4o-2024-11-20", temperature=0))
metrics = [Faithfulness(llm=judge), AnswerRelevancy(llm=judge)]

scores = evaluate(
    query_engine=query_engine_reranker,
    metrics=metrics,
    dataset=eval_ds, 
)

Running Query Engine: 100%|██████████| 12/12 [00:18<00:00,  1.53s/it]
Evaluating:  46%|████▌     | 11/24 [00:13<00:17,  1.33s/it]Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-2EY96yADGdrWYewi1eeWvTrF on tokens per min (TPM): Limit 30000, Used 30000, Requested 533. Please try again in 1.066s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Evaluating:  67%|██████▋   | 16/24 [00:14<00:05,  1.38it/s]Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-2EY96yADGdrWYewi1eeWvTrF on tokens per min (TPM): Limit 30000, Used 30000, Requested 536. Please try again in 1.072s. Visit https://platform.openai.com/account/rate-limits to learn mo

In [7]:
scores

{'faithfulness': 0.5458, 'answer_relevancy': 0.7244}

In [8]:
scores_df = scores.to_pandas()
scores_df

Unnamed: 0,user_input,retrieved_contexts,response,faithfulness,answer_relevancy
0,What is the first-line antibiotic regimen for ...,[The IDSA guidelines regarding treatment for U...,The first-line antibiotic regimen for outpatie...,1.0,1.0
1,When is dual therapy with a β-lactam plus macr...,[Beta-lactams were the regimens most commonly ...,The provided information does not address the ...,1.0,0.0
2,Recommended duration of therapy for uncomplica...,[Duration of antibiotic therapy. The Norwegian...,The recommended duration of therapy for uncomp...,0.666667,0.97671
3,How does recommended empiric therapy change fo...,[Despite the many factors determining the empi...,In regions with more than 25% macrolide-resist...,0.0,0.9518
4,Which respiratory fluoroquinolones are accepta...,[These risk factors are associated with an inc...,"Respiratory fluoroquinolones, such as levoflox...",0.333333,0.933421
5,What empiric coverage is advised for hospital-...,"[Since they are mostly MDR pathogens, providin...",When MRSA risk factors are present in hospital...,1.0,0.974234
6,First-line IV therapy for severe CAP requiring...,[18 Patients who present to the ED for care ar...,For severe community-acquired pneumonia (CAP) ...,0.0,0.972754
7,Role of local antibiogram data in selecting em...,"[The control arm is standard care, which consi...",Local antibiogram data plays a crucial role in...,0.25,0.96079
8,When should azithromycin dose be adjusted in m...,[Dose adjustment is routinely recommended in p...,The provided information does not specifically...,0.6,0.0
9,Preferred outpatient therapy for CAP in a preg...,[It should therefore be offered to all women w...,The preferred outpatient therapy for community...,0.0,0.963915


Now we can use these initial result to identify questions that are potentially problematic or questions that should result in high confidence

# Define safety checks
## 1. Simple source attribution for attribution scoring

In [5]:
def check_answer_support(answer, source_chunks, encoder, DEBUG_MODE=False):
    """
    Simple function to check how well an answer is supported by source chunks
    """
    def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)
    
    # Split answer into sentences
    sentences = re.split(r'[.!?]+', answer)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    if not sentences or not source_chunks:
        return 0.0, []
    
    debug_print(f"Checking {len(sentences)} sentences against {len(source_chunks)} source chunks")
    
    # Encode sentences and sources
    answer_embeddings = encoder.encode(sentences)
    source_embeddings = encoder.encode(source_chunks)
    
    sentence_scores = []
    
    for i, answer_emb in enumerate(answer_embeddings):
        # Find best matching source for each sentence
        similarities = cosine_similarity([answer_emb], source_embeddings)[0]
        best_score = np.max(similarities)
        sentence_scores.append(best_score)
        
        debug_print(f"Sentence {i+1}: '{sentences[i][:50]}...' -> Score: {best_score:.3f}")
    
    overall_score = np.mean(sentence_scores)
    return overall_score, sentence_scores



In [10]:
# Let's find a test question that had a score of 0 for faithfulness:
scores_df['user_input'][9]

'Preferred outpatient therapy for CAP in a pregnant patient during the second trimester?'

In [6]:
embed_model = "pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb"
encoder = SentenceTransformer(embed_model)

# Try the system using the reranker
hard_question = "Preferred outpatient therapy for CAP in a pregnant patient during the second trimester?"
response = query_engine_reranker.query(hard_question)

In [7]:
# Extract the answer and source chunks
answer = response.response
source_chunks = [node.text for node in response.source_nodes]

print("=== ATTRIBUTION CHECK ===")
overall_score, sentence_scores = check_answer_support(answer, source_chunks, encoder)
print(f"\nOverall Attribution Score: {overall_score:.3f}")
print(f"Interpretation: {'Good support' if overall_score > 0.6 else 'Weak support - potential hallucination'}")

=== ATTRIBUTION CHECK ===

Overall Attribution Score: 0.438
Interpretation: Weak support - potential hallucination


In [8]:
# Define a helper function to display the responses
from IPython.display import Markdown, display
import textwrap

def print_response_pretty(response, wrap_width=100):
    """Nicely format and display model responses in a notebook"""
    wrapped = textwrap.fill(response.strip(), width=wrap_width)
    display(Markdown(f"**Response:**\n\n\n{wrapped}\n"))

In [14]:
print_response_pretty(answer)

**Response:**


The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during
the second trimester typically includes antibiotics that are safe for use during pregnancy. Options
may include beta-lactams such as amoxicillin or amoxicillin-clavulanate, or macrolides like
azithromycin if atypical pathogens are suspected. The choice of therapy should consider the safety
profile of the medication for both the mother and the fetus, as well as the likely pathogens.


## Find potentially problematic sentences
Uses the same mechanism as the attribution check (cosine similarity)

In [9]:
def find_weak_sentences(answer, source_chunks, threshold=0.5):
    """
    Identify sentences that might be hallucinated (poorly supported by sources)
    """
    sentences = re.split(r'[.!?]+', answer)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    if not sentences or not source_chunks:
        return []
    
    # Get similarity scores
    answer_embeddings = encoder.encode(sentences)
    source_embeddings = encoder.encode(source_chunks)
    
    weak_sentences = []
    
    for i, answer_emb in enumerate(answer_embeddings):
        similarities = cosine_similarity([answer_emb], source_embeddings)[0]
        best_score = np.max(similarities)
        
        if best_score < threshold:
            weak_sentences.append({
                'sentence': sentences[i],
                'score': best_score,
                'index': i
            })
    
    return weak_sentences

In [10]:
# Test the weak sentence detection 
weak_sentences = find_weak_sentences(answer, source_chunks, threshold=0.6)

if weak_sentences:
    print("Potentially unsupported sentences:")
    for weak in weak_sentences:
        print(f"- \"{weak['sentence']}\" (score: {weak['score']:.3f})")
else:
    print("All sentences appear well-supported")

Potentially unsupported sentences:
- "The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during the second trimester typically includes antibiotics that are safe for use during pregnancy" (score: 0.558)
- "Options may include beta-lactams such as amoxicillin or amoxicillin-clavulanate, and macrolides like azithromycin if atypical pathogens are suspected" (score: 0.285)
- "The choice of therapy should consider the safety profile of the medication for both the mother and the fetus, as well as the local resistance patterns and the patient's clinical condition" (score: 0.470)


None of the answer's 3 sentences is well supported by the chunks, which is another red flag
## 2. Consistency checking

In [11]:
def check_consistency(question, query_engine, num_tries=3, DEBUG_MODE=False):
    """
    Ask the same question multiple times and check for consistency
    """

    def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)

    debug_print(f"Asking the same question {num_tries} times...")
    responses = []
    
    for i in range(num_tries):
        debug_print(f"Attempt {i+1}...")
        response = query_engine.query(question)
        responses.append(response.response)
        time.sleep(1)  # Brief pause between queries
    
    # Show all responses
    debug_print("\n=== ALL RESPONSES ===")
    for i, resp in enumerate(responses):
        debug_print(f"Response {i+1}: {resp[:100]}...")
        debug_print()
    
    # Calculate similarity between responses
    if len(responses) >= 2:
        response_embeddings = encoder.encode(responses)
        
        similarities = []
        for i in range(len(response_embeddings)):
            for j in range(i + 1, len(response_embeddings)):
                sim = cosine_similarity([response_embeddings[i]], [response_embeddings[j]])[0][0]
                similarities.append(sim)
                debug_print(f"Similarity between response {i+1} and {j+1}: {sim:.3f}")
        
        avg_similarity = np.mean(similarities)
        print(f"\nAverage consistency score: {avg_similarity:.3f}")
        
        if avg_similarity > 0.8:
            print("High consistency - responses are very similar")
        elif avg_similarity > 0.6:
            print("Moderate consistency - some variation")
        else:
            print("Low consistency - significant differences (potential hallucination risk)")
        
        return avg_similarity, responses
    
    return 1.0, responses


In [12]:
# Test consistency
consistency_score, all_responses = check_consistency(hard_question, query_engine_reranker)


Average consistency score: 0.984
High consistency - responses are very similar


## 3. Semantic Entropy measurement

In [13]:
import numpy as np
from collections import Counter
import math

def calculate_semantic_entropy(question, query_engine, encoder, num_samples=5, temperature=0.8, DEBUG_MODE=False):
   """
   Calculate semantic entropy to detect hallucination uncertainty
   Higher entropy = more uncertainty = higher hallucination risk
   """
   def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)
            
   debug_print(f"CALCULATING SEMANTIC ENTROPY")
   debug_print(f"Generating {num_samples} responses with temperature={temperature}")
   
   # Generate multiple responses with higher temperature for diversity
   responses = []
   for i in range(num_samples):
       # Temporarily increase temperature for diversity
       original_temp = llm.temperature if hasattr(llm, 'temperature') else 0.1
       if hasattr(llm, 'temperature'):
           llm.temperature = temperature
       
       response = query_engine.query(question)
       responses.append(response.response)
       
       # Restore original temperature
       if hasattr(llm, 'temperature'):
           llm.temperature = original_temp
       
       debug_print(f"Response {i+1}: {response.response[:80]}...")
   
   # Sentence-level semantic clustering
   semantic_entropy = calculate_sentence_semantic_entropy(responses, encoder)
   
   # Interpretation
   if semantic_entropy > 2.0:
       interpretation = "HIGH uncertainty - likely hallucination"
       confidence = "LOW"
   elif semantic_entropy > 1.0:
       interpretation = "MEDIUM uncertainty - review recommended"
       confidence = "MEDIUM"
   else:
       interpretation = "LOW uncertainty - confident answer"
       confidence = "HIGH"
   
   print(f"\nSemantic Entropy Score: {semantic_entropy:.3f}")
   print(f"Interpretation: {interpretation}")
   
   return {
       'semantic_entropy': semantic_entropy,
       'responses': responses,
       'interpretation': interpretation,
       'high_uncertainty': semantic_entropy > 1.5,
       'confidence': confidence
   }

def calculate_sentence_semantic_entropy(responses, encoder):
   """Calculate entropy based on semantic clustering of sentences"""
   
   # Extract all sentences from all responses
   all_sentences = []
   for response in responses:
       sentences = re.split(r'[.!?]+', response)
       sentences = [s.strip() for s in sentences if s.strip() and len(s) > 10]
       all_sentences.extend(sentences)
   
   if len(all_sentences) < 2:
       return 0.0
   
   # Encode sentences
   #encoder = SentenceTransformer("all-MiniLM-L6-v2")
   embeddings = encoder.encode(all_sentences)
   
   # Simple clustering based on similarity threshold
   clusters = []
   used_indices = set()
   similarity_threshold = 0.7
   
   for i, emb_i in enumerate(embeddings):
       if i in used_indices:
           continue
           
       cluster = [i]
       used_indices.add(i)
       
       for j, emb_j in enumerate(embeddings):
           if j <= i or j in used_indices:
               continue
               
           similarity = cosine_similarity([emb_i], [emb_j])[0][0]
           if similarity > similarity_threshold:
               cluster.append(j)
               used_indices.add(j)
       
       clusters.append(cluster)
   
   # Calculate entropy based on cluster sizes
   cluster_sizes = [len(cluster) for cluster in clusters]
   total_sentences = len(all_sentences)
   
   # Calculate Shannon entropy
   entropy = 0.0
   for size in cluster_sizes:
       prob = size / total_sentences
       if prob > 0:
           entropy -= prob * math.log2(prob)
   
   return entropy

In [14]:
# Test with your query engine
entropy_result = calculate_semantic_entropy(hard_question, query_engine_reranker, encoder, num_samples=4)


Semantic Entropy Score: 3.102
Interpretation: HIGH uncertainty - likely hallucination


In [83]:
entropy_result

{'semantic_entropy': 2.374556048381884,
 'responses': ['The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during the second trimester typically includes the use of antibiotics that are safe in pregnancy. Options may include a beta-lactam antibiotic (such as amoxicillin or amoxicillin-clavulanate) combined with a macrolide (such as azithromycin) if atypical pathogens are suspected. Alternatively, monotherapy with a macrolide like azithromycin might be considered in patients without significant comorbidities or risks for resistant pathogens. The choice of treatment should prioritize the safety of both the mother and fetus while addressing the likely pathogens. Always consult with a healthcare provider for the most appropriate therapy.',
  'The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during the second trimester should prioritize medications that are safe for both the mother and the fetus. Beta-la

In [15]:
# Let's print all responses in a readable way
def print_responses_pretty(responses, wrap_width=100):
    """Nicely format and display model responses in a notebook"""
    for i, r in enumerate(responses, 1):
        wrapped = textwrap.fill(r.strip(), width=wrap_width)
        display(Markdown(f"**Response {i}:**\n\n```\n{wrapped}\n```"))


In [16]:
print_responses_pretty(entropy_result['responses'])

**Response 1:**

```
The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during
the second trimester would typically involve antibiotics that are safe during pregnancy. Options
like beta-lactams (e.g., amoxicillin or amoxicillin-clavulanate) or macrolides (e.g., azithromycin)
could be considered, depending on the patient's clinical situation and allergy status. It is
essential to avoid medications that are contraindicated during pregnancy and ensure the treatment is
tailored to the patient's needs while protecting maternal and fetal health.
```

**Response 2:**

```
The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during
the second trimester typically includes antibiotics that are both effective against CAP pathogens
and considered safe during pregnancy. Beta-lactams such as amoxicillin or amoxicillin-clavulanate
are commonly used. If atypical pathogens like *Mycoplasma pneumoniae* or *Chlamydia pneumoniae* are
suspected, a macrolide such as azithromycin can be considered, as it is generally regarded as safe
during pregnancy. Care should always be taken to select medications based on the patient’s specific
clinical presentation and in consultation with their healthcare provider.
```

**Response 3:**

```
The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during
the second trimester generally includes the use of beta-lactams such as amoxicillin or amoxicillin-
clavulanate. If atypical pathogens like Mycoplasma pneumoniae or Chlamydia pneumoniae are suspected,
azithromycin (a macrolide) is often recommended, as it is considered safe in pregnancy.
Fluoroquinolones and tetracyclines are not preferred due to potential adverse effects on fetal
development. Therapy should always be guided by clinical judgment and adjusted based on patient-
specific factors.
```

**Response 4:**

```
The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during
the second trimester would be a beta-lactam antibiotic, such as amoxicillin, or a combination of
amoxicillin with a macrolide like azithromycin, depending on bacterial coverage needs. Macrolides
(e.g., azithromycin) are generally considered safe during pregnancy and may be added for atypical
organism coverage. It is crucial to avoid antibiotics known to be teratogenic and ensure the
treatment aligns with pregnancy safety guidelines.
```

In [87]:
# Let's try on a question that had a high faithfulness and high relevancy
scores_df['user_input'][0]

'What is the first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired bacterial pneumonia in an adult with no comorbidities?'

In [88]:
# Test semantic entropy on a sample question
sample_question = "What is the first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired bacterial pneumonia in an adult with no comorbidities?"
entropy_result_2 = calculate_semantic_entropy(sample_question, query_engine_reranker, encoder, num_samples=4)


Semantic Entropy Score: 0.000
Interpretation: LOW uncertainty - confident answer


In [90]:
print_responses_pretty(entropy_result_2['responses'])

**Response 1:**

```
The first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired
bacterial pneumonia in an adult with no comorbidities includes oral amoxicillin, macrolides, or
doxycycline.
```

**Response 2:**

```
The first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired
bacterial pneumonia in an adult with no comorbidities includes oral amoxicillin, macrolides, or
doxycycline.
```

**Response 3:**

```
The first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired
bacterial pneumonia in an adult with no comorbidities includes oral amoxicillin, macrolides, or
doxycycline.
```

**Response 4:**

```
The first-line antibiotic regimen for outpatient treatment of uncomplicated community-acquired
bacterial pneumonia in an adult with no comorbidities includes oral amoxicillin, macrolides, or
doxycycline.
```

In [93]:
# Now, let's take a look at a question that had moderate scores using ragas
scores_df['user_input'][11]

'Recommended approach if a patient remains febrile after 48 h of appropriate CAP therapy?'

In [94]:
moderate_score_question = "Recommended approach if a patient remains febrile after 48 h of appropriate CAP therapy?"
entropy_result_3 = calculate_semantic_entropy(moderate_score_question, query_engine_reranker, encoder, num_samples=4)


Semantic Entropy Score: 2.425
Interpretation: HIGH uncertainty - likely hallucination


In [95]:
print_responses_pretty(entropy_result_3['responses'])

**Response 1:**

```
If a patient remains febrile after 48 hours of appropriate therapy for community-acquired pneumonia
(CAP), it is recommended to reassess the patient. This includes evaluating for potential
complications, such as pleural effusion or abscess, through imaging studies like chest radiographs
or ultrasounds. Additionally, laboratory markers such as C-reactive protein (CRP) and white blood
cell counts can be useful in assessing ongoing infection or inflammation. A multidisciplinary
discussion involving specialists, such as a pulmonologist or infectious disease expert, may also be
considered to guide further management, including adjustments in therapy or additional
interventions.
```

**Response 2:**

```
If a patient remains febrile after 48 hours of appropriate therapy for community-acquired pneumonia
(CAP), it is recommended to reassess the clinical situation. This may involve evaluating for
complications (e.g., pleural effusion, abscess), assessing adherence to the guideline-recommended
therapy duration, and considering additional diagnostic investigations such as imaging (e.g., chest
X-ray, CT scan) or laboratory tests (e.g., CRP, white blood cell count). Multidisciplinary
discussions and adjustments to the treatment plan, such as extending therapy duration or addressing
complications conservatively or invasively, may also be necessary based on findings.
```

**Response 3:**

```
If a patient remains febrile after 48 hours of appropriate therapy for community-acquired pneumonia
(CAP), it is recommended to reassess the clinical situation. This could include evaluating for
potential complications like pleural effusion or abscess formation, considering alternative
diagnoses, and ensuring the choice of antibiotics aligns with current guidelines and susceptibility
data. Imaging studies and further clinical or biochemical assessments, such as repeat chest
radiographs, CRP levels, or white blood cell counts, could guide decisions on whether additional
interventions, such as drainage procedures or a change in therapy, are necessary.
```

**Response 4:**

```
If a patient remains febrile after 48 hours of appropriate therapy for community-acquired pneumonia
(CAP), the recommended approach includes evaluating clinical and biochemical factors such as
persistent fever, rising C-reactive protein (CRP), or white blood cell count, as well as reviewing
imaging results like chest radiographs or ultrasounds to assess for complications such as unresolved
infections, pleural effusions, or abscesses. Additional diagnostic tests or adjustments to the
treatment plan, such as extending therapy duration, changing antibiotics, or considering pleural
interventions (e.g., thoracentesis or chest tube placement), may be needed based on the patient's
clinical status. Multidisciplinary discussions can also be helpful for determining the most
appropriate course of action.
```

# Enhance the retrieval with multi-stage retrieval
## 1. breakdown the query

In [17]:
def break_down_query(complex_question, llm, DEBUG_MODE=False):
    """
    Break a complex medical question into simpler parts
    """
    def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)

    prompt = f"""
You are a medical librarian. Break down this complex medical question into 2-4 simpler, specific questions that together would provide a complete answer.

Complex question: {complex_question}

Provide the simpler questions as a numbered list:
1. 
2. 
3. 
4. 
"""
    
    response = llm.complete(prompt)
    debug_print("=== QUERY BREAKDOWN ===")
    debug_print(response.text)
    
    # Extract the sub-questions
    lines = response.text.strip().split('\n')
    sub_questions = []
    
    for line in lines:
        line = line.strip()
        if line and (line[0].isdigit() or line.startswith('-')):
            # Remove numbering
            clean_question = re.sub(r'^[\d\-\.\)\s]+', '', line).strip()
            if clean_question:
                sub_questions.append(clean_question)
    
    return sub_questions



In [18]:
# Test with a complex medical question
complex_question = "What are the drug interactions between warfarin and antibiotics in elderly patients, and how should the dosing be adjusted?"

sub_questions = break_down_query(complex_question, llm)
print(f"\nExtracted {len(sub_questions)} sub-questions:")
for i, q in enumerate(sub_questions):
    print(f"{i+1}. {q}")


Extracted 4 sub-questions:
1. What are the common antibiotics that interact with warfarin, and what is the mechanism of these interactions?
2. How do these interactions affect the efficacy and safety of warfarin in elderly patients?
3. What specific monitoring or laboratory tests are recommended when warfarin and antibiotics are used together?
4. How should warfarin dosing be adjusted in elderly patients when they are prescribed interacting antibiotics?


## 2. Multi-stage information gathering

In [19]:
def multi_stage_retrieval(complex_question, query_engine, llm, DEBUG_MODE=False):
    """
    Perform multi-stage retrieval for complex questions
    """
    def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)

    debug_print("MULTI-STAGE RETRIEVAL")
    debug_print("=" * 50)
    
    # Step 1: Break down the question
    sub_questions = break_down_query(complex_question, llm)
    
    # Step 2: Get answers for each sub-question
    sub_answers = []
    all_sources = []
    
    for i, sub_q in enumerate(sub_questions):
        debug_print(f"\n--- Sub-question {i+1}: {sub_q} ---")
        
        response = query_engine.query(sub_q)
        sub_answer = response.response
        sources = response.source_nodes
        
        debug_print(f"Answer: {sub_answer[:150]}...")
        
        sub_answers.append({
            'question': sub_q,
            'answer': sub_answer,
            'sources': sources
        })
        
        # Collect unique sources
        for source in sources:
            if source.text not in [s.text for s in all_sources]:
                all_sources.append(source)
    
    # Step 3: Synthesize final answer
    debug_print(f"\nSYNTHESIZING FINAL ANSWER")
    context = ""
    for i, sub in enumerate(sub_answers):
        context += f"Sub-question {i+1}: {sub['question']}\n"
        context += f"Answer: {sub['answer']}\n\n"
    
    synthesis_prompt = f"""
Based on the following information, provide a comprehensive answer to the original question.

Original question: {complex_question}

Information gathered:
{context}

Instructions:
- Combine the information into one coherent answer
- Only use the information provided above
- If there are contradictions, mention them
- Be specific and cite relevant details

Comprehensive answer:
"""
    
    final_response = llm.complete(synthesis_prompt)
    final_answer = final_response.text
    
    debug_print("Final synthesized answer:")
    debug_print(final_answer)
    
    return {
        'original_question': complex_question,
        'sub_questions': sub_questions,
        'sub_answers': sub_answers,
        'final_answer': final_answer,
        'all_sources': all_sources
    }



In [99]:
# Test multi-stage retrieval
multi_stage_result = multi_stage_retrieval(complex_question, query_engine_reranker, llm)

In [100]:
entropy_result_multistage = calculate_semantic_entropy(complex_question, query_engine_reranker, encoder, num_samples=4)


Semantic Entropy Score: 2.000
Interpretation: MEDIUM uncertainty - review recommended


In [101]:
print_responses_pretty(entropy_result_multistage['responses'])

**Response 1:**

```
The provided information does not discuss the specific drug interactions between warfarin and
antibiotics in elderly patients or recommendations for dose adjustment. For accurate dosing and
monitoring of interactions, particularly in elderly patients, consultation with healthcare
professionals and individual patient assessment are essential.
```

**Response 2:**

```
The provided information does not address interactions between warfarin and antibiotics, nor does it
provide specific guidance on dosing adjustments for such interactions in elderly patients. For
information on this topic, it would be important to consult clinical guidelines or pharmacological
resources specifically focused on drug interactions involving warfarin and antibiotics.
```

**Response 3:**

```
The provided information does not address the specific interactions between warfarin and antibiotics
in elderly patients or how their dosing should be adjusted.
```

**Response 4:**

```
The provided information does not specifically address the interactions between warfarin and
antibiotics in elderly patients or provide guidance on dosing adjustments. It discusses issues
related to augmented renal clearance (ARC) and the pharmacokinetic and pharmacodynamic
considerations of antibiotics in critically ill patients, but it does not explore warfarin or its
interactions with antibiotics. For accurate guidance, consulting clinical guidelines or a healthcare
professional is recommended.
```

# Putting it all together - define a comprehensive safety check function

In [20]:
def comprehensive_safety_check(question, query_engine, llm, use_multi_stage=False, DEBUG_MODE=False):
    """
    Perform comprehensive safety checking on a RAG response
    """
    def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)

    print("COMPREHENSIVE MEDICAL RAG SAFETY CHECK")
    print("=" * 60)

    # Step 1: Get the answer
    if use_multi_stage:
        print("Using multi-stage retrieval...")
        result = multi_stage_retrieval(question, query_engine, llm)
        answer = result["final_answer"]
        source_chunks = [node.text for node in result["all_sources"]]
    else:
        print("Using standard retrieval...")
        response = query_engine.query(question)
        answer = response.response
        source_chunks = [node.text for node in response.source_nodes]

    debug_print(f"\nQuestion: {question}")
    debug_print(f"Answer: {answer[:200]}...")

    # Step 2: Attribution check
    debug_print(f"\nATTRIBUTION CHECK")
    attribution_score, _ = check_answer_support(answer, source_chunks, encoder)

    # Step 3: Consistency check
    debug_print(f"\nCONSISTENCY CHECK")
    consistency_score, _ = check_consistency(question, query_engine, num_tries=2)

    # Step 4: Find weak sentences
    debug_print(f"\nWEAK SENTENCE DETECTION")
    weak_sentences = find_weak_sentences(answer, source_chunks)

    # Show warnings
    if weak_sentences:
        debug_print(f"\nPOTENTIAL ISSUES:")
        for weak in weak_sentences:
            debug_print(f"- Weak support: \"{weak['sentence'][:100]}...\"")
    else:
        debug_print(f"\nNo weak sentences found")

    # Step 5: Calculate semantic entropy
    entropy_result = calculate_semantic_entropy(
            question, query_engine, encoder, num_samples=3
        )
    semantic_entropy = entropy_result['semantic_entropy']

    # Step 6: Overall safety assessment
    print(f"\nOVERALL SAFETY ASSESSMENT")
    print("=" * 40)

    safety_score = 0
    max_score = 3

    print(f"Attribution Score: {attribution_score:.3f}")
    if attribution_score > 0.6:
        safety_score += 1
        print("Good source attribution")
    else:
        print("Weak source attribution")

    print(f"Consistency Score: {consistency_score:.3f}")
    if consistency_score > 0.6:
        safety_score += 1
        print("Good consistency")
    else:
        print("Low consistency")

    print(f"Semantic entropy score: {semantic_entropy:.3f}")
    if entropy_result['confidence'] == "HIGH":
        safety_score += 1
        print("Good semantic entropy")
    else:
        print("High semantic entropy")
    


    # Final confidence level
    if safety_score == max_score:
        confidence = "HIGH CONFIDENCE"
    elif safety_score >= 2:
        confidence = "MEDIUM CONFIDENCE"
    else:
        confidence = "LOW CONFIDENCE"

    print(f"\nFinal Assessment: {confidence}")

    print(f"\nMedical Disclaimer: This information is for educational purposes only.")

    return {
        "question": question,
        "answer": answer,
        "attribution_score": attribution_score,
        "consistency_score": consistency_score,
        "semantic_entropy": semantic_entropy,
        "weak_sentences": weak_sentences,
        "safety_score": safety_score,
        "confidence": confidence,
    }

In [21]:
# Test the comprehensive safety check
test_question = "What antibiotics are safe to use with warfarin in elderly patients?"
safety_result = comprehensive_safety_check(test_question, query_engine_reranker, llm, use_multi_stage=False)

COMPREHENSIVE MEDICAL RAG SAFETY CHECK
Using standard retrieval...

Average consistency score: 0.979
High consistency - responses are very similar

Semantic Entropy Score: 1.500
Interpretation: MEDIUM uncertainty - review recommended

OVERALL SAFETY ASSESSMENT
Attribution Score: 0.608
Good source attribution
Consistency Score: 0.979
Good consistency
Semantic entropy score: 1.500
High semantic entropy

Final Assessment: MEDIUM CONFIDENCE

Medical Disclaimer: This information is for educational purposes only.


In [107]:
# Test the comprehensive safety check
test_question = "What antibiotics are safe to use with warfarin in elderly patients?"
safety_result = comprehensive_safety_check(test_question, query_engine_reranker, llm, use_multi_stage=True)

COMPREHENSIVE MEDICAL RAG SAFETY CHECK
Using multi-stage retrieval...
Checking 12 sentences against 11 source chunks
Sentence 1: 'When prescribing antibiotics for elderly patients ...' -> Score: 0.699
Sentence 2: 'While the provided information does not specify wh...' -> Score: 0.637
Sentence 3: 'Elderly patients often experience age-related phys...' -> Score: 0.656
Sentence 4: 'Additionally, comorbidities like hypertension, dys...' -> Score: 0.645
Sentence 5: 'These factors, combined with frailty, dementia, or...' -> Score: 0.736
Sentence 6: 'Although specific antibiotics that minimize the ri...' -> Score: 0.747
Sentence 7: 'This includes considering organ function, renal cl...' -> Score: 0.566
Sentence 8: 'For instance, in patients with augmented renal cle...' -> Score: 0.753
Sentence 9: 'Given the lack of specific guidance in the provide...' -> Score: 0.595
Sentence 10: 'This includes regular INR monitoring to detect any...' -> Score: 0.575
Sentence 11: 'Consulting a healthcare prof

# External Fact-Checking Module for Healthcare RAG
# Part 4: Building a Transparent Interface for Healthcare AI

We will use PubMed to search for relevant information. First, we need to transform our answer into pubmed queries. We will ask an llm to do that for us

In [87]:
answer

"The preferred outpatient therapy for community-acquired pneumonia (CAP) in a pregnant patient during the second trimester typically includes antibiotics that are safe for use during pregnancy. Options may include beta-lactams such as amoxicillin or amoxicillin-clavulanate, and macrolides like azithromycin if atypical pathogens are suspected. The choice of therapy should consider the safety profile of the medication for both the mother and the fetus, as well as the local resistance patterns and the patient's clinical condition."

In [252]:
# Split answer into sentences
answer_sentences = re.split(r'[.!?]+', answer)
answer_sentences = [s.strip() for s in answer_sentences if s.strip()]

In [240]:
import os
from openai import OpenAI
import requests

# Set your API key
openai_api_key = os.getenv("OPENAI_API_KEY")

def call_openai(system_prompt, user_prompt, model="gpt-4o-2024-11-20", temperature=0.1):
    client = OpenAI()
    response = client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ],
    temperature=temperature,
)
    return response.choices[0].message.content

def generate_scholar_keywords(answer, call_openai):
    """
    Calls an LLM to transform an answer or question into a PubMed search query.
    llm_generate_fn(prompt) should return a string result.
    """
    system_prompt = """You are a clinical NLP assistant that extracts 3–6 concise keywords
from an answer so they can be used as a Semantic Scholar search query.

Rules
1. Output ONE line containing only the keywords separated by spaces.
2. Use lower-case nouns; drop adjectives and stop-words.
3. Include a keyword for:
   • the main disease / problem
   • the intervention / drug class (if present)
   • the population or special setting (if present)
4. Do NOT include numbers, punctuation, extra words, or explanations.
5. If the answer covers multiple distinct topics, pick the MOST
   central one (usually the first sentence).
"""

    user_prompt = f"""Extract keywords for Semantic Scholar from this answer:

{answer}

Keywords:
"""
    
    keywords = call_openai(system_prompt, user_prompt)

    return keywords

In [242]:
# 1. Extract keywords from the answer:
query = generate_scholar_keywords(answer, call_openai)
print(query)

community-acquired pneumonia antibiotics pregnancy


In [243]:
# Query semantic scholar
def search_semantic_scholar(query, max_results=10):
    url = "https://api.semanticscholar.org/graph/v1/paper/search"
    params = {"query": query, "limit": max_results, "fields": "title,abstract"}

    response = requests.get(url, params=params)
    response.raise_for_status()
    articles = response.json()

    abstracts = []
    for article in articles['data']:
        if not article['abstract']:
            print("No data")
        else:
            abstracts.append(article['abstract'])
    return abstracts

In [244]:
abstracts = search_semantic_scholar(query)

No data
No data
No data


In [246]:
abstracts

['Community acquired lower lobe pneumonia is a rare cause of abdominal pain in pregnancy which may present as acute abdominal pain. Ultrasound\nand MRI are the preferred imaging investigations. The standard treatment is broad spectrum antibiotics and pleuritic interventions. We report a\ncase of lower lobe pneumonia with pleural effusion presented in \uf001rst trimester with acute abdominal pain.',
 'BACKGROUND Cryptogenic organizing pneumonia (COP), formerly known as bronchiolitis obliterans organizing pneumonia, is an extremely rare disease in pregnancy. In this case, we report on COP diagnosed in recurrent pneumonia that does not respond to antibiotics in pregnant woman. CASE SUMMARY A 35-year-old woman with no prior lung disease presented with concerns of chest pain with cough, sputum, dyspnea, and mild fever at 11 wk’ gestation. She was diagnosed with community-acquired pneumonia and treated with antibiotics; her symptoms improved temporarily. Four weeks after discharge, she was r

In [247]:

score, _   = check_answer_support(answer, abstracts, encoder)
score

np.float32(0.56347543)

In [249]:
# Split all abstracts into sentences
import re

_SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')

def _split_into_sentences(text, min_len=10):
    """
    Very lightweight sentence splitter.
    - Splits on punctuation followed by whitespace.
    - Strips whitespace.
    - Drops fragments shorter than `min_len` characters.
    """
    pieces = _SENTENCE_SPLIT_RE.split(text)
    return [s.strip() for s in pieces if len(s.strip()) >= min_len]

def prepare_abstract_sentences(abstracts, min_len=10):
    """
    Parameters
    ----------
    abstracts : list[str]
        Each item is a single PubMed abstract (or any free-text chunk).
    min_len : int
        Minimum character length to keep a sentence (filters headings, etc.).

    Returns
    -------
    list[str]
        Flattened list of cleaned sentences ready for semantic-similarity scoring.
    """
    sentences = []
    for abs_text in abstracts:
        sentences.extend(_split_into_sentences(abs_text, min_len=min_len))
    return sentences


In [250]:
sentences = prepare_abstract_sentences(abstracts)

In [251]:
score, _   = check_answer_support(answer, sentences, encoder)
score

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


np.float32(0.6780352)

In [255]:
def external_fact_check_answer(answer, max_results=10):
    # 1. Extract keywords from the answer:
    query = generate_scholar_keywords(answer, call_openai)
    print(f"Generated query: {query}")

    # 2. Search semantic scholar for trustworthy sources
    abstracts = search_semantic_scholar(query, max_results=max_results)

    # 3. Prepare sentence-level abstracts
    abstract_sentences = prepare_abstract_sentences(abstracts)

    # 4. check answer support
    score, _   = check_answer_support(answer, abstract_sentences, encoder)
    return score
    

In [258]:
external_support = external_fact_check_answer(answer)

Generated query: community-acquired pneumonia antibiotics pregnancy
No data
No data
No data


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [259]:
external_support

np.float32(0.6780352)

In [260]:
def comprehensive_safety_check(question, query_engine, llm, use_multi_stage=False, DEBUG_MODE=False):
    """
    Perform comprehensive safety checking on a RAG response
    """
    def debug_print(*args, **kwargs):
        """Print only if DEBUG_MODE is enabled."""
        if DEBUG_MODE:
            print(*args, **kwargs)

    print("COMPREHENSIVE MEDICAL RAG SAFETY CHECK")
    print("=" * 60)

    # Step 1: Get the answer
    if use_multi_stage:
        print("Using multi-stage retrieval...")
        result = multi_stage_retrieval(question, query_engine, llm)
        answer = result["final_answer"]
        source_chunks = [node.text for node in result["all_sources"]]
    else:
        print("Using standard retrieval...")
        response = query_engine.query(question)
        answer = response.response
        source_chunks = [node.text for node in response.source_nodes]

    debug_print(f"\nQuestion: {question}")
    debug_print(f"Answer: {answer[:200]}...")

    # Step 2: Attribution check
    debug_print(f"\nATTRIBUTION CHECK")
    attribution_score, _ = check_answer_support(answer, source_chunks, encoder)

    # Step 3: Consistency check
    debug_print(f"\nCONSISTENCY CHECK")
    consistency_score, _ = check_consistency(question, query_engine, num_tries=2)

    # Step 4: Find weak sentences
    debug_print(f"\nWEAK SENTENCE DETECTION")
    weak_sentences = find_weak_sentences(answer, source_chunks)

    # Show warnings
    if weak_sentences:
        debug_print(f"\nPOTENTIAL ISSUES:")
        for weak in weak_sentences:
            debug_print(f"- Weak support: \"{weak['sentence'][:100]}...\"")
    else:
        debug_print(f"\nNo weak sentences found")

    # Step 5: Calculate semantic entropy
    entropy_result = calculate_semantic_entropy(
            question, query_engine, encoder, num_samples=3
        )
    semantic_entropy = entropy_result['semantic_entropy']

    # Step 6: Calculate support from external sources
    external_support = external_fact_check_answer(answer)

    # Step 7: Overall safety assessment
    print(f"\nOVERALL SAFETY ASSESSMENT")
    print("=" * 40)

    safety_score = 0
    max_score = 4

    print(f"Attribution Score: {attribution_score:.3f}")
    if attribution_score > 0.6:
        safety_score += 1
        print("Good source attribution")
    else:
        print("Weak source attribution")

    print(f"Consistency Score: {consistency_score:.3f}")
    if consistency_score > 0.6:
        safety_score += 1
        print("Good consistency")
    else:
        print("Low consistency")

    print(f"Semantic entropy score: {semantic_entropy:.3f}")
    if entropy_result['confidence'] == "HIGH":
        safety_score += 1
        print("Good semantic entropy")
    else:
        print("High semantic entropy")

    print(f"External Fact Checking Score: {external_support:.3f}")
    if external_support > 0.6:
        safety_score += 1
        print("Good external support")
    else:
        print("Low external support")
    


    # Final confidence level
    if safety_score == max_score:
        confidence = "HIGH CONFIDENCE"
    elif safety_score >= 2:
        confidence = "MEDIUM CONFIDENCE"
    else:
        confidence = "LOW CONFIDENCE"

    print(f"\nFinal Assessment: {confidence}")

    print(f"\nMedical Disclaimer: This information is for educational purposes only.")

    return {
        "question": question,
        "answer": answer,
        "attribution_score": attribution_score,
        "consistency_score": consistency_score,
        "semantic_entropy": semantic_entropy,
        "weak_sentences": weak_sentences,
        "safety_score": safety_score,
        "confidence": confidence,
    }

In [261]:
# Test the comprehensive safety check
test_question = "What antibiotics are safe to use with warfarin in elderly patients?"
safety_result = comprehensive_safety_check(test_question, query_engine_reranker, llm, use_multi_stage=False)

COMPREHENSIVE MEDICAL RAG SAFETY CHECK
Using standard retrieval...

Average consistency score: 0.985
High consistency - responses are very similar

Semantic Entropy Score: 1.811
Interpretation: MEDIUM uncertainty - review recommended
Generated query: warfarin antibiotics elderly
No data
No data
No data
No data
No data
No data
No data

OVERALL SAFETY ASSESSMENT
Attribution Score: 0.612
Good source attribution
Consistency Score: 0.985
Good consistency
Semantic entropy score: 1.811
High semantic entropy
External Fact Checking Score: 0.571
Low external support

Final Assessment: MEDIUM CONFIDENCE

Medical Disclaimer: This information is for educational purposes only.


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
