# From Black Box to Dashboard: How We Built a Transparent Interface for Healthcare AI

## Pubmed Corpus Download

In [None]:
# Quick demo of the PubMed corpus module

from src.corpus import PubMedDownloader, DataProcessor
import os
from dotenv import load_dotenv
import config as cfg # Import all the parameters from the config to stay consistent with the app

load_dotenv()



True

In [None]:
# Expanded query to grab more variety of articles for our RAGAS evaluation
expanded_query = """(
  "Pneumonia, Bacterial"[MeSH Terms]                     
    OR "Community-Acquired Infections"[MeSH Terms]
    OR (pneumonia[tiab] AND (community-acquired OR hospital-acquired OR ventilator-associated OR aspiration))
)
AND
(
      guideline[pt] OR practice guideline[pt]    
  OR  (("antibacterial agents"[MeSH Terms] OR antibiotic*[tiab] OR antimicrobial*[tiab])
       AND (therapy[sh] OR treatment[tiab] OR empiric[tiab] OR empiric* therapy[tiab]))
)
AND
(
      ((outpatient*[tiab] OR ambulatory[tiab]) AND (macrolide*[tiab] OR "β-lactam"[tiab] OR dual[tiab]))
  OR 
      (duration[tiab] AND (course[tiab] OR day*[tiab] OR "7-day"[tiab] OR "5-day"[tiab]))
  OR
      (macrolide[tiab] AND resistance[tiab] AND (threshold*[tiab] OR percent*[tiab]))
  OR
      (fluoroquinolone*[tiab] AND (alternative[tiab] OR substitution[tiab] OR "β-lactam allergy"[tiab]))
  OR 
      (hospital-acquired[tiab] OR ventilator-associated[tiab])
        AND (MRSA[tiab] OR methicillin-resistant[tiab] OR pseudomonas[tiab])
  OR 
      antibiogram*[tiab]
  OR
      ((azithromycin[tiab] OR macrolide*[tiab]) AND (renal[tiab] OR kidney[tiab]) AND (dose[tiab] OR dosing[tiab]))
  OR
      (pregnan*[tiab] AND (CAP[tiab] OR pneumonia[tiab]))
  OR
      (aspiration[tiab] AND anaerob*[tiab])
  OR
      ((persistent[tiab] OR refractory[tiab] OR non-responsive[tiab]) AND (fever[tiab] OR febrile[tiab]))
)
AND
("2015/01/01"[PDAT] : "3000"[PDAT])  
AND
humans[MeSH] AND english[lang]
"""

### Initialize and download

In [13]:
# Initialize and download
downloader = PubMedDownloader(
    api_key=os.getenv("NCBI_API_KEY"), 
    email=os.getenv("EMAIL")
)

# Download small test corpus
articles = downloader.download_articles(
    query=expanded_query, 
    max_results=1000,
    batch_size=200
)



Searching PMC for: (
  "Pneumonia, Bacterial"[MeSH Terms]                     
    OR "Community-Acquired Infections"[MeSH Terms]
    OR (pneumonia[tiab] AND (community-acquired OR hospital-acquired OR ventilator-associated OR aspiration))
)
AND
(
      guideline[pt] OR practice guideline[pt]    
  OR  (("antibacterial agents"[MeSH Terms] OR antibiotic*[tiab] OR antimicrobial*[tiab])
       AND (therapy[sh] OR treatment[tiab] OR empiric[tiab] OR empiric* therapy[tiab]))
)
AND
(
      ((outpatient*[tiab] OR ambulatory[tiab]) AND (macrolide*[tiab] OR "β-lactam"[tiab] OR dual[tiab]))
  OR 
      (duration[tiab] AND (course[tiab] OR day*[tiab] OR "7-day"[tiab] OR "5-day"[tiab]))
  OR
      (macrolide[tiab] AND resistance[tiab] AND (threshold*[tiab] OR percent*[tiab]))
  OR
      (fluoroquinolone*[tiab] AND (alternative[tiab] OR substitution[tiab] OR "β-lactam allergy"[tiab]))
  OR 
      (hospital-acquired[tiab] OR ventilator-associated[tiab])
        AND (MRSA[tiab] OR methicillin-resista

In [14]:
len(articles)

595

In [None]:
# Process and save
cleaned = DataProcessor.clean_articles(articles)
DataProcessor.save_articles_jsonl(cleaned, cfg.CORPUS_PATH)

# Quick stats
stats = DataProcessor.get_corpus_stats(cleaned)
print(f"Downloaded {stats['total_articles']} articles")
print(f"Licenses: {stats['license_distribution']}")

Skipping invalid article: PMC11161024
Skipping invalid article: PMC10755665
Skipping invalid article: PMC10695481
Skipping invalid article: PMC10468848
Skipping invalid article: PMC10108815
Skipping invalid article: PMC9885521
Skipping invalid article: PMC9379091
Skipping invalid article: PMC8237391
Cleaned 587 valid articles out of 595
Saved 587 articles to data/processed/expanded_pneumonia.jsonl
Downloaded 587 articles
Licenses: {'cc-by': 579, 'cc0': 8}


## Build the index

In [None]:
from src.rag import DocumentProcessor, create_index, create_query_engine, query_medical_rag
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI


In [None]:
# Load and process articles
articles = DocumentProcessor.load_medical_articles("data/processed/expanded_pneumonia.jsonl")
documents = DocumentProcessor.process_articles(articles)

In [None]:
# Create embedding model and index
embed_model = HuggingFaceEmbedding(model_name=cfg.EMBEDDING_MODEL)
index = create_index(documents, embed_model, chunk_size=cfg.CHUNK_SIZE, chunk_overlap=cfg.CHUNK_OVERLAP, index_path=cfg.INDEX_PATH)

Created 34346 nodes using sentence splitting
Created and saved index with 34346 nodes


### Test the index with a simple query

In [None]:
# Create query engine and ask questions
llm = OpenAI(model=cfg.OPENAI_MODEL_NAME, temperature=cfg.DEFAULT_TEMPERATURE)
embed_model = HuggingFaceEmbedding(model_name=cfg.EMBEDDING_MODEL)
query_engine = create_query_engine(cfg.INDEX_PATH, llm, embed_model)
answer = query_medical_rag("What are common treatments for pneumonia?", query_engine, embed_model)

In [20]:
print(answer)

Common treatments for pneumonia include antibiotics, particularly when bacterial infection is suspected or confirmed. For severe cases, parenteral antibiotics such as flucloxacillin, gentamicin, or ceftriaxone may be used. In cases of atypical pathogens, macrolides may be added. Antiviral treatments like oseltamivir can be important for influenza-related pneumonia. Corticosteroids are sometimes used to manage inflammation, though their use remains controversial due to potential adverse effects. In some regions, Chinese herbal injections are also utilized as adjunctive treatments for severe pneumonia.

Sources:
[1] PMCID PMC9077980 - Comparative efficacy of Chinese herbal injections for treating severe pneumonia: A protocol for systematic review and Bayesian network meta-analysis of randomized controlled trials
[2] PMCID PMC9682719 - Can child pneumonia in low-resource settings be treated without antibiotics? A systematic review & meta-analysis
[3] PMCID PMC11706086 - D-PRISM: a global 