In [1]:
import os, sys
from pathlib import Path

repo_root = Path("/home/obadah/code/MedCLIP")
os.chdir(repo_root)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
print(f"Working dir set to: {Path.cwd()}")


Working dir set to: /home/obadah/code/MedCLIP


### FAISS Vector Explorer

Load the SapBERT FAISS index and mapping, sample vectors, and visualize/clusters them.

In [2]:
from pathlib import Path
import json
import yaml
import numpy as np
import faiss
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Config paths
paths_cfg = yaml.safe_load(Path("cfg/paths.yml").read_text())
umls_cfg = paths_cfg.get("umls", paths_cfg)
index_path = Path(umls_cfg["faiss_index"]).expanduser()
mapping_path = Path(umls_cfg.get("sapbert_id2cui", "")).expanduser()

print(f"Index path: {index_path}")
print(f"Mapping path: {mapping_path}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/home/obadah/anaconda3/envs/medclip/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/obadah/anaconda3/envs/medclip/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/obadah/anaconda3/envs/medclip/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/obadah/anaconda3/envs/medclip/lib/python3.9/site-packages/traitlets/config/application.py", line 10

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [None]:
# Load FAISS index and id2cui mapping
if not index_path.exists():
    raise FileNotFoundError(f"FAISS index not found: {index_path}")
index = faiss.read_index(str(index_path))

id2cui = {}
if mapping_path.exists():
    id2cui = json.loads(mapping_path.read_text())
    id2cui = {str(k): v for k, v in id2cui.items()}

print(index)
print(f"Loaded {len(id2cui)} id->CUI mappings")

<faiss.swigfaiss.IndexFlat; proxy of <Swig Object of type 'faiss::IndexFlat *' at 0x7cd95c1f17e0> >
Loaded 250225 id->CUI mappings


In [None]:
# Extract vectors (sample if too large)
num = index.ntotal
if num == 0:
    raise ValueError("Index is empty")

# Choose one:
all_vectors = True  # set True to load all vectors (memory heavy on large indexes)
max_points = 5000    # used only when all_vectors is False

if all_vectors:
    vecs = np.zeros((num, index.d), dtype='float32')
    index.reconstruct_n(0, num, vecs)
    print(f"Loaded all {num} vectors")
else:
    step = max(1, num // max_points)
    idxs = np.arange(0, num, step, dtype='int64')
    vecs = np.zeros((len(idxs), index.d), dtype='float32')
    index.reconstruct_n(0, len(idxs), vecs)
    print(f"Sampled {len(vecs)} / {num} vectors for analysis")


Loaded all 250225 vectors


In [None]:
# # PCA to 2D
# pca = PCA(n_components=2, random_state=42)
# vecs_2d = pca.fit_transform(vecs)
# print("Explained variance ratio:", pca.explained_variance_ratio_)

In [None]:
# # Optional clustering (k-means)
# K = 10
# kmeans = KMeans(n_clusters=K, random_state=42, n_init='auto')
# labels = kmeans.fit_predict(vecs)

In [None]:
# # Plot
# plt.figure(figsize=(8,6))
# # scatter = plt.scatter(vecs_2d[:,0], vecs_2d[:,1], c=labels, cmap='tab10', s=8, alpha=0.6)
# plt.title("FAISS vectors (PCA 2D, k-means labels)")
# plt.xlabel("PC1")
# plt.ylabel("PC2")
# plt.colorbar(scatter, label='Cluster')
# plt.show()

## 4. Sketch: flag CUIs with visual signal using MedCLIP
Idea: pair SapBERT CUIs with a MedCLIP text encoder, and compare to MedCLIP image embeddings from a sample image pool.
1) Build CUI texts (preferred names or surface strings).
2) Encode CUIs with MedCLIP text encoder; encode a batch of images with the MedCLIP image encoder.
3) Score cosine similarity between each CUI and the image pool (max/mean). High scores â†’ likely visual.
4) Optionally gate by semantic type (findings/anatomy) using UMLS STYs.

Below is a lightweight scaffold to plug in your MedCLIP model and image sampler. Replace the TODOs with real loaders/paths.

In [4]:

import json
import numpy as np
import torch
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image

from lib import constants as C
from lib.medclip_utils import load_medclip

# Paths
concept_bank_dir = (C.OUTPUTS / "snomed_mimic_bank").resolve()
inventory_path = concept_bank_dir / "concept_inventory.json"
max_images = 128  # sample to keep things cheap

# Build CUI->name map from the SNOMED bank
if not inventory_path.exists():
    raise FileNotFoundError(f"Concept inventory not found at {inventory_path} (cwd={Path.cwd()})")
inventory = json.loads(inventory_path.read_text())
cui_names = {}
for entry in inventory.values():
    cui = entry.get("cui")
    if not cui:
        continue
    name = entry.get("canonical_name", cui)
    cui_names[cui] = name

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor, model = load_medclip(device=device, variant="resnet")

# Utility to sample a small batch of images from a root

def sample_image_batch(root: Path, max_images: int):
    exts = {".jpg", ".jpeg", ".png"}
    paths = [p for p in root.rglob("*") if p.suffix.lower() in exts]
    paths = sorted(paths)[:max_images]
    if not paths:
        raise FileNotFoundError(f"No images found under {root}")
    return [Image.open(p).convert("RGB") for p in paths]

# Image pool: use MIMIC-CXR jpg root from cfg
image_root = C.MIMIC_JPG_ROOT
image_batch = sample_image_batch(image_root, max_images)
image_inputs = processor(images=image_batch, return_tensors="pt")
pixel_values = image_inputs["pixel_values"].to(device)
with torch.no_grad():
    image_embs = model.encode_image(pixel_values=pixel_values).cpu().numpy()
image_embs = image_embs / np.linalg.norm(image_embs, axis=1, keepdims=True)

# Encode a slice of CUIs for a quick pass
sample_cuis = list(cui_names.items())[:512]
texts = [name for _, name in sample_cuis]
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)
text_inputs = {k: v.to(device) for k, v in text_inputs.items() if k in {"input_ids", "attention_mask"}}
with torch.no_grad():
    text_embs = model.encode_text(**text_inputs).cpu().numpy()
text_embs = text_embs / np.linalg.norm(text_embs, axis=1, keepdims=True)

# Visualness score: max similarity to the image pool
text_t = torch.from_numpy(text_embs).to(device)
image_t = torch.from_numpy(image_embs).to(device)
sims = (text_t @ image_t.T).cpu().numpy()

visual_scores = []
for (cui, name), row in zip(sample_cuis, sims):
    visual_scores.append({
        "cui": cui,
        "name": name,
        "score_max": float(row.max()),
        "score_mean": float(row.mean()),
    })

visual_scores = sorted(visual_scores, key=lambda x: x["score_max"], reverse=True)
visual_scores[:10]


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


load model weight from: ./pretrained/medclip-resnet


[{'cui': 'C1542803',
  'name': '(Pleural effusion NOS) or (haemothorax) or (hydrothorax)',
  'score_max': 0.0030659381300210953,
  'score_mean': -0.043003715574741364},
 {'cui': 'C1540450',
  'name': '(Congenital) or (acquired) aberrant thyroid gland (& [retrosternal])',
  'score_max': 0.0013944581151008606,
  'score_mean': -0.038078922778367996},
 {'cui': 'C1534919',
  'name': '(Arthritis/arthrosis) or (arthropathy) or (joint disorders)',
  'score_max': -0.006883405148983002,
  'score_mean': -0.055048618465662},
 {'cui': 'C2239286',
  'name': '(Regurgitates food) or (regurgitation)',
  'score_max': -0.007574917748570442,
  'score_mean': -0.04612813889980316},
 {'cui': 'C0340079',
  'name': 'Allergic alveolitis and pneumonitis NOS',
  'score_max': -0.008778903633356094,
  'score_mean': -0.0570792593061924},
 {'cui': 'C1537278',
  'name': 'Adrenal hypofunction (& [corticoadrenal insufficiency NOS] or [insufficiency NEC])',
  'score_max': -0.009787686169147491,
  'score_mean': -0.0459371