# MIMIC-CXR SapBERT Concept Bank Analysis

This notebook operates on an existing SapBERT concept bank (e.g. `generated/concept_bank_sapbert_mimic/`) and:

- counts concepts vs frequency threshold for a chosen assertion label (default: `present`)
- lists the top-K most frequent concepts with their CUIs and semantic types

You can adapt the paths or assertion filter below to analyze other banks.

In [5]:
from pathlib import Path
import json
from collections import Counter

import pandas as pd

# Try to resolve the repo root so the notebook works whether it is
# launched from the repository root or from the analysis/ directory.
CWD = Path.cwd()
if (CWD / 'generated').exists():
    REPO_ROOT = CWD
elif (CWD.parent / 'generated').exists():
    REPO_ROOT = CWD.parent
else:
    REPO_ROOT = CWD

BANK_DIR = REPO_ROOT / 'generated' / 'concept_bank_sapbert_mimic'
STUDY_CONCEPTS_PATH = BANK_DIR / 'study_concepts.jsonl'
INVENTORY_PATH = BANK_DIR / 'concept_inventory.json'
SCHEMA_PATH = BANK_DIR / 'concept_schema.json'

ASSERTION = 'present'  # one of: 'present', 'absent', 'uncertain', 'any'
THRESHOLDS = [1, 3, 5, 10, 25, 50, 100]
TOP_K = 50

print(f'Repo root: {REPO_ROOT}')
print(f'Using study concepts from: {STUDY_CONCEPTS_PATH}')
if not STUDY_CONCEPTS_PATH.exists():
    raise FileNotFoundError(f'study_concepts.jsonl not found at {STUDY_CONCEPTS_PATH}')

inventory = {}
if SCHEMA_PATH.exists():
    with SCHEMA_PATH.open('r', encoding='utf-8') as handle:
        raw_schema = json.load(handle)
    if isinstance(raw_schema, dict):
        inventory = raw_schema
    print(f'Loaded curated schema with {len(inventory):,} entries from {SCHEMA_PATH}')
elif INVENTORY_PATH.exists():
    with INVENTORY_PATH.open('r', encoding='utf-8') as handle:
        raw_inventory = json.load(handle)
    if isinstance(raw_inventory, dict):
        inventory = raw_inventory
    print(f'Loaded inventory with {len(inventory):,} entries from {INVENTORY_PATH}')
else:
    print('No concept_inventory.json or concept_schema.json found; CUI and semantic_type columns will be empty.')


Repo root: /home/obadah/code/MedCLIP
Using study concepts from: /home/obadah/code/MedCLIP/generated/concept_bank_sapbert_mimic/study_concepts.jsonl
Loaded curated schema with 6,163 entries from /home/obadah/code/MedCLIP/generated/concept_bank_sapbert_mimic/concept_schema.json


In [6]:
counts = Counter()
total_studies = 0

with STUDY_CONCEPTS_PATH.open("r", encoding="utf-8") as handle:
    for line in handle:
        line = line.strip()
        if not line:
            continue
        record = json.loads(line)
        total_studies += 1
        for item in record.get("concepts", []):
            name = item.get("concept")
            if not name:
                continue
            if ASSERTION != "any" and item.get("assertion") != ASSERTION:
                continue
            counts[name] += 1

if not counts:
    raise RuntimeError(
        f"No concepts matched assertion={ASSERTION!r}. "
        "Try a different ASSERTION value or inspect the concept bank."
    )

print(f"Analyzed {total_studies:,} studies.")
print(f"Distinct concepts (assertion={ASSERTION!r}): {len(counts):,}")

Analyzed 7,071 studies.
Distinct concepts (assertion='present'): 5,671


In [7]:
# Summary: number of concepts above each frequency threshold
summary_rows = []
for threshold in THRESHOLDS:
    kept = sum(1 for freq in counts.values() if freq >= threshold)
    summary_rows.append({"min_frequency": threshold, "num_concepts": kept})

summary_df = pd.DataFrame(summary_rows).sort_values("min_frequency").reset_index(drop=True)
summary_df

Unnamed: 0,min_frequency,num_concepts
0,1,5671
1,3,2521
2,5,1857
3,10,1239
4,25,733
5,50,458
6,100,275


In [9]:
# Top-K most frequent concepts with CUI and semantic type
rows = []
for rank, (name, freq) in enumerate(counts.most_common(TOP_K), start=1):
    meta = inventory.get(name, {}) if isinstance(inventory, dict) else {}
    label = meta.get("label") or name
    rows.append(
        {
            "rank": rank,
            "count": freq,
            "concept": label,
            "canonical_name": name,
            "cui": meta.get("cui"),
            "semantic_type": meta.get("semantic_type"),
        }
    )

top_df = pd.DataFrame(rows)
top_df


Unnamed: 0,rank,count,concept,canonical_name,cui,semantic_type
0,1,3811,Lung,Lung part,C1268107,BodyStructure
1,2,3372,Heart,"Heart, NOS",C0018787,BodyStructure
2,3,2452,Pleural,Pleural,C1522720,BodyStructure
3,4,2181,Patient condition unchanged,Patient condition unchanged,C0184763,Finding
4,5,1917,Mediastinum,"Mediastinum, NOS",C0025066,BodyStructure
5,6,1767,Symptom mild,Symptom mild,C0436343,Finding
6,7,1611,Right,Right,C0205090,BodyStructure
7,8,1542,large effusion,Pleural effusion NOS (disorder),C0032227,Finding
8,9,1515,pul,PUL (body structure),C0449197,BodyStructure
9,10,1492,Size,Size finding,C1265584,BodyStructure
