# Report to CUIs (RadGraph + SapBERT)
This notebook sketches how to turn free-text reports into linked concepts using the same components as `concept_extraction/convert_reports.py`: RadGraph for span/attribute detection and SapBERT + FAISS for CUI linking. Cells are structured so you can fill in real paths/resources and run end-to-end if the dependencies are installed.


### 1. Setup
Make sure RadGraph (spaCy pipeline) and SapBERT + FAISS resources are available.
- RadGraph model: e.g., `en_core_sci_scibert` or RadGraph-XL pipeline.
- SapBERT checkpoint and FAISS index built from UMLS strings.
```


In [1]:
from pathlib import Path
import sys, yaml

repo_root = Path.cwd().resolve().parent
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

cfg = yaml.safe_load((repo_root / "cfg" / "paths.yml").read_text())
semantic_cfg = yaml.safe_load((repo_root / "cfg" / "umls_sapbert.yml").read_text())
umls_cfg = cfg.get("umls", cfg)
allowed_tuis_override = semantic_cfg.get("allowed_tuis")
allowed_sources_override = semantic_cfg.get("sources") or semantic_cfg.get("allowed_sources")
radlex_csv_path = semantic_cfg.get("radlex_csv_path") or umls_cfg.get("radlex_csv_path")

# SapBERT / FAISS resource paths
sapbert_checkpoint = umls_cfg.get("sapbert_checkpoint", "") or umls_cfg.get("sapbert_model_id", "")
faiss_index = umls_cfg.get("faiss_index", "")
umls_strings = umls_cfg.get("mrconso_path", "")
semantic_types_tsv = umls_cfg.get("mrsty_path", "")

# # Optional stop terms to filter trivial mentions
# stop_terms = ["no change", "normal"]


### 2. Load pipelines
Build the ClinicalEntityLinker using SapBERT + FAISS paths from `cfg/paths.yml`.


In [2]:
import tempfile
from concept_extraction.concepts import ner as concept_ner

sapbert_model_id = (
    umls_cfg.get("sapbert_model_id")
    or umls_cfg.get("sapbert_checkpoint")
    or concept_ner.ClinicalEntityLinker.SAPBERT_MODEL_ID
)

index_file = umls_cfg.get('faiss_index')
mapping_file = umls_cfg.get('sapbert_id2cui') or umls_cfg.get('mapping_file')

payload = dict(umls_cfg)
payload['sapbert_model_id'] = sapbert_model_id
if allowed_tuis_override is not None:
    payload['allowed_tuis'] = allowed_tuis_override
if allowed_sources_override is not None:
    payload['sources'] = allowed_sources_override
if radlex_csv_path:
    payload['radlex_csv_path'] = radlex_csv_path
if index_file:
    payload['index_file'] = index_file
if mapping_file:
    payload['mapping_file'] = mapping_file

with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp:
    yaml.safe_dump(payload, tmp)
    linker_cfg_path = Path(tmp.name)


print("Instantiating linker (will download SapBERT or build FAISS if missing)")
linker = concept_ner.create_linker(
    linker_cfg_path,
    # stop_terms=stop_terms,
)
print("Linker ready")


  from .autonotebook import tqdm as notebook_tqdm


Instantiating linker (will download SapBERT or build FAISS if missing)


  if LooseVersion(numpy.__version__) >= "1.19":
  other = LooseVersion(other)
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Using device: cuda:0
model_type not provided, defaulting to radgraph-xl


`torch_dtype` is deprecated! Use `dtype` instead!


[info] loading RadLex synonyms from /mnt/e/RadLex/RADLEX.csv
[info] loaded FAISS index on GPU (dtype=float32, transposed=True)
Linker ready


### 3. Run extraction on a sample report
We mirror the logic in `convert_reports.py`: detect mentions with RadGraph, then link with SapBERT.


In [9]:
from concept_extraction.dataset_iter import lookup_report_text
from concept_extraction.identifiers import normalize_study_id
from concept_extraction.concepts import aggregation as concept_agg

# Set dataset and IDs
# dataset options: "mimic_cxr" or "chexpert_plus"
dataset = "mimic_cxr"
study_id = "50225296"  # replace with your study
patient_id = None       # required for chexpert_plus; set patient id there

report_text = lookup_report_text(dataset, patient_id, normalize_study_id(study_id))
if not report_text:
    raise ValueError(f"No report found for dataset={dataset}, study_id={study_id}, patient_id={patient_id}")

sample_report = report_text.strip()

mentions = linker(sample_report, record_id=str(study_id))
concept_summary, mention_payload = concept_agg.build_concept_summary(mentions, linker.cui2str)
cui_list = [
    {
        "cui": entry["cui"],
        "preferred_name": entry["preferred_name"],
        "assertion": assertion,
    }
    for assertion, entries in concept_summary.items()
    for entry in entries
    if entry.get("cui")
]
cui_list


[{'cui': 'RID114', 'preferred_name': 'gaster', 'assertion': 'present'},
 {'cui': 'RID117', 'preferred_name': 'gastric body', 'assertion': 'present'},
 {'cui': 'RID13173', 'preferred_name': 'unremarkable', 'assertion': 'present'},
 {'cui': 'RID1385', 'preferred_name': 'heart', 'assertion': 'present'},
 {'cui': 'RID96',
  'preferred_name': 'gastro-esophageal junction',
  'assertion': 'present'},
 {'cui': 'RID1363', 'preferred_name': 'pleural cavity', 'assertion': 'absent'},
 {'cui': 'RID34539',
  'preferred_name': 'pleural effusion',
  'assertion': 'absent'},
 {'cui': 'RID4863',
  'preferred_name': 'venous congestion',
  'assertion': 'absent'},
 {'cui': 'RID49914', 'preferred_name': 'vascular', 'assertion': 'absent'},
 {'cui': 'RID5350', 'preferred_name': 'pneumonia', 'assertion': 'absent'},
 {'cui': 'RID5702', 'preferred_name': 'focal', 'assertion': 'absent'},
 {'cui': 'RID5718', 'preferred_name': 'acute', 'assertion': 'absent'}]

In [None]:
from concept_extraction.dataset_iter import lookup_report_text
from concept_extraction.identifiers import normalize_study_id
from concept_extraction.concepts import aggregation as concept_agg

# Set your dataset and IDs
# dataset options: "mimic_cxr" or "chexpert_plus"
dataset = "mimic_cxr"
study_id = "12345678"
patient_id = None       # required for chexpert_plus; set patient id there

report_text = lookup_report_text(dataset, patient_id, normalize_study_id(study_id))
if not report_text:
    raise ValueError(f"No report found for dataset={dataset}, study_id={study_id}, patient_id={patient_id}")

sample_report = report_text.strip()

mentions = linker(sample_report, record_id=str(study_id))
concept_summary, mention_payload = concept_agg.build_concept_summary(mentions, linker.cui2str)
cui_list = [
    {
        "cui": entry["cui"],
        "preferred_name": entry["preferred_name"],
        "assertion": assertion,
    }
    for assertion, entries in concept_summary.items()
    for entry in entries
    if entry.get("cui")
]
cui_list


[{'cui': 'C0018800', 'preferred_name': 'cardiomegaly', 'assertion': 'present'},
 {'cui': 'C0032227',
  'preferred_name': 'Pleural effusion NOS (disorder)',
  'assertion': 'present'},
 {'cui': 'C0748864', 'preferred_name': 'SMALL SIZE', 'assertion': 'present'},
 {'cui': 'C1268108', 'preferred_name': 'Pleural part', 'assertion': 'present'},
 {'cui': 'C3842382',
  'preferred_name': 'Mild (low grade)',
  'assertion': 'present'},
 {'cui': 'C0032326', 'preferred_name': 'pneumothorax', 'assertion': 'absent'}]

### 4. Assemble JSON-ready record
This matches the structure written by `convert_reports.py` for each report row.


In [4]:
import json

record = {
    "record_id": "demo_report",
    "cuis": cui_list,
    "mentions": mention_payload,
}
print(json.dumps(record, indent=2))


{
  "record_id": "demo_report",
  "cuis": [
    {
      "cui": "RID1363",
      "preferred_name": "pleural cavity",
      "assertion": "present"
    },
    {
      "cui": "RID34539",
      "preferred_name": "pleural effusion",
      "assertion": "present"
    },
    {
      "cui": "RID5671",
      "preferred_name": "mild",
      "assertion": "present"
    },
    {
      "cui": "RID5774",
      "preferred_name": "small",
      "assertion": "present"
    },
    {
      "cui": "RID5352",
      "preferred_name": "pneumo",
      "assertion": "absent"
    }
  ],
  "mentions": [
    {
      "text": "Small",
      "mods": [],
      "assertion": "present",
      "category": "Observation::definitely present",
      "span": [
        10,
        15
      ],
      "cui": "RID5774",
      "cui_surface": "RID5774",
      "cui_text": "RID5774",
      "score": 0.9995694160461426,
      "score_surface": 0.9995694160461426,
      "score_text": 0.9995694160461426,
      "preferred_name": "small"
    },
 

View non-null mentions

In [5]:
# Drop mentions with no linked CUI from either surface or text match (and/or)
mentions = [m for m in mentions if m.cui_surface or m.cui_text]


concept_summary, mention_payload = concept_agg.build_concept_summary(mentions, linker.cui2str)

concept_summary, mention_payload


({'present': [{'cui': 'RID1363',
    'preferred_name': 'pleural cavity',
    'mention_texts': ['pleural'],
    'score_max': 0.8536310195922852,
    'score_mean': 0.8536310195922852},
   {'cui': 'RID34539',
    'preferred_name': 'pleural effusion',
    'mention_texts': ['effusion'],
    'score_max': 0.9994068145751953,
    'score_mean': 0.9994068145751953},
   {'cui': 'RID5671',
    'preferred_name': 'mild',
    'mention_texts': ['Mild'],
    'score_max': 0.999603807926178,
    'score_mean': 0.999603807926178},
   {'cui': 'RID5774',
    'preferred_name': 'small',
    'mention_texts': ['Small'],
    'score_max': 0.9995694160461426,
    'score_mean': 0.9995694160461426}],
  'absent': [{'cui': 'RID5352',
    'preferred_name': 'pneumo',
    'mention_texts': ['pneumothorax'],
    'score_max': 0.9999706745147705,
    'score_mean': 0.9999706745147705}]},
 [{'text': 'Small',
   'mods': [],
   'assertion': 'present',
   'category': 'Observation::definitely present',
   'span': (10, 15),
   'cui'