In [2]:
# from openai import AsyncOpenAI
import motor.motor_asyncio
from dotenv import dotenv_values

In [3]:
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(await client.server_info())  #

{'version': '7.0.11', 'gitVersion': 'f451220f0df2b9dfe073f1521837f8ec5c208a8c', 'modules': ['enterprise'], 'allocator': 'tcmalloc', 'javascriptEngine': 'mozjs', 'sysInfo': 'deprecated', 'versionArray': [7, 0, 11, 0], 'bits': 64, 'debug': False, 'maxBsonObjectSize': 16777216, 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1718477718, 38), 'signature': {'hash': b'@\xd202\xf4\xccl\x98E\xb6\xe3\xf3\xd4;\x08\x9b\xc4\x08\x00t', 'keyId': 7327016299177967635}}, 'operationTime': Timestamp(1718477718, 38)}


## Anatomic Locations


In [5]:
anatomic_locations_raw = client["ontologies"]["anatomic_locations_raw"]
count = await anatomic_locations_raw.count_documents({})
count

2901

### Loading JSON/CSV


In [45]:
import json

# Load the openimagingdatamodel/ontology_tools/data/updated_anatomic_locations.json file
with open("openimagingdatamodel/ontology_tools/data/updated_anatomic_locations.json") as f:
    data = json.load(f)
    body_parts = data["bodyParts"]
len(body_parts)

2901

In [46]:
# Load the openimagingdatamodel/ontology_tools/data/common_ids.csv file
import csv

common_ids = {}
with open("openimagingdatamodel/ontology_tools/data/common_ids.csv") as f:
    reader = csv.reader(f)
    # Skip the header
    next(reader)
    for row in reader:
        if row[2] in common_ids:
            if common_ids[row[2]] != row[0].strip():
                print(f"Duplicate common ID: {row[2]} ({row[1]}), {common_ids[row[2]]} and {row[0].strip()}")
        else:
            common_ids[row[2]] = row[0].strip()

Duplicate common ID: RID342 (genital system of male human body), 4013890 and 4014361
Duplicate common ID: RID342 (genital system of male human body), 4013890 and 4014403
Duplicate common ID: RID34819 (adrenal vein), 4013447 and 4014863
Duplicate common ID: RID1275 (left main bronchus), 4014246 and 4015017


In [47]:
import random

In [48]:
random.sample(body_parts, 3)

[{'radlexId': 'RID2142',
  'description': 'brachioradialis muscle',
  'region': 'Upper extremity',
  'containedById': 'RID2107',
  'leftId': 'RID43109',
  'rightId': 'RID43108',
  'synonyms': ['musculus brachioradialis', 'Musculus brachioradialis'],
  'codes': [{'system': 'FMA', 'code': '38485'},
   {'system': 'SNOMED', 'code': '72111005'}]},
 {'radlexId': 'RID48457',
  'description': 'left tibialis anterior muscle',
  'region': 'Lower extremity',
  'containedById': 'RID2869_RID5824',
  'unsidedId': 'RID2912',
  'rightId': 'RID48456',
  'synonyms': ['left tibialis anterior'],
  'codes': [{'system': 'FMA', 'code': '22545'}]},
 {'radlexId': 'RID3129_RID2996_RID5825',
  'description': 'right metatarsophalangeal joint of second toe',
  'region': 'Lower extremity',
  'containedById': 'RID28829_RID5825',
  'unsidedId': 'RID3129_RID2996',
  'leftId': 'RID3129_RID2996_RID5824',
  'codes': [{'system': 'SNOMED', 'code': '23311000087100'}]}]

In [49]:
count_added, count_not_added = 0, 0
for body_part in body_parts:
    if "commonIdd" in body_part:
        del body_part["commonIdd"]
    if body_part["radlexId"] in common_ids:
        count_added += 1
        body_part["commonId"] = common_ids[body_part["radlexId"]]
    else:
        count_not_added += 1
count_added, count_not_added

(2025, 876)

In [None]:
result = await anatomic_locations_raw.insert_many(body_parts)
len(result.inserted_ids)

### Processing Raw


In [6]:
import random
import re
from pprint import pprint
from typing import Annotated, Literal

from annotated_types import MinLen
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, ValidationError
from pydantic.alias_generators import to_camel

In [39]:
def check_anatomic_location_id(v: str):
    v = v.strip()
    match = re.match(r"^RID\d{2,}(_RID\d{2,})*$", v)
    if not match:
        raise ValueError("Invalid anatomic location ID")
    return v


def check_numeric_string(v: str):
    v = v.strip()
    match = re.match(r"^\d{3,}$", v)
    if not match:
        raise ValueError("Invalid numeric string")
    return v


def check_compound_numeric_string(v: str):
    v = v.strip()
    match = re.match(r"^\d{3,}(_\d{3,})*$", v)
    if not match:
        raise ValueError("Invalid compound numeric strings")
    return v

In [40]:
AnatomicLocationId = Annotated[str, BeforeValidator(check_anatomic_location_id)]
NumericString = Annotated[str, BeforeValidator(check_numeric_string)]
CompoundNumericString = Annotated[str, BeforeValidator(check_compound_numeric_string)]
NonEmptyString = Annotated[str, MinLen(3)]
Region = Annotated[
    Literal["Body", "Head", "Neck", "Thorax", "Upper Extremity", "Breast", "Abdomen", "Pelvis", "Lower Extremity"],
    BeforeValidator(lambda v: v.title()),
]

In [41]:
class AnatomicLocationRef(BaseModel):
    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, validate_assignment=True)

    id: AnatomicLocationId
    display: NonEmptyString | None = None

In [42]:
class Code(BaseModel):
    model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True, validate_assignment=True)

    system: NonEmptyString
    code: NonEmptyString
    display: NonEmptyString | None = None

In [43]:
class AnatomicLocation(BaseModel):
    model_config = ConfigDict(
        alias_generator=to_camel,
        populate_by_name=True,
        validate_assignment=True,
    )

    id: AnatomicLocationId = Field(alias="_id")
    acr_common_id: NumericString | None = None
    snomed_id: CompoundNumericString | None = None
    snomed_display: NonEmptyString | None = None
    description: NonEmptyString
    region: Region
    contained_by_ref: AnatomicLocationRef | None = None
    contains_refs: Annotated[list[AnatomicLocationRef], MinLen(1)] | None = None
    synonyms: Annotated[list[str], MinLen(1)] | None = None
    part_of_ref: AnatomicLocationRef | None = None
    has_parts_refs: Annotated[list[AnatomicLocationRef], MinLen(1)] | None = None
    left_ref: AnatomicLocationRef | None = None
    right_ref: AnatomicLocationRef | None = None
    unsided_ref: AnatomicLocationRef | None = None
    sex_specific: Literal["Male", "Female"] | None = None
    codes: Annotated[list[Code], MinLen(1)] | None = None

In [80]:
anatomic_locations_data = {}
async for anatomic_location in anatomic_locations_raw.find({}):
    anatomic_locations_data[anatomic_location["radlexId"]] = anatomic_location

In [81]:
# For each element in anatomic_locations_data, add its _id to a "contains" list of the named element
# and to the "has_parts" list of that element
for anatomic_location in anatomic_locations_data.values():
    this_ref = (anatomic_location["radlexId"], anatomic_location["description"])
    if "containedById" in anatomic_location:
        contained_by_id = anatomic_location["containedById"]
        if contained_by_id in anatomic_locations_data:
            if "contains" not in anatomic_locations_data[contained_by_id]:
                anatomic_locations_data[contained_by_id]["contains"] = []
            anatomic_locations_data[contained_by_id]["contains"].append(this_ref)
        else:
            print(f"Contained by ID {contained_by_id} for {anatomic_location['radlexId']} not found")
    if "partOfId" in anatomic_location:
        part_of_id = anatomic_location["partOfId"]
        if part_of_id in anatomic_locations_data:
            if "hasParts" not in anatomic_locations_data[part_of_id]:
                anatomic_locations_data[part_of_id]["hasParts"] = []
            anatomic_locations_data[part_of_id]["hasParts"].append(this_ref)
        else:
            print(f"Part of ID {part_of_id} for {anatomic_location['radlexId']} not found")

In [84]:
# Find the locations with invalid region values
for anatomic_location in anatomic_locations_data.values():
    region = anatomic_location.get("region", None)
    if region == "Back":
        anatomic_location["region"] = "Thorax"
    if not region or region.title() not in [
        "Body",
        "Head",
        "Neck",
        "Thorax",
        "Upper Extremity",
        "Breast",
        "Abdomen",
        "Pelvis",
        "Lower Extremity",
    ]:
        print(
            f"Invalid region value {anatomic_location['region']} for {anatomic_location['radlexId']} ({anatomic_location['description']})"
        )

In [86]:
raw_data = anatomic_locations_data["RID10109_RID5825"]
pprint(raw_data)

{'_id': ObjectId('6669acb587b7161a0c6d2736'),
 'codes': [{'code': '772137004', 'system': 'SNOMED'}],
 'containedById': 'RID13303_RID5825',
 'description': 'right middle ear cavity',
 'leftId': 'RID10109_RID5824',
 'radlexId': 'RID10109_RID5825',
 'region': 'Head',
 'unsidedId': 'RID10109'}


In [58]:
snomed_collection = client["ontologies"]["snomedct"]

In [50]:
# Gather all the SNOMED IDs into a single list
snomed_ids = set()
for anatomic_location in anatomic_locations_data.values():
    if "codes" in anatomic_location:
        for code in anatomic_location["codes"]:
            if code["system"] == "SNOMED":
                snomed_ids.add(code["code"])
                break

In [52]:
# Get the SNOMED preferredTerm for each SNOMED ID and put them in a dictionary
snomed_names = {}
cursor = snomed_collection.find({"conceptId": {"$in": list(snomed_ids)}})
async for snomed_data in cursor:
    snomed_names[snomed_data["conceptId"]] = snomed_data["preferredTerm"]

In [64]:
# Get the snomed codes in snomed_ids but not in snomed_names
missing_snomed_ids = snomed_ids - set(snomed_names.keys())
missing_snomed_ids

{'16217661000119100', '16217701000119100', '75095003'}

In [90]:
# find the raw data that give the missing snomed ids
missing_snomed_data = []
for anatomic_location in anatomic_locations_data.values():
    if "codes" in anatomic_location:
        for code in anatomic_location["codes"]:
            if code["system"] == "SNOMED" and code["code"] in missing_snomed_ids:
                missing_snomed_data.append(anatomic_location)
                break
missing_snomed_data

[]

In [88]:
async def get_snomed_name_from_id(snomed_id: str):
    snomed_data = await snomed_collection.find_one({"conceptId": snomed_id})
    return snomed_data["preferredTerm"]


left_deltoid = await get_snomed_name_from_id("16217701000119102")
right_deltoid = await get_snomed_name_from_id("16217661000119109")
c56_disc = await get_snomed_name_from_id("73959003")
print(left_deltoid, right_deltoid, c56_disc)

Structure of left deltoid muscle Structure of right deltoid muscle Intervertebral disc structure of fifth cervical vertebra


In [89]:
snomed_errors = [
    ("RID43059", "16217661000119109", right_deltoid),
    ("RID43060", "16217701000119102", left_deltoid),
    ("RID50392_RID6185", "73959003", c56_disc),
]
for radlex_id, snomed_id, snomed_name in snomed_errors:
    location = anatomic_locations_data[radlex_id]
    for code in location["codes"]:
        if code["system"] == "SNOMED":
            code["code"] = snomed_id
    snomed_names[snomed_id] = snomed_name

In [99]:
anatomic_location_ids = list(anatomic_locations_data.keys())


def random_anatomic_location_data():
    return anatomic_locations_data[random.choice(anatomic_location_ids)]


pprint(random_anatomic_location_data())

{'_id': ObjectId('6669acb587b7161a0c6d2998'),
 'codes': [{'code': '74633', 'system': 'FMA'},
           {'code': '37512009', 'system': 'SNOMED'},
           {'code': 'A06.300.747.875 | A06.688.178.875 | A06.688.357.750.875 | '
                    'A08.186.211.180.497.352.435.500.875 | '
                    'A08.186.211.200.317.357.352.435.500.875 | A08.713.049.875 '
                    '| A08.713.357.750.875',
            'system': 'MESH'},
           {'code': 'C1184201', 'system': 'UMLS'}],
 'commonId': '4015539',
 'containedById': 'RID6537',
 'description': 'zone of neurohypophysis',
 'partOfId': 'RID38674',
 'radlexId': 'RID15527',
 'region': 'Head',
 'synonyms': ['subdivision of posterior lobe of pituitary gland',
              'neurohypophysis zone']}


In [100]:
pprint(anatomic_locations_data["RID39518"])

{'_id': ObjectId('6669acb587b7161a0c6d224d'),
 'codes': [{'code': '25202', 'system': 'FMA'},
           {'code': '16982005', 'system': 'SNOMED'},
           {'code': 'A01.378.800.750', 'system': 'MESH'}],
 'commonId': '4015252',
 'containedById': 'RID1850',
 'contains': [('RID1860', 'scapula'),
              ('RID40098', 'body of scapula'),
              ('RID1854', 'clavicle'),
              ('RID1862', 'acromion'),
              ('RID1863', 'coracoid process'),
              ('RID1905', 'glenohumeral joint'),
              ('RID1895', 'acromioclavicular joint'),
              ('RID1884', 'sternoclavicular joint'),
              ('RID1892', 'costoclavicular ligament'),
              ('RID1915', 'coracoacromial ligament'),
              ('RID1901', 'coracoclavicular ligament'),
              ('RID1944', 'tendon of long head of biceps brachii'),
              ('RID1945', 'tendon of short head of biceps brachii'),
              ('RID860', 'axillary artery'),
              ('RID1014', 'ax

In [105]:
def make_ref_from_key_if_key(data: dict[str, any], key: str) -> AnatomicLocationRef:
    if key in data and data[key]:
        return AnatomicLocationRef(id=data[key], display=anatomic_locations_data[data[key]]["description"])
    return None


def raw_json_to_anatomic_location(raw_data: dict[str, any]) -> AnatomicLocation:
    try:
        location = AnatomicLocation(
            id=raw_data["radlexId"], description=raw_data["description"], region=raw_data["region"]
        )
    except ValidationError as e:
        print(raw_data)
        raise e
    if "commonId" in raw_data and raw_data["commonId"]:
        location.acr_common_id = raw_data["commonId"]
    if "codes" in raw_data and raw_data["codes"]:
        snomed_code_list = [code["code"] for code in raw_data["codes"] if code["system"] == "SNOMED"]
        if snomed_code_list:
            location.snomed_id = snomed_code_list[0]
            location.snomed_display = snomed_names[snomed_code_list[0]]
        codes = [Code(**code) for code in raw_data["codes"] if code["system"] != "SNOMED"]
        if codes:
            location.codes = codes
    used_ref_ids = [location.id]
    if contained_by := make_ref_from_key_if_key(raw_data, "containedById"):
        location.contained_by_ref = contained_by
        used_ref_ids.append(contained_by.id)
    if contains := raw_data.get("contains"):
        location.contains_refs = [AnatomicLocationRef(id=ref[0], display=ref[1]) for ref in contains]
    if part_of := make_ref_from_key_if_key(raw_data, "partOfId"):
        if part_of.id not in used_ref_ids:
            location.part_of_ref = part_of
        location.part_of_ref = part_of
    if has_parts := raw_data.get("hasParts"):
        location.has_parts_refs = [AnatomicLocationRef(id=ref[0], display=ref[1]) for ref in has_parts]
    if left := make_ref_from_key_if_key(raw_data, "leftId"):
        location.left_ref = left
        used_ref_ids.append(left.id)
    if right := make_ref_from_key_if_key(raw_data, "rightId"):
        location.right_ref = right
        used_ref_ids.append(right.id)
    if unsided := make_ref_from_key_if_key(raw_data, "unsidedId"):
        location.unsided_ref = unsided
        used_ref_ids.append(unsided.id)
    if "sexSpecific" in raw_data and raw_data["sexSpecific"]:
        location.sex_specific = raw_data["sexSpecific"]
    if "synonyms" in raw_data and raw_data["synonyms"]:
        location.synonyms = raw_data["synonyms"]
    return location

In [108]:
raw_data = random_anatomic_location_data()
pprint(raw_data)
location = raw_json_to_anatomic_location(raw_data)
print(location.model_dump_json(by_alias=True, indent=2, exclude_none=True))

{'_id': ObjectId('6669acb587b7161a0c6d228a'),
 'codes': [{'code': '26002', 'system': 'FMA'}],
 'commonId': '4014905',
 'containedById': 'RID39518_RID5825',
 'description': 'right coracoacromial ligament',
 'leftId': 'RID40843',
 'radlexId': 'RID40842',
 'region': 'Upper extremity',
 'synonyms': ['right coraco-acromial ligament'],
 'unsidedId': 'RID1915'}
{
  "_id": "RID40842",
  "acrCommonId": "4014905",
  "description": "right coracoacromial ligament",
  "region": "Upper Extremity",
  "containedByRef": {
    "id": "RID39518_RID5825",
    "display": "right shoulder"
  },
  "synonyms": [
    "right coraco-acromial ligament"
  ],
  "leftRef": {
    "id": "RID40843",
    "display": "left coracoacromial ligament"
  },
  "unsidedRef": {
    "id": "RID1915",
    "display": "coracoacromial ligament"
  },
  "codes": [
    {
      "system": "FMA",
      "code": "26002"
    }
  ]
}


In [115]:
anatomic_locations_collection = client["ontologies"]["anatomic_locations"]
count = await anatomic_locations_collection.count_documents({})
count

2901

In [110]:
anatomic_locations = [raw_json_to_anatomic_location(data) for data in anatomic_locations_data.values()]
len(anatomic_locations)

2901

In [112]:
anatomic_locations_dicts = [location.dict(by_alias=True, exclude_none=True) for location in anatomic_locations]
result = anatomic_locations_collection.insert_many(anatomic_locations_dicts)
pprint(result)

<Future pending cb=[_chain_future.<locals>._call_check_cancel() at /opt/homebrew/Cellar/python@3.11/3.11.9/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/futures.py:387]>


In [114]:
pprint(await result)

InsertManyResult(['RID56', 'RID905', 'RID294', 'RID294_RID5824', 'RID294_RID5825', 'RID935', 'RID935_RID5824', 'RID935_RID5825', 'RID88', 'RID30324', 'RID30325', 'RID164', 'RID480', 'RID581', 'RID1394', 'RID168', 'RID580', 'RID34819', 'RID1183', 'RID1185', 'RID157', 'RID33652', 'RID237', 'RID582', 'RID1011', 'RID1011_RID5824', 'RID1011_RID5825', 'RID29895', 'RID29896', 'RID29897', 'RID29898', 'RID29899', 'RID29900', 'RID29903', 'RID29904', 'RID29905', 'RID29906', 'RID29907', 'RID29908', 'RID29914', 'RID29915', 'RID29916', 'RID29917', 'RID29918', 'RID29919', 'RID29928', 'RID29929', 'RID29930', 'RID29931', 'RID29932', 'RID29933', 'RID29934', 'RID29935', 'RID29936', 'RID29937', 'RID29938', 'RID29939', 'RID29940', 'RID29941', 'RID29942', 'RID29943', 'RID29944', 'RID29945', 'RID29946', 'RID29947', 'RID29948', 'RID29949', 'RID29950', 'RID29951', 'RID29952', 'RID29953', 'RID29954', 'RID29955', 'RID1246', 'RID1248', 'RID154', 'RID906', 'RID34571', 'RID2468', 'RID510', 'RID199', 'RID909', 'RID9

In [117]:
location_data = await anatomic_locations_collection.find_one({"_id": "RID10109_RID5825"})
pprint(location_data)

{'_id': 'RID10109_RID5825',
 'containedByRef': {'display': 'right middle ear', 'id': 'RID13303_RID5825'},
 'description': 'right middle ear cavity',
 'leftRef': {'display': 'left middle ear cavity', 'id': 'RID10109_RID5824'},
 'region': 'Head',
 'snomedDisplay': 'Structure of tympanic cavity of right ear',
 'snomedId': '772137004',
 'unsidedRef': {'display': 'middle ear cavity', 'id': 'RID10109'}}


In [118]:
location = AnatomicLocation(**location_data)
print(location.model_dump_json(by_alias=True, indent=2, exclude_none=True))

{
  "_id": "RID10109_RID5825",
  "snomedId": "772137004",
  "snomedDisplay": "Structure of tympanic cavity of right ear",
  "description": "right middle ear cavity",
  "region": "Head",
  "containedByRef": {
    "id": "RID13303_RID5825",
    "display": "right middle ear"
  },
  "leftRef": {
    "id": "RID10109_RID5824",
    "display": "left middle ear cavity"
  },
  "unsidedRef": {
    "id": "RID10109",
    "display": "middle ear cavity"
  }
}


In [119]:
pprint(location)

AnatomicLocation(id='RID10109_RID5825', acr_common_id=None, snomed_id='772137004', snomed_display='Structure of tympanic cavity of right ear', description='right middle ear cavity', region='Head', contained_by_ref=AnatomicLocationRef(id='RID13303_RID5825', display='right middle ear'), contains_refs=None, synonyms=None, part_of_ref=None, has_parts_refs=None, left_ref=AnatomicLocationRef(id='RID10109_RID5824', display='left middle ear cavity'), right_ref=None, unsided_ref=AnatomicLocationRef(id='RID10109', display='middle ear cavity'), sex_specific=None, codes=None)


### Adding Definitions


In [None]:
from openimagingdatamodel.ontology_tools.radlex_concept_repo import RadLexConceptRepo

radlex_collection = db["radlex"]
radlex_repo = RadLexConceptRepo(radlex_collection)
count = radlex_repo.get_count()
count

46761

In [None]:
# Get all the IDs in the anatomic locations collection
anatomic_location_ids = [location["_id"] for location in collection.find({})]
len(anatomic_location_ids)

2901

In [None]:
# Get all the RadLex concepts with IDs in the list of anatomic_location_ids and have a definition field set
radlex_concepts = radlex_collection.find(
    {"_id": {"$in": anatomic_location_ids}, "definition": {"$exists": True}}, {"_id": 1, "definition": 1}
)
definitions = {concept["_id"]: concept["definition"] for concept in radlex_concepts}

In [None]:
from pymongo import UpdateOne

# Update the anatomic locations collection with the definitions
updates = []
for location_id, definition in definitions.items():
    updates.append(UpdateOne({"_id": location_id}, {"$set": {"definition": definition}}))

result = collection.bulk_write(updates)
result.modified_count

### Create ancestorRefs
