# Ontology Tools Experiment Notebook

- This notebook contains experiments using the Ontology Tools.

- Demo `transform_radlex()` function

  1. Accessing the "ontologies" database in MongoDB.
  2. Demo the `transform_radlex()` function on 1 JSON document.
  3. Demo the `transform_radlex()` function on multiple JSON document.

- Demo JSON dump using `batch_write_mongoDB` function

  1. Demo 100 JSON documents dump.

- Final Script in notebook
  1. Run 100 document batches => write back to `radlex` collection in `ontologies` database within MongoDB.


In [1]:
# import local modules
# import MongoDB modules
import motor.motor_asyncio
from dotenv import dotenv_values
from openimagingdatamodel.ontology_tools import RadLexConcept, transform_radlex

In [3]:
# establish connection to MongoDB
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
db = client["ontologies"]
collection = db["radlex"]
# Get the count of documents in the collection to confirm that the data was loaded
count = await collection.count_documents({})
print(f"Count of documents in the collection: {count}")

Count of documents in the collection: 46761


In [4]:
# Access the ontologies database and "RadLex" collection
doc = await collection.find_one()

In [5]:
print(doc)

{'_id': 'RID38411', 'preferredLabel': 'postcentral branch of spinal branch of left third lumbar artery', 'parent': 'RID38409', 'radlexProperties': {'fmaid': '15206', 'preferredNameGerman': 'Ramus postcentralis des Ramus spinalis der linken Arteria lumbalis III'}}


In [6]:
concept = RadLexConcept.model_validate(doc)

In [7]:
print(concept.model_dump_json(indent=2, by_alias=True))

{
  "_id": "RID38411",
  "preferredLabel": "postcentral branch of spinal branch of left third lumbar artery",
  "synonyms": null,
  "parent": "RID38409",
  "definition": null,
  "radlexProperties": {
    "fmaid": "15206",
    "preferredNameGerman": "Ramus postcentralis des Ramus spinalis der linken Arteria lumbalis III"
  }
}


In [13]:
collection = db["snomedct"]
SNOMED_CT_COUNT = await collection.count_documents({})

In [23]:
import random

random_doc = await collection.find_one({}, skip=random.randint(0, SNOMED_CT_COUNT))
print(random_doc)

{'_id': '23381000087109', 'conceptId': '23381000087109', 'effectiveDate': '2018-07-31', 'modules': ['SNOMED-CT-core'], 'languageCode': 'en', 'preferredTerm': 'Structure of proximal interphalangeal joint of left little finger', 'terms': ['Proximal interphalangeal joint of left little finger', 'Proximal interphalangeal joint of fifth digit of left hand'], 'semanticTags': ['body structure'], 'caseSignificance': 'insensitive', 'definitions': None}


In [18]:
from pydantic import BaseModel, ConfigDict
from pydantic.alias_generators import to_camel


class SnomedCTConcept(BaseModel):
    model_config = ConfigDict(
        populate_by_name=True,
        coerce_numbers_to_str=True,
        alias_generator=to_camel,
        validate_assignment=True,
    )
    concept_id: str
    effective_date: str
    modules: list[str] | None = None
    language_code: str
    preferred_term: str
    terms: list[str] | None = None
    case_significance: str
    definitions: list[str] | None = None

In [24]:
doc = SnomedCTConcept.model_validate(random_doc)
print(doc.model_dump_json(indent=2, by_alias=True))

{
  "conceptId": "23381000087109",
  "effectiveDate": "2018-07-31",
  "modules": [
    "SNOMED-CT-core"
  ],
  "languageCode": "en",
  "preferredTerm": "Structure of proximal interphalangeal joint of left little finger",
  "terms": [
    "Proximal interphalangeal joint of left little finger",
    "Proximal interphalangeal joint of fifth digit of left hand"
  ],
  "caseSignificance": "insensitive",
  "definitions": null
}


In [32]:
class SnomedCTConceptRepo:
    def __init__(self, collection: motor.motor_asyncio.AsyncIOMotorCollection):
        self.collection = collection

    async def get_concept(self, concept_id: str) -> SnomedCTConcept:
        concept = await self.collection.find_one({"conceptId": concept_id})
        return SnomedCTConcept.model_validate(concept)

In [33]:
repo = SnomedCTConceptRepo(collection)

In [34]:
doc = await repo.get_concept("23381000087109")

In [35]:
doc.preferred_term

'Structure of proximal interphalangeal joint of left little finger'

In [30]:
import motor.motor_asyncio

# Check if collection is of type motor.MotorCollection
if isinstance(collection, motor.MotorCollection):
    print("collection is of type motor.MotorCollection")
else:
    print("collection is not of type motor.MotorCollection")

collection is not of type motor.MotorCollection


### Demo `transform_radlex()` function before writing to MongoDB


#### Demo function on 1 document


In [5]:
t_doc: RadLexConcept = transform_radlex(doc)
print(t_doc.model_dump_json(indent=2, by_alias=True, exclude_unset=True, exclude_none=True))

{
  "_id": "RID38411",
  "preferredLabel": "postcentral branch of spinal branch of left third lumbar artery",
  "parent": "RID38409",
  "radlexProperties": {
    "fmaid": "15206",
    "preferredNameGerman": "Ramus postcentralis des Ramus spinalis der linken Arteria lumbalis III"
  }
}


#### Demo function on multiple documents


In [6]:
# Check multiple documents
docs = await collection.find({}).to_list(length=100)
for doc in docs:
    t_doc = transform_radlex(doc)
    print(t_doc.model_dump_json(indent=2, by_alias=True, exclude_unset=True, exclude_none=True))

{
  "_id": "RID38411",
  "preferredLabel": "postcentral branch of spinal branch of left third lumbar artery",
  "parent": "RID38409",
  "radlexProperties": {
    "fmaid": "15206",
    "preferredNameGerman": "Ramus postcentralis des Ramus spinalis der linken Arteria lumbalis III"
  }
}
{
  "_id": "RID35591",
  "preferredLabel": "string-of-pearls sign of bowel",
  "parent": "RID29023",
  "definition": "Oblique or horizontal row of air bubbles visible on abdominal radiograph; almost always indicates small bowel obstruction; air is trapped between valvulae conniventes along the superior wall of the intestine.",
  "radlexProperties": {
    "anatomicalSite": "RID132",
    "comment": "157",
    "mayBeCausedBy": "RID4962",
    "preferredNameGerman": "string-of-pearls sign of bowel (EN)",
    "relatedModality": "RID10345",
    "source": "Radiology 2000; 214:157-158"
  }
}
{
  "_id": "RID22326",
  "preferredLabel": "trunk of superficial transverse perineal muscle branch of perineal nerve",
  "pa

#### Code to check if "|" was removed from JSON docs


In [18]:
# check multiple docs if any instance of not transformed "|"
import json

docs = await collection.find({}).to_list(length=10)
for doc in docs:
    t_doc = transform_radlex(doc)
    t_doc_json = json.dumps(t_doc.dict())
    if "|" in t_doc_json:
        print(f"Error in transforming document {t_doc.id}: '|' character found.")

# Code to write transformed RadLex to new 'radlex' collection in MongoDB


In [19]:
doc = await collection.find_one()

print(doc)

{'_id': ObjectId('65f84ed1f80fad5323c79c1f'), 'Class ID': 'http://radlex.org/RID/RID35591', 'Preferred Label': 'string-of-pearls sign of bowel', 'Definitions': 'Oblique or horizontal row of air bubbles visible on abdominal radiograph; almost always indicates small bowel obstruction; air is trapped between valvulae conniventes along the superior wall of the intestine.', 'Obsolete': False, 'Parents': 'http://radlex.org/RID/RID29023', 'http://data': {'bioontology': {'org/metadata/prefixIRI': 'RID35591'}}, 'http://radlex': {'org/RID/Anatomical_Site': 'http://radlex.org/RID/RID132', 'org/RID/Comment': 'http://radiology.rsna.org/cgi/content/full/214/1/157', 'org/RID/Definition': 'Oblique or horizontal row of air bubbles visible on abdominal radiograph; almost always indicates small bowel obstruction; air is trapped between valvulae conniventes along the superior wall of the intestine.', 'org/RID/May_Be_Caused_By': 'http://radlex.org/RID/RID4962', 'org/RID/Preferred_name': 'string-of-pearls s

## Demo fetch docs from 'RadLex' collection in batches of 100 => demo JSON dumps


#### This demos the function


In [None]:
async def fetch_and_demo_docs():
    # Fetch 10 documents
    docs = await collection.find({}).to_list(length=100)

    # Iterate through the documents
    for doc in docs:
        # Transform the document
        transformed_doc = transform_radlex(doc)

        # Print JSON dump of the transformed document
        print(transformed_doc.model_dump_json(indent=2, by_alias=True, exclude_none=True))


# Run the async function
await fetch_and_demo_docs()

In [17]:
from typing import AsyncGenerator

from motor.motor_asyncio import AsyncIOMotorCollection


async def get_documents_in_batches(
    collection: AsyncIOMotorCollection, batch_size: int = 100, limit: int | None = None, skip: int = 0
) -> AsyncGenerator[list[dict], None]:
    # Get the total count of documents in the collection
    total_count = limit if limit else await collection.count_documents({})
    print(f"Total count of documents in the collection: {total_count}")

    # Loop through the documents in batches
    while skip < total_count:
        # Fetch the documents in the current batch
        docs = await collection.find({}, skip=skip, limit=batch_size).to_list(length=batch_size)
        yield docs

        # Increment the skip variable by the batch size
        skip += batch_size

In [18]:
from pydantic import ValidationError

new_collection = db["radlex"]

async for docs in get_documents_in_batches(collection, batch_size=250, skip=24250):
    transformed_docs = []
    for doc in docs:
        try:
            transformed_docs.append(transform_radlex(doc))
        except ValidationError as e:
            print(f"Error transforming document {doc['_id']}: {e}")
            print(doc)
    await new_collection.insert_many([doc.model_dump(by_alias=True, exclude_none=True) for doc in transformed_docs])
    print(f"Inserted {len(transformed_docs)} documents into the new collection.")

Total count of documents in the collection: 46761
Error transforming document 65f84ee1f80fad5323c7fb78: 1 validation error for RadLexConcept
preferred_label
  Input should be a valid string [type=string_type, input_value=False, input_type=bool]
    For further information visit https://errors.pydantic.dev/2.6/v/string_type
{'_id': ObjectId('65f84ee1f80fad5323c7fb78'), 'Class ID': 'http://radlex.org/RID/RID49835', 'Preferred Label': False, 'Obsolete': False, 'Parents': 'http://radlex.org/RID/RID49833', 'http://data': {'bioontology': {'org/metadata/prefixIRI': 'RID49835'}}, 'http://radlex': {'org/RID/Created': '2015-03-12T00:00:00', 'org/RID/Preferred_name': False, 'org/RID/Preferred_name_German': 'falsch', 'org/RID/Radlex_version_of_class_change': 3.13, 'org/RID/Source': 'Playbook'}}
Inserted 249 documents into the new collection.
Inserted 250 documents into the new collection.
Inserted 250 documents into the new collection.
Inserted 250 documents into the new collection.
Inserted 250 d

In [19]:
from bson import ObjectId

true_false_docs = [
    {
        "_id": ObjectId("65f84eedf80fad5323c83bb2"),
        "Class ID": "http://radlex.org/RID/RID49834",
        "Preferred Label": "true",
        "Obsolete": False,
        "Parents": "http://radlex.org/RID/RID49833",
        "http://data": {"bioontology": {"org/metadata/prefixIRI": "RID49834"}},
        "http://radlex": {
            "org/RID/Created": "2015-03-12T00:00:00",
            "org/RID/Preferred_name": "true",
            "org/RID/Preferred_name_German": "echt",
            "org/RID/Radlex_version_of_class_change": 3.13,
            "org/RID/Source": "Playbook",
        },
    },
    {
        "_id": ObjectId("65f84ee1f80fad5323c7fb78"),
        "Class ID": "http://radlex.org/RID/RID49835",
        "Preferred Label": "false",
        "Obsolete": False,
        "Parents": "http://radlex.org/RID/RID49833",
        "http://data": {"bioontology": {"org/metadata/prefixIRI": "RID49835"}},
        "http://radlex": {
            "org/RID/Created": "2015-03-12T00:00:00",
            "org/RID/Preferred_name": "false",
            "org/RID/Preferred_name_German": "falsch",
            "org/RID/Radlex_version_of_class_change": 3.13,
            "org/RID/Source": "Playbook",
        },
    },
]

In [22]:
t_docs = [transform_radlex(doc) for doc in true_false_docs]
await new_collection.insert_many([doc.model_dump(by_alias=True, exclude_none=True) for doc in t_docs])

InsertManyResult(['RID49834', 'RID49835'], acknowledged=True)

### Now run these scripts to access the new 'radlex' collection within the 'ontologies' database and then run batch script


In [8]:
# establish connection to MongoDB
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
db = client["ontologies"]


##### change to the new collection "radlex"
collection_new = db["radLex"]


# Get the count of documents in the collection to confirm that the data was loaded—should be zero before we load data
count = await collection.count_documents({})
print(f"Count of documents in the collection: {count}")

Count of documents in the collection: 0


In [None]:
# import the batch_transform_and_write_to_db function function
from openimagingdatamodel.ontology_tools import batch_transform_and_write_to_db

# Final Script to Write Transformed Documents to MongoDB `radlex` collection in `ontologies` database


In [None]:
async def fetch_transform_and_write_docs():
    # Fetch documents in batches of 100
    cursor = collection.find({}).batch_size(100)

    batch = []
    async for doc in cursor:
        # Transform the document
        transformed_doc = transform_radlex(doc)

        # Print JSON dump of the transformed document (demo)
        print(transformed_doc.model_dump_json(indent=2))

        # Add transformed document to the batch
        batch.append(transformed_doc.model_dump())

        # If the batch size reaches 100, write it to the database
        if len(batch) == 100:
            batch_transform_and_write_to_db(collection, batch)
            batch = []

    # Write any remaining documents in the last batch
    if batch:
        batch_transform_and_write_to_db(collection, batch)


# Run the async function
await fetch_transform_and_write_docs()