In [5]:
import random
from openai import AsyncOpenAI
import motor.motor_asyncio
from dotenv import dotenv_values
from openimagingdatamodel.ontology_tools.snomedct_concept import SnomedCTConcept

Load Env and Database


In [2]:
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(await client.server_info())  #

{'version': '7.0.11', 'gitVersion': 'f451220f0df2b9dfe073f1521837f8ec5c208a8c', 'modules': ['enterprise'], 'allocator': 'tcmalloc', 'javascriptEngine': 'mozjs', 'sysInfo': 'deprecated', 'versionArray': [7, 0, 11, 0], 'bits': 64, 'debug': False, 'maxBsonObjectSize': 16777216, 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1717764150, 6), 'signature': {'hash': b'\x1d3\xce?\xb6\xd3P\xd9x\x94\xc0\x0b\n\xf1\x17y\xec!L\x96', 'keyId': 7327016299177967635}}, 'operationTime': Timestamp(1717764150, 6)}


In [10]:
db = client["ontologies"]
snomedct_collection = db["snomedct"]
SNOMED_CONCEPT_COUNT = await snomedct_collection.count_documents({})
print(f"SNOMED CT COUNT: {SNOMED_CONCEPT_COUNT}")

SNOMED CT COUNT: 508540


Generate Embedding


In [7]:
openai_client = AsyncOpenAI(api_key=config["OPENAI_API_KEY"])

In [8]:
async def get_embedding(text, model="text-embedding-3-large", dimensions: int = 1024):
    text = text.replace("\n", " ")
    response = await openai_client.embeddings.create(input=[text], model=model, dimensions=dimensions)
    return response.data[0].embedding

Get Sonomed Combined Text


In [9]:
def text_for_embedding(document):
    """
    Combine preferred term, alternate terms, and definition into a single text string.

    """
    # Combine the terms and definition into a single string
    # Alternate terms are joined by a comma and a space
    text_components = []

    # Add the preferred term if it exists
    if "preferredTerm" in document and document["preferredTerm"]:
        text_components.append(f"Preferred term: {document['preferredTerm']}")

    # Add the alternate terms if they exist
    if "terms" in document and document["terms"]:
        text_components.append("Other terms: " + ", ".join(document["terms"]))

    # Add the definition if it exists
    if "definitions" in document and document["definitions"]:
        text_components.append("Definition: " + " ".join(document["definitions"]))

    # Combine the available fields into a single string for embedding
    return "; ".join(text_components)

In [13]:
cursor = (
    snomedct_collection.find({}, {"preferredTerm": 1, "terms": 1, "definitions": 1})
    .limit(15)
    .skip(random.randint(0, SNOMED_CONCEPT_COUNT - 15))
)

# Get the combined text for embedding
async for document in cursor:
    embedding_text = text_for_embedding(document)
    print(embedding_text)
    vector = await get_embedding(embedding_text)
    print("Got vector length ", len(vector))

Preferred term: Family history: Mother; Other terms: Family history with explicit context pertaining to mother, Family history with explicit context pertaining to mother (situation), FH: Mother
Got vector length  1024
Preferred term: Family history: Mother alive and well; Other terms: FH: Mother alive and well
Got vector length  1024
Preferred term: Family history: Mother alive with problem; Other terms: FH: Mother alive with problem
Got vector length  1024
Preferred term: Mother deceased; Other terms: Family history: Mother dead (situation), Family history: Mother dead (context-dependent category), FH: Mother dead, Family history: Mother dead
Got vector length  1024
Preferred term: Leishmania donovani donovani
Got vector length  1024
Preferred term: Family history: Mother unwell; Other terms: FH: Mother unwell
Got vector length  1024
Preferred term: Family history: Mother NOS; Other terms: FH: Mother NOS
Got vector length  1024
Preferred term: Family history: Father; Other terms: Fami