In [1]:
import random
from openai import AsyncOpenAI
import motor.motor_asyncio
from dotenv import dotenv_values
from openimagingdatamodel.ontology_tools.snomedct_concept import SnomedCTConcept
from openimagingdatamodel.ontology_tools.snomedct_concept_repo import SnomedCTConceptRepo
from openimagingdatamodel.ontology_tools.utils import create_embedding_for_snomedctconcept

Load Env and Database


In [2]:
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(await client.server_info())  #

{'version': '7.0.11', 'gitVersion': 'f451220f0df2b9dfe073f1521837f8ec5c208a8c', 'modules': ['enterprise'], 'allocator': 'tcmalloc', 'javascriptEngine': 'mozjs', 'sysInfo': 'deprecated', 'versionArray': [7, 0, 11, 0], 'bits': 64, 'debug': False, 'maxBsonObjectSize': 16777216, 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1718035051, 18), 'signature': {'hash': b'),\xc2\x1d<\xa9\x17\xad.o\x92\xeb\x8b\xc2\xd2\xddI\x02L\x94', 'keyId': 7327016299177967635}}, 'operationTime': Timestamp(1718035051, 18)}


In [3]:
db = client["ontologies"]
snomedct_collection = db["snomedct"]
SNOMED_CONCEPT_COUNT = await snomedct_collection.count_documents({})
print(f"SNOMED CT COUNT: {SNOMED_CONCEPT_COUNT}")

SNOMED CT COUNT: 508540


Generate Embedding


In [4]:
openai_client = AsyncOpenAI(api_key=config["OPENAI_EMBEDDING_API_KEY"])

In [5]:
async def get_embedding(text, model="text-embedding-3-large", dimensions: int = 1024):
    text = text.replace("\n", " ")
    response = await openai_client.embeddings.create(input=[text], model=model, dimensions=dimensions)
    return response.data[0].embedding

Get Sonomed Combined Text


In [None]:
def text_for_embedding(document):
    """
    Combine preferred term, alternate terms, and definition into a single text string.

    """
    # Combine the terms and definition into a single string
    # Alternate terms are joined by a comma and a space
    text_components = []

    # Add the preferred term if it exists
    if "preferredTerm" in document and document["preferredTerm"]:
        text_components.append(f"Preferred term: {document['preferredTerm']}")

    # Add the alternate terms if they exist
    if "terms" in document and document["terms"]:
        text_components.append("Other terms: " + ", ".join(document["terms"]))

    # Add the definition if it exists
    if "definitions" in document and document["definitions"]:
        text_components.append("Definition: " + " ".join(document["definitions"]))

    # Combine the available fields into a single string for embedding
    return "; ".join(text_components)

In [None]:
cursor = (
    snomedct_collection.find({}, {"preferredTerm": 1, "terms": 1, "definitions": 1})
    .limit(15)
    .skip(random.randint(0, SNOMED_CONCEPT_COUNT - 15))
)

# Get the combined text for embedding
async for document in cursor:
    embedding_text = text_for_embedding(document)
    print(embedding_text)
    vector = await get_embedding(embedding_text)
    print("Got vector length ", len(vector))

Initialize SnomedCTConcept 

In [5]:
#raw_doc = await snomedct_collection.find_one({}, skip=random.randint(0, snomedct_concept))
one_raw_doc = await snomedct_collection.find_one({'_id': '100001001'})
print(one_raw_doc)
concept = SnomedCTConcept(**one_raw_doc)

{'_id': '100001001', 'conceptId': '100001001', 'effectiveDate': '2003-07-31', 'modules': ['SNOMED-CT-core'], 'languageCode': 'en', 'preferredTerm': 'BIZOLIN 200', 'terms': [], 'semanticTags': ['product', 'substance'], 'caseSignificance': 'sensitive', 'definitions': None}


Call Text for Embedding


In [6]:

embedding_text =  await create_embedding_for_snomedctconcept(concept)

print(embedding_text)


concept_id='100001001' effective_date=datetime.date(2003, 7, 31) modules=[<SnomedCTModule.SNOMED_CT_CORE: 'SNOMED-CT-core'>] embedding_vector=[-0.02206601, 0.055774968, -0.04891891, 0.016692342, -0.040148094, -0.035052374, 0.014152204, 0.029200017, 0.00070210913, -0.040487807, -0.010145115, -0.0038372313, -0.0031442908, 0.0031944758, 0.07782554, 0.03412588, 0.011210583, 0.005670918, -0.030805942, -0.028180873, 0.016460719, -0.0207998, -0.04098194, -0.0069950335, -0.002148309, -0.040673107, 0.012677533, -0.065101676, -0.008655002, -0.063989885, 0.06818999, -0.05982066, 0.09129059, -0.022436608, -0.037121546, -0.0140672745, 0.025077118, -0.014793029, -0.038109805, -0.0062229545, -0.064484015, 0.012831949, 0.017325446, -0.0054431553, 0.030435344, 0.02538595, -0.023810908, 0.027964693, 0.065842874, -0.0056207334, -0.029354434, -0.023949882, -0.0006963185, 0.07183421, -0.0059565874, 0.010724174, -0.030250045, 0.05324255, -0.018298266, 0.016290862, 0.019471826, -0.031686112, 0.04277316, -0.0

Update SnomedCT Embedded Vector value


In [7]:
snomed_ct_repo = SnomedCTConceptRepo(snomedct_collection)
new_value = await snomed_ct_repo.update_concept(concept)
print(new_value)


concept_id='100001001' effective_date=datetime.date(2003, 7, 31) modules=[<SnomedCTModule.SNOMED_CT_CORE: 'SNOMED-CT-core'>] embedding_vector=[-0.02206601, 0.055774968, -0.04891891, 0.016692342, -0.040148094, -0.035052374, 0.014152204, 0.029200017, 0.00070210913, -0.040487807, -0.010145115, -0.0038372313, -0.0031442908, 0.0031944758, 0.07782554, 0.03412588, 0.011210583, 0.005670918, -0.030805942, -0.028180873, 0.016460719, -0.0207998, -0.04098194, -0.0069950335, -0.002148309, -0.040673107, 0.012677533, -0.065101676, -0.008655002, -0.063989885, 0.06818999, -0.05982066, 0.09129059, -0.022436608, -0.037121546, -0.0140672745, 0.025077118, -0.014793029, -0.038109805, -0.0062229545, -0.064484015, 0.012831949, 0.017325446, -0.0054431553, 0.030435344, 0.02538595, -0.023810908, 0.027964693, 0.065842874, -0.0056207334, -0.029354434, -0.023949882, -0.0006963185, 0.07183421, -0.0059565874, 0.010724174, -0.030250045, 0.05324255, -0.018298266, 0.016290862, 0.019471826, -0.031686112, 0.04277316, -0.0