In [5]:
import random
from openai import AsyncOpenAI
import motor.motor_asyncio
from dotenv import dotenv_values
from openimagingdatamodel.ontology_tools.snomedct_concept import SnomedCTConcept
from openimagingdatamodel.ontology_tools.snomedct_concept_repo import SnomedCTConceptRepo
from openimagingdatamodel.ontology_tools.utils import create_embedding_for_snomedctconcept

Load Env and Database


In [2]:
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(await client.server_info())  #

{'version': '7.0.11', 'gitVersion': 'f451220f0df2b9dfe073f1521837f8ec5c208a8c', 'modules': ['enterprise'], 'allocator': 'tcmalloc', 'javascriptEngine': 'mozjs', 'sysInfo': 'deprecated', 'versionArray': [7, 0, 11, 0], 'bits': 64, 'debug': False, 'maxBsonObjectSize': 16777216, 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1718032354, 17), 'signature': {'hash': b'\x9a\xe2\x020\xe4\t,~\xb0\x07F\xd5B\xe9])\x7fue9', 'keyId': 7327016299177967635}}, 'operationTime': Timestamp(1718032354, 17)}


In [10]:
db = client["ontologies"]
snomedct_collection = db["snomedct"]
SNOMED_CONCEPT_COUNT = await snomedct_collection.count_documents({})
print(f"SNOMED CT COUNT: {SNOMED_CONCEPT_COUNT}")

SNOMED CT COUNT: 508540


Generate Embedding


In [11]:
openai_client = AsyncOpenAI(api_key=config["OPENAI_API_KEY"])

In [5]:
async def get_embedding(text, model="text-embedding-3-large", dimensions: int = 1024):
    text = text.replace("\n", " ")
    response = await openai_client.embeddings.create(input=[text], model=model, dimensions=dimensions)
    return response.data[0].embedding

Get Sonomed Combined Text


In [9]:
def text_for_embedding(document):
    """
    Combine preferred term, alternate terms, and definition into a single text string.

    """
    # Combine the terms and definition into a single string
    # Alternate terms are joined by a comma and a space
    text_components = []

    # Add the preferred term if it exists
    if "preferredTerm" in document and document["preferredTerm"]:
        text_components.append(f"Preferred term: {document['preferredTerm']}")

    # Add the alternate terms if they exist
    if "terms" in document and document["terms"]:
        text_components.append("Other terms: " + ", ".join(document["terms"]))

    # Add the definition if it exists
    if "definitions" in document and document["definitions"]:
        text_components.append("Definition: " + " ".join(document["definitions"]))

    # Combine the available fields into a single string for embedding
    return "; ".join(text_components)

In [13]:
cursor = (
    snomedct_collection.find({}, {"preferredTerm": 1, "terms": 1, "definitions": 1})
    .limit(15)
    .skip(random.randint(0, SNOMED_CONCEPT_COUNT - 15))
)

# Get the combined text for embedding
async for document in cursor:
    embedding_text = text_for_embedding(document)
    print(embedding_text)
    vector = await get_embedding(embedding_text)
    print("Got vector length ", len(vector))

Combined Text: BITTER-3
BITTER-3
[-0.01693664, -0.00039217295, 0.009699929, 0.0064478363, -0.06481645, 0.0075305253, 0.033100504, 0.018965172, -0.018820276, 0.013748945, -0.015479637, -0.025694748, -0.0061177975, -0.018997371, 0.027127601, 0.020381924, 0.05193688, 0.0020687815, -0.013169364, 0.005976927, 0.053611223, 0.0421806, 0.003950407, 0.03525783, 0.031216865, 0.0271437, -0.008307325, -0.042953376, 0.03232773, -0.04800861, 0.018820276, -0.035708617, 0.027304696, -0.028882442, -0.010698096, 0.005312824, -0.016759545, 0.04134343, -0.008790309, 0.0042744083, -0.04189081, -0.0049063126, -0.006516259, 0.043661755, -0.023956006, -0.025533754, -0.011382323, 0.031200767, 0.0021211046, 0.008230852, -0.016856141, 0.043822747, -0.009901172, 0.1116015, -0.005349048, -0.009530884, -0.003877959, 0.039733484, 0.039057307, -0.020575117, -0.032488722, -0.0131371645, 0.025839644, -0.0013634235, -0.031828646, 0.0046648206, 0.014199729, 0.015745278, 0.014739062, 0.03139396, -0.009257194, 0.019802343,

Initialize SnomedCTConcept 

In [7]:
raw_doc = await snomedct_collection.find_one({}, skip=random.randint(0, snomedct_concept))
one_raw_doc = await snomedct_collection.find_one({'_id': '10000006'})
print(one_raw_doc)
concept = SnomedCTConcept(**one_raw_doc)

{'_id': '10000006', 'conceptId': '10000006', 'effectiveDate': '2017-07-31', 'modules': ['SNOMED-CT-core'], 'languageCode': 'en', 'preferredTerm': 'Radiating chest pain', 'terms': [], 'semanticTags': ['finding'], 'caseSignificance': 'insensitive', 'definitions': None}


Call Text for Embedding


In [8]:

embedding_text =  await create_embedding_for_snomedctconcept(concept)

print(embedding_text)


concept_id='10000006' effective_date=datetime.date(2017, 7, 31) modules=[<SnomedCTModule.SNOMED_CT_CORE: 'SNOMED-CT-core'>] embedding_vector=[-0.007235075, -0.012823928, 0.03021301, 0.05201507, -0.02205107, 0.027128072, -0.024527319, -0.013349612, 0.008037435, -0.035054836, -0.03333945, 0.032592423, 0.005609604, -0.000466026, 0.048999302, 0.0014473608, -0.0001151015, -0.029023303, -0.045568522, 0.041529052, 0.020225009, 0.029604321, 0.048362948, -0.0670109, 0.00318869, 0.008639205, 0.023711124, -0.0058759046, 0.013612455, -0.008514701, -0.024361314, -0.03281376, 0.035193175, -0.02882963, -0.02835928, 0.017430583, -0.058599956, 0.04316144, 0.010748859, -0.0119316485, 0.030130006, 0.003259588, -0.03754492, 0.050465684, -0.0027909682, -0.0061041624, -0.029133974, -0.014304144, 0.048086274, -0.0001597371, 0.0036486636, 0.00063203153, 0.0075463355, 0.034529153, -0.0020785278, -0.033671457, -0.020584688, 0.02260442, 0.033035103, 0.022784261, -0.005246467, -0.029327646, -0.030019335, -0.03571

Update SnomedCT Embedded Vector value


In [9]:
snomed_ct_repo = SnomedCTConceptRepo(snomedct_collection)
new_value = await snomed_ct_repo.update_concept(concept)
print(new_value)


concept_id='10000006' effective_date=datetime.date(2017, 7, 31) modules=[<SnomedCTModule.SNOMED_CT_CORE: 'SNOMED-CT-core'>] embedding_vector=[-0.007235075, -0.012823928, 0.03021301, 0.05201507, -0.02205107, 0.027128072, -0.024527319, -0.013349612, 0.008037435, -0.035054836, -0.03333945, 0.032592423, 0.005609604, -0.000466026, 0.048999302, 0.0014473608, -0.0001151015, -0.029023303, -0.045568522, 0.041529052, 0.020225009, 0.029604321, 0.048362948, -0.0670109, 0.00318869, 0.008639205, 0.023711124, -0.0058759046, 0.013612455, -0.008514701, -0.024361314, -0.03281376, 0.035193175, -0.02882963, -0.02835928, 0.017430583, -0.058599956, 0.04316144, 0.010748859, -0.0119316485, 0.030130006, 0.003259588, -0.03754492, 0.050465684, -0.0027909682, -0.0061041624, -0.029133974, -0.014304144, 0.048086274, -0.0001597371, 0.0036486636, 0.00063203153, 0.0075463355, 0.034529153, -0.0020785278, -0.033671457, -0.020584688, 0.02260442, 0.033035103, 0.022784261, -0.005246467, -0.029327646, -0.030019335, -0.03571