In [1]:
import random
from openai import OpenAI
import motor.motor_asyncio
from dotenv import dotenv_values
from openimagingdatamodel.ontology_tools.snomedct_concept import SnomedCTConcept

Load Env and Database


In [2]:
config = dotenv_values(".env")
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(await client.server_info())  #

{'version': '7.0.11', 'gitVersion': 'f451220f0df2b9dfe073f1521837f8ec5c208a8c', 'modules': ['enterprise'], 'allocator': 'tcmalloc', 'javascriptEngine': 'mozjs', 'sysInfo': 'deprecated', 'versionArray': [7, 0, 11, 0], 'bits': 64, 'debug': False, 'maxBsonObjectSize': 16777216, 'storageEngines': ['devnull', 'inMemory', 'queryable_wt', 'wiredTiger'], 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1717756190, 19), 'signature': {'hash': b"\xeb\xeb\x00\x08\xb8+f\x06\xeb'jF\xf4\xdd\x8c\xe8\x13\xa7Q\x81", 'keyId': 7327016299177967635}}, 'operationTime': Timestamp(1717756190, 19)}


In [3]:
db = client["ontologies"]
snomedct_collection = db["snomedct"]
snomedct_concept = await snomedct_collection.count_documents({})
print(f"SNOMED CT COUNT: {snomedct_concept}")

SNOMED CT COUNT: 508540


Generate Embedding

In [4]:
openai_client = OpenAI(api_key=config["OPENAI_EMBEDDING_API_KEY"])

def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return openai_client.embeddings.create(input = [text], model=model) #.data[0].embedding

# text =  "SnomedCT Preferred Term: Sprain"

# response = get_embedding(text)

# # Extract the embedding from the response
# embedding = response.data[0].embedding

# print(embedding)

[-0.012995994, 0.0327384, 0.021742795, 0.018748747, -0.02417464, 0.01472182, -0.008897156, -0.06254812, -0.032320015, -0.008341493, -0.023220206, 0.013270557, 0.020278458, 0.042413484, 0.028293088, 0.022383442, 0.054337375, 0.0028959885, -0.06866696, 0.06029932, 0.03169244, 0.0061482554, 0.0033241764, -0.05423278, 0.01098253, -0.014068098, 0.000985486, -0.029835872, 0.03195393, -0.01507483, 0.034255035, -0.03877879, 0.0030822994, -0.0048669605, -0.018591855, 0.024383832, -0.02735173, 0.086186714, -0.02537749, -0.013819683, 0.018291142, -0.008452626, -0.07342606, 0.009269779, -0.029077556, -0.013871981, 0.02391315, 0.019546289, 0.018264994, 0.0093613, -0.014617224, -0.023403248, 0.038595747, 0.016094636, -0.040975295, -0.044740736, -0.015035606, 0.05491265, 0.011479358, -0.041445974, 0.032503057, -0.027194835, -0.029129853, -0.00968816, 0.012348809, -0.020239234, 0.019480916, 0.053369865, 0.022605708, -0.044949926, -0.02172972, -0.012799877, -0.008838322, 0.028633025, -0.044113163, 0.00

Get Sonomed Combined Text

In [8]:
def text_for_embedding(document):
    """
    Combine preferred term, alternate terms, and definition into a single text string.
    
    """
    # Combine the terms and definition into a single string
    # Alternate terms are joined by a comma and a space
    text_components = []
    
    # Add the preferred term if it exists
    if 'preferredTerm' in document and document['preferredTerm']:
        text_components.append(document['preferredTerm'])
    
    # Add the alternate terms if they exist
    if 'terms' in document and document['terms']:
        text_components.append(', '.join(document['terms']))
    
    # Add the definition if it exists
    if 'definitions' in document and document['definitions']:
        text_components.append(' '.join(document['definitions']))
    
    # Combine the available fields into a single string for embedding
    combined_text = ', '.join(text_components)
    if combined_text:  # Ensure the combined text is not empty
        print(f"Combined Text: {combined_text}")
        return combined_text

# Snomed CT Collection:
cursor = snomedct_collection.find({}, {'preferredTerm': 1, 'terms': 1, 'definitions': 1}).limit(15)

# Get the combined text for embedding
async for document in cursor:
    embedding_text = text_for_embedding(document)
    print(embedding_text)
    response = get_embedding(embedding_text)

    # Extract the embedding from the response
    embedding = response.data[0].embedding

    print(embedding)

Combined Text: BITTER-3
BITTER-3
[-0.01693664, -0.00039217295, 0.009699929, 0.0064478363, -0.06481645, 0.0075305253, 0.033100504, 0.018965172, -0.018820276, 0.013748945, -0.015479637, -0.025694748, -0.0061177975, -0.018997371, 0.027127601, 0.020381924, 0.05193688, 0.0020687815, -0.013169364, 0.005976927, 0.053611223, 0.0421806, 0.003950407, 0.03525783, 0.031216865, 0.0271437, -0.008307325, -0.042953376, 0.03232773, -0.04800861, 0.018820276, -0.035708617, 0.027304696, -0.028882442, -0.010698096, 0.005312824, -0.016759545, 0.04134343, -0.008790309, 0.0042744083, -0.04189081, -0.0049063126, -0.006516259, 0.043661755, -0.023956006, -0.025533754, -0.011382323, 0.031200767, 0.0021211046, 0.008230852, -0.016856141, 0.043822747, -0.009901172, 0.1116015, -0.005349048, -0.009530884, -0.003877959, 0.039733484, 0.039057307, -0.020575117, -0.032488722, -0.0131371645, 0.025839644, -0.0013634235, -0.031828646, 0.0046648206, 0.014199729, 0.015745278, 0.014739062, 0.03139396, -0.009257194, 0.019802343,