In [None]:
import motor.motor_asyncio
import pymongo
from dotenv import dotenv_values
from openai import AsyncOpenAI, OpenAI
from openimagingdatamodel.ontology_tools.embedding_creator import AsyncEmbeddingCreator, EmbeddingCreator
from openimagingdatamodel.ontology_tools.snomedct_concept import SnomedCTConcept
from openimagingdatamodel.ontology_tools.snomedct_concept_repo import AsyncSnomedCTConceptRepo, SnomedCTConceptRepo

In [None]:
config = dotenv_values(".env")

## Asynchronous


### Load Env and Database


In [None]:
client = motor.motor_asyncio.AsyncIOMotorClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(await client.server_info())

In [None]:
db = client["ontologies"]
snomedct_collection = db["snomedct"]
snomed_concept_repo = AsyncSnomedCTConceptRepo(snomedct_collection)
SNOMED_CONCEPT_COUNT = await snomed_concept_repo.get_count()
print(f"SNOMED CT COUNT: {SNOMED_CONCEPT_COUNT}")

Initialize SnomedCTConcept


In [None]:
concept: SnomedCTConcept = (await snomed_concept_repo.get_random_concepts(1))[0]
print(concept)

Asynchronous Generate Embedding Creator


In [None]:
async_openai_client = AsyncOpenAI(api_key=config["OPENAI_API_KEY"])
async_embedding_creator = AsyncEmbeddingCreator(async_openai_client)
repo = AsyncSnomedCTConceptRepo(snomedct_collection)
embedding_creator = AsyncEmbeddingCreator(async_openai_client)
# embedding_vector = await async_embedding_creator.create_embedding_for_snomedctconcept(concept)
# print(embedding_vector)

Get SnomedCT Count


In [None]:
count = await repo.get_count()
print(f"Count: {count}")

In [None]:
snomed_concepts = [SnomedCTConcept(**concept) async for concept in snomedct_collection.find({'embedding_vector': { '$exists': False }})]
print(snomed_concepts[101])

In [None]:
snomed_concepts[101].text_for_embedding()

In [None]:
repo = AsyncSnomedCTConceptRepo(snomedct_collection)
BATCH_SIZE = 50
for i in range(50, len(snomed_concepts), BATCH_SIZE):
    batch = snomed_concepts[i : i + BATCH_SIZE]
    vectors = await embedding_creator.create_embeddings_for_concepts(batch)
    if await repo.bulk_write_embedding_vectors(batch, vectors):
        print(f"Batch {i} done")
    else:
        print(f"Batch {i} failed")

## Synchronous


Setup Database/Repo


In [None]:
# Set up a pymongo client, not a motor client
client = pymongo.MongoClient(config["ATLAS_DSN"])
# Ping the client to confirm
print(client.server_info())

In [None]:
llm = OpenAI(api_key=config["OPENAI_API_KEY"])
db = client["ontologies"]
collection = db["snomedct"]
repo = SnomedCTConceptRepo(collection)
embedding_creator = EmbeddingCreator(llm)

In [None]:
count = repo.get_count()
print(f"Count: {count}")

In [None]:
snomed_concepts = [SnomedCTConcept(**concept) for concept in collection.find({})]
print(snomed_concepts[101])

In [None]:
snomed_concepts[101].text_for_embedding()

In [None]:
repo = SnomedCTConceptRepo(collection)
BATCH_SIZE = 50
for i in range(50, len(snomed_concepts), BATCH_SIZE):
    batch = snomed_concepts[i : i + BATCH_SIZE]
    vectors = embedding_creator.create_embeddings_for_concepts(batch)
    if repo.bulk_write_embedding_vectors(batch, vectors):
        print(f"Batch {i} done")
    else:
        print(f"Batch {i} failed")
        

In [None]:
first_batch = snomed_concepts[:50]
result = embedding_creator.create_embeddings_for_concepts(first_batch)

In [None]:
empty_snomedct = list(collection.find({'embedding_vector': { '$exists': False }}))
print(len(empty_snomedct))