In [1]:
from dotenv import dotenv_values
from openai import OpenAI
from openimagingdatamodel.ontology_tools.anatomic_location import AnatomicLocation
from openimagingdatamodel.ontology_tools.anatomic_location_repo import AnatomicLocationRepo
from openimagingdatamodel.ontology_tools.embedding_creator import EmbeddingCreator
from pymongo import MongoClient

In [2]:
config = dotenv_values(".env")
llm = OpenAI(api_key=config["OPENAI_API_KEY"])
client = MongoClient(config["ATLAS_DSN"])
db = client["ontologies"]
collection = db["anatomic_locations"]
repo = AnatomicLocationRepo(collection)
embedding_creator = EmbeddingCreator(llm)

In [3]:
count = repo.get_count()
print(f"Count: {count}")

Count: 2901


In [4]:
locations = [AnatomicLocation(**location) for location in collection.find({})]

In [5]:
locations[101].text_for_embedding()

'gastroduodenal artery (synonyms: arteria gastroduodenalis; arteria gastroduodenalis)'

In [6]:
BATCH_SIZE = 50
for i in range(50, len(locations), BATCH_SIZE):
    batch = locations[i : i + BATCH_SIZE]
    vectors = embedding_creator.create_embeddings_for_concepts(batch)
    if repo.bulk_write_embedding_vectors(batch, vectors):
        print(f"Batch {i} done")
    else:
        print(f"Batch {i} failed")
        break

Batch 50 done
Batch 100 done
Batch 150 done
Batch 200 done
Batch 250 done
Batch 300 done
Batch 350 done
Batch 400 done
Batch 450 done
Batch 500 done
Batch 550 done
Batch 600 done
Batch 650 done
Batch 700 done
Batch 750 done
Batch 800 done
Batch 850 done
Batch 900 done
Batch 950 done
Batch 1000 done
Batch 1050 done
Batch 1100 done
Batch 1150 done
Batch 1200 done
Batch 1250 done
Batch 1300 done
Batch 1350 done
Batch 1400 done
Batch 1450 done
Batch 1500 done
Batch 1550 done
Batch 1600 done
Batch 1650 done
Batch 1700 done
Batch 1750 done
Batch 1800 done
Batch 1850 done
Batch 1900 done
Batch 1950 done
Batch 2000 done
Batch 2050 done
Batch 2100 done
Batch 2150 done
Batch 2200 done
Batch 2250 done
Batch 2300 done
Batch 2350 done
Batch 2400 done
Batch 2450 done
Batch 2500 done
Batch 2550 done
Batch 2600 done
Batch 2650 done
Batch 2700 done
Batch 2750 done
Batch 2800 done
Batch 2850 done
Batch 2900 done


In [7]:
first_batch = locations[:50]
result = embedding_creator.create_embeddings_for_concepts(first_batch)