In [1]:
import asyncio
from pprint import pprint

from dotenv import dotenv_values
from motor.motor_asyncio import AsyncIOMotorClient
from openai import AsyncOpenAI, OpenAI
from openimagingdatamodel.ontology_tools.anatomic_location import AnatomicLocation
from openimagingdatamodel.ontology_tools.anatomic_location_repo import AnatomicLocationRepo, AsyncAnatomicLocationRepo
from openimagingdatamodel.ontology_tools.embedding_creator import AsyncEmbeddingCreator, EmbeddingCreator
from pymongo import MongoClient

In [2]:
config = dotenv_values(".env")

### Common Definitions


In [3]:
def combine_text_vector_results(text_results, vector_results):
    combined_results = {}
    for result in vector_results:
        combined_results[(result.system, result.code)] = {
            "display": result.display,
            "vector_score": result.score,
        }
    for result in text_results:
        if (result.system, result.code) in combined_results:
            combined_results[(result.system, result.code)]["text_score"] = result.score
        else:
            combined_results[(result.system, result.code)] = {
                "display": result.display,
                "text_score": result.score,
            }
    return combined_results

## Synchronous


In [4]:
llm = OpenAI(api_key=config["OPENAI_API_KEY"])
client = MongoClient(config["ATLAS_DSN"])
db = client["ontologies"]
collection = db["anatomic_locations"]
embedding_creator = EmbeddingCreator(llm)
repo = AnatomicLocationRepo(collection, embedding_creator)

In [4]:
count = repo.get_count()
print(f"Count: {count}")

Count: 2901


### Creating Embeddings


In [4]:
locations = [AnatomicLocation(**location) for location in collection.find({})]

In [5]:
locations[101].text_for_embedding()

'gastroduodenal artery (synonyms: arteria gastroduodenalis; arteria gastroduodenalis)'

In [6]:
BATCH_SIZE = 50
for i in range(50, len(locations), BATCH_SIZE):
    batch = locations[i : i + BATCH_SIZE]
    vectors = embedding_creator.create_embeddings_for_concepts(batch)
    if repo.bulk_write_embedding_vectors(batch, vectors):
        print(f"Batch {i} done")
    else:
        print(f"Batch {i} failed")
        break

Batch 50 done
Batch 100 done
Batch 150 done
Batch 200 done
Batch 250 done
Batch 300 done
Batch 350 done
Batch 400 done
Batch 450 done
Batch 500 done
Batch 550 done
Batch 600 done
Batch 650 done
Batch 700 done
Batch 750 done
Batch 800 done
Batch 850 done
Batch 900 done
Batch 950 done
Batch 1000 done
Batch 1050 done
Batch 1100 done
Batch 1150 done
Batch 1200 done
Batch 1250 done
Batch 1300 done
Batch 1350 done
Batch 1400 done
Batch 1450 done
Batch 1500 done
Batch 1550 done
Batch 1600 done
Batch 1650 done
Batch 1700 done
Batch 1750 done
Batch 1800 done
Batch 1850 done
Batch 1900 done
Batch 1950 done
Batch 2000 done
Batch 2050 done
Batch 2100 done
Batch 2150 done
Batch 2200 done
Batch 2250 done
Batch 2300 done
Batch 2350 done
Batch 2400 done
Batch 2450 done
Batch 2500 done
Batch 2550 done
Batch 2600 done
Batch 2650 done
Batch 2700 done
Batch 2750 done
Batch 2800 done
Batch 2850 done
Batch 2900 done


In [7]:
first_batch = locations[:50]
result = embedding_creator.create_embeddings_for_concepts(first_batch)

### Testing Vector Search


In [35]:
vector_results = repo.vector_search("right kidney", 10)

In [36]:
text_results = repo.text_search("right kidney", 10)

In [37]:
combined_results = combine_text_vector_results(text_results, vector_results)
pprint(combined_results)

{('ANTOMICLOCATIONS', 'RID1302'): {'display': 'right lung',
                                   'text_score': 5.479655742645264},
 ('ANTOMICLOCATIONS', 'RID205'): {'display': 'kidney',
                                  'text_score': 16.591510772705078},
 ('ANTOMICLOCATIONS', 'RID211'): {'display': 'cortex of kidney',
                                  'text_score': 17.129283905029297},
 ('ANTOMICLOCATIONS', 'RID228'): {'display': 'renal pelvis',
                                  'text_score': 5.765583038330078},
 ('ANTOMICLOCATIONS', 'RID2639_RID5825'): {'display': 'right hip',
                                           'vector_score': 0.8282985687255859},
 ('ANTOMICLOCATIONS', 'RID29662'): {'display': 'right kidney',
                                    'text_score': 23.062929153442383,
                                    'vector_score': 0.8300362825393677},
 ('ANTOMICLOCATIONS', 'RID29663'): {'display': 'left kidney',
                                    'text_score': 19.24749183654785},

## Asynchronous


In [5]:
async_llm = AsyncOpenAI(api_key=config["OPENAI_API_KEY"])
async_client = AsyncIOMotorClient(config["ATLAS_DSN"])
async_db = async_client["ontologies"]
async_collection = async_db["anatomic_locations"]
async_embedding_creator = AsyncEmbeddingCreator(async_llm)
async_repo = AsyncAnatomicLocationRepo(async_collection, async_embedding_creator)

In [6]:
count = await async_repo.get_count()
print(f"Count: {count}")

Count: 2901


In [7]:
# Do the text and vector searches in parallel and gather the results
text_results, vector_results = await asyncio.gather(
    async_repo.text_search("right kidney", 10), async_repo.vector_search("right kidney", 10)
)
combined_results = combine_text_vector_results(text_results, vector_results)
pprint(combined_results)

{('ANTOMICLOCATIONS', 'RID1302'): {'display': 'right lung',
                                   'text_score': 5.479655742645264},
 ('ANTOMICLOCATIONS', 'RID205'): {'display': 'kidney',
                                  'text_score': 16.591510772705078},
 ('ANTOMICLOCATIONS', 'RID211'): {'display': 'cortex of kidney',
                                  'text_score': 17.129283905029297},
 ('ANTOMICLOCATIONS', 'RID228'): {'display': 'renal pelvis',
                                  'text_score': 5.765583038330078},
 ('ANTOMICLOCATIONS', 'RID2639_RID5825'): {'display': 'right hip',
                                           'vector_score': 0.8282985687255859},
 ('ANTOMICLOCATIONS', 'RID29662'): {'display': 'right kidney',
                                    'text_score': 23.062929153442383,
                                    'vector_score': 0.8300362825393677},
 ('ANTOMICLOCATIONS', 'RID29663'): {'display': 'left kidney',
                                    'text_score': 19.24749183654785},