#### Python Version

In [None]:
from platform import python_version
print(python_version())

#ToDO - Need VM with python 3.10

3.10.12


#### Install Libraries

In [23]:
# pip install ydata_profiling dtale
from IPython.display import clear_output

! pip install openai
! pip install chromadb

clear_output()

#### Restart Kernel

In [6]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

#### Formatting

In [1]:
bold_s = "\033[1m"
bold_e = "\033[0m"

#### Imports and set Path

In [2]:
import pandas as pd
import time
from ast import literal_eval

import chromadb
from chromadb.config import Settings

from keys.keys import OPENAI_KEY
import openai

In [17]:
# Set up your API credentials
openai.api_key = OPENAI_KEY

#### Global params

In [21]:
EMBEDDINGS_PATH = 'gs://emopti_shared/aiipem_deidentified_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv'
EMBEDDINGS_MODEL_NAME = "text-embedding-ada-002"

print(f"EMBEDDINGS_PATH: {EMBEDDINGS_PATH}")
print(f"EMBEDDINGS_MODEL_NAME: {EMBEDDINGS_MODEL_NAME}")

COPY_TO_LOCAL = False

#### Only if Copy to Local is True - Run in case reading from GCS doesn't work
if COPY_TO_LOCAL:
    LOCAL_PATH = './data'
    LOCAL_FILE_PATH = EMBEDDINGS_PATH.replace("gs://emopti_shared", LOCAL_PATH)
    print(f"LOCAL_FILE_PATH: {EMBEDDINGS_MODEL_NAME}")

    #### Copy embeddings to local folder
    ! gsutil -m cp {EMBEDDINGS_PATH} {LOCAL_PATH}

    INPUT_FILE_PATH = LOCAL_FILE_PATH

else:
    INPUT_FILE_PATH = EMBEDDINGS_PATH

print(f"INPUT_FILE_PATH: {INPUT_FILE_PATH}")


EMBEDDINGS_PATH: gs://emopti_shared/aiipem_deidentified_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv
EMBEDDINGS_MODEL_NAME: text-embedding-ada-002
INPUT_FILE_PATH: gs://emopti_shared/aiipem_deidentified_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv


#### Read Embeddings

In [11]:
start_time = time.time()
df = pd.read_csv(INPUT_FILE_PATH)

print(f"time taken in reading data: {time.time() - start_time}")
print("Sample Data: ")
display(df.head(2))

start_time= time.time()
df["embedding"] = df.embedding.apply(literal_eval)
print(f"time taken for literal_eval: {time.time()- start_time}")

print(f"DF Shape : {df.shape}")

time taken in reading data: 4.4795825481414795
Sample Data: 


Unnamed: 0,PT_Visit_ID_Hashed,combined,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Desc,embedding
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,Complaint: FLANK PAIN; Arrival_Method: Ambulan...,Maternal care for other conditions predominant...,Abdominal and pelvic pain,,"[0.0032147050369530916, 0.002123788231983781, ..."
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,Complaint: DIZZINESS; Arrival_Method: Car; Gro...,Volume depletion,Acute posthemorrhagic anemia,Hematuria,"[0.010029399767518044, 0.003674413776025176, 0..."


time taken for literal_eval: 89.83346438407898
DF Shape : (10000, 6)


#### Chroma DB

In [12]:
# PersistentClient
print(f"\nCreate Chroma Client and Collection")
client = chromadb.PersistentClient(path="chroma_openai", settings=Settings(allow_reset=True))
client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

collection = client.get_or_create_collection(name="emopti_search", metadata={"hnsw:space": "cosine"})

# print(f"\nlist of the first 10 items in the collection: {collection.peek()}")
print(f"\nnumber of items in the collection: {collection.count()}")



Create Chroma Client and Collection



number of items in the collection: 0


#### Add Embeddings to Collection

In [13]:
start_time= time.time()

ids = df['PT_Visit_ID_Hashed'].values.tolist()
embeddings = df['embedding'].values.tolist()

collection.add(
    embeddings=embeddings,
    ids=ids
)

# print(f"\nlist of the first 1 items in the collection: {collection.peek(1)}")
print(f"\nnumber of items in the collection: {collection.count()}")

print(f"\ntime taken in adding embeddings to collection: {time.time()- start_time}")


number of items in the collection: 10000

time taken in adding embeddings to collection: 20.8240385055542


#### Query DB

In [14]:
from typing import List
def get_embedding(text: str, model="text-embedding-ada-002", **kwargs) -> List[float]:

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = openai.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding

In [18]:
# Test query
patient_info = """Complaint: 'cramping pain;"""
test_embeddings = get_embedding(patient_info, model=EMBEDDINGS_MODEL_NAME)

In [19]:
search_results = collection.query(
    query_embeddings=[test_embeddings],
    n_results=5
)

In [20]:
print(f"\n{bold_s}Search Patient{bold_e}: {patient_info}")

print(f"\nTop Matched Patients: \n")

for id, distance in zip(search_results['ids'][0], search_results['distances'][0]):
        
    print(f"{bold_s}ID{bold_e}: {id}")
    print(f"{bold_s}Similariry{bold_e}: {(1-distance):.4f}")
    print()



[1mSearch Patient[0m: Complaint: 'cramping pain;

Top Matched Patients: 

[1mID[0m: 6c0a3480a8f7c8a1c5c552cfe2adddfad6a3b7c15f6fa839cf2df9370badd364
[1mSimilariry[0m: 0.8340

[1mID[0m: b2b0e44c052944ac55b4136de913a75bedc5a8aa31dbd4c36fc66b08a1e2f656
[1mSimilariry[0m: 0.8314

[1mID[0m: 9de4bab565738986855321f3fed7b7e4d91daff43024924f0820862a4cde9866
[1mSimilariry[0m: 0.8314

[1mID[0m: 0899222f45307e430cbd2d2f886e698aba2af86a880665e526bdf16bcb5231e2
[1mSimilariry[0m: 0.8301

[1mID[0m: 71fd3e9f934d4179371425994bdbc46a15a520eb5e29186e856453fa8fd45b82
[1mSimilariry[0m: 0.8288

