#### Python Version

In [6]:
from platform import python_version
print(python_version())

#ToDO - Need VM with python 3.10

3.10.12


#### Install Libraries

In [23]:
# pip install ydata_profiling dtale
from IPython.display import clear_output

! pip install --upgrade google-cloud-aiplatform
! pip install chromadb

clear_output()

#### Restart Kernel

In [6]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

#### Formatting

In [3]:
bold_s = "\033[1m"
bold_e = "\033[0m"

#### Imports and set Path

In [4]:
import pandas as pd
import time
from ast import literal_eval

import chromadb
from chromadb.config import Settings

from vertexai.language_models import TextEmbeddingModel


#### Global params

In [1]:
EMBEDDINGS_PATH = 'gs://emopti_shared/aiipem_deidentified_palm2_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv'
EMBEDDINGS_MODEL_NAME = "textembedding-gecko@001"
CHROMA_DB_NAME = "chroma_openai"

print(f"EMBEDDINGS_PATH: {EMBEDDINGS_PATH}")
print(f"EMBEDDINGS_MODEL_NAME: {EMBEDDINGS_MODEL_NAME}")

COPY_TO_LOCAL = False

#### Only if Copy to Local is True - Run in case reading from GCS doesn't work
if COPY_TO_LOCAL:
    LOCAL_PATH = './data'
    LOCAL_FILE_PATH = EMBEDDINGS_PATH.replace("gs://emopti_shared", LOCAL_PATH)
    print(f"LOCAL_FILE_PATH: {EMBEDDINGS_MODEL_NAME}")

    #### Copy embeddings to local folder
    ! gsutil -m cp {EMBEDDINGS_PATH} {LOCAL_PATH}

    INPUT_FILE_PATH = LOCAL_FILE_PATH

else:
    INPUT_FILE_PATH = EMBEDDINGS_PATH

print(f"INPUT_FILE_PATH: {INPUT_FILE_PATH}")


EMBEDDINGS_PATH: gs://emopti_shared/aiipem_deidentified_palm2_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv
LOCAL_FILE_PATH: ./data/aiipem_deidentified_palm2_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv
EMBEDDINGS_MODEL_NAME: textembedding-gecko@001


#### Read Embeddings

In [11]:
start_time = time.time()
df = pd.read_csv(INPUT_FILE_PATH)

print(f"time taken in reading data: {time.time() - start_time}")
print("Sample Data: ")
display(df.head(2))

start_time= time.time()
df["embedding"] = df.embedding.apply(literal_eval)
print(f"time taken for literal_eval: {time.time()- start_time}")

print(f"DF Shape : {df.shape}")

time taken in reading data: 29.59541893005371
Sample Data: 


Unnamed: 0,PT_Visit_ID_Hashed,combined,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Desc,embedding
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,Complaint: FLANK PAIN; Arrival_Method: Ambulan...,Maternal care for other conditions predominant...,Abdominal and pelvic pain,,"[0.006166818551719189, -6.569196557393298e-05,..."
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,Complaint: DIZZINESS; Arrival_Method: Car; Gro...,Volume depletion,Acute posthemorrhagic anemia,Hematuria,"[0.003573720809072256, -0.0013320287689566612,..."


time taken for literal_eval: 34.90341067314148
DF Shape : (10000, 6)


#### Chroma DB

In [12]:
# PersistentClient
print(f"\nCreate Chroma Client and Collection")
client = chromadb.PersistentClient(path="chroma_palm2", settings=Settings(allow_reset=True))
client.reset() # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

collection = client.get_or_create_collection(name="emopti_search", metadata={"hnsw:space": "cosine"})

# print(f"\nlist of the first 10 items in the collection: {collection.peek()}")
print(f"\nnumber of items in the collection: {collection.count()}")



Create Chroma Client and Collection

number of items in the collection: 0


#### Add Embeddings to Collection

In [13]:
start_time= time.time()

ids = df['PT_Visit_ID_Hashed'].values.tolist()
embeddings = df['embedding'].values.tolist()

collection.add(
    embeddings=embeddings,
    ids=ids
)

# print(f"\nlist of the first 1 items in the collection: {collection.peek(1)}")
print(f"\nnumber of items in the collection: {collection.count()}")
print(f"\ntime taken in adding embeddings to collection: {time.time()- start_time}")


number of items in the collection: 10000

time taken in adding embeddings to collection: 11.869381427764893


In [None]:
#### Copy embeddings to local folder
# ! gsutil -m cp -r chroma_openai gs://roughdraft/villages/emopti/workshops/llm_patient_search/workers/chroma_vector_db

#### Query DB

In [14]:
# Test query
patient_info = """Complaint: 'cramping pain;"""

embedding_model = TextEmbeddingModel.from_pretrained(EMBEDDINGS_MODEL_NAME)
test_embeddings = embedding_model.get_embeddings([patient_info])[0].values

In [15]:
search_results = collection.query(
    query_embeddings=[test_embeddings],
    n_results=5
)

In [16]:
print(f"\n{bold_s}Search Patient{bold_e}: {patient_info}")

print(f"\nTop Matched Patients: \n")

for id, distance in zip(search_results['ids'][0], search_results['distances'][0]):
        
    print(f"{bold_s}ID{bold_e}: {id}")
    print(f"{bold_s}Similariry{bold_e}: {(1-distance):.4f}")
    print()



[1mSearch Patient[0m: Complaint: 'cramping pain;

Top Matched Patients: 

[1mID[0m: 90bcbe99fb36f49051199b964ced8d646c5d0bacc1582e4e7c3e86340c18a2d7
[1mSimilariry[0m: 0.7265

[1mID[0m: 011e7611ec4702acc3a2bf2dd5251b0fdf975216a74ccb8815e833da85659d6f
[1mSimilariry[0m: 0.7257

[1mID[0m: 7a5bc4b2a70297eb2eb39e908d801663abca48ddc76318897745b5999d7138a3
[1mSimilariry[0m: 0.7249

[1mID[0m: 01b37540917cc95ca5414f07e5b5b7e539ee20d722e0a995bc0ca5ef93780352
[1mSimilariry[0m: 0.7243

[1mID[0m: 34a19ef11c0a873ab46d3398e1e5ca10ffb72857f2105591db98c3f3e60f9716
[1mSimilariry[0m: 0.7241

