#### Objective

This notebook is to execute below tasks:
1. Fetch the data from GCS bucket
2. Filter key columns, required for Embeddings
3. Clean Columns and generate String Concating columns
4. Save output file back to GCS
5. Create another file with Diagnostic columns - to be used for recommendations and save to GCS
6. DEploy index to Vertex AI

#### Python Version

In [None]:
from platform import python_version
print(python_version())

#ToDO - Need VM with python 3.10

3.10.12


#### Imports and set Path

In [1]:
import pandas as pd
import time
import numpy as np

import sys
sys.path.append("./keys")
sys.path.append("./utils")

from keys import OPENAI_KEY
from embeddings_utils import get_embedding, cosine_similarity, get_embeddings

import openai

# aiplatform package
from google.cloud import aiplatform

In [2]:
# Set up your API credentials
openai.api_key = OPENAI_KEY

#### Global params

In [3]:
BUCKET_NAME = "emopti-vector-search-icd-codes"
BUCKET_URI =  f"gs://{BUCKET_NAME}"

# embedding model parameters
embedding_model = "text-embedding-ada-002"

### Uncomment if using tiktoken
# embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002

input_file_loc = 'gs://emopti_shared/aiipem_deidentified_data_20231208.csv'
output_json_loc = './data/deidentified/aiipem_deidentified_data_20231208_with_vitals_lbel_and_icd_codes.json'

icd_code_desc_loc = 'gs://emopti_shared/filtered_extracted_diag_name_desc.csv'
vital_signs_cond_loc = 'gs://emopti_shared/vital_signs_conditions.csv'
vital_signs_raw_val_loc = 'gs://emopti_shared/vital_signs_raw_values.csv'

PROJECT_ID = "healthlab-genesis"
LOCATION = "us-central1"

#### Dataframe Columns

In [4]:
id_column = 'PT_Visit_ID_Hashed'

columns_for_embeddings = ['Pt_Complaint', 'Pt_Arrival_Method', 'grouped_arrival_method',  'Pt_Triage_Acuity',
           'Pt_Gender', 'Pt_Race', 'Pt_Ethnicity', 'Pulse', 'Resp', 'SpO2', 'Temp', 
           'Arrival_TimeOfDay', 'Age_Group',
           'Systolic_BP', 'Diastolic_BP', 'bp_group']

#### Read Data From BQ

In [5]:
start_time = time.time()
df = pd.read_csv("./data/healthlab/aiipem_deidentified_data_1000_20231208.csv")

print(f"time taken in reading data from BQ: {time.time() - start_time}")
print("Sample Data: ")
display(df.head(2))

print(f"DF Shape : {df.shape}")

time taken in reading data from BQ: 0.02946329116821289
Sample Data: 


Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Temp,Visit_Duration_Hrs,Arrival_TimeOfDay,Age_Group,Pt_DOB_Year,Systolic_BP,Diastolic_BP,emopti_grouped_disposition,grouped_arrival_method,bp_group
0,3c3fee2b39d09271071cead1a9a5ad35187e9a3181225f...,91df64b2d9f1e855ceb4341c19d975b075c5b8f8cea66d...,Car,DIARRHEA,3.0,M,UNKNOWN,Choose not to disclose,E87.1,,...,98.1,5.5,12pm-6pm,40-60,1975.0,145.0,58.0,Not Discharged,private transport,high
1,9311b49a3774354d0ccbd749b44f9fb9117728788cc50d...,4a9b0578ab8d7b8444e930854b0b11805c804fa009d761...,Car,DELIRIUM TREMENS (DTS),3.0,M,UNKNOWN,Choose not to disclose,,,...,98.1,1.0,12pm-6pm,20-40,1997.0,124.0,98.0,Left Without Being Seen,private transport,at_risk


DF Shape : (1000, 27)


#### Filter required columns

In [6]:
#### all columns
all_columns = df.columns.to_list()
        
print(f"\n\nall_columns :{all_columns}")
print(f"\n\nID Column :{id_column}")
print(f"\nColumns For Embeddings :{columns_for_embeddings}")
print("\n\nFiltered DF: ")

#### Save everything
display(df.head(2))



all_columns :['PT_Visit_ID_Hashed', 'PT_ID_Hashed', 'Pt_Arrival_Method', 'Pt_Complaint', 'Pt_Triage_Acuity', 'Pt_Gender', 'Pt_Race', 'Pt_Ethnicity', 'Pt_Prime_ICD10', 'Pt_Secondary_ICD10', 'Pt_Third_ICD10', 'EHR_Disposition', 'd2i_Grouped_Disposition', 'BP', 'Pulse', 'Resp', 'SpO2', 'Temp', 'Visit_Duration_Hrs', 'Arrival_TimeOfDay', 'Age_Group', 'Pt_DOB_Year', 'Systolic_BP', 'Diastolic_BP', 'emopti_grouped_disposition', 'grouped_arrival_method', 'bp_group']


ID Column :PT_Visit_ID_Hashed

Columns For Embeddings :['Pt_Complaint', 'Pt_Arrival_Method', 'grouped_arrival_method', 'Pt_Triage_Acuity', 'Pt_Gender', 'Pt_Race', 'Pt_Ethnicity', 'Pulse', 'Resp', 'SpO2', 'Temp', 'Arrival_TimeOfDay', 'Age_Group', 'Systolic_BP', 'Diastolic_BP', 'bp_group']


Filtered DF: 


Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Temp,Visit_Duration_Hrs,Arrival_TimeOfDay,Age_Group,Pt_DOB_Year,Systolic_BP,Diastolic_BP,emopti_grouped_disposition,grouped_arrival_method,bp_group
0,3c3fee2b39d09271071cead1a9a5ad35187e9a3181225f...,91df64b2d9f1e855ceb4341c19d975b075c5b8f8cea66d...,Car,DIARRHEA,3.0,M,UNKNOWN,Choose not to disclose,E87.1,,...,98.1,5.5,12pm-6pm,40-60,1975.0,145.0,58.0,Not Discharged,private transport,high
1,9311b49a3774354d0ccbd749b44f9fb9117728788cc50d...,4a9b0578ab8d7b8444e930854b0b11805c804fa009d761...,Car,DELIRIUM TREMENS (DTS),3.0,M,UNKNOWN,Choose not to disclose,,,...,98.1,1.0,12pm-6pm,20-40,1997.0,124.0,98.0,Left Without Being Seen,private transport,at_risk


#### EDA

In [7]:
df['grouped_arrival_method'].value_counts()[0:20]

grouped_arrival_method
private transport    804
ambulance            158
helicopter            28
police                 5
other                  5
Name: count, dtype: int64

In [11]:
# for col in df.columns:
#     print(f"\n\nColumn: {col}")
#     print(f"\n{df[col].value_counts()}")

#### Concate Columns

In [8]:
df.columns

Index(['PT_Visit_ID_Hashed', 'PT_ID_Hashed', 'Pt_Arrival_Method',
       'Pt_Complaint', 'Pt_Triage_Acuity', 'Pt_Gender', 'Pt_Race',
       'Pt_Ethnicity', 'Pt_Prime_ICD10', 'Pt_Secondary_ICD10',
       'Pt_Third_ICD10', 'EHR_Disposition', 'd2i_Grouped_Disposition', 'BP',
       'Pulse', 'Resp', 'SpO2', 'Temp', 'Visit_Duration_Hrs',
       'Arrival_TimeOfDay', 'Age_Group', 'Pt_DOB_Year', 'Systolic_BP',
       'Diastolic_BP', 'emopti_grouped_disposition', 'grouped_arrival_method',
       'bp_group'],
      dtype='object')

In [9]:
print(f"DF Shape : {df.shape}")

DF Shape : (1000, 27)


#### Modify DataFrame for values

In [10]:
vital_signs_raw_val_loc = "./data/deidentified/vital_signs_raw_modified.csv"
icd_code_desc_loc = "./data/deidentified/filtered_extracted_diag_name_desc.csv"

In [None]:
def apply_labels(df, df_raw_values):

    vitals = ['Temp', 'Pulse', 'Systolic_BP', 'Diastolic_BP', 'Resp', 'SpO2']
    age_list = ['>13', '2-13', '1-2', '0.5-1', '<0.5']

    processed_df_list = []

    for age in age_list:
        print(f"\nProcessing age : {age}")

        df_age_temp = df[df['Age_Group'] == age].copy()

        for vital in vitals:
            print(f"Processing Vital : {vital}")

            df_vital = df_raw_values[((df_raw_values['Vital']==vital) & (df_raw_values['Age']==age))].iloc[0]

            labels = [ 'Nonsense_Low', 'Very_Bad_Low', 'Normal_Low', 'Normal_High', 'Bad_High', 'Very_Bad_High', 'Nonsense_High']

            bins = [min(df_age_temp['Temp'].min()-1, df_vital['Nonsense_Low']-1), df_vital['Nonsense_Low'], df_vital['Very_Bad_Low'], 
                    df_vital['Normal_Low'], df_vital['Normal_High'], df_vital['Bad_High'], 
                    df_vital['Very_Bad_High'], np.inf]

            if vital == "SpO2":
                labels.remove("Bad_High")
                labels.remove("Very_Bad_High")

                bins.remove(df_vital['Bad_High'])
                bins.remove(df_vital['Very_Bad_High'])
                
            df_age_temp[f"{vital}_Bin"] = pd.cut(df_age_temp[f"{vital}"], bins=bins, labels=labels)
        processed_df_list.append(df_age_temp)
                
    df_processed = pd.concat(processed_df_list)

    return df_processed


In [11]:
print("\nReading DF: ")

#### Read DF
df_raw_values = pd.read_csv(vital_signs_raw_val_loc)
display(df_raw_values.head(2))

#### ICD 10 Descriptions
df_icd_desc = pd.read_csv(icd_code_desc_loc)
df_icd_desc.columns = ['Code', "Desc"]
display(df_icd_desc.head(2))

################################################################
################################################################
#### Age group Mappimg
print("\nApply Age mapping: ")
Age_Group = {"0-.5": "<0.5", ".5-1": "0.5-1", "1-2": "1-2", "2-13": "2-13", "13-20": ">13", "20-40": ">13", "40-60": ">13", "60-80": ">13", "80-100": ">13"}
df['Age_Group_Old'] = df['Age_Group'].values
df['Age_Group'] = df["Age_Group"].map(Age_Group)
print(f"DF Shape : {df.shape}")

################################################################
################################################################
print("\nApply Age Vitals Conditions and add Desc: ")
df = apply_labels(df, df_raw_values)

################################################################
################################################################
print("\nAdd ICD10 Desc: ")

#### IDC 10 Descriptions
for col in ["Pt_Prime_ICD10", "Pt_Secondary_ICD10", "Pt_Third_ICD10"]:
    print(f"Col: {col}")
    df[f"{col}_Code"] = df[col].str[:3]
    df = df.merge(df_icd_desc, left_on=f"{col}_Code", right_on="Code", how="left").drop(columns = ['Code']).rename(columns={"Desc": f"{col}_Desc"})

print(f"DF Shape : {df.shape}")
################################################################
################################################################

df = df.fillna("NA")
display(df.head(2))
print(f"DF Shape : {df.shape}")

################################################################
################################################################


Reading DF: 


Unnamed: 0,Vital,Age,Normal_High,Normal_Low,Bad_High,Very_Bad_Low,Very_Bad_High,Nonsense_High,Nonsense_Low
0,Temp,>13,100,97,104.0,96,107.0,>107,93
1,Temp,2-13,100,97,104.0,96,107.0,>107,93


Unnamed: 0,Code,Desc
0,A00,Cholera
1,A01,Typhoid and paratyphoid fevers



Apply Age mapping: 
DF Shape : (1000, 28)


#### Backup

In [84]:
vitals = ['Temp', 'Pulse', 'Systolic_BP', 'Diastolic_BP', 'Resp', 'SpO2']
df_condition = pd.DataFrame()

for vital in vitals:

    age_list = []
    value_list = []
    label_list = []

    df_vital = df_raw_values[df_raw_values['Vital'] ==  vital]

    for row_idx in range(0, df_vital.shape[0]):
        row = df_vital.iloc[row_idx]

        value_list.append(f"<={int(row['Nonsense_Low'])}")
        label_list.append('Nonsense_Low')

        value_list.append(f"{int(row['Nonsense_Low'])+1}-{int(row['Very_Bad_Low'])}")
        label_list.append('Very_Bad_Low')

        value_list.append(f"{int(row['Normal_Low'])}")
        label_list.append('Normal_Low')

        value_list.append(f"{int(row['Normal_Low'])+1}-{int(row['Normal_High'])}")
        label_list.append('Normal_High')

        if vital != 'SpO2':
            value_list.append(f"{int(row['Normal_High'])+1}-{int(row['Bad_High'])}")
            label_list.append('Bad_High')

            value_list.append(f"{int(row['Bad_High'])+1}-{int(row['Very_Bad_High'])}")
            label_list.append('Very_Bad_High')

            value_list.append(f"{row['Nonsense_High']}")
            label_list.append('Nonsense_High')
            age_list.extend([row['Age']]*7)

        else:
            value_list.append(f">{row['Normal_High']}")
            label_list.append('Nonsense_High')
            age_list.extend([row['Age']]*5)

    df_temp = pd.DataFrame({"age": age_list, f"{vital}": value_list, "label": label_list})

    if len(df_condition) == 0:
        df_condition = df_temp
    else:
        df_condition = df_condition.merge(df_temp, on=['age', 'label'], how='left')

print(f"df_condition shape: {df_condition.shape}")
display(df_condition.head(2))

df_condition shape: (35, 8)


Unnamed: 0,age,Temp,label,Pulse,Systolic_BP,Diastolic_BP,Resp,SpO2
0,>13,<=93,Nonsense_Low,<=25,<=50,<=30,<=4,<=75
1,>13,94-96,Very_Bad_Low,26-40,51-75,31-40,5-8,76-88



Processing age : >13
Processing Vital : Temp
Processing Vital : Pulse
Processing Vital : Systolic_BP
Processing Vital : Diastolic_BP
Processing Vital : Resp
Processing Vital : SpO2

Processing age : 2-13
Processing Vital : Temp
Processing Vital : Pulse
Processing Vital : Systolic_BP
Processing Vital : Diastolic_BP
Processing Vital : Resp
Processing Vital : SpO2

Processing age : 1-2
Processing Vital : Temp
Processing Vital : Pulse
Processing Vital : Systolic_BP
Processing Vital : Diastolic_BP
Processing Vital : Resp
Processing Vital : SpO2

Processing age : 0.5-1
Processing Vital : Temp
Processing Vital : Pulse
Processing Vital : Systolic_BP
Processing Vital : Diastolic_BP
Processing Vital : Resp
Processing Vital : SpO2

Processing age : <0.5
Processing Vital : Temp
Processing Vital : Pulse
Processing Vital : Systolic_BP
Processing Vital : Diastolic_BP
Processing Vital : Resp
Processing Vital : SpO2


In [14]:
#### Convert columns to right values for label generation
def apply_conditions(x, col_values):
    if (x <= col_values[0]):
        return f"<={col_values[0]}"
    elif (x > col_values[0]) and (x <= col_values[1]):
        return f"{col_values[0]+1}-{col_values[1]}"
    elif (x == col_values[2]):
        return f"{col_values[2]}"
    elif (x > col_values[2]) and (x <= col_values[4]):
        return f"{col_values[2]+1}-{col_values[4]}"
    elif (x > col_values[4]) and (x <= col_values[5]):
        return f"{col_values[4]+1}-{col_values[5]}"
    elif (x > col_values[5]):
        return f">{col_values[5]}"
    else:
        return "na"

################################################################
################################################################

print("\nReading DF: ")

#### Read DF
df_raw_values = pd.read_csv(vital_signs_raw_val_loc)
display(df_raw_values)

#### condition_df
df_condition = pd.read_csv(vital_signs_cond_loc)
display(df_condition.head(5))

#### ICD 10 Descriptions
df_icd_desc = pd.read_csv(icd_code_desc_loc)
df_icd_desc.columns = ['Code', "Desc"]
display(df_icd_desc.head(2))

################################################################
################################################################
#### Age group Mappimg
print("\nApply Age mapping: ")
Age_Group = {"0-.5": "0-.5", ".5-1": ".5-1", "1-2": "1-2", "2-13": "2-13", "13-20": ">13", "20-40": ">13", "40-60": ">13", "60-80": ">13", "80-100": ">13"}
df['Age_Group_Old'] = df['Age_Group'].values
df['Age_Group'] = df["Age_Group"].map(Age_Group)
print(f"DF Shape : {df.shape}")

################################################################
#### Vital mapping to apply Age and Vitals conditions
print("\nApply Vitals mapping: ")
for col in df_raw_values.columns:
    col_values = df_raw_values[col].values
    col_values = col_values.astype(int)

    print(f"{col}: {col_values}")
    df[f'{col}_old'] = df[f"{col}"].values
    df[f'{col}'] = df[f"{col}"].apply(lambda x: apply_conditions(x, col_values))

print(f"DF Shape : {df.shape}")
################################################################
#### Apply Conditions and create Labels for Vitals
print("\nApply Age Vitals Conditions and add Desc: ")
age_col = "Age_Group"
label_col = "label"
columns = ['Temp', 'Pulse', 'Resp', 'SpO2', 'Systolic_BP', 'Diastolic_BP']

for col in columns:
    print(f"col: {col}")
    df_condition[f"{col}_Cat"] = df_condition[label_col]
    df = pd.merge(df, df_condition[[age_col, col, f"{col}_Cat"]], on=[age_col, col], how="left")

print(f"DF Shape : {df.shape}")
################################################################

print("\nAdd ICD10 Desc: ")

#### IDC 10 Descriptions
for col in ["Pt_Prime_ICD10", "Pt_Secondary_ICD10", "Pt_Third_ICD10"]:
    print(f"Col: {col}")
    df[f"{col}_Code"] = df[col].str[:3]
    df = df.merge(df_icd_desc, left_on=f"{col}_Code", right_on="Code", how="left").drop(columns = ['Code']).rename(columns={"Desc": f"{col}_Desc"})

print(f"DF Shape : {df.shape}")
################################################################

df = df.fillna("NA")
display(df.head(2))
print(f"DF Shape : {df.shape}")

################################################################


Reading DF: 


Unnamed: 0,Temp,Pulse,Resp,SpO2,Systolic_BP,Diastolic_BP
0,93.0,25.0,4.0,75.0,50.0,30.0
1,96.0,40.0,8.0,88.0,75.0,40.0
2,97.0,60.0,12.0,94.0,90.0,55.0
3,,,,,,
4,100.0,100.0,24.0,100.0,135.0,90.0
5,104.0,120.0,32.0,101.0,180.0,120.0


Unnamed: 0,Age_Group,Temp,Pulse,Resp,SpO2,Systolic_BP,Diastolic_BP,label
0,>13,<=93,<=25,<=4,<=75,<=50,<=30,Nonsense Low
1,>13,94-96,26-40,5-8,76-88,51-75,31-40,Very Bad Low
2,>13,97,60,12,94,90,55,Normal Low
3,>13,98-100,61-100,13-24,95-100,91-135,56-90,Normal High
4,>13,101-104,101-120,25-32,101-101,136-180,91-120,Very Bad High


Unnamed: 0,Code,Desc
0,A00,Cholera
1,A01,Typhoid and paratyphoid fevers



Apply Age mapping: 
DF Shape : (535441, 28)

Apply Vitals mapping: 
Temp: [                  93                   96                   97
 -9223372036854775808                  100                  104]
Pulse: [                  25                   40                   60
 -9223372036854775808                  100                  120]
Resp: [                   4                    8                   12
 -9223372036854775808                   24                   32]
SpO2: [                  75                   88                   94
 -9223372036854775808                  100                  101]
Systolic_BP: [                  50                   75                   90
 -9223372036854775808                  135                  180]
Diastolic_BP: [                  30                   40                   55
 -9223372036854775808                   90                  120]
DF Shape : (535441, 34)

Apply Age Vitals Conditions and add Desc: 
col: Temp
col: Pulse
col: Resp
col: S

Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Resp_Cat,SpO2_Cat,Systolic_BP_Cat,Diastolic_BP_Cat,Pt_Prime_ICD10_Code,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Code,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Code,Pt_Third_ICD10_Desc
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Ambulance,FLANK PAIN,3,F,UNKNOWN,Choose not to disclose,O26.893,R10.2,...,Normal High,Normal High,Normal High,Normal High,O26,Maternal care for other conditions predominant...,R10,Abdominal and pelvic pain,,
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Car,DIZZINESS,3,F,UNKNOWN,Choose not to disclose,E86.0,D62,...,Normal High,Normal High,Normal High,Normal High,E86,Volume depletion,D62,Acute posthemorrhagic anemia,R31,Hematuria


DF Shape : (535460, 46)


#### Concat

In [15]:
df["combined"] = ("Complaint: " + df['Pt_Complaint'].str.strip() + 
                  "; Arrival_Method: " + df['Pt_Arrival_Method'].str.strip() + 
                  "; Grouped_Arrival_Method: " + df['grouped_arrival_method'].str.strip() + 
                  "; Triage_Acuity: " + df['Pt_Triage_Acuity'].astype(str).str.strip() + 
                  "; Gender: " + df['Pt_Gender'].str.strip() + 
                  "; Race: " + df['Pt_Race'].str.strip() + 
                  "; Ethnicity: " + df['Pt_Ethnicity'].str.strip() + 
                  
                  "; Pulse: " + df['Pulse'].astype(str).str.strip() + 
                  "; Pulse_Label: " + df['Pulse_Cat'].astype(str).str.strip() + 
                  
                  "; Resp: " + df['Resp'].astype(str).str.strip() + 
                  "; Resp_Label: " + df['Resp_Cat'].astype(str).str.strip() + 
                  
                  "; SpO2: " + df['SpO2'].astype(str).str.strip() + 
                  "; SpO2_Label: " + df['SpO2_Cat'].astype(str).str.strip() +
                  
                  "; Temp: " + df['Temp'].astype(str).str.strip() + 
                  "; Temp_Label: " + df['Temp_Cat'].astype(str).str.strip() + 
                  
                  "; Arrival_TimeOfDay: " + df['Arrival_TimeOfDay'].str.strip() + 
                  "; Age_Group: " + df['Age_Group'].str.strip() + 
                  
                  "; Systolic_BP: " + df['Systolic_BP'].astype(str).str.strip() + 
                  "; Systolic_BP_Label: " + df['Systolic_BP_Cat'].astype(str).str.strip() + 
                  
                  "; Diastolic_BP: " + df['Diastolic_BP'].astype(str).str.strip() + 
                  "; Diastolic_BP_Label: " + df['Diastolic_BP_Cat'].astype(str).str.strip() +
                  
                  "; BP_Group: " + df['bp_group'].str.strip() +
                  
                  "; Prime Diagnosis: " + df['Pt_Prime_ICD10_Desc'].astype(str).str.strip() + 
                  "; Secondary Diagnosis: " + df['Pt_Secondary_ICD10_Desc'].astype(str).str.strip() + 
                  "; Third Diagnosis: " + df['Pt_Third_ICD10_Desc'].astype(str).str.strip()
                  )
display(df.head(2))

Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,SpO2_Cat,Systolic_BP_Cat,Diastolic_BP_Cat,Pt_Prime_ICD10_Code,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Code,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Code,Pt_Third_ICD10_Desc,combined
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Ambulance,FLANK PAIN,3,F,UNKNOWN,Choose not to disclose,O26.893,R10.2,...,Normal High,Normal High,Normal High,O26,Maternal care for other conditions predominant...,R10,Abdominal and pelvic pain,,,Complaint: FLANK PAIN; Arrival_Method: Ambulan...
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Car,DIZZINESS,3,F,UNKNOWN,Choose not to disclose,E86.0,D62,...,Normal High,Normal High,Normal High,E86,Volume depletion,D62,Acute posthemorrhagic anemia,R31,Hematuria,Complaint: DIZZINESS; Arrival_Method: Car; Gro...


In [16]:
print(f"Sample Patient info: \n{df['combined'].iloc[0]}")

Sample Patient info: 
Complaint: FLANK PAIN; Arrival_Method: Ambulance; Grouped_Arrival_Method: ambulance; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: 12pm-6pm; Age_Group: >13; Systolic_BP: 91-135; Systolic_BP_Label: Normal High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: normal; Prime Diagnosis: Maternal care for other conditions predominantly related to pregnancy; Secondary Diagnosis: Abdominal and pelvic pain; Third Diagnosis: NA


In [17]:
print(f"DF shape before drop_NA: {df.shape}")

df.dropna(subset = ['combined'], inplace=True)
print(f"DF shape after drop_NA: {df.shape}")


DF shape before drop_NA: (535460, 47)
DF shape after drop_NA: (535460, 47)


In [19]:
#### test only
start_time= time.time()
print(f"\nsaving df to disk:")
print(f"df shape: {df.shape}")
df.to_csv('gs://emopti_shared/aiipem_deidentified_data_20231208_with_labels_and_desc.csv', index= False)
print(f"time taken for saving df : {time.time()- start_time}")


saving df to disk:
df shape: (535460, 47)
time taken for saving df : 29.388294458389282


In [14]:
# df = pd.read_csv('gs://emopti_shared/aiipem_deidentified_data_20231208_with_labels_and_desc.csv')

In [15]:
display(df.head(2))

Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,SpO2_Cat,Systolic_BP_Cat,Diastolic_BP_Cat,Pt_Prime_ICD10_Code,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Code,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Code,Pt_Third_ICD10_Desc,combined
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,O26.893,R10.2,...,Normal High,Normal High,Normal High,O26,Maternal care for other conditions predominant...,R10,Abdominal and pelvic pain,,,Complaint: FLANK PAIN; Arrival_Method: Ambulan...
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,E86.0,D62,...,Normal High,Normal High,Normal High,E86,Volume depletion,D62,Acute posthemorrhagic anemia,R31,Hematuria,Complaint: DIZZINESS; Arrival_Method: Car; Gro...


#### Approximate Tokens and Cost calculation

In [44]:
# import tiktoken

# encoding = tiktoken.get_encoding(embedding_encoding)
# n_tokens_list = []

# # omit reviews that are too long to embed
# df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))

# print(f"df len: {len(df)}")
# display(df.head(2))

# print(f"# Tokens: {df['n_tokens'].sum()}")



In [45]:
# #### Cost
# n_tokens = df['n_tokens'].sum()
# per_1k_price = 0.0001 
# print(f"approximate cost: ${round(n_tokens * per_1k_price/1000, 2)}")

# #### Cost for 100k comments
# n_tokens = df[0:100000]['n_tokens'].sum()
# per_1k_price = 0.0001 
# print(f"approximate cost: ${round(n_tokens * per_1k_price/1000, 2)}")

#### Generate Embeddings

In [86]:
#### Time taken in sequential order for 10k patients --> 50 seconds

# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
start_time= time.time()

#### test only
df_copy = df[["PT_Visit_ID_Hashed", 'combined', 'Pt_Prime_ICD10_Desc', 'Pt_Secondary_ICD10_Desc', 'Pt_Third_ICD10_Desc']].iloc[0:10000].copy()

# df_copy = df[[id_column, 'combined']].copy()

embedding_list = []
batch_size = 2048

for idx in range(0, len(df_copy)-1, batch_size):
    start_idx = idx
    end_idx = min(len(df_copy), idx+batch_size)
    print(f"embeddings for: {start_idx, end_idx}")
    txt_list = df_copy['combined'].iloc[start_idx:end_idx]
    embedding_values = get_embeddings(txt_list, model=embedding_model)
    embedding_list.extend(embedding_values)

print(f"time taken for embeddings : {time.time()- start_time}")

df_copy["embedding"]  = embedding_list

#### test only
start_time= time.time()
print(f"\nsaving embedings to disk:")
print(f"embedings df shape: {df_copy.shape}")
df_copy.to_csv("./data/deidentified/aiipem_deidentified_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv.gz", index= False, compression='gzip')
print(f"time taken for saving embeddings : {time.time()- start_time}")



saving embedings to disk:
embedings df shape: (10000, 6)
time taken for saving embeddings : 93.76497602462769


In [58]:
display(df_copy.head(2))
df_copy.shape

Unnamed: 0,PT_Visit_ID_Hashed,combined,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Desc,embedding
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,Complaint: FLANK PAIN; Arrival_Method: Ambulan...,Maternal care for other conditions predominant...,Abdominal and pelvic pain,,"[0.0010272093350067735, 0.0016156438505277038,..."
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,Complaint: DIZZINESS; Arrival_Method: Car; Gro...,Volume depletion,Acute posthemorrhagic anemia,Hematuria,"[0.007763882167637348, 0.0021794575732201338, ..."


(10000, 6)

#### Read Embeddings from Disk

In [16]:
# import pandas as pd
# import numpy as np
# import time
# from ast import literal_eval

# datafile_path = "./data/deidentified/aiipem_deidentified_embeddings_batch_0_to_10k_vitals_labels_and_icd_desc.csv.gz"

# start_time= time.time()
# df_copy = pd.read_csv(datafile_path)
# print(f"time taken to read csv: {time.time()- start_time}")

# start_time= time.time()
# df_copy["embedding"] = df_copy.embedding.apply(literal_eval).apply(np.array)
# print(f"time taken for literal_eval: {time.time()- start_time}")

# display(df_copy.head(2))
# df_copy.shape

time taken to read csv: 6.112082481384277
time taken for literal_eval: 73.16566777229309


Unnamed: 0,PT_Visit_ID_Hashed,combined,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Desc,embedding
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,Complaint: FLANK PAIN; Arrival_Method: Ambulan...,Maternal care for other conditions predominant...,Abdominal and pelvic pain,,"[0.0010272093350067735, 0.0016156438505277038,..."
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,Complaint: DIZZINESS; Arrival_Method: Car; Gro...,Volume depletion,Acute posthemorrhagic anemia,Hematuria,"[0.007763882167637348, 0.0021794575732201338, ..."


(10000, 6)

#### Cosine Similarity

In [18]:

# search through the patients for a specific patient
def search_patients(df, patient_info):
    product_embedding = get_embedding(patient_info, model=embedding_model)
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = df.sort_values("similarity", ascending=False)
    return results

bold_s = "\033[1m"
bold_e = "\033[0m"

#### Example 1

In [19]:
top_k = 5
patient_id = 10005
df_patient_search = df_copy.copy()

patient_info = df.iloc[patient_id]
patient_text = patient_info['combined'].split(' Prime Diagnosis:')[0]
results = search_patients(df_patient_search, patient_text)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_text}")
print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {patient_info['Pt_Prime_ICD10_Desc']}")
print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {patient_info['Pt_Secondary_ICD10_Desc']}")
print(f"{bold_s}Pt_Third_ICD10{bold_e}: {patient_info['Pt_Third_ICD10_Desc']}")

print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: BEHAVIORAL HEALTH CONCERN; Arrival_Method: Car; Grouped_Arrival_Method: private transport; Triage_Acuity: 2.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: >13; Systolic_BP: 91-135; Systolic_BP_Label: Normal High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: at_risk;
[1mPt_Prime_ICD10[0m: nan
[1mPt_Secondary_ICD10[0m: Depressive episode
[1mPt_Third_ICD10[0m: nan

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: BEHAVIORAL HEALTH CONCERN; Arrival_Method: Car; Grouped_Arrival_Method: private transport; Triage_Acuity: 2.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 101-120; Pulse_Label: Very Bad High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High;

#### Example 2

In [69]:
df_patient_search = df_copy.copy()

patient_info = "Complaint: EMESIS"
results = search_patients(df_patient_search, patient_info)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_info}")
print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: EMESIS

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: EMESIS; Arrival_Method: Car; Grouped_Arrival_Method: private transport; Triage_Acuity: 4.0; Gender: F; Race: BLACK OR AFRICAN AMERICAN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: Midnight-6am; Age_Group: >13; Systolic_BP: 91-135; Systolic_BP_Label: Normal High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: normal; Prime Diagnosis: Other and unspecified noninfective gastroenteritis and colitis; Secondary Diagnosis: NA; Third Diagnosis: NA
[1mSimilariry[0m: 0.8752226600665103
[1mPt_Prime_ICD10[0m: Other and unspecified noninfective gastroenteritis and colitis
[1mPt_Secondary_ICD10[0m: NA
[1mPt_Third_ICD10[0m: NA

[1mMatched Patient 2[0m: Complaint: EMESIS; Arrival_Method: Car; Grouped_Arri

In [21]:
df_patient_search = df_copy.copy()
top_k = 20
patient_info = "Complaint: Crash"
results = search_patients(df_patient_search, patient_info)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_info}")
print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: Crash

Top 20 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: MOTOR VEHICLE CRASH; Arrival_Method: Car; Grouped_Arrival_Method: private transport; Triage_Acuity: 3.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: >13; Systolic_BP: 136-180; Systolic_BP_Label: Very Bad High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: high; Prime Diagnosis: Motor- or nonmotor-vehicle accident, type of vehicle unspecified; Secondary Diagnosis: NA; Third Diagnosis: NA
[1mSimilariry[0m: 0.8203691313492745
[1mPt_Prime_ICD10[0m: Motor- or nonmotor-vehicle accident, type of vehicle unspecified
[1mPt_Secondary_ICD10[0m: nan
[1mPt_Third_ICD10[0m: nan

[1mMatched Patient 2[0m: Complaint: MOTOR VEHICLE CRASH; Arrival_Method: Am

#### Example 3

In [70]:
df_patient_search = df_copy.copy()

patient_info = "Complaint: EMESIS; Pulse: 130.0"
results = search_patients(df_patient_search, patient_info)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_info}")
print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: EMESIS; Pulse: 130.0

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: EMESIS; Arrival_Method: Walked In; Grouped_Arrival_Method: private transport; Triage_Acuity: 3.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: 12pm-6pm; Age_Group: >13; Systolic_BP: 136-180; Systolic_BP_Label: Very Bad High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: at_risk; Prime Diagnosis: Other symptoms and signs involving the digestive system and abdomen; Secondary Diagnosis: NA; Third Diagnosis: NA
[1mSimilariry[0m: 0.8948611013861237
[1mPt_Prime_ICD10[0m: Other symptoms and signs involving the digestive system and abdomen
[1mPt_Secondary_ICD10[0m: NA
[1mPt_Third_ICD10[0m: NA

[1mMatched Patient 2[0m: Complaint: EMESIS; Arrival_Method: Car; 

#### Example 4

In [80]:
top_k = 5
patient_id = 10011
df_patient_search = df_copy.copy()

patient_info = df.iloc[patient_id]
patient_text = patient_info['combined'].split('; Prime Diagnosis:')[0]
results = search_patients(df_patient_search, patient_text)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_text}")
print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {patient_info['Pt_Prime_ICD10_Desc']}")
print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {patient_info['Pt_Secondary_ICD10_Desc']}")
print(f"{bold_s}Pt_Third_ICD10{bold_e}: {patient_info['Pt_Third_ICD10_Desc']}")

print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: SEIZURES; Arrival_Method: Walked In; Grouped_Arrival_Method: private transport; Triage_Acuity: 4.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: >13; Systolic_BP: 91-135; Systolic_BP_Label: Normal High; Diastolic_BP: 91-120; Diastolic_BP_Label: Very Bad High; BP_Group: at_risk
[1mPt_Prime_ICD10[0m: Open wound of head
[1mPt_Secondary_ICD10[0m: NA
[1mPt_Third_ICD10[0m: NA

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: SEIZURES; Arrival_Method: Walked In; Grouped_Arrival_Method: private transport; Triage_Acuity: 3.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label

#### Example 5

In [81]:
top_k = 5
patient_id = 10112
df_patient_search = df_copy.copy()

patient_info = df.iloc[patient_id]
patient_text = patient_info['combined'].split('; Prime Diagnosis:')[0]
results = search_patients(df_patient_search, patient_text)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_text}")
print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {patient_info['Pt_Prime_ICD10_Desc']}")
print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {patient_info['Pt_Secondary_ICD10_Desc']}")
print(f"{bold_s}Pt_Third_ICD10{bold_e}: {patient_info['Pt_Third_ICD10_Desc']}")

print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: ALLERGIC REACTION; Arrival_Method: Walked In; Grouped_Arrival_Method: private transport; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: 6am-12pm; Age_Group: >13; Systolic_BP: 91-135; Systolic_BP_Label: Normal High; Diastolic_BP: na; Diastolic_BP_Label: NA; BP_Group: at_risk
[1mPt_Prime_ICD10[0m: Adverse effects, not elsewhere classified
[1mPt_Secondary_ICD10[0m: NA
[1mPt_Third_ICD10[0m: NA

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: ALLERGIC REACTION; Arrival_Method: Walk-In; Grouped_Arrival_Method: private transport; Triage_Acuity: 4.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp

#### Example 6

In [82]:
top_k = 5
patient_id = 20000
df_patient_search = df_copy.copy()

patient_info = df.iloc[patient_id]
patient_text = patient_info['combined'].split('; Prime Diagnosis:')[0]
results = search_patients(df_patient_search, patient_text)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_text}")
print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {patient_info['Pt_Prime_ICD10_Desc']}")
print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {patient_info['Pt_Secondary_ICD10_Desc']}")
print(f"{bold_s}Pt_Third_ICD10{bold_e}: {patient_info['Pt_Third_ICD10_Desc']}")

print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: NASAL CONGESTION; Arrival_Method: Car; Grouped_Arrival_Method: private transport; Triage_Acuity: 4.0; Gender: M; Race: BLACK OR AFRICAN AMERICAN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arrival_TimeOfDay: Midnight-6am; Age_Group: >13; Systolic_BP: 136-180; Systolic_BP_Label: Very Bad High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: high
[1mPt_Prime_ICD10[0m: Viral infection of unspecified site
[1mPt_Secondary_ICD10[0m: Pain in throat and chest
[1mPt_Third_ICD10[0m: NA

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: NASAL CONGESTION; Arrival_Method: Car; Grouped_Arrival_Method: private transport; Triage_Acuity: 4.0; Gender: M; Race: BLACK OR AFRICAN AMERICAN; Ethnicity: Choose not to disclose; Pulse: >120; Pulse_Label: Nonsense High; Resp: 13-24; Resp_Label: 

#### Example 7

In [83]:
top_k = 5
patient_id = 50000
df_patient_search = df_copy.copy()

patient_info = df.iloc[patient_id]
patient_text = patient_info['combined'].split('; Prime Diagnosis:')[0]
results = search_patients(df_patient_search, patient_text)

print(f"\n{bold_s}Search Patient{bold_e}: {patient_text}")
print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {patient_info['Pt_Prime_ICD10_Desc']}")
print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {patient_info['Pt_Secondary_ICD10_Desc']}")
print(f"{bold_s}Pt_Third_ICD10{bold_e}: {patient_info['Pt_Third_ICD10_Desc']}")

print(f"\nTop {top_k} Matched Patients: \n")

for idx in range(0, top_k):
    
    matched_patient = results.iloc[idx]
    print(f"{bold_s}Matched Patient {idx+1}{bold_e}: {matched_patient['combined']}")
    print(f"{bold_s}Similariry{bold_e}: {matched_patient['similarity']}")
    print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {matched_patient['Pt_Prime_ICD10_Desc']}")
    print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {matched_patient['Pt_Secondary_ICD10_Desc']}")
    print(f"{bold_s}Pt_Third_ICD10{bold_e}: {matched_patient['Pt_Third_ICD10_Desc']}")
    print()



[1mSearch Patient[0m: Complaint: DIZZINESS; Arrival_Method: Ambulance; Grouped_Arrival_Method: ambulance; Triage_Acuity: 2.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 97; Temp_Label: Normal Low; Arrival_TimeOfDay: 6am-12pm; Age_Group: >13; Systolic_BP: 136-180; Systolic_BP_Label: Very Bad High; Diastolic_BP: 56-90; Diastolic_BP_Label: Normal High; BP_Group: high
[1mPt_Prime_ICD10[0m: Dizziness and giddiness
[1mPt_Secondary_ICD10[0m: NA
[1mPt_Third_ICD10[0m: NA

Top 5 Matched Patients: 

[1mMatched Patient 1[0m: Complaint: DIZZINESS; Arrival_Method: Ambulance; Grouped_Arrival_Method: ambulance; Triage_Acuity: 2.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 61-100; Pulse_Label: Normal High; Resp: 13-24; Resp_Label: Normal High; SpO2: 95-100; SpO2_Label: Normal High; Temp: 98-100; Temp_Label: Normal High; Arriva

### GCS Vector Search Deployement

#### Save the embeddings in a JSON file

In [59]:
output_json_loc

'./data/deidentified/aiipem_deidentified_data_20231208_with_vitals_lbel_and_icd_codes.json'

In [61]:
print("Data with Embeddings: ")
display(df_copy.head(2))

print("\nFilter df for vector search: ")
df_filter = df_copy[[id_column, "embedding"]]

df_filter.columns = ["id", "embedding"]
display(df_filter.head(2))
print(f"df_filter shape : {df_filter.shape}")

# save id and embedding as a json file
jsonl_string = df_filter.to_json(orient = 'records', lines = True)
with open(output_json_loc, 'w') as f:
    f.write(jsonl_string)


Data with Embeddings: 


Unnamed: 0,PT_Visit_ID_Hashed,combined,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Desc,embedding
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,Complaint: FLANK PAIN; Arrival_Method: Ambulan...,Maternal care for other conditions predominant...,Abdominal and pelvic pain,,"[0.0010272093350067735, 0.0016156438505277038,..."
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,Complaint: DIZZINESS; Arrival_Method: Car; Gro...,Volume depletion,Acute posthemorrhagic anemia,Hematuria,"[0.007763882167637348, 0.0021794575732201338, ..."



Filter df for vector search: 


Unnamed: 0,id,embedding
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,"[0.0010272093350067735, 0.0016156438505277038,..."
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,"[0.007763882167637348, 0.0021794575732201338, ..."


df_filter shape : (10000, 2)


#### Copy the file to a new GCS bucket

In [85]:
! gsutil -m cp {output_json_loc} {BUCKET_URI}

Copying file://./data/deidentified/aiipem_deidentified_data_20231208_with_vitals_lbel_and_icd_codes.json [Content-Type=application/json]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1/1 files][197.1 MiB/197.1 MiB] 100% Done                                    
Operation completed over 1 objects/197.1 MiB.                                    


#### Create an Index

The parameters for creating index
- contents_delta_uri: The URI of Cloud Storage directory where you stored the embedding JSON files
- dimensions: Dimension size of each embedding. In this case, it is 768 as we are using the embeddings from the Text Embeddings API.
- approximate_neighbors_count: how many similar items we want to retrieve in typical cases
- distance_measure_type: what metrics to measure distance/similarity between embeddings. In this case it's DOT_PRODUCT_DISTANCE

In [63]:
aiplatform.init(project=PROJECT_ID, location = LOCATION)

In [19]:
# create index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
  display_name = f"{BUCKET_NAME}-index",
  contents_delta_uri = BUCKET_URI,
  dimensions = 1536,
  approximate_neighbors_count = 20,
  distance_measure_type = "DOT_PRODUCT_DISTANCE",
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/1090622561506/locations/us-central1/indexes/2302399338795499520/operations/7274772381413933056
MatchingEngineIndex created. Resource name: projects/1090622561506/locations/us-central1/indexes/2302399338795499520
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/1090622561506/locations/us-central1/indexes/2302399338795499520')


#### Create Index Endpoint and deploy the Index

In [20]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
  display_name = f"{BUCKET_NAME}-index-endpoint",
  public_endpoint_enabled = True,
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960/operations/2705870559446564864
MatchingEngineIndexEndpoint created. Resource name: projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960')


In [28]:
DEPLOYED_INDEX_ID = f"{BUCKET_NAME.replace('-', '_')}_deployed"
print(f"DEPLOYED_INDEX_ID: {DEPLOYED_INDEX_ID}")

# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(
  index = my_index, deployed_index_id = DEPLOYED_INDEX_ID
)

DEPLOYED_INDEX_ID: emopti_vector_search_deployed_test1
Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960/operations/2086625610683121664
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7f2d874dac50> 
resource name: projects/1090622561506/locations/us-central1/indexEndpoints/4269909426002984960

#### Run Query

In [88]:
# top_k = 20
# patient_id = 1001

# patient_info = df_copy.iloc[patient_id]
# test_embeddings = [get_embedding(patient_info['combined'], model=embedding_model)]

# print(f"\n{bold_s}Search Patient{bold_e}: {patient_info['combined']}")
# print(f"{bold_s}Pt_Prime_ICD10{bold_e}: {patient_info['Pt_Prime_ICD10']}")
# print(f"{bold_s}Pt_Secondary_ICD10{bold_e}: {patient_info['Pt_Secondary_ICD10']}")
# print(f"{bold_s}Pt_Third_ICD10{bold_e}: {patient_info['Pt_Third_ICD10']}")


In [89]:
# # Test query
# response = my_index_endpoint.find_neighbors(
#   deployed_index_id = DEPLOYED_INDEX_ID,
#   queries = test_embeddings,
#   num_neighbors = 20,
# )

# # show the result
# import numpy as np
# for idx, neighbor in enumerate(response[0]):
#     id = neighbor.id
#     similar = df_copy.query("PT_Visit_ID_Hashed == @id", engine = "python")
#     print(f"{neighbor.distance:.4f} {similar.combined.values[0]}")

#### IMPORTANT: Cleaning Up
In case you are using your own Cloud project, not a temporary project on Qwiklab, please make sure to delete all the Indexes, Index Endpoints and Cloud Storage buckets after finishing this tutorial. Otherwise the remaining objects would incur unexpected costs.

In [90]:
# # wait for a confirmation
# input("Press Enter to delete Index Endpoint, Index and Cloud Storage bucket:")

# # delete Index Endpoint
# my_index_endpoint.undeploy_all()
# my_index_endpoint.delete(force = True)

# # delete Index
# my_index.delete()

# # delete Cloud Storage bucket
# ! gsutil rm -r {BUCKET_URI}