#### Objective

This notebook is to execute below tasks:
1. Fetch the data from GCS bucket
2. Filter key columns, required for Embeddings
3. Clean Columns and generate String Concating columns
4. Save output file back to GCS
5. Create another file with Diagnostic columns - to be used for recommendations and save to GCS

#### Imports

In [3]:
# from utils.bq_utils import BQClient
import pandas as pd
import dtale
from ydata_profiling import ProfileReport

#### Create BQ Client Object

In [2]:
# bq_client = BQClient()

#### Read Data From BQ and Store to Local

In [3]:
# # The ID of your GCS bucket
# bucket_name = "emopti_shared"

# # The ID of your GCS object
# source_blob_name = "aiipem_deidentified_data_1000_20231208.csv"
# # source_blob_name = "aiipem_deidentified_data_20231208.csv"
# # source_blob_name = "d2i_LOF_091123.txt"

# # The path to which the file should be downloaded
# destination_file_name = f"./data/{source_blob_name}"

# bq_client.download_blob(bucket_name, source_blob_name, destination_file_name)

#### Read data from Local and Filter required columns

In [4]:
print("Original DF: ")
df = pd.read_csv("./data/deidentified/aiipem_deidentified_data_20231208.csv")
display(df.head(2))

#### all columns
all_columns = df.columns.to_list()
        
id_column = ['PT_Visit_ID_Hashed']

columns_for_embeddings = ['Pt_Arrival_Method', 'Pt_Complaint', 'Pt_Triage_Acuity',
           'Pt_Gender', 'Pt_Race', 'Pt_Ethnicity', 'Pulse', 'Resp', 'SpO2', 'Temp', 
           'Arrival_TimeOfDay', 'Age_Group',
           'grouped_arrival_method', 'bp_group']

columns_for_output = ['EHR_Disposition', 'd2i_Grouped_Disposition', 'emopti_grouped_disposition']
columns_to_keep = id_column + columns_for_embeddings + columns_for_output
columns_dropped = list(set(all_columns) - set(columns_to_keep))

print(f"\n\nID Columns :{id_column}")
print(f"\nColumns For Embeddings :{columns_for_embeddings}")
print(f"\nColumns for output :{columns_for_output}")
print(f"\nColumns dropped :{columns_dropped}")
print(f"\nColumns to keep :{columns_to_keep}")

print("\n\nFiltered DF: ")
display(df.head(2))

Original DF: 


Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Temp,Visit_Duration_Hrs,Arrival_TimeOfDay,Age_Group,Pt_DOB_Year,Systolic_BP,Diastolic_BP,emopti_grouped_disposition,grouped_arrival_method,bp_group
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,O26.893,R10.2,...,97.9,1.5,12pm-6pm,20-40,1988.0,111.0,62.0,Left Against Medical Advice,ambulance,normal
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,E86.0,D62,...,97.9,4.5,6pm-Midnight,20-40,1988.0,115.0,56.0,Discharged,private transport,normal




ID Columns :['PT_Visit_ID_Hashed']

Columns For Embeddings :['Pt_Arrival_Method', 'Pt_Complaint', 'Pt_Triage_Acuity', 'Pt_Gender', 'Pt_Race', 'Pt_Ethnicity', 'Pulse', 'Resp', 'SpO2', 'Temp', 'Arrival_TimeOfDay', 'Age_Group', 'grouped_arrival_method', 'bp_group']

Columns for output :['EHR_Disposition', 'd2i_Grouped_Disposition', 'emopti_grouped_disposition']

Columns dropped :['Systolic_BP', 'Visit_Duration_Hrs', 'Pt_Secondary_ICD10', 'BP', 'Diastolic_BP', 'Pt_Prime_ICD10', 'Pt_Third_ICD10', 'PT_ID_Hashed', 'Pt_DOB_Year']

Columns to keep :['PT_Visit_ID_Hashed', 'Pt_Arrival_Method', 'Pt_Complaint', 'Pt_Triage_Acuity', 'Pt_Gender', 'Pt_Race', 'Pt_Ethnicity', 'Pulse', 'Resp', 'SpO2', 'Temp', 'Arrival_TimeOfDay', 'Age_Group', 'grouped_arrival_method', 'bp_group', 'EHR_Disposition', 'd2i_Grouped_Disposition', 'emopti_grouped_disposition']


Filtered DF: 


Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Temp,Visit_Duration_Hrs,Arrival_TimeOfDay,Age_Group,Pt_DOB_Year,Systolic_BP,Diastolic_BP,emopti_grouped_disposition,grouped_arrival_method,bp_group
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,O26.893,R10.2,...,97.9,1.5,12pm-6pm,20-40,1988.0,111.0,62.0,Left Against Medical Advice,ambulance,normal
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,E86.0,D62,...,97.9,4.5,6pm-Midnight,20-40,1988.0,115.0,56.0,Discharged,private transport,normal


In [5]:
df_copy = df.copy()

In [6]:
#### Convert columns to right values for label generation

def apply_conditions(x, col_values):
    if (x <= col_values[0]):
        return f"<={col_values[0]}"
    elif (x > col_values[0]) and (x <= col_values[1]):
        return f"{col_values[0]+1}-{col_values[1]}"
    elif (x == col_values[2]):
        return f"{col_values[2]}"
    elif (x > col_values[2]) and (x <= col_values[4]):
        return f"{col_values[2]+1}-{col_values[4]}"
    elif (x > col_values[4]) and (x <= col_values[5]):
        return f"{col_values[4]+1}-{col_values[5]}"
    elif (x > col_values[5]):
        return f">{col_values[5]}"
    else:
        return "na"

Age_Group = {"0-.5": "0-.5", ".5-1": ".5-1", "1-2": "1-2", "2-13": "2-13", "13-20": ">13", "20-40": ">13", "40-60": ">13", "60-80": ">13", "80-100": ">13"}

df_raw_values = pd.read_csv("./data/deidentified/vital_signs_raw_values.csv")
display(df_raw_values)

df_copy['Age_Group'] = df_copy["Age_Group"].map(Age_Group)

for col in df_raw_values.columns:
    col_values = df_raw_values[col].values
    col_values = col_values.astype(int)

    print(f"{col}: {col_values}")
    df_copy[f'{col}_old'] = df_copy[f"{col}"].values
    df_copy[f'{col}'] = df_copy[f"{col}"].apply(lambda x: apply_conditions(x, col_values))

#### condition_df
df_condition = pd.read_csv("./data/deidentified/vital_signs_conditions.csv")
display(df_condition.head(5))

age_col = "Age_Group"
label_col = "label"
columns = ['Temp', 'Pulse', 'Resp', 'SpO2', 'Systolic_BP', 'Diastolic_BP']

for col in columns:
    print(f"col: {col}")
    df_condition[f"{col}_Cat"] = df_condition[label_col]
    df_copy = pd.merge(df_copy, df_condition[[age_col, col, f"{col}_Cat"]], on=[age_col, col], how="left")

#### IDC 10 Descriptions
df_icd_desc = pd.read_csv("./data/deidentified/filtered_extracted_diag_name_desc.csv")
df_icd_desc.columns = ['Code', "Desc"]
display(df_icd_desc.head(2))

for col in ["Pt_Prime_ICD10", "Pt_Secondary_ICD10", "Pt_Third_ICD10"]:
    print(f"Col: {col}")
    df_copy[f"{col}_Code"] = df_copy[col].str[:3]
    df_copy = df_copy.merge(df_icd_desc, left_on=f"{col}_Code", right_on="Code").drop(columns = ['Code']).rename(columns={"Desc": f"{col}_Desc"})

df_copy = df_copy.fillna("NA")

Unnamed: 0,Temp,Pulse,Resp,SpO2,Systolic_BP,Diastolic_BP
0,93.0,25.0,4.0,75.0,50.0,30.0
1,96.0,40.0,8.0,88.0,75.0,40.0
2,97.0,60.0,12.0,94.0,90.0,55.0
3,,,,,,
4,100.0,100.0,24.0,100.0,135.0,90.0
5,104.0,120.0,32.0,101.0,180.0,120.0



invalid value encountered in cast



Temp: [                  93                   96                   97
 -9223372036854775808                  100                  104]



invalid value encountered in cast



Pulse: [                  25                   40                   60
 -9223372036854775808                  100                  120]



invalid value encountered in cast



Resp: [                   4                    8                   12
 -9223372036854775808                   24                   32]



invalid value encountered in cast



SpO2: [                  75                   88                   94
 -9223372036854775808                  100                  101]



invalid value encountered in cast



Systolic_BP: [                  50                   75                   90
 -9223372036854775808                  135                  180]



invalid value encountered in cast



Diastolic_BP: [                  30                   40                   55
 -9223372036854775808                   90                  120]


In [7]:
df_copy.head(2)

Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Diastolic_BP,emopti_grouped_disposition,grouped_arrival_method,bp_group,Temp_old,Pulse_old,Resp_old,SpO2_old,Systolic_BP_old,Diastolic_BP_old
0,c1cf913b612ffce254b33e996aa49b0def330b033058a7...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,O26.893,R10.2,...,56-90,Left Against Medical Advice,ambulance,normal,97.9,90.0,20.0,99.0,111.0,62.0
1,094a742f233f2699fe3125b34ee61421e3f5ae54e26ca2...,e1866701283c7c46f57bff8b5d23c1d44fbbfca0a076a1...,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,E86.0,D62,...,56-90,Discharged,private transport,normal,97.9,104.0,18.0,100.0,115.0,56.0


Unnamed: 0,Code,Desc
0,A00,Cholera
1,A01,Typhoid and paratyphoid fevers


In [38]:
# df_copy = df_copy.drop(columns=[col for col in df_copy.columns if "_ICD10_Code" in col])

Col: Pt_Prime_ICD10
Col: Pt_Secondary_ICD10
Col: Pt_Third_ICD10


In [40]:
df_copy.head(2)

Unnamed: 0,PT_Visit_ID_Hashed,PT_ID_Hashed,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pt_Prime_ICD10,Pt_Secondary_ICD10,...,Resp_Cat,SpO2_Cat,Systolic_BP_Cat,Diastolic_BP_Cat,Pt_Prime_ICD10_Code,Pt_Prime_ICD10_Desc,Pt_Secondary_ICD10_Code,Pt_Secondary_ICD10_Desc,Pt_Third_ICD10_Code,Pt_Third_ICD10_Desc
0,b0f5fbe02cee3ca3ee0c5d7ef5dc433617803baf248a0e...,90a13556febf5811661108be8dd15c515664630671dc07...,Car,PREGNANCY PROBLEM,3.0,F,UNKNOWN,Choose not to disclose,O26.891,R10.31,...,Normal High,Normal High,Normal High,Normal High,O26,Maternal care for other conditions predominant...,R10,Abdominal and pelvic pain,R10,Abdominal and pelvic pain
1,848e2e7591cff12635e11de484b2b6c455dd85827ce6f3...,4658f71559fa809c22ef977f2657b62dffec5a4a2b88f2...,Car,ABDOMINAL PAIN,3.0,F,UNKNOWN,Choose not to disclose,O26.891,R10.31,...,Normal High,Normal High,Normal High,Normal High,O26,Maternal care for other conditions predominant...,R10,Abdominal and pelvic pain,R10,Abdominal and pelvic pain


#### Clean and Prepare data for Embeddings

In [4]:
print("DF for Embeddings: ")
df_for_embeddings = df[columns_for_embeddings]
display(df_for_embeddings.head(2))

DF for Embeddings: 


Unnamed: 0,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pulse,Resp,SpO2,Temp,Arrival_TimeOfDay,Age_Group,grouped_arrival_method,bp_group
0,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,90.0,20.0,99.0,97.9,12pm-6pm,20-40,ambulance,normal
1,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,104.0,18.0,100.0,97.9,6pm-Midnight,20-40,private transport,normal


#### EDA

In [5]:
df_for_embeddings[['Pt_Arrival_Method', 'grouped_arrival_method']].value_counts()[0:20]

Pt_Arrival_Method            grouped_arrival_method
Car                          private transport         313829
Walked In                    private transport          65973
Ambulance                    ambulance                  45842
Walk-In                      private transport          14959
Ambulatory                   private transport          14817
Amb- UC Health EMS           ambulance                  14320
Wheelchair                   helicopter                 13709
Life EMS                     ambulance                  12739
Lifecare EMS                 ambulance                   7385
Ambulatory (Walk-in)         private transport           4707
Amb- Thompson Valley         ambulance                   3975
Carried by parent/caregiver  private transport           2764
Police                       police                      2491
Amb- AMR                     ambulance                   2126
Van Buren EMS                ambulance                   1735
Pride Care EMS    

In [12]:
# d = dtale.show(df_for_embeddings)
# print(d._main_url)


In [17]:
# ProfileReport(df_for_embeddings)

In [7]:
for col in df_for_embeddings.columns:
    print(f"\n\nColumn: {col}")
    print(f"\n{df_for_embeddings[col].value_counts()}")



Column: Pt_Arrival_Method

Pt_Arrival_Method
Car                               313829
Walked In                          65973
Ambulance                          45842
Walk-In                            14959
Ambulatory                         14817
                                   ...  
Amb- Fountain Fire                     1
Amb- Franktown Fire and Rescue         1
Thornapple Twp Fire                    1
Amb- South Platte Fire                 1
Amb- Northglenn                        1
Name: count, Length: 114, dtype: int64


Column: Pt_Complaint

Pt_Complaint
ABDOMINAL PAIN         53163
CHEST PAIN             29726
SHORTNESS OF BREATH    21877
FALL                   21718
BACK PAIN              14556
                       ...  
DAYTIME SLEEPINESS         1
PTSD                       1
DIAGNOSTIC TESTING         1
BURN WOUND RETURN          1
WEIGHT CHANGE              1
Name: count, Length: 984, dtype: int64


Column: Pt_Triage_Acuity

Pt_Triage_Acuity
3.0    262292
4.0    13

#### Include 'Diastolic_BP' and 'Systolic_BP', "Arrival Method' (could be tried)

#### Concate Columns

In [8]:
df_for_embeddings.columns

Index(['Pt_Arrival_Method', 'Pt_Complaint', 'Pt_Triage_Acuity', 'Pt_Gender',
       'Pt_Race', 'Pt_Ethnicity', 'Pulse', 'Resp', 'SpO2', 'Temp',
       'Arrival_TimeOfDay', 'Age_Group', 'grouped_arrival_method', 'bp_group'],
      dtype='object')

In [9]:
df_for_embeddings["combined"] = ("Complaint: " + df_for_embeddings['Pt_Complaint'].str.strip() + 
                #   "; Arrival_Method: " + df_for_embeddings['Pt_Arrival_Method'].str.strip() + 
                  "; Triage_Acuity: " + df_for_embeddings['Pt_Triage_Acuity'].astype(str).str.strip() + 
                  "; Gender: " + df_for_embeddings['Pt_Gender'].str.strip() + 
                  "; Race: " + df_for_embeddings['Pt_Race'].str.strip() + 
                  "; Ethnicity: " + df_for_embeddings['Pt_Ethnicity'].str.strip() + 
                  "; Pulse: " + df_for_embeddings['Pulse'].astype(str).str.strip() + 
                  "; Resp: " + df_for_embeddings['Resp'].astype(str).str.strip() + 
                  "; SpO2: " + df_for_embeddings['SpO2'].astype(str).str.strip() + 
                  "; Temp: " + df_for_embeddings['Temp'].astype(str).str.strip() + 
                  "; Arrival_TimeOfDay: " + df_for_embeddings['Arrival_TimeOfDay'].str.strip() + 
                  "; Age_Group: " + df_for_embeddings['Age_Group'].str.strip() + 
                  "; Grouped_Arrival_Method: " + df_for_embeddings['grouped_arrival_method'].str.strip() + 
                  "; BP_Group: " + df_for_embeddings['bp_group'].str.strip()
                  )
display(df_for_embeddings.head(2))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pulse,Resp,SpO2,Temp,Arrival_TimeOfDay,Age_Group,grouped_arrival_method,bp_group,combined
0,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,90.0,20.0,99.0,97.9,12pm-6pm,20-40,ambulance,normal,Complaint: FLANK PAIN; Triage_Acuity: 3.0; Gen...
1,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,104.0,18.0,100.0,97.9,6pm-Midnight,20-40,private transport,normal,Complaint: DIZZINESS; Triage_Acuity: 3.0; Gend...


In [41]:
print(f"Sample Patient info: \n{df_for_embeddings['combined'].iloc[0]}")

Sample Patient info: 
Complaint: FLANK PAIN; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 90.0; Resp: 20.0; SpO2: 99.0; Temp: 97.9; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 20-40; Grouped_Arrival_Method: ambulance; BP_Group: normal


In [10]:
print(f"DF shape before drop_NA: {df_for_embeddings.shape}")

df_for_embeddings.dropna(subset = ['combined'], inplace=True)
print(f"DF shape after drop_NA: {df_for_embeddings.shape}")


DF shape before drop_NA: (535441, 15)
DF shape after drop_NA: (531127, 15)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [48]:
# imports
from keys.keys import OPENAI_KEY
from utils.embeddings_utils import get_embedding, cosine_similarity, get_embeddings

import openai
# Set up your API credentials
openai.api_key = OPENAI_KEY

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [12]:
import tiktoken

encoding = tiktoken.get_encoding(embedding_encoding)
n_tokens_list = []

# omit reviews that are too long to embed
df_for_embeddings["n_tokens"] = df_for_embeddings.combined.apply(lambda x: len(encoding.encode(x)))

print(f"df len: {len(df_for_embeddings)}")
display(df_for_embeddings.head(2))

print(f"# Tokens: {df_for_embeddings['n_tokens'].sum()}")



df len: 531127




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pulse,Resp,SpO2,Temp,Arrival_TimeOfDay,Age_Group,grouped_arrival_method,bp_group,combined,n_tokens
0,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,90.0,20.0,99.0,97.9,12pm-6pm,20-40,ambulance,normal,Complaint: FLANK PAIN; Triage_Acuity: 3.0; Gen...,97
1,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,104.0,18.0,100.0,97.9,6pm-Midnight,20-40,private transport,normal,Complaint: DIZZINESS; Triage_Acuity: 3.0; Gend...,98


# Tokens: 52866627


In [13]:
#### Cost
n_tokens = df_for_embeddings['n_tokens'].sum()
per_1k_price = 0.0001 
print(f"approximate cost: ${round(n_tokens * per_1k_price/1000, 2)}")

#### Cost for 100k comments
n_tokens = df_for_embeddings[0:100000]['n_tokens'].sum()
per_1k_price = 0.0001 
print(f"approximate cost: ${round(n_tokens * per_1k_price/1000, 2)}")

approximate cost: $5.29
approximate cost: $1.0


In [14]:
df_for_embeddings.to_csv("./data/healthlab/aiipem_deidentified_before_embeddings.csv", index= False)

In [30]:
#### Time taken in sequential order for 10k patients --> 85 Mins

# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
df_copy = df_for_embeddings[0:10000]
embedding_list = []

for idx in range(len(df_copy)):
    txt = df_copy['combined'].iloc[idx]
    embedding_value = get_embedding(txt, model=embedding_model)
    embedding_list.append(embedding_value)

    if(idx % 500 == 0):
        print(f"idx: {idx}")

df_copy["embedding"] = embedding_list
df_copy.to_csv("./data/healthlab/aiipem_deidentified_embeddings.csv", index= False)

idx: 0
idx: 5
idx: 10
idx: 15
idx: 20
idx: 25
idx: 30
idx: 35
idx: 40
idx: 45
idx: 50
idx: 55
idx: 60
idx: 65
idx: 70
idx: 75
idx: 80
idx: 85
idx: 90
idx: 95
idx: 100
idx: 105
idx: 110
idx: 115
idx: 120
idx: 125
idx: 130
idx: 135
idx: 140
idx: 145
idx: 150
idx: 155
idx: 160
idx: 165
idx: 170
idx: 175
idx: 180
idx: 185
idx: 190
idx: 195
idx: 200
idx: 205
idx: 210
idx: 215
idx: 220
idx: 225
idx: 230
idx: 235
idx: 240
idx: 245
idx: 250
idx: 255
idx: 260
idx: 265
idx: 270
idx: 275
idx: 280
idx: 285
idx: 290
idx: 295
idx: 300
idx: 305
idx: 310
idx: 315
idx: 320
idx: 325
idx: 330
idx: 335
idx: 340
idx: 345
idx: 350
idx: 355
idx: 360
idx: 365
idx: 370
idx: 375
idx: 380
idx: 385
idx: 390
idx: 395
idx: 400
idx: 405
idx: 410
idx: 415
idx: 420
idx: 425
idx: 430
idx: 435
idx: 440
idx: 445
idx: 450
idx: 455
idx: 460
idx: 465
idx: 470
idx: 475
idx: 480
idx: 485
idx: 490
idx: 495
idx: 500
idx: 505
idx: 510
idx: 515
idx: 520
idx: 525
idx: 530
idx: 535
idx: 540
idx: 545
idx: 550
idx: 555
idx: 560
idx: 

In [32]:
display(df_copy.head(2))
df_copy.shape

Unnamed: 0,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pulse,Resp,SpO2,Temp,Arrival_TimeOfDay,Age_Group,grouped_arrival_method,bp_group,combined,n_tokens,embedding
0,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,90.0,20.0,99.0,97.9,12pm-6pm,20-40,ambulance,normal,Complaint: FLANK PAIN; Triage_Acuity: 3.0; Gen...,97,"[0.009560050442814827, 0.002757575362920761, 0..."
1,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,104.0,18.0,100.0,97.9,6pm-Midnight,20-40,private transport,normal,Complaint: DIZZINESS; Triage_Acuity: 3.0; Gend...,98,"[0.011276663281023502, 0.003159793559461832, 0..."


(10000, 17)

In [16]:
import pandas as pd
import numpy as np
from ast import literal_eval

datafile_path = "./data/healthlab/aiipem_deidentified_embeddings.csv"

df_copy = pd.read_csv(datafile_path)
df_copy["embedding"] = df_copy.embedding.apply(literal_eval).apply(np.array)

display(df_copy.head(2))
df_copy.shape

Unnamed: 0,Pt_Arrival_Method,Pt_Complaint,Pt_Triage_Acuity,Pt_Gender,Pt_Race,Pt_Ethnicity,Pulse,Resp,SpO2,Temp,Arrival_TimeOfDay,Age_Group,grouped_arrival_method,bp_group,combined,n_tokens,embedding
0,Ambulance,FLANK PAIN,3.0,F,UNKNOWN,Choose not to disclose,90.0,20.0,99.0,97.9,12pm-6pm,20-40,ambulance,normal,Complaint: FLANK PAIN; Triage_Acuity: 3.0; Gen...,97,"[0.009560050442814827, 0.002757575362920761, 0..."
1,Car,DIZZINESS,3.0,F,UNKNOWN,Choose not to disclose,104.0,18.0,100.0,97.9,6pm-Midnight,20-40,private transport,normal,Complaint: DIZZINESS; Triage_Acuity: 3.0; Gend...,98,"[0.011276663281023502, 0.003159793559461832, 0..."


(10000, 17)

In [42]:
# for idx in range(0, 10):
#     print(f"Text1: {df_copy['combined'].iloc[idx]}")
#     print(f"Text2: {df_copy['combined'].iloc[idx+1]}")
#     sim= cosine_similarity(df_copy['embedding'].iloc[idx], df_copy['embedding'].iloc[idx+1])
#     print(f"Similariry: {sim}")
#     print()

In [18]:
from utils.embeddings_utils import get_embedding, cosine_similarity

# search through the patients for a specific patient
def search_patients(df, patient_info):
    product_embedding = get_embedding(patient_info, model=embedding_model)
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))

    results = df.sort_values("similarity", ascending=False)
    return results


In [32]:
top_k = 5
patient_id = 10005
df_patient_search = df_copy.copy()

patient_info = df_for_embeddings['combined'].iloc[patient_id]
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 108.0; Resp: 28.0; SpO2: 100.0; Temp: 98.4; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 2-13; Grouped_Arrival_Method: private transport; BP_Group: normal

top 5 matched patients: 

Matched Patient 1: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 139.0; Resp: 35.0; SpO2: 97.0; Temp: 99.3; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 2-13; Grouped_Arrival_Method: private transport; BP_Group: normal
Similariry: 0.999298727129631

Matched Patient 2: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 99.0; Resp: 16.0; SpO2: 98.0; Temp: 99.6; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 20-40; Grouped_Arrival_Method: private transport; BP_Group: normal
Similariry: 0.9983084518939789

Matched Patient 3: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: UNKNOWN; Et

In [83]:
top_k = 5
patient_id = 10005
df_patient_search = df_copy.copy()

patient_info = "Complaint: EMESIS"
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: EMESIS

top 5 matched patients: 

Matched Patient 1: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: WHITE OR CAUCASIAN; Ethnicity: Other; Pulse: 81.0; Resp: 24.0; SpO2: 100.0; Temp: 98.0; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: 60-80; Grouped_Arrival_Method: private transport; BP_Group: high
Similariry: 0.874638939954148

Matched Patient 2: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: WHITE OR CAUCASIAN; Ethnicity: American; Pulse: 75.0; Resp: 16.0; SpO2: 100.0; Temp: 98.3; Arrival_TimeOfDay: 6am-12pm; Age_Group: 20-40; Grouped_Arrival_Method: private transport; BP_Group: normal
Similariry: 0.8741879664208697

Matched Patient 3: Complaint: EMESIS; Triage_Acuity: 2.0; Gender: M; Race: WHITE OR CAUCASIAN; Ethnicity: American; Pulse: 82.0; Resp: 18.0; SpO2: 100.0; Temp: 98.0; Arrival_TimeOfDay: Midnight-6am; Age_Group: 60-80; Grouped_Arrival_Method: private transport; BP_Group: high
Similariry: 0.8729831678097482

Matched Patient 4: Comp

In [84]:
top_k = 5
patient_id = 10005
df_patient_search = df_copy.copy()

patient_info = "Complaint: EMESIS; Pulse: 130.0"
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: EMESIS; Pulse: 130.0

top 5 matched patients: 

Matched Patient 1: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: M; Race: SOMETHING ELSE; Ethnicity: Other; Pulse: 75.0; Resp: 18.0; SpO2: 100.0; Temp: 98.2; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 13-20; Grouped_Arrival_Method: private transport; BP_Group: normal
Similariry: 0.9019182906831928

Matched Patient 2: Complaint: EMESIS; Triage_Acuity: 3.0; Gender: F; Race: WHITE OR CAUCASIAN; Ethnicity: American; Pulse: 75.0; Resp: 16.0; SpO2: 100.0; Temp: 98.3; Arrival_TimeOfDay: 6am-12pm; Age_Group: 20-40; Grouped_Arrival_Method: private transport; BP_Group: normal
Similariry: 0.9016371016596785

Matched Patient 3: Complaint: EMESIS; Triage_Acuity: 4.0; Gender: M; Race: SOMETHING ELSE; Ethnicity: Choose not to disclose; Pulse: 120.0; Resp: 40.0; SpO2: 100.0; Temp: 97.7; Arrival_TimeOfDay: 12pm-6pm; Age_Group: .5-1; Grouped_Arrival_Method: private transport; BP_Group: normal
Similariry: 0.9010984030439315

Matche

Mike 
1. to add code snippets to clean the vitals like if Pulse is More than 100 add text "Tachycardic"
2. 'Pt_Prime_ICD10', 'Pt_Secondary_ICD10',  'Pt_Third_ICD10' - text mapping

Narayan
1. Move the Code to Worshop and Ping Mike
1.2 Include 'Diastolic_BP' and 'Systolic_BP', "Arrival Method' (could be tried)
2. Store the embeddings to GCS
3. Create Google Vector search pipeline
4. Create two pipelines
    4.1 To Create embedings and store in GCS
    4.2 Code to search the nearest patient - to return 2 information, patient ids of top_k patients, (what information can we get from ANN search (how far the matched patients are from the current patient), cosine similarity or something else which tells how close the match are, embdeings as well if possible

Omar
1. Create a frontend app steamlite app or any other 
2. What Diagnostics was taken for the patients 


# Help Dr. Pickup and Not miss Rare info

5k with Heart attacks
and only 10 with 'diasection'




In [33]:
top_k = 5
patient_id = 10010
df_patient_search = df_copy.copy()

patient_info = df_for_embeddings['combined'].iloc[patient_id]
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: DENTAL PAIN; Triage_Acuity: 5.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 72.0; Resp: 18.0; SpO2: 100.0; Temp: 97.5; Arrival_TimeOfDay: Midnight-6am; Age_Group: 13-20; Grouped_Arrival_Method: private transport; BP_Group: high

top 5 matched patients: 

Matched Patient 1: Complaint: DENTAL PAIN; Triage_Acuity: 5.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 74.0; Resp: 16.0; SpO2: 99.0; Temp: 97.7; Arrival_TimeOfDay: Midnight-6am; Age_Group: 20-40; Grouped_Arrival_Method: private transport; BP_Group: high
Similariry: 0.9987692902490611

Matched Patient 2: Complaint: DENTAL PAIN; Triage_Acuity: 5.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 52.0; Resp: 20.0; SpO2: 100.0; Temp: 98.0; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: 13-20; Grouped_Arrival_Method: private transport; BP_Group: high
Similariry: 0.9982120577001438

Matched Patient 3: Complaint: DENTAL PAIN; Triage_Acuity: 3.0; 

In [34]:
top_k = 5
patient_id = 100110
df_patient_search = df_copy.copy()

patient_info = df_for_embeddings['combined'].iloc[patient_id]
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: SEIZURE; Triage_Acuity: 2.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 100.0; Resp: 22.0; SpO2: 94.0; Temp: 98.2; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: 40-60; Grouped_Arrival_Method: ambulance; BP_Group: high

top 5 matched patients: 

Matched Patient 1: Complaint: SEIZURE; Triage_Acuity: 2.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 99.0; Resp: 16.0; SpO2: 95.0; Temp: 99.9; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: 20-40; Grouped_Arrival_Method: ambulance; BP_Group: high
Similariry: 0.9982161096281713

Matched Patient 2: Complaint: SEIZURE; Triage_Acuity: 2.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose not to disclose; Pulse: 59.0; Resp: 18.0; SpO2: 100.0; Temp: 97.5; Arrival_TimeOfDay: 6am-12pm; Age_Group: 80-100; Grouped_Arrival_Method: ambulance; BP_Group: high
Similariry: 0.9972988503420792

Matched Patient 3: Complaint: SEIZURE; Triage_Acuity: 2.0; Gender: M; Race: UNKNOWN; Ethnicity: Choose

In [35]:
top_k = 5
patient_id = 20000
df_patient_search = df_copy.copy()

patient_info = df_for_embeddings['combined'].iloc[patient_id]
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: CHEST PAIN; Triage_Acuity: 2.0; Gender: M; Race: WHITE OR CAUCASIAN; Ethnicity: Choose not to disclose; Pulse: 76.0; Resp: 15.0; SpO2: 98.0; Temp: 99.3; Arrival_TimeOfDay: Midnight-6am; Age_Group: 60-80; Grouped_Arrival_Method: ambulance; BP_Group: normal

top 5 matched patients: 

Matched Patient 1: Complaint: CHEST PAIN; Triage_Acuity: 2.0; Gender: F; Race: WHITE OR CAUCASIAN; Ethnicity: Choose not to disclose; Pulse: 77.0; Resp: 14.0; SpO2: 100.0; Temp: 98.4; Arrival_TimeOfDay: Midnight-6am; Age_Group: 80-100; Grouped_Arrival_Method: ambulance; BP_Group: high
Similariry: 0.99630456870025

Matched Patient 2: Complaint: CHEST PAIN; Triage_Acuity: 2.0; Gender: M; Race: WHITE OR CAUCASIAN; Ethnicity: Choose not to disclose; Pulse: 105.0; Resp: 19.0; SpO2: 95.0; Temp: 97.6; Arrival_TimeOfDay: 6pm-Midnight; Age_Group: 40-60; Grouped_Arrival_Method: ambulance; BP_Group: at_risk
Similariry: 0.9958701681690126

Matched Patient 3: Complaint: CHEST PAIN; Triage_Acui

In [36]:
top_k = 5
patient_id = 20001
df_patient_search = df_copy.copy()

patient_info = df_for_embeddings['combined'].iloc[patient_id]
results = search_patients(df_patient_search, patient_info)

print(f"\nSearch Patient:\n{patient_info}")
print(f"\ntop {top_k} matched patients: \n")

for idx in range(0, top_k):
    print(f"Matched Patient {idx+1}: {results['combined'].iloc[idx]}")
    print(f"Similariry: {results['similarity'].iloc[idx]}")
    print()



Search Patient:
Complaint: BACK PAIN; Triage_Acuity: 4.0; Gender: M; Race: WHITE OR CAUCASIAN; Ethnicity: Choose not to disclose; Pulse: 114.0; Resp: 18.0; SpO2: 97.0; Temp: 97.9; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 60-80; Grouped_Arrival_Method: private transport; BP_Group: high

top 5 matched patients: 

Matched Patient 1: Complaint: BACK PAIN; Triage_Acuity: 4.0; Gender: M; Race: WHITE OR CAUCASIAN; Ethnicity: Choose not to disclose; Pulse: 118.0; Resp: 18.0; SpO2: 95.0; Temp: 98.8; Arrival_TimeOfDay: 6am-12pm; Age_Group: 40-60; Grouped_Arrival_Method: private transport; BP_Group: high
Similariry: 0.9991282198684766

Matched Patient 2: Complaint: BACK PAIN; Triage_Acuity: 4.0; Gender: M; Race: WHITE OR CAUCASIAN; Ethnicity: Choose not to disclose; Pulse: 94.0; Resp: 16.0; SpO2: 100.0; Temp: 98.9; Arrival_TimeOfDay: 12pm-6pm; Age_Group: 40-60; Grouped_Arrival_Method: private transport; BP_Group: high
Similariry: 0.9991091130845825

Matched Patient 3: Complaint: BACK PAIN; Triage

In [79]:
# # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\
# import time

# start_time= time.time()
# df_temp = df_for_embeddings[0:50]
# embedding_list1 = []

# for idx in range(len(df_temp)-1):
#     txt = df_temp['combined'].iloc[idx]
#     embedding_value = get_embedding(txt, model=embedding_model)
#     embedding_list1.append(embedding_value)

#     if(idx % 10 == 0):
#         print(f"idx: {idx}")

# print(f"time taken in one txt at a time: {time.time()- start_time}")

In [78]:
# # Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage\
# import time

# start_time= time.time()
# df_temp = df_for_embeddings[0:100]
# embedding_list1 = []

# embedding_value = get_embeddings(df_temp['combined'].values.tolist(), model=embedding_model)
# embedding_list1.extend(embedding_value)

# print(f"time taken in one txt at a time: {time.time()- start_time}")

In [77]:

# cntr = 0
# for idx in range(0, len(df_copy_2)-1, 2048):
#     start_idx = idx
#     end_idx = min(10000, idx+2048)
#     print(start_idx, end_idx)
#     cntr += len(df_copy_2['combined'].iloc[start_idx:end_idx])
#     print(cntr)

In [76]:
#### Time taken in sequential order for 10k patients --> 50 seconds
import time

# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
start_time= time.time()
df_copy_2 = df_for_embeddings[10000:20000].copy()
embedding_list_2 = []
batch_size = 2048

for idx in range(0, len(df_copy_2)-1, batch_size):
    start_idx = idx
    end_idx = min(10000, idx+batch_size)
    print(f"embeddings for: {start_idx, end_idx}")
    txt_list = df_copy_2['combined'].iloc[start_idx:end_idx]
    embedding_values = get_embeddings(txt_list, model=embedding_model)
    embedding_list_2.extend(embedding_values)

print(f"time taken in one txt at a time: {time.time()- start_time}")

df_copy_2["embedding"]  = embedding_list_2

print(f"saving embedings to disk:")
df_copy_2.to_csv("./data/healthlab/aiipem_deidentified_embeddings_batch_2.csv", index= False)

embeddings for: (0, 2048)
embeddings for: (2048, 4096)
embeddings for: (4096, 6144)
embeddings for: (6144, 8192)
embeddings for: (8192, 10000)
time taken in one txt at a time: 50.03306984901428


In [82]:
len(embedding_values[0])

1536