# Patient Similarity Search using Clinical Embeddings
This notebook demonstrates how to find similar patients based on clinical embeddings.

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast  # For safely evaluating strings containing Python literals

In [5]:
# Load the data
df = pd.read_csv('csv_files/ProviderPatientDashboard.csv')
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (1504, 29)


Unnamed: 0,PatientID,ProviderID,VisitDate,Status_x,ProviderName,Name,Gender,BirthDate,ObservationDate,Category,...,ClinicalStatus,OnsetDate,EncounterID,Start,End,Status_y,Class,CombinedText,ClinicalFocusText,ClinicalFocusEmbedding
0,patient-0001,P004,2025-06-21,Pending,Dr. Ahmed Khan,Mark Johnson,male,1971-11-24,2025-03-17T23:46:53.561861,vital-signs,...,active,2019-06-03T23:46:53.561861,encounter-0001,2025-03-17T23:46:53.561861,2025-06-07T23:46:53.561861,finished,AMB,PatientID: patient-0001 | Name: Mark Johnson |...,Medication: Lisinopril | NoteText: SOAP Note:\...,"[-0.0699615404009819, -0.15284700691699982, 0...."
1,patient-0001,P001,2025-08-28,Completed,Dr. Alice Smith,Mark Johnson,male,1971-11-24,2025-03-17T23:46:53.561861,vital-signs,...,active,2019-06-03T23:46:53.561861,encounter-0001,2025-03-17T23:46:53.561861,2025-06-07T23:46:53.561861,finished,AMB,PatientID: patient-0001 | Name: Mark Johnson |...,Medication: Lisinopril | NoteText: SOAP Note:\...,"[-0.0699615404009819, -0.15284700691699982, 0...."
2,patient-0002,P001,2025-06-01,Cancelled,Dr. Alice Smith,David Taylor,male,1948-06-01,2025-06-16T23:46:53.562317,vital-signs,...,active,2019-09-04T23:46:53.562317,encounter-0002,2025-06-16T23:46:53.562317,2025-04-10T23:46:53.562317,finished,AMB,PatientID: patient-0002 | Name: David Taylor |...,Medication: Lisinopril | NoteText: SOAP Note:\...,"[-0.024824656546115875, -0.10685153305530548, ..."
3,patient-0002,P002,2025-06-12,Pending,Dr. John Patel,David Taylor,male,1948-06-01,2025-06-16T23:46:53.562317,vital-signs,...,active,2019-09-04T23:46:53.562317,encounter-0002,2025-06-16T23:46:53.562317,2025-04-10T23:46:53.562317,finished,AMB,PatientID: patient-0002 | Name: David Taylor |...,Medication: Lisinopril | NoteText: SOAP Note:\...,"[-0.024824656546115875, -0.10685153305530548, ..."
4,patient-0003,P002,2025-06-24,No-show,Dr. John Patel,Michael Mcclain,male,1946-04-19,2025-04-26T23:46:53.562601,vital-signs,...,active,2017-05-13T23:46:53.562601,encounter-0003,2025-04-26T23:46:53.562601,2025-05-08T23:46:53.562601,finished,AMB,PatientID: patient-0003 | Name: Michael Mcclai...,Medication: Lisinopril | NoteText: SOAP Note:\...,"[-0.02864755317568779, -0.09952106326818466, 0..."


In [6]:
# Function to safely convert string representation of list to numpy array
def parse_embedding(embedding_str):
    try:
        # Try to evaluate the string as a Python literal
        return np.array(ast.literal_eval(embedding_str))
    except (ValueError, SyntaxError):
        # If evaluation fails, try to handle common formatting issues
        cleaned = embedding_str.strip('[]').split(',')
        return np.array([float(x.strip()) for x in cleaned if x.strip()])

In [7]:
# Convert embeddings to numpy arrays
df['embedding_array'] = df['ClinicalFocusEmbedding'].apply(parse_embedding)

# Create a matrix of all embeddings
embedding_matrix = np.vstack(df['embedding_array'].values)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (1504, 768)


In [8]:
def find_similar_patients(patient_id, top_n=5):
    """
    Find similar patients based on clinical embeddings.
    
    Args:
        patient_id (str): The ID of the patient to find similar patients for
        top_n (int): Number of similar patients to return
        
    Returns:
        DataFrame: DataFrame containing the top_n similar patients
    """
    # Find the index of the patient
    patient_idx = df[df['PatientID'] == patient_id].index
    
    if len(patient_idx) == 0:
        return f"Patient with ID {patient_id} not found."
    
    # Get the embedding of the query patient
    query_embedding = embedding_matrix[patient_idx[0]]
    
    # Calculate cosine similarity with all patients
    similarities = cosine_similarity(
        query_embedding.reshape(1, -1),  # Reshape to 2D
        embedding_matrix
    ).flatten()
    
    # Add similarities to the dataframe
    df['similarity'] = similarities
    
    # Get top N most similar patients (excluding the query patient itself)
    similar_patients = (
        df[df['PatientID'] != patient_id]  # Exclude the query patient
        .sort_values('similarity', ascending=False)
        .head(top_n)
    )
    
    # Get the query patient's data
    query_patient = df[df['PatientID'] == patient_id].iloc[0]
    
    # Drop the embedding and combined text columns
    columns_to_drop = ['ClinicalFocusEmbedding', 'CombinedText', 'embedding_array', 'similarity']
    similar_patients = similar_patients.drop(columns=[col for col in columns_to_drop if col in similar_patients.columns])
    
    return query_patient, similar_patients

In [9]:
# Example usage
patient_id = "patient-0001"  # Replace with actual patient ID
query_patient, similar_patients = find_similar_patients(patient_id, top_n=5)

print(f"\nQuery Patient (ID: {patient_id}):")
print("-" * 50)
# Drop the embedding and combined text columns for display
columns_to_drop = ['ClinicalFocusEmbedding', 'CombinedText', 'embedding_array', 'similarity']
query_display = query_patient.drop([col for col in columns_to_drop if col in query_patient.index])
print(query_display.to_string())

print(f"\nTop 5 Similar Patients to {patient_id}:")
print("-" * 50)
similar_patients


Query Patient (ID: patient-0001):
--------------------------------------------------
PatientID                                                 patient-0001
ProviderID                                                        P004
VisitDate                                                   2025-06-21
Status_x                                                       Pending
ProviderName                                            Dr. Ahmed Khan
Name                                                      Mark Johnson
Gender                                                            male
BirthDate                                                   1971-11-24
ObservationDate                             2025-03-17T23:46:53.561861
Category                                                   vital-signs
Code                                                    Blood Pressure
Systolic                                                           131
Diastolic                                                     

Unnamed: 0,PatientID,ProviderID,VisitDate,Status_x,ProviderName,Name,Gender,BirthDate,ObservationDate,Category,...,NoteText,ConditionCode,ClinicalStatus,OnsetDate,EncounterID,Start,End,Status_y,Class,ClinicalFocusText
483,patient-0315,P002,2025-06-26,No-show,Dr. John Patel,Carlos Ryan,male,1984-04-04,2025-05-26T23:46:53.651768,vital-signs,...,SOAP Note:\nSubjective: Carlos reports fatigue...,Hypertension,active,2018-11-25T23:46:53.651768,encounter-0315,2025-05-26T23:46:53.651768,2025-06-04T23:46:53.651768,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
284,patient-0187,P003,2025-07-05,Follow-up Needed,Dr. Maya Lee,Gregory Patterson,male,1984-11-25,2025-04-30T23:46:53.614806,vital-signs,...,SOAP Note:\nSubjective: Gregory reports fatigu...,Hypertension,active,2022-09-06T23:46:53.614806,encounter-0187,2025-04-30T23:46:53.614806,2025-04-18T23:46:53.614806,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
283,patient-0187,P001,2025-07-01,Follow-up Needed,Dr. Alice Smith,Gregory Patterson,male,1984-11-25,2025-04-30T23:46:53.614806,vital-signs,...,SOAP Note:\nSubjective: Gregory reports fatigu...,Hypertension,active,2022-09-06T23:46:53.614806,encounter-0187,2025-04-30T23:46:53.614806,2025-04-18T23:46:53.614806,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
1325,patient-0882,P005,2025-07-23,Cancelled,Dr. Emily Zhang,Juan Greene,male,1973-03-30,2025-04-05T23:46:53.855741,vital-signs,...,SOAP Note:\nSubjective: Juan reports fatigue.\...,Hypertension,active,2018-10-12T23:46:53.855741,encounter-0882,2025-04-05T23:46:53.855741,2025-03-29T23:46:53.855741,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
1324,patient-0882,P002,2025-07-17,Completed,Dr. John Patel,Juan Greene,male,1973-03-30,2025-04-05T23:46:53.855741,vital-signs,...,SOAP Note:\nSubjective: Juan reports fatigue.\...,Hypertension,active,2018-10-12T23:46:53.855741,encounter-0882,2025-04-05T23:46:53.855741,2025-03-29T23:46:53.855741,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...


In [10]:
# Function to test with different patient IDs
def test_similarity_search(patient_id):
    try:
        query_patient, similar_patients = find_similar_patients(patient_id, top_n=5)
        
        print(f"\nQuery Patient (ID: {patient_id}):")
        print("-" * 50)
        # Drop the embedding and combined text columns for display
        columns_to_drop = ['ClinicalFocusEmbedding', 'CombinedText', 'embedding_array', 'similarity']
        query_display = query_patient.drop([col for col in columns_to_drop if col in query_patient.index])
        print(query_display.to_string())
        
        print(f"\nTop 5 Similar Patients to {patient_id}:")
        print("-" * 50)
        return similar_patients
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

In [11]:
# Test with a different patient
test_similarity_search("patient-0002")


Query Patient (ID: patient-0002):
--------------------------------------------------
PatientID                                                 patient-0002
ProviderID                                                        P001
VisitDate                                                   2025-06-01
Status_x                                                     Cancelled
ProviderName                                           Dr. Alice Smith
Name                                                      David Taylor
Gender                                                            male
BirthDate                                                   1948-06-01
ObservationDate                             2025-06-16T23:46:53.562317
Category                                                   vital-signs
Code                                                    Blood Pressure
Systolic                                                           116
Diastolic                                                     

Unnamed: 0,PatientID,ProviderID,VisitDate,Status_x,ProviderName,Name,Gender,BirthDate,ObservationDate,Category,...,NoteText,ConditionCode,ClinicalStatus,OnsetDate,EncounterID,Start,End,Status_y,Class,ClinicalFocusText
365,patient-0236,P001,2025-08-03,Pending,Dr. Alice Smith,Susan Serrano,female,1944-03-08,2025-06-09T23:46:53.630764,vital-signs,...,SOAP Note:\nSubjective: Susan reports fatigue....,Asthma,active,2019-01-02T23:46:53.630764,encounter-0236,2025-06-09T23:46:53.630764,2025-06-07T23:46:53.630764,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
553,patient-0358,P003,2025-08-14,Completed,Dr. Maya Lee,Benjamin Miller,male,1954-08-27,2025-05-12T23:46:53.662093,vital-signs,...,SOAP Note:\nSubjective: Benjamin reports fatig...,Asthma,active,2018-12-19T23:46:53.662093,encounter-0358,2025-05-12T23:46:53.662093,2025-04-16T23:46:53.662093,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
552,patient-0358,P002,2025-06-14,Cancelled,Dr. John Patel,Benjamin Miller,male,1954-08-27,2025-05-12T23:46:53.662093,vital-signs,...,SOAP Note:\nSubjective: Benjamin reports fatig...,Asthma,active,2018-12-19T23:46:53.662093,encounter-0358,2025-05-12T23:46:53.662093,2025-04-16T23:46:53.662093,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
1365,patient-0910,P001,2025-07-25,Completed,Dr. Alice Smith,George Patterson,male,2005-10-05,2025-06-01T23:46:53.863551,vital-signs,...,SOAP Note:\nSubjective: George reports fatigue...,Asthma,active,2017-04-27T23:46:53.863551,encounter-0910,2025-06-01T23:46:53.863551,2025-05-03T23:46:53.863551,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
978,patient-0650,P005,2025-07-08,Cancelled,Dr. Emily Zhang,Kristin Chambers,female,1976-05-29,2025-06-04T23:46:53.786954,vital-signs,...,SOAP Note:\nSubjective: Kristin reports headac...,Asthma,active,2021-04-27T23:46:53.786954,encounter-0650,2025-06-04T23:46:53.786954,2025-04-15T23:46:53.786954,finished,AMB,Medication: Lisinopril | NoteText: SOAP Note:\...
