<a href="https://colab.research.google.com/github/logannye/research/blob/main/Discharge_Notes2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This model transforms clinical progress notes from patient profiles into vector embeddings. It is based upon the following model from HuggingFace:
https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT

In [7]:
# Step 0: Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

# Step 1: Install the Transformers and Pandas libraries
!pip install transformers pandas

# Step 2: Import libraries
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
import os

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Step 3: Load the tokenizer and model for Bio_ClinicalBERT
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Function to vectorize a single clinical note
def vectorize_clinical_note(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [9]:
# Load the subject IDs for the two subsets using the correct paths
pancan_subj_path = '/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/pancan_subj.npy'
safe_subj_path = '/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/safe_subj.npy'

In [10]:
pancan_subj = np.load(pancan_subj_path, allow_pickle=True).tolist()
safe_subj = np.load(safe_subj_path, allow_pickle=True).tolist()

In [11]:
# Load the clinical notes DataFrame
# Make sure to adjust the path below to where your discharge notes CSV file is located
csv_file_path = '/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/discharge.csv'
df_notes = pd.read_csv(csv_file_path)

# Assuming there's a column for clinical notes, replace 'note_text' with the actual column name
note_column = 'text'  # Change this to the actual column name in your CSV

# Filter the DataFrame for only the selected patients
pancan_notes = df_notes[df_notes['subject_id'].isin(pancan_subj)]
safe_notes = df_notes[df_notes['subject_id'].isin(safe_subj)]

# Function to process and vectorize notes for a given DataFrame
def process_and_vectorize_notes(df, note_column='text'):
    vectors = []
    for note in df[note_column]:
        vectors.append(vectorize_clinical_note(note).numpy())  # Convert tensor to numpy array for easier handling
    return np.array(vectors)

# Vectorize notes for patients with pancreatic cancer
pancan_vectors = process_and_vectorize_notes(pancan_notes, note_column)

# Vectorize notes for control group patients
safe_vectors = process_and_vectorize_notes(safe_notes, note_column)

# Now pancan_vectors and safe_vectors contain the vector representations for the respective patient groups

In [12]:
import numpy as np

# Save the vectors to .npy files
np.save('/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/pancan_vectors.npy', pancan_vectors)
np.save('/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/safe_vectors.npy', safe_vectors)


Downloading the vector files

In [13]:
from google.colab import files

# Download the saved .npy files
files.download('/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/pancan_vectors.npy')
files.download('/content/drive/MyDrive/Galen-Health/Datasets/mimic-iv-note/note/safe_vectors.npy')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>