# ClinicalBERT

In [None]:
# General imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sys
from tqdm import tqdm
import ast
from scipy.signal import resample
import json
import warnings
import time

# Matlab/WFDB files
import scipy.io as sio

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
In

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path to training folder within PhysioNet dataset
PhysioNet_PATH = '/content/drive/MyDrive/ECG Project (Shared Folder)/Datasets/physionet.org/files/challenge-2021/1.0.3/training'
PhysioNet_PATH

'/content/drive/MyDrive/ECG Project (Shared Folder)/Datasets/physionet.org/files/challenge-2021/1.0.3/training'

In [None]:
PyFiles_PATH = '/content/drive/MyDrive/ECG Project (Shared Folder)/PyFiles'
PyFiles_PATH

'/content/drive/MyDrive/ECG Project (Shared Folder)/PyFiles'

In [None]:
sys.path.append(PyFiles_PATH)

In [None]:
from helper_functions import *
from dataset import PhysioNetDataset

In [None]:
train_set = PhysioNetDataset(PhysioNet_PATH, train = True)
val_set = PhysioNetDataset(PhysioNet_PATH, train = False)

KeyboardInterrupt: ignored

In [None]:
# Header Data + Mapped SNOWMED-CT Codes
processed_train_df = pd.read_csv('/content/drive/MyDrive/ECG Project (Shared Folder)/processed_train_set_records.csv')
processed_val_df = pd.read_csv('/content/drive/MyDrive/ECG Project (Shared Folder)/processed_val_set_records.csv')

In [None]:
processed_train_df['dx_modality'][0]

"['atrial fibrillation', 'right bundle branch block', 't wave abnormal']"

In [None]:
processed_train_df.head(2)

Unnamed: 0,recording_number,recording_file,num_leads,sampling_frequency,num_samples,age,sex,dx,rx,hx,...,lead_10_lead_name,lead_11_file,lead_11_adc_gain,lead_11_units,lead_11_adc_resolution,lead_11_adc_zero,lead_11_initial_value,lead_11_checksum,lead_11_lead_name,dx_modality
0,JS00001,JS00001.mat,12,500,5000,85.0,Male,"['164889003', '59118001', '164934002']",Unknown,Unknown,...,0,JS00001.mat,1000.0,mV,16,0,527,32579,0,"['atrial fibrillation', 'right bundle branch b..."
1,JS00002,JS00002.mat,12,500,5000,59.0,Female,"['426177001', '164934002']",Unknown,Unknown,...,0,JS00002.mat,1000.0,mV,16,0,0,31542,0,"['sinus bradycardia', 't wave abnormal']"


# TextEncoder()

Create a class, ```TextEncoder()``` that is used to convert the description of the (dx_modality) diagnosis class into embeddings using the ClinicalBERT model.

- Input should be a concatenated using comma or blank space string of diagnoses/dx_modality per ECG signal.
- Use processed CSV files (dx_modality vs dx_modality, age, etc together)
- Frozen weights (since it's already pretrained)

In [None]:
class TextEncoder:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

    def encode(self, text_list):
        # Check if text_list is a string representation of a list
        if isinstance(text_list, str):
            text_list = ast.literal_eval(text_list)
        # Convert list of strings to a single string
        text = ', '.join(text_list)
        # Tokenize text
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        # Get embeddings from ClinicalBERT model
        with torch.no_grad():
            embeddings = self.model(**inputs).last_hidden_state
        # Average the embeddings to get single vector per each input
        embeddings = torch.mean(embeddings, dim=1)
        return embeddings

In [None]:
text_encoder = TextEncoder()
embeddings = text_encoder.encode(processed_train_df['dx_modality'][0])
print(embeddings.size())

(…)io_ClinicalBERT/resolve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

(…)/Bio_ClinicalBERT/resolve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

torch.Size([1, 768])


In [None]:
embeddings

tensor([[ 1.8975e-01,  3.0864e-01, -4.8170e-02,  1.9149e-01,  2.7236e-01,
          1.8960e-01, -6.3694e-02,  1.6969e-01,  1.7664e-01,  1.3202e-01,
         -1.5042e-01,  5.0794e-01, -7.3957e-02,  2.9992e-01, -2.1831e-01,
         -1.3088e-01,  1.1179e-01,  1.9628e-02, -1.9350e-01, -1.6476e-01,
          8.1238e-02, -4.2498e-01, -3.8750e-01, -4.7243e-02, -1.5559e-02,
         -2.4554e-01, -1.0310e-01,  5.7794e-01,  1.4504e-01,  1.6382e-01,
         -2.1807e-02,  7.4254e-03, -2.5811e-01,  3.3975e-01,  1.6474e-01,
          1.2412e-01, -8.9964e-02, -6.1442e-03,  1.3661e-01,  1.2153e-01,
          3.6225e-01,  6.1826e-02, -3.7872e-01,  2.8434e-01, -1.3252e-01,
         -2.9651e-01,  6.7283e-02, -6.6552e-02, -2.9479e-01, -1.2505e-02,
          1.6438e-01,  3.5855e-01,  3.9261e-01,  5.9356e-02,  3.6884e-01,
         -1.4099e-01, -3.9989e-01, -3.2891e-01, -3.4848e-01,  3.2186e-01,
          1.5459e-02,  8.9985e-02, -1.2571e-01,  1.9153e-01,  2.4583e-01,
         -7.6325e-02,  3.0472e-01,  1.

# ECGEncoder()

- Input is ECG signal, output will be embeddings of ECG signal
- This is going to be model in model.py
- Model weights are updated iteratively
- optimizer = torch.optim.Adam(clip_model.ECGEncoder.parameters())

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
class OneDimCNN(nn.Module):
    def __init__(self, num_classes):
        super(OneDimCNN, self).__init__()

        # Layer 1
        self.conv1 = nn.Conv1d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(32)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.AvgPool1d(kernel_size=2, stride=2)

        # Layer 2
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.AvgPool1d(kernel_size=2, stride=2)

        # Layer 3
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm1d(128)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.AvgPool1d(kernel_size=2, stride=2)

        # Layer 4
        self.conv4 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm1d(256)
        self.relu4 = nn.ReLU()
        self.pool4 = nn.AvgPool1d(kernel_size=2, stride=2)

        # Fully Connected Layer 1
        self.fc1 = nn.Linear(79872, 128)
        self.relu5 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)

        # Fully Connected Layer 2
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        # Layer 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)

        # Layer 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)

        # Layer 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)

        # Layer 4
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.pool4(x)

        # Flatten the tensor
        x = x.view(x.size(0), -1)
        # print(x.shape)

        # Fully Connected Layer 1
        x = self.fc1(x)
        x = self.relu5(x)
        x = self.dropout1(x)

        # Fully Connected Layer 2
        x = self.fc2(x)

        return x

In [None]:
class ECGEncoder(OneDimCNN):
    def __init__(self, num_classes):
        super(ECGEncoder, self).__init__(num_classes)
        self.fc3 = nn.Linear(126, 768)  # New linear layer

    def encode(self, signal):
        signal = torch.tensor(signal, dtype=torch.float).unsqueeze(0)
        embedding = self.forward(signal)
        return self.fc3(embedding)  # Apply the new linear layer

In [None]:
type(train_set[0][1]['val'])

numpy.ndarray

In [None]:
train_set[0][1]['val']

array([[ -49.41174209,  -49.41174209,  -49.41174209, ...,    3.26986681,
           6.65822132,    4.35320015],
       [  39.23760431,   39.23760431,   39.23760431, ...,  -45.89316397,
         -44.94818514,  -42.39476713],
       [  95.17413779,   95.17413779,   95.17413779, ..., -112.12670284,
        -112.96003617, -116.70843301]])

In [None]:
# Define the number of classes
num_classes = 126

# Create an instance of the model
ecg_encoder = ECGEncoder(num_classes)

# Convert the numpy array to a PyTorch tensor
input_data = torch.from_numpy(train_set[60000][1]['val']).float()

# Add an extra dimension for the batch size
input_data = input_data.unsqueeze(0)

# Convert the model's weights to Float
ecg_encoder = ecg_encoder.float()

# Pass the data through the model
output = ecg_encoder(input_data)

print(output)

tensor([[ 0.0782,  0.3507,  0.4124, -0.0518, -0.2030, -0.3175,  0.5087,  0.0060,
         -0.7707,  0.2363, -0.0559, -0.5044, -0.5819, -0.0828,  0.0036,  0.1545,
          0.2404,  0.3532, -0.0037, -0.2324,  0.0248, -0.0734,  0.0640, -0.1215,
          0.1172,  0.2116, -0.4930, -0.4360,  0.1286, -0.2000,  0.3085,  0.7351,
          0.1531,  0.3013, -0.1597,  0.2680, -0.3177,  0.1918, -0.5725, -0.0113,
         -0.2665,  0.0122,  0.1794, -0.2652, -0.0538, -0.0325,  0.1387,  0.1452,
          0.0522,  0.2749,  0.4686, -0.1010, -0.3231,  0.0459,  0.0438, -0.1950,
         -0.4089,  0.3040, -0.0319, -0.1998, -0.1212,  0.1492,  0.8501,  0.1538,
          0.0512,  0.0891, -0.0682, -0.2538, -0.1318,  0.0538, -0.2148,  0.5314,
         -0.3847,  0.2059, -0.2431, -0.2679, -0.0384, -0.1250,  0.1306,  0.1732,
         -0.5033,  0.1928,  0.1803,  0.1711,  0.3875,  0.4904,  0.2149,  0.2390,
         -0.4808,  0.1219,  0.3235,  0.3111, -0.1651, -0.3142,  0.2434,  0.2042,
          0.0359,  0.3012, -

In [None]:
# Convert the model's weights to Float
ecg_encoder = ecg_encoder.float()

# Set the model in evaluation mode
ecg_encoder.eval()

# Pass the data through the model
output = ecg_encoder(input_data)

print(output)

tensor([[ 0.0089,  0.0508,  0.0747,  0.0004, -0.0709, -0.0121, -0.0174,  0.0108,
         -0.0159,  0.0286,  0.0462, -0.1531,  0.0912,  0.0005,  0.0661,  0.0893,
          0.0438,  0.1017, -0.0086, -0.0439,  0.0205, -0.0163,  0.0093,  0.0031,
          0.0023,  0.1072, -0.0517,  0.1196,  0.0267,  0.0081,  0.0102, -0.1114,
         -0.0709, -0.0173,  0.0147, -0.1080,  0.0417, -0.0744, -0.0630,  0.1390,
          0.0279, -0.0136, -0.0302,  0.0075, -0.0056, -0.0309,  0.0641,  0.0728,
         -0.0152,  0.0285, -0.0273,  0.0432,  0.1088, -0.0714, -0.0908, -0.0648,
         -0.0714, -0.0346, -0.0180, -0.0974, -0.0882, -0.0979, -0.0625, -0.0864,
          0.0301, -0.0385, -0.0454, -0.1070,  0.0440,  0.0862,  0.0046,  0.0805,
         -0.0724, -0.0606, -0.1373, -0.0198,  0.0376,  0.0421,  0.0445, -0.0783,
          0.1198, -0.0871, -0.0126, -0.0576, -0.0755, -0.0223, -0.0407,  0.0511,
          0.1379,  0.0229, -0.0920, -0.0784,  0.0149,  0.0016,  0.0568, -0.0786,
          0.0563,  0.1018,  

# InstanceSelecter()

- positive_instances are where the ECG embedding and dx_modality embedding align (from the same file/reading)
- negative_instances are where these two embeddings do not align
- filter out text embeddings that are the same or equal to the positive_instances

In [None]:
from tqdm import tqdm

class InstanceSelector:
    def __init__(self, train_set, processed_train_df, text_encoder, ecg_encoder):
        self.train_set = train_set
        self.processed_train_df = processed_train_df
        self.text_encoder = text_encoder
        self.ecg_encoder = ecg_encoder

    # def get_positive_instances(self):
    #     positive_instances = []
    #     for i in tqdm(range(len(self.train_set)), desc="Generating positive instances"):
    #         ecg_embedding = self.ecg_encoder.encode(self.train_set[i][1]['val'])
    #         dx_modality_embedding = self.text_encoder.encode(self.processed_train_df['dx_modality'][i])
    #         if torch.all(torch.eq(ecg_embedding, dx_modality_embedding)):
    #             positive_instances.append((ecg_embedding, dx_modality_embedding))
    #     return positive_instances

    def get_negative_instances(self):
        negative_instances = []
        positive_instances = self.get_positive_instances()
        for i in tqdm(range(len(self.train_set)), desc="Generating negative instances"):
            ecg_embedding = self.ecg_encoder.encode(self.train_set[i][1]['val'])
            for j in range(len(self.processed_train_df)):
                if i != j:
                    dx_modality_embedding = self.text_encoder.encode(self.processed_train_df['dx_modality'][j])
                    if not any(torch.all(torch.eq(ecg_embedding, pos[1])) for pos in positive_instances):
                        negative_instances.append((ecg_embedding, dx_modality_embedding))
        return negative_instances

In [None]:
text_encoder = TextEncoder()
ecg_encoder = ECGEncoder(num_classes=126)  # Assuming you have this class defined

In [None]:
instance_selector = InstanceSelector(train_set, processed_train_df, text_encoder, ecg_encoder)

In [None]:
positive_instances = instance_selector.get_positive_instances()

In [None]:
negative_instances = instance_selector.get_negative_instances()

# CLIPModel


In [None]:
class CLIPModel(nn.Module):
    def __init__(self, train_set, processed_train_df):
        super(CLIPModel, self).__init__()
        self.ecg_encoder = ECGEncoder(num_classes=126)  # Initialize ECGEncoder
        self.text_encoder = TextEncoder()  # Initialize TextEncoder
        self.instance_selector = InstanceSelector(train_set, processed_train_df, self.text_encoder, self.ecg_encoder)

    def forward(self, ecgs, diagnoses):
        ecgs_embeddings = self.ecg_encoder(ecgs)
        diagnoses_embeddings = self.text_encoder.encode(diagnoses)
        positive_instances = self.instance_selector.get_positive_instances()
        negative_instances = self.instance_selector.get_negative_instances()
        # Compute loss based on whether the pair of embeddings is a positive or negative instance
        loss = sum(F.cosine_similarity(ecgs_embeddings[i], diagnoses_embeddings[i]) for i in range(len(ecgs)) if (ecgs_embeddings[i], diagnoses_embeddings[i]) in positive_instances) \
             - sum(F.cosine_similarity(ecgs_embeddings[i], diagnoses_embeddings[i]) for i in range(len(ecgs)) if (ecgs_embeddings[i], diagnoses_embeddings[i]) in negative_instances)
        return loss

Example of how the CLIP-like model works:

Creating small data subsets for training example:

- 100 records only for train_set
- 100 records only for processed_train_df

In [None]:
sample_train_set_100 = train_set[:100]
sample_processed_train_df_100 = processed_train_df.iloc[:100]

In [None]:
len(sample_processed_train_df_100) , len(sample_train_set_100)

(100, 100)

In [None]:
# Initialize model
model = CLIPModel(sample_train_set_100, sample_processed_train_df_100)
# Initialize optimizer
optimizer = torch.optim.Adam(model.ecg_encoder.parameters())

num_params = sum(p.numel() for p in model.parameters())
print("Number of parameters: ", num_params)

Number of parameters:  10468286


In [None]:
# Training params
num_epochs = 3

In [None]:
# Initialize a list to store the loss at each step
losses = []

# Training loop
for epoch in range(num_epochs):
    # Add a progress bar for the inner loop
    for i in tqdm(range(len(sample_train_set_100)), desc=f"Training epoch {epoch+1}/{2}"):
        # Get ECGs and diagnoses from training set
        ecgs = sample_train_set_100[i][1]['val']
        diagnoses = sample_processed_train_df_100['dx_modality'][i]

        # Convert ECGs to tensor and add a dimension for batch size
        ecgs = torch.from_numpy(ecgs).float().unsqueeze(0)

        # Forward pass
        loss = model(ecgs, diagnoses)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Save the loss to a variable
        losses.append(loss.item())

    print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

# Save the model checkpoint after training
torch.save(model.state_dict(), 'model.ckpt')

Training epoch 1/2:   0%|          | 0/100 [00:00<?, ?it/s]
Generating positive instances:   0%|          | 0/100 [00:00<?, ?it/s][A
Generating positive instances:   1%|          | 1/100 [00:00<00:29,  3.37it/s][A
Generating positive instances:   2%|▏         | 2/100 [00:00<00:29,  3.32it/s][A
Generating positive instances:   3%|▎         | 3/100 [00:00<00:25,  3.81it/s][A
Generating positive instances:   4%|▍         | 4/100 [00:01<00:27,  3.43it/s][A
Generating positive instances:   5%|▌         | 5/100 [00:01<00:28,  3.33it/s][A
Generating positive instances:   6%|▌         | 6/100 [00:01<00:26,  3.53it/s][A
Generating positive instances:   7%|▋         | 7/100 [00:02<00:26,  3.53it/s][A
Generating positive instances:   8%|▊         | 8/100 [00:02<00:27,  3.36it/s][A
Generating positive instances:   9%|▉         | 9/100 [00:02<00:28,  3.18it/s][A
Generating positive instances:  10%|█         | 10/100 [00:03<00:33,  2.69it/s][A
Generating positive instances:  11%|█        

RuntimeError: ignored

```
class CLIPModel(nn.Module):
def
def __init__(self, ):
Konstantin Egorov8:28 AM
class CLIPModule(nn.Module):
	def __init__(self, ):
		self.ecg_encoder = ECGEncoder()
		self.text_encoder = TextEncoder()
		self.triplet_loss = TripletLoss()

	def forward(self, ecgs, diagnoses):
		ecgs_embeddings = self.ecg_encoder(ecgs)
		diagnoses_embeddings = self.text_encoder(diagnoses)
		loss – self.triplet_loss(ecgs_embeddings, diagnoses_embeddings)
```