# ClinicalBERT

In [1]:
# General imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sys
from tqdm import tqdm
import ast
from scipy.signal import resample

# Matlab/WFDB files
import scipy.io as sio

from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn

c:\Users\navme\AppData\Local\Programs\Python\Python38\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\navme\AppData\Local\Programs\Python\Python38\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
sys.path.append('C:/Users/navme/Desktop/ECG_Project/PyFiles')

In [3]:
from helper_functions import *
from dataset import PhysioNetDataset
from tripletloss import TripletLoss

In [4]:
PhysioNet_PATH = f'C:/Users/navme/Desktop/ECG_Thesis_Local/PhysioNet-2021-Challenge/physionet.org/files/challenge-2021/1.0.3/training'
PhysioNet_PATH

'C:/Users/navme/Desktop/ECG_Thesis_Local/PhysioNet-2021-Challenge/physionet.org/files/challenge-2021/1.0.3/training'

In [5]:
train_set = PhysioNetDataset(PhysioNet_PATH, train = True)
val_set = PhysioNetDataset(PhysioNet_PATH, train = False)

In [6]:
processed_train_csv = pd.read_csv('processed_train_set_records.csv')
processed_val_csv = pd.read_csv('processed_val_set_records.csv')

In [49]:
processed_train_csv.columns.values

array(['recording_number', 'recording_file', 'num_leads',
       'sampling_frequency', 'num_samples', 'age', 'sex', 'dx', 'rx',
       'hx', 'sx', 'lead_0_file', 'lead_0_adc_gain', 'lead_0_units',
       'lead_0_adc_resolution', 'lead_0_adc_zero', 'lead_0_initial_value',
       'lead_0_checksum', 'lead_0_lead_name', 'lead_1_file',
       'lead_1_adc_gain', 'lead_1_units', 'lead_1_adc_resolution',
       'lead_1_adc_zero', 'lead_1_initial_value', 'lead_1_checksum',
       'lead_1_lead_name', 'lead_2_file', 'lead_2_adc_gain',
       'lead_2_units', 'lead_2_adc_resolution', 'lead_2_adc_zero',
       'lead_2_initial_value', 'lead_2_checksum', 'lead_2_lead_name',
       'lead_3_file', 'lead_3_adc_gain', 'lead_3_units',
       'lead_3_adc_resolution', 'lead_3_adc_zero', 'lead_3_initial_value',
       'lead_3_checksum', 'lead_3_lead_name', 'lead_4_file',
       'lead_4_adc_gain', 'lead_4_units', 'lead_4_adc_resolution',
       'lead_4_adc_zero', 'lead_4_initial_value', 'lead_4_checksum',
    

In [82]:
train_set[0][1]

{'val': array([[408.24601882, 408.24601882, 408.24601882, ..., -83.34581329,
         -74.965045  , -63.10339951],
        [-92.07603073, -92.07603073, -92.07603073, ...,  57.20010276,
          54.51591647,  58.88514819],
        [225.08001192, 225.08001192, 225.08001192, ...,  93.39571052,
          97.44912853, 117.96825132]])}

In [8]:
class ECGEncoder(nn.Module):
    def __init__(self):
        super(ECGEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

    def forward(self, ecg_signal):
        # Convert ECG signal to string as BERT takes text as input
        ecg_signal = ' '.join(map(str, ecg_signal))
        inputs = self.tokenizer(ecg_signal, return_tensors="pt")
        outputs = self.bert(**inputs)
        # Use the BERT embeddings for the [CLS] token (first token)
        embeddings = outputs.last_hidden_state[:, 0, :]
        return embeddings

In [78]:
class ECGEncoder(nn.Module):
    def __init__(self):
        super(ECGEncoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        
    def forward(self, data):
        metadata, ecg_signal = data
        embeddings = []
        for lead_info, lead_signal in zip(metadata['leads_info'], ecg_signal):
            # Create a textual description of the ECG signal
            description = f"Lead {lead_info['lead_name']} with initial value {lead_info['initial_value']} and checksum {lead_info['checksum']} has signal values {lead_signal}"
            inputs = self.tokenizer(description, return_tensors="pt")
            outputs = self.bert(**inputs)
            embeddings.append(outputs.last_hidden_state[:, 0, :])
        return torch.cat(embeddings, dim=0)

In [79]:
# Instantiate the ECGEncoder
ecg_encoder = ECGEncoder()

# Get the first data point from the training set
data = train_set[0]

# Generate a ClinicalBERT embedding for the data point
embeddings = ecg_encoder(data)

for i, embedding in enumerate(embeddings):
    print(f"Embedding for lead {i}: {embedding}")

Embedding for lead 0: tensor([ 2.6361e-01,  8.8632e-02, -4.2674e-01,  3.2769e-01,  2.7432e-01,
        -7.9961e-02,  4.5787e-01,  2.4446e-01,  7.8162e-02, -2.5685e-01,
        -4.1844e-01, -6.6938e-02, -4.2574e-01,  3.7564e-01, -3.5149e-01,
         1.3120e-01,  3.5058e-01,  2.5360e-01, -4.3578e-01,  1.7420e-01,
         9.3913e-02,  1.9671e-01, -1.7270e-01, -2.4773e-01, -3.5569e-01,
        -5.1214e-01,  9.7906e-01,  4.0506e-01,  2.7232e-02,  4.5981e-01,
         5.2280e-01, -1.4626e-03, -9.2910e-03, -1.2541e-01, -1.5461e-01,
        -5.4510e-02,  1.0910e-01,  2.1561e-01, -9.0366e-02, -6.8356e-02,
        -3.3390e-03,  3.2699e-02,  1.0143e+00, -4.8333e-02,  3.7887e-01,
        -4.9696e-01, -2.7866e-01,  6.3696e-01, -4.6769e-01,  2.4993e-01,
         1.5906e-02,  5.8978e-01, -2.1196e-03, -7.6264e-02,  9.2241e-02,
        -1.0423e-01, -1.0745e-01, -4.0016e-01, -9.6476e-02, -6.6540e-02,
         2.5644e-01, -1.0989e-02,  3.5057e-01, -5.9316e-02, -1.7401e-01,
        -2.0260e-01, -1.2896e

In [None]:
# Instantiate the ECGEncoder
ecg_encoder = ECGEncoder()

# Assume we have an ECG signal from the PhysioNetDataset
_, ecg_signal = train_set[0]  # Get the first sample from the training set

# We'll just use one of the leads for this example
lead_name = list(ecg_signal.keys())[0]  # Get the name of the first lead
lead_ecg_signal = ecg_signal[lead_name]  # Get the ECG signal for that lead

# Convert the ECG signal to a ClinicalBERT embedding
ecg_embedding = ecg_encoder(lead_ecg_signal)

# Now ecg_embedding is a tensor containing the ClinicalBERT embedding of the ECG signal
print(ecg_embedding)

In [16]:
_

{'recording_number': 'JS00001',
 'recording_file': 'JS00001.mat',
 'num_leads': 12,
 'sampling_frequency': 500,
 'num_samples': 5000,
 'leads_info': [{'file': 'JS00001.mat',
   'adc_gain': 1000.0,
   'units': 'mV',
   'adc_resolution': 16,
   'adc_zero': 0,
   'initial_value': -254,
   'checksum': 21756,
   'lead_name': '0'},
  {'file': 'JS00001.mat',
   'adc_gain': 1000.0,
   'units': 'mV',
   'adc_resolution': 16,
   'adc_zero': 0,
   'initial_value': 264,
   'checksum': -599,
   'lead_name': '0'},
  {'file': 'JS00001.mat',
   'adc_gain': 1000.0,
   'units': 'mV',
   'adc_resolution': 16,
   'adc_zero': 0,
   'initial_value': 517,
   'checksum': -22376,
   'lead_name': '0'},
  {'file': 'JS00001.mat',
   'adc_gain': 1000.0,
   'units': 'mV',
   'adc_resolution': 16,
   'adc_zero': 0,
   'initial_value': -5,
   'checksum': 28232,
   'lead_name': '0'},
  {'file': 'JS00001.mat',
   'adc_gain': 1000.0,
   'units': 'mV',
   'adc_resolution': 16,
   'adc_zero': 0,
   'initial_value': -386,


In [39]:
class CLIPModel(nn.Module):
    def __init__(self, margin=1.0):
        super(CLIPModel, self).__init__()
        self.ecg_encoder = ECGEncoder()
        self.triplet_loss = TripletLoss(margin)

    def forward(self, anchor, positive, negative):
        anchor_embedding = self.ecg_encoder(anchor)
        positive_embedding = self.ecg_encoder(positive)
        negative_embedding = self.ecg_encoder(negative)

        loss = self.triplet_loss(anchor_embedding, positive_embedding, negative_embedding)
        return loss

In [34]:
# Get the first data point from the training set
data = train_set[0]

# Print the data point to inspect its structure
print(data)

({'recording_number': 'JS00001', 'recording_file': 'JS00001.mat', 'num_leads': 12, 'sampling_frequency': 500, 'num_samples': 5000, 'leads_info': [{'file': 'JS00001.mat', 'adc_gain': 1000.0, 'units': 'mV', 'adc_resolution': 16, 'adc_zero': 0, 'initial_value': -254, 'checksum': 21756, 'lead_name': '0'}, {'file': 'JS00001.mat', 'adc_gain': 1000.0, 'units': 'mV', 'adc_resolution': 16, 'adc_zero': 0, 'initial_value': 264, 'checksum': -599, 'lead_name': '0'}, {'file': 'JS00001.mat', 'adc_gain': 1000.0, 'units': 'mV', 'adc_resolution': 16, 'adc_zero': 0, 'initial_value': 517, 'checksum': -22376, 'lead_name': '0'}, {'file': 'JS00001.mat', 'adc_gain': 1000.0, 'units': 'mV', 'adc_resolution': 16, 'adc_zero': 0, 'initial_value': -5, 'checksum': 28232, 'lead_name': '0'}, {'file': 'JS00001.mat', 'adc_gain': 1000.0, 'units': 'mV', 'adc_resolution': 16, 'adc_zero': 0, 'initial_value': -386, 'checksum': 16619, 'lead_name': '0'}, {'file': 'JS00001.mat', 'adc_gain': 1000.0, 'units': 'mV', 'adc_resolutio

In [40]:
# Instantiate the model
model = CLIPModel(margin=1.0)

# Define an optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Assume we have some data points
anchor_metadata, anchor_data = train_set[0]
positive_metadata, positive_data = train_set[1]
negative_metadata, negative_data = train_set[2]

# Extract the signal values
anchor_signal = anchor_data['val']
positive_signal = positive_data['val']
negative_signal = negative_data['val']

# Convert signal values to tensors
anchor = torch.tensor(anchor_signal, dtype=torch.float32)
positive = torch.tensor(positive_signal, dtype=torch.float32)
negative = torch.tensor(negative_signal, dtype=torch.float32)

# Zero the gradients
optimizer.zero_grad()

# Forward pass
loss = model((anchor_metadata, anchor), (positive_metadata, positive), (negative_metadata, negative))

# Backward pass
loss.backward()

# Update the weights
optimizer.step()

# Print the loss
print(f"Loss: {loss.item()}")

Loss: 1.2122842073440552


In [30]:
# Instantiate the CLIPModel
clip_model = CLIPModel()

# Assume we have an anchor, positive, and negative ECG signals from the PhysioNetDataset
_, anchor_ecg_signal = train_set[0]  # Get the first sample from the training set
_, positive_ecg_signal = train_set[1]  # Get the second sample from the training set
_, negative_ecg_signal = train_set[2]  # Get the third sample from the training set

# We'll just use one of the leads for this example
lead_name = list(anchor_ecg_signal.keys())[0]  # Get the name of the first lead
anchor_lead_ecg_signal = anchor_ecg_signal[lead_name]  # Get the ECG signal for that lead
positive_lead_ecg_signal = positive_ecg_signal[lead_name]  # Get the ECG signal for that lead
negative_lead_ecg_signal = negative_ecg_signal[lead_name]  # Get the ECG signal for that lead

In [31]:
# Compute the triplet loss for these ECG signals
triplet_loss = clip_model(anchor_lead_ecg_signal, positive_lead_ecg_signal, negative_lead_ecg_signal)

# Now triplet_loss is a tensor containing the triplet loss for these ECG signals
print(triplet_loss)

ValueError: too many values to unpack (expected 2)

In [None]:
# Instantiate the CLIPModel
clip_model = CLIPModel()

num_epochs = 2

# Define an optimizer
optimizer = torch.optim.Adam(clip_model.parameters())

# Loop over your training data
for epoch in range(num_epochs):
    for i in range(len(train_set)):
        # Get the anchor, positive, and negative ECG signals
        _, anchor_ecg_signal = train_set[i]
        _, positive_ecg_signal = train_set[(i + 1) % len(train_set)]  # Use the next sample as the positive example
        _, negative_ecg_signal = train_set[(i + 2) % len(train_set)]  # Use the sample after that as the negative example

        # We'll just use one of the leads for this example
        lead_name = list(anchor_ecg_signal.keys())[0]  # Get the name of the first lead
        anchor_lead_ecg_signal = anchor_ecg_signal[lead_name]  # Get the ECG signal for that lead
        positive_lead_ecg_signal = positive_ecg_signal[lead_name]  # Get the ECG signal for that lead
        negative_lead_ecg_signal = negative_ecg_signal[lead_name]  # Get the ECG signal for that lead

        # Compute the triplet loss for these ECG signals
        triplet_loss = clip_model(anchor_lead_ecg_signal, positive_lead_ecg_signal, negative_lead_ecg_signal)

        # Backpropagate the loss and update the model parameters
        optimizer.zero_grad()
        triplet_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} Loss: {triplet_loss.item()}")

In [42]:
# Instantiate the CLIPModel
clip_model = CLIPModel()

num_epochs = 2

# Define an optimizer
optimizer = torch.optim.Adam(clip_model.parameters())

# Loop over your training data
for epoch in range(num_epochs):
    for i in range(len(train_set)):
        # Get the anchor, positive, and negative ECG signals
        anchor_metadata, anchor_ecg_signal = train_set[i]
        positive_metadata, positive_ecg_signal = train_set[(i + 1) % len(train_set)]  # Use the next sample as the positive example
        negative_metadata, negative_ecg_signal = train_set[(i + 2) % len(train_set)]  # Use the sample after that as the negative example

        # Compute the triplet loss for these ECG signals
        triplet_loss = clip_model((anchor_metadata, anchor_ecg_signal), (positive_metadata, positive_ecg_signal), (negative_metadata, negative_ecg_signal))

        # Backpropagate the loss and update the model parameters
        optimizer.zero_grad()
        triplet_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} Loss: {triplet_loss.item()}")

KeyboardInterrupt: 

1. The `ECGEncoder` is used to convert the raw ECG signals into embeddings using the ClinicalBERT model.

2. The `CLIPModel` uses the `ECGEncoder` to get embeddings for the anchor, positive, and negative examples, and then computes the triplet loss on these embeddings. By training the `CLIPModel` on your dataset, you're effectively learning a new representation for your ECG signals where similar signals are close together and different signals are far apart.

3. Once you've trained the `CLIPModel`, you can use it to transform your ECG signals into this new representation. You can then train a classifier on these transformed signals. This classifier could be any type of model you choose, such as a linear classifier, a support vector machine, a decision tree, a neural network, etc. The hope is that the new representation learned by the `CLIPModel` will be more useful for classification than the raw ECG signals.

In [None]:
import torch.nn as nn
import torch.optim as optim

# Load the data
train_dataset = PhysioNetDataset(processed_train_csv)
val_dataset = PhysioNetDataset(processed_val_csv)

# Instantiate the CLIPModel
clip_model = CLIPModel()

# Define the 1D CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 64, kernel_size=5, stride=2)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(64, num_classes)  # num_classes is the number of unique values in 'dx_modality'

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = torch.flatten(x, start_dim=1)
        x = self.fc(x)
        return x

cnn = CNN()

# Define a loss function and an optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn.parameters())

# Train the model
for epoch in range(num_epochs):
    for i in range(len(train_dataset)):
        # Get the ECG signal and the label
        _, ecg_signal = train_dataset[i]
        label = processed_train_csv['dx_modality'][i]

        # Transform the ECG signal using the CLIPModel
        ecg_signal = clip_model(ecg_signal)

        # Train the CNN on the transformed ECG signal and the label
        optimizer.zero_grad()
        outputs = cnn(ecg_signal)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs} Loss: {loss.item()}")

# Evaluate the model on the validation set
correct = 0
total = 0
with torch.no_grad():
    for i in range(len(val_dataset)):
        # Get the ECG signal and the label
        _, ecg_signal = val_dataset[i]
        label = processed_val_csv['dx_modality'][i]

        # Transform the ECG signal using the CLIPModel
        ecg_signal = clip_model(ecg_signal)

        # Compute the model's prediction
        outputs = cnn(ecg_signal)
        _, predicted = torch.max(outputs.data, 1)

        total += label.size(0)
        correct += (predicted == label).sum().item()

print(f"Accuracy on the validation set: {100 * correct / total}%")

To predict the class of a new ECG signal using the trained 1D CNN model, you can follow these steps:

1. Load the new ECG signal.
2. Transform the ECG signal into the learned representation using the `CLIPModel`.
3. Pass the transformed ECG signal through the 1D CNN to get the predicted class.

In [None]:
from scipy.io import loadmat

# Load the new ECG signal
new_ecg_signal = loadmat('new_ecg_signal.mat')

# Assuming the ECG signal is stored in a variable named 'ecg' in the .mat file
new_ecg_signal = new_ecg_signal['ecg']

# Transform the ECG signal using the CLIPModel
new_ecg_signal = clip_model(new_ecg_signal)

# Pass the transformed ECG signal through the 1D CNN to get the predicted class
outputs = cnn(new_ecg_signal)
_, predicted_class = torch.max(outputs.data, 1)

print(f"The predicted class for the new ECG signal is: {predicted_class.item()}")

This code loads a new ECG signal, transforms it using the `CLIPModel`, and then passes the transformed signal through the 1D CNN to get the predicted class. The predicted class is then printed to the console. Note that you'll need to replace `'new_ecg_signal.mat'` with the path to your new ECG signal file.

In [83]:
pip install neurokit2

Collecting neurokit2
  Obtaining dependency information for neurokit2 from https://files.pythonhosted.org/packages/b4/22/e7e3b341b80a1e56f270a0137d4a3e6b20a58ae56c77785c0f6d3c6ba653/neurokit2-0.2.7-py2.py3-none-any.whl.metadata
  Downloading neurokit2-0.2.7-py2.py3-none-any.whl.metadata (37 kB)
Downloading neurokit2-0.2.7-py2.py3-none-any.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ------ --------------------------------- 0.2/1.3 MB 3.9 MB/s eta 0:00:01
   ---------------- ----------------------- 0.5/1.3 MB 5.6 MB/s eta 0:00:01
   ---------------------------------------  1.3/1.3 MB 9.0 MB/s eta 0:00:01
   ---------------------------------------- 1.3/1.3 MB 6.8 MB/s eta 0:00:00
Installing collected packages: neurokit2
Successfully installed neurokit2-0.2.7
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [95]:
train_set[0][1]

{'val': array([[408.24601882, 408.24601882, 408.24601882, ..., -83.34581329,
         -74.965045  , -63.10339951],
        [-92.07603073, -92.07603073, -92.07603073, ...,  57.20010276,
          54.51591647,  58.88514819],
        [225.08001192, 225.08001192, 225.08001192, ...,  93.39571052,
          97.44912853, 117.96825132]])}