# Imports and Initials

In [None]:
import os
import random
import numpy as np
from PIL import Image
from datetime import datetime

In [None]:
! mkdir mixed_dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Downloading dataset(s)

In [None]:
# Downloading the MIT-BIH dataset
# fs = 360
# Containing Anomaly
# Should get all possible EKMs
! wget -r -N -c -np https://physionet.org/files/mitdb/1.0.0/

In [None]:
# Downloading the NSRDB dataset
# fs = 128
! wget -r -N -c -np https://physionet.org/files/nsrdb/1.0.0/

In [None]:
# Downloading the PTBDB dataset
# fs = 1000
# Containing Anomaly
# Should get all possible EKMs
! wget -r -N -c -np https://physionet.org/files/ptbdb/1.0.0/

# Labeling (MIT-BIHDB)

In [None]:
def health_checker(path):
  with open(path, "r") as f:
    text = f.read()

  if "None" in text:
    return "healthy"
  else:
    return "cvd"

In [None]:
# Checked
health_checker("/content/physionet.org/files/mitdb/1.0.0/113.hea")

'healthy'

In [None]:
# Extracting labels of users (saving in users_labels variable)
# healthy users are labeled 1 and others labeled 0
source_path = "/content/physionet.org/files/mitdb/1.0.0/"
users_labels = {}

for user_file in os.listdir(source_path):
  user_id = user_file.split(".")[0]
  file_type = user_file.split(".")[-1]

  if file_type == "hea":
    user_file_full_path = source_path + user_file
    if health_checker(user_file_full_path) == "healthy":
      users_labels[user_id] = 1
    elif health_checker(user_file_full_path) == "cvd":
      users_labels[user_id] = 0

In [None]:
# Checked
users_labels["113"]

1

In [None]:
# Getting healthy users
healthy_users = []

for key in users_labels:
  if users_labels[key] == 1:
    healthy_users.append(key)

healthy_users.sort()
print(healthy_users)

['113', '115', '116', '117', '208', '210', '212', '215', '223', '231', '234']


Healthy users in MIT-DB are users with following ids:

'113', '115', '116', '117', '208', '210', '212', '215', '223', '231', '234'

In [None]:
healthy_users = ['113', '115', '116', '117', '208', '210', '212', '215', '223', '231', '234']

# Mixing datasets

## Mixing (MIT-DB) and (NSRDB)

### Extracting MIT-DB dataset

In [None]:
! ls /content/drive/MyDrive/ECG\ project/

 EKM_MIT_DB_5bpf.tar.gz   EKM_PTBDB_5bpf.tar.gz
 EKM_NSRDB_5bpf.tar.gz	 'N. Mokhtari'


In [None]:
# Extracting the MIT-DB dataset's .tar.zip file
! tar -xzvf "/content/drive/MyDrive/ECG project/EKM_MIT_DB_5bpf.tar.gz"

In [None]:
# Just chaning the name of extracted folder to EKM_MITDB_dataset
! mv EKM_dataset EKM_MITDB_dataset

In [None]:
# Moving the train and test EKMs into one folder
! mv EKM_MITDB_dataset/train/* EKM_MITDB_dataset/
! mv EKM_MITDB_dataset/test/* EKM_MITDB_dataset/

In [None]:
! rm -r EKM_MITDB_dataset/train/
! rm -r EKM_MITDB_dataset/test/

In [None]:
healthy_users

['113', '115', '116', '117', '208', '210', '212', '215', '223', '231', '234']

In [None]:
# Counting the healthy users in MITDB
healthy_users_counter = 0
source_path = "/content/EKM_MITDB_dataset"
ekms_files_list = os.listdir(source_path)

for ekm in ekms_files_list:
  user_id = ekm.split("-")[-2]

  if user_id in healthy_users:
    healthy_users_counter += 1

cvd_users_count = len(ekms_files_list) - healthy_users_counter
print(f"Number of healthy users in MITDB:\t {healthy_users_counter}")
print(f"Number of users with CVD in MITDB:\t {cvd_users_count}")

Number of healthy users in MITDB:	 5203
Number of users with CVD in MITDB:	 16410


In [None]:
# Copying MITDB dataset into bowl
! cp EKM_MITDB_dataset/* bowl/

### Extracting the NSRDB dataset

In [None]:
# Extracting the NSRDB dataset's .tar.zip file
! tar -xzvf "/content/drive/MyDrive/ECG project/EKM_NSRDB_5bpf.tar.gz"

In [None]:
# Just chaning the name of extracted folder to EKM_MITDB_dataset
! mv EKM_dataset EKM_NSRDB_dataset

In [None]:
# Moving the train and test EKMs into one folder
! find EKM_NSRDB_dataset/train/ -name '*' -exec mv {} EKM_NSRDB_dataset/ \;
! mv EKM_NSRDB_dataset/test/* EKM_NSRDB_dataset/

In [None]:
! rm -r EKM_NSRDB_dataset/train/
! rm -r EKM_NSRDB_dataset/test/

### Mixing with same proportion of healthy users and ill users

In [None]:
amount_of_ekms_needed_from_NSRDB = cvd_users_count - healthy_users_counter

In [None]:
# Getting ekms needed from NSRDB randomly
source_path = "/content/EKM_NSRDB_dataset"
all_ekms_in_NSRDB = os.listdir(source_path)

random_ekms_from_NSRDB = random.sample(all_ekms_in_NSRDB, amount_of_ekms_needed_from_NSRDB)

In [None]:
len(random_ekms_from_NSRDB)

11207

In [None]:
for ekm in random_ekms_from_NSRDB:
  ! cp $source_path/$ekm /content/bowl

In [None]:
# Mixed dataset EKMs' amount
len(os.listdir("/content/bowl"))

32821

# Vetorizing the dataset

In [None]:
# We can label the images by vitually,
# in this way that, when numericlizing the labels
# we can add the amount of previous dataset users
# when we want to athenticate them.

In [None]:
# For labeling cvd/healthy (0 or 1) just use dataset's name
# and the users id(key)

In [None]:
def vertorizing_png_imges(address):
  # Load the PNG image
  image = Image.open(address)

  # Convert the image to RGB mode
  image = image.convert('RGB')

  # Resize the image to match the input size expected by the CNN
  desired_width = 33
  desired_height = 21
  image = image.resize((desired_width, desired_height))

  # Convert the image to a NumPy array
  image_array = np.array(image)

  # Reshape the array to match the input shape expected by the CNN
  # image_array = image_array.reshape((1, desired_height, desired_width, 3))

  # Normalize the array
  image_array = image_array.astype('float32') / 255.0

  return image_array

In [None]:
X_dataset = []
y_dataset = []

In [None]:
from IPython.display import clear_output

def progress_bar(index):

  bar_length = 50

  total_length = len(os.listdir("/content/bowl"))

  step = int(total_length / bar_length)
  prgress = int(index / step)

  # Clear the current cell's output
  clear_output(wait=True)

  print("[", end="")

  [print("*", end="") for i in range(prgress)]
  [print("-", end="") for i in range(bar_length - prgress)]
  print("]")
  print(f"{index}/{total_length}")

In [None]:
# X data for cnn network input
base_path = "/content/bowl/"
images_names = os.listdir(base_path)

before_run_time = datetime.now()

# Get X_dataset by vectorization
# and y_dataset by name of the datasets and user ids
for index, img_name in enumerate(images_names):

    # Checking if the file is an image or not
    if img_name.split(".")[-1] != "png":
      continue

    img_vector = vertorizing_png_imges(base_path + img_name)
    X_dataset.append(img_vector)

    img_name = img_name.split("-")
    user_id = img_name[-2]
    dataset_name = img_name[-4]

    # labeling 1 for healthy users and 0 for user with cvd
    if dataset_name == "NSRDB":
      y_dataset.append(1)
    elif (dataset_name == "MITDB") and (user_id in healthy_users):
      y_dataset.append(1)
    else:
      y_dataset.append(0)

    progress_bar(index)

after_run_time = datetime.now()
diff = after_run_time - before_run_time
print(f"This cell took {int(diff.seconds / 60)} minutes to run.")

[**************************************************]
32820/32821
This cell took 28 minutes to run.


In [None]:
X_dataset = np.array(X_dataset)
y_dataset = np.array(y_dataset)

In [None]:
np.unique(y_dataset)

array([0, 1])

# CNN Architecture

## Model is mixture of (MITDB & NSRDB)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_dataset, y_dataset, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [None]:
# Creating the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(21, 33, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.7),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

# Setting Adam optimizer
optimizer = Adam(learning_rate=0.001)

# Compileing the model with the optimizer
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ed31bd32e00>

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

319/319 - 2s - loss: 0.0169 - accuracy: 0.9948 - 2s/epoch - 6ms/step
Test Loss: 0.0169
Test Accuracy: 0.9948


In [None]:
# Calculate the proportion of each unique value
unique_values, counts = np.unique(y_test, return_counts=True)
proportions = counts / len(y_test)

# Create a dictionary to store the unique values and their proportions
value_proportions = dict(zip(unique_values, proportions))

# Print the value proportions
for value, proportion in value_proportions.items():
    print(f"Value: {value}, Proportion: {proportion:.2f}")

Value: 0, Proportion: 0.50
Value: 1, Proportion: 0.50


## Testing with PTBDB

In [None]:
# Gathering .dat files of PTB dataset
# For PTB dataset

if "ptbdb_dat_files" in os.listdir("/content"):
  ! rm -r ptbdb_dat_files
! mkdir ptbdb_dat_files

ecg_mine_path = "/content/physionet.org/files/ptbdb/1.0.0"
ptb_labeling_dict = {}
ptb_labeling_file_to_user = {}

files_list = os.listdir(ecg_mine_path)
# Extracting ECG files of patients and saving them
# into /content/ptbdb_dat_files directory
for patient_folder in files_list:
  # chckeing if the folder is a patient's files or not
  patient_folder_or_not = str(patient_folder)[:-3]
  if patient_folder_or_not != "patient":
    continue

  # key is user's id
  key = patient_folder[-3:]
  ptb_labeling_dict[key] = []
  each_patient_files = os.listdir(f"{ecg_mine_path}/{patient_folder}")
  for f in each_patient_files:
    ! cp "$ecg_mine_path/$patient_folder/$f" ptbdb_dat_files
    ptb_labeling_dict[key].append(f)
    ptb_labeling_file_to_user[f] = key

In [None]:
ecg_mine_path = "/content/ptbdb_dat_files"

files_list = os.listdir(ecg_mine_path)
healthy_controls = []

for f in files_list:
  f_extention = f.split(".")[-1]
  if f_extention == "hea":
    with open(f"{ecg_mine_path}/{f}", "r") as hea_file:
      text = hea_file.read()

    if "Healthy control" in text:
      healthy_controls.append(f.split(".")[0])

In [None]:
healthy_users_id = []
for healthy_user_file_name in healthy_controls:
  healthy_user_id = ptb_labeling_file_to_user[f"{healthy_user_file_name}.dat"]
  healthy_users_id.append(healthy_user_id)

In [None]:
# unique healthy users
len(set(healthy_users_id))

52

In [None]:
healthy_users_id[0]

'252'

In [None]:
healthy_users_id = list(set(healthy_users_id))

In [None]:
# Extracting the PTBDB dataset's .tar.zip file
! tar -xzvf "/content/drive/MyDrive/ECG project/EKM_PTBDB_5bpf.tar.gz"

In [None]:
# Just chaning the name of extracted folder to EKM_PTBDB_dataset
! mv EKM_dataset EKM_PTBDB_dataset

In [None]:
# Moving the train and test EKMs into one folder
! mv EKM_PTBDB_dataset/train/* EKM_PTBDB_dataset/
! mv EKM_PTBDB_dataset/test/* EKM_PTBDB_dataset/

In [None]:
! rm -r EKM_PTBDB_dataset/train/
! rm -r EKM_PTBDB_dataset/test/

In [None]:
# Getting 1000 ekms from PTBDB randomly for test
source_path = "/content/EKM_PTBDB_dataset/"
all_ekms_in_PTBDB = os.listdir(source_path)

random_ekms_from_PTBDB = random.sample(all_ekms_in_PTBDB, 1000)

In [None]:
ptb_test_x = []
ptb_test_y = []

In [None]:
for ekm in all_ekms_in_PTBDB:
  img_vector = vertorizing_png_imges(source_path + ekm)
  ptb_test_x.append(img_vector)

  user_id = ekm.split("-")[-2]
  if user_id in healthy_users_id:
    ptb_test_y.append(1)
  else:
    ptb_test_y.append(0)

In [None]:
ptb_test_x = np.array(ptb_test_x)
ptb_test_y = np.array(ptb_test_y)

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(ptb_test_x, ptb_test_y, verbose=2)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

291/291 - 2s - loss: 0.8868 - accuracy: 0.8242 - 2s/epoch - 6ms/step
Test Loss: 0.8868
Test Accuracy: 0.8242
