In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# move to desginated directory
import os
os.chdir('/content/drive/MyDrive/DLH Final Project/')

Mounted at /content/drive


In [4]:
import csv
import numpy as np
import os as os
import pandas as pd
import scipy.io
import skvideo.io
import tensorflow as tf

ECG Model Training Attempt from resources given from DeepHeartBeat (We were not able to reproduce using the code provided, given the core problem of having different dependencies, as the code largely depended on TF2.2.0)

In [None]:
# physio_data processing
physio_data = dict()

for filename in os.listdir('data/physio_training/'):
  if filename.endswith('.mat'):
    mat_data = scipy.io.loadmat('data/physio_training/'+ filename)
    physio_data[filename[:-4]] = {
        'measurements': mat_data['val'][0],
        'frequency': 300
    }

In [7]:
# from sklearn.model_selection import train_test_split
# from models.ecg import ECGModel

In [None]:
# # Load Physionet ECG data
# print('%i subjects loaded' % len(physio_data))

# # Train-validation split
# ids = np.array(list(physio_data.keys()))
# train_ids, val_ids = train_test_split(ids, test_size=0.2, random_state=38)

# train_data = [physio_data[id] for id in train_ids]
# val_data = [physio_data[id] for id in val_ids]

In [None]:

# trained_model_path = './self_trained_models/physionet'
# model = ECGModel(latent_space_dim=8, batch_size=64, hidden_dim=128, learning_rate=5e-4, log_dir=trained_model_path)
# model.fit(train_data, val_data)

# model.save_weights(trained_model_path)

ECG Model (self reproduction with help of LLM)


* Ablation in a function window_data to normalize


In [31]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

# Constants
WINDOW_SIZE = 256
STRIDE = 128
EPOCHS = 10
BATCH_SIZE = 16
LATENT_DIM = 32  # size of latent vector

# Normalize + Windowing
def window_data(data_dict, window_size, stride):
    all_windows = []
    for sample in data_dict.values():
        signal = sample['measurements']
        for start in range(0, len(signal) - window_size + 1, stride):
            window = signal[start:start + window_size]
            # Normalize: zero mean, unit variance
            window = (window - np.mean(window)) / (np.std(window) + 1e-8)
            all_windows.append(window)
    return tf.ragged.constant(all_windows, dtype=tf.float32)

windows = window_data(physio_data, WINDOW_SIZE, STRIDE)
print(f"Total windows: {windows.shape[0]}")

# Split
def split_train_test(ragged_tensor, train_frac=0.8):
    total = ragged_tensor.shape[0]
    split = int(total * train_frac)
    train = ragged_tensor[:split].to_tensor()
    test = ragged_tensor[split:].to_tensor()
    return train, test

train_x, test_x = split_train_test(windows)

# Model
def build_autoencoder(input_length, latent_dim=LATENT_DIM):
    input_layer = layers.Input(shape=(input_length, 1))

    # Encoder
    x = layers.Conv1D(16, 3, padding='same', activation='relu')(input_layer)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Conv1D(8, 3, padding='same', activation='relu')(x)
    x = layers.MaxPooling1D(2)(x)
    x = layers.Flatten()(x)
    latent = layers.Dense(latent_dim, name="latent_vector")(x)

    # Decoder
    x = layers.Dense((input_length // 4) * 8)(latent)
    x = layers.Reshape((input_length // 4, 8))(x)
    x = layers.UpSampling1D(2)(x)
    x = layers.Conv1D(8, 3, padding='same', activation='relu')(x)
    x = layers.UpSampling1D(2)(x)
    x = layers.Conv1D(1, 3, padding='same', activation='linear')(x)

    return models.Model(input_layer, x)

# Prepare input
train_x = tf.expand_dims(train_x, -1)
test_x = tf.expand_dims(test_x, -1)

# Build and compile
model = build_autoencoder(WINDOW_SIZE)
model.compile(optimizer='adam', loss='mse', metrics=['mae','rmse'])

history = model.fit(
    train_x, train_x,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(test_x, test_x)
)

# Evaluate with LLM generated metrics, MSE and MAE
eval_loss, eval_mae = model.evaluate(test_x, test_x)
print(f"\nTest MSE: {eval_loss:.4f}, Test MAE: {eval_mae:.4f}")

Total windows: 13262
Epoch 1/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 13ms/step - loss: 0.4533 - mae: 0.3778 - val_loss: 0.0595 - val_mae: 0.1570
Epoch 2/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - loss: 0.0609 - mae: 0.1520 - val_loss: 0.0449 - val_mae: 0.1318
Epoch 3/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - loss: 0.0468 - mae: 0.1273 - val_loss: 0.0385 - val_mae: 0.1216
Epoch 4/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - loss: 0.0397 - mae: 0.1148 - val_loss: 0.0337 - val_mae: 0.1109
Epoch 5/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 0.0358 - mae: 0.1088 - val_loss: 0.0301 - val_mae: 0.1041
Epoch 6/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - loss: 0.0339 - mae: 0.1053 - val_loss: 0.0291 - val_mae: 0.1010
Epoch 7/10
[1m664/664[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

Evaluation with Anamaly Detection

In [33]:
"""
From utils.py in the DeepHeartBeat repo
"""

from tqdm import tqdm

def get_model_results(model, data_dict, window_size=WINDOW_SIZE, stride=STRIDE):
    model_results = dict()

    for rec_id, sample in tqdm(data_dict.items(), desc="Evaluating model"):
        signal = sample['measurements']
        windows = []

        for start in range(0, len(signal) - window_size + 1, stride):
            window = signal[start:start + window_size]
            window = (window - np.mean(window)) / (np.std(window) + 1e-8)
            windows.append(window)

        if not windows:
            continue

        windows = np.array(windows)[..., np.newaxis]
        reconstructions = model.predict(windows, verbose=0)
        errors = np.mean((reconstructions - windows)**2, axis=(1, 2))

        model_results[rec_id] = {
            'reconstruction_error': float(np.mean(errors)),
            'reconstruction_stddev': float(np.mean(np.std(reconstructions, axis=1))),
            'num_windows': len(windows),
        }

    return model_results


In [34]:
full_results_train = get_model_results(model, {k: physio_data[k] for k in physio_data.keys()})

Evaluating model: 100%|██████████| 180/180 [00:24<00:00,  7.31it/s]


In [35]:
"""
How well does the model do when classifying noises: "~" labels
"""

from sklearn.metrics import roc_auc_score

with open('./data/physio_training/REFERENCE.csv', newline='') as label_csv_file:
      csv_reader = csv.reader(label_csv_file, delimiter=',')
      labels = {record_id: label for record_id, label in csv_reader}


noise_labels = [1 if labels[rec_id] == '~' else 0 for rec_id in full_results_train.keys()]
rec_error = [full_results_train[rec_id]['reconstruction_error'] for rec_id in full_results_train.keys()]
print("ROC AUC score: {}".format(roc_auc_score(noise_labels, rec_error)))


ROC AUC score: 0.8116760828625236


EchocardioModel Model Training Attempt (were not able to succeed due to limitations in data processing, need to expand a zip of 8GB and after multiple tries in unzipping, there are still a large portion of missing files that do not match with filelistlabel)

In [None]:
# video_cache_folder = './cache/EchoNet-Dynamic/Videos'

# if not os.path.exists(video_cache_folder):
#     os.makedirs(video_cache_folder)

# data_info = pd.read_csv('./data/EchoNet-Dynamic/FileList.csv')
# data_info['globalID'] = data_info['FileName'].apply(lambda s: s[:-4]).astype('string')
# data_info.set_index('globalID', inplace=True)

# files = dict()
# for index, row in data_info.iterrows():
#     filepath = './content/drive/MyDrive/DLH Final Project/data/EchoNet-Dynamic/Videos/' + index + '.avi'
#     filepath_cached = video_cache_folder + '/' + index + '.npz'

#     if not os.path.exists(filepath_cached):

#         frames = skvideo.io.vread(filepath)

#         frames = [frame[:, :, 0] for frame in frames]

#         time_base = 1/data_info.loc[index]['FPS']
#         times = [i*time_base for i in range(len(frames))]

#         np.savez(filepath_cached, frames=frames, times=times)

#     files[index] = filepath_cached

In [None]:
# echonet_train_ids = data_info[data_info.Split == 'TRAIN'].index.values
# echonet_val_ids = data_info[data_info.Split == 'VAL'].index.values
# ids = list(echonet_train_ids) + list(echonet_val_ids)

# files = np.array([files[id] for id in ids])
# kf = KFold(n_splits=5, shuffle=True, random_state=230)
# for i, (train_index, val_index) in enumerate(kf.split(files)):

#     train_files = files[train_index]
#     val_files = files[val_index]

#     trained_model_path = './self_trained_modles/echonet_dynamic_' + str(i)

#     model = EchocardioModel(latent_space_dim=128, batch_size=32, hidden_dim=128, log_dir=trained_model_path)
#     model.fit(train_files, val_files)

#     model.save_weights(trained_model_path)