In [11]:
import os
import glob
import numpy as np
import tensorflow as tf
import scipy.signal as signal
from scipy.io import loadmat
from sklearn.model_selection import train_test_split

# -------------------------------------------
# 1. Preprocessing Functions
# -------------------------------------------
def bandpass_filter(ecg_signal, lowcut=0.5, highcut=40, fs=500, order=4):
    """Applies a Butterworth band-pass filter to the ECG signal."""
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = signal.butter(order, [low, high], btype='band', analog=False)
    filtered_ecg = signal.filtfilt(b, a, ecg_signal)
    return filtered_ecg

def downsample_signal(ecg_signal, factor=5):
    """Downsamples the signal by taking every 'factor'-th sample.
       For a 5000-point signal, factor=5 reduces it to 1000 points."""
    return ecg_signal[::factor]

def frame_ecg(ecg_signal, frame_size=1000):
    """
    Ensures the ECG signal is exactly frame_size long.
    Expects a 1D array; if shorter, pads with zeros; if longer, truncates.
    """
    n_points = ecg_signal.shape[0]
    if n_points < frame_size:
        padded = np.zeros(frame_size, dtype=ecg_signal.dtype)
        padded[:n_points] = ecg_signal
        return padded
    else:
        return ecg_signal[:frame_size]

# -------------------------------------------
# 2. Functions to Handle .hea Files and Build Label Mapping
# -------------------------------------------
def read_hea_file(hea_path):
    """
    Reads a .hea file and extracts diagnostic codes.
    Assumes a line starting with '#Dx:' contains the diagnostic codes.
    """
    diagnostic_codes = []
    with open(hea_path, 'r') as f:
        for line in f:
            if line.startswith('#Dx:'):
                codes_line = line.strip()[4:].strip()  # Remove "#Dx:" prefix
                codes = codes_line.replace(',', ' ').split()
                diagnostic_codes.extend(codes)
                break
    return diagnostic_codes

def build_diagnostic_mapping(hea_files):
    """Builds a mapping from diagnostic code to index using all provided .hea files."""
    all_codes = set()
    for hea_file in hea_files:
        codes = read_hea_file(hea_file)
        all_codes.update(codes)
    sorted_codes = sorted(list(all_codes))
    mapping = {code: idx for idx, code in enumerate(sorted_codes)}
    return mapping

# -------------------------------------------
# 3. File Pairing
# -------------------------------------------
def get_file_pairs(data_dir):
    """
    Scans the directory for .mat files and returns a list of tuples:
    (mat_file_path, corresponding hea_file_path).
    """
    mat_files = glob.glob(os.path.join(data_dir, "*.mat"))
    file_pairs = []
    for mat_file in mat_files:
        base = os.path.splitext(mat_file)[0]
        hea_file = base + ".hea"
        if os.path.exists(hea_file):
            file_pairs.append((mat_file, hea_file))
    return file_pairs

# -------------------------------------------
# 4. Data Generator Function for tf.data
# -------------------------------------------
def data_generator(file_pairs, diag_mapping, frame_size=1000, fs=500):
    """
    Generator function that yields preprocessed ECG signals and corresponding multi-label vectors.
    For multi-lead signals, each lead is processed and then combined so that the final signal has
    shape (frame_size, num_leads).
    """
    num_classes = len(diag_mapping)
    for mat_path, hea_path in file_pairs:
        # Load raw signal from .mat file (assuming key 'val')
        mat_contents = loadmat(mat_path)
        raw_signal = mat_contents['val']
        raw_signal = np.squeeze(raw_signal)
        # If the signal is 1D, convert to 2D with one channel.
        if raw_signal.ndim == 1:
            raw_signal = np.expand_dims(raw_signal, axis=0)
        # raw_signal shape: (num_leads, 5000)
        num_leads = raw_signal.shape[0]
        processed_leads = []
        for i in range(num_leads):
            lead = raw_signal[i]
            filtered_signal = bandpass_filter(lead, fs=fs)
            downsampled_signal = downsample_signal(filtered_signal, factor=5)
            framed_signal = frame_ecg(downsampled_signal, frame_size=frame_size)
            processed_leads.append(framed_signal)
        # Stack and transpose: final shape becomes (frame_size, num_leads)
        processed_leads = np.stack(processed_leads, axis=0)
        final_signal = processed_leads.transpose(1, 0).astype(np.float32)

        # Process diagnostic codes from .hea file into a multi-label vector
        codes = read_hea_file(hea_path)
        label_vector = np.zeros(num_classes, dtype=np.float32)
        for code in codes:
            if code in diag_mapping:
                label_vector[diag_mapping[code]] = 1.0

        yield final_signal, label_vector

# -------------------------------------------
# 5. Custom Keras Layers: BasicResBlock and Attention
# -------------------------------------------
class BasicResBlock(tf.keras.layers.Layer):
    def __init__(self, out_channels, kernel_size=7, strides=1, **kwargs):
        super(BasicResBlock, self).__init__(**kwargs)
        self.conv1 = tf.keras.layers.Conv1D(out_channels, kernel_size, strides=strides, padding='same')
        self.bn1 = tf.keras.layers.BatchNormalization()
        self.relu = tf.keras.layers.ReLU()
        self.conv2 = tf.keras.layers.Conv1D(out_channels, kernel_size, strides=strides, padding='same')
        self.bn2 = tf.keras.layers.BatchNormalization()
        self.downsample = None

    def build(self, input_shape):
        if input_shape[-1] != self.conv1.filters:
            self.downsample = tf.keras.Sequential([
                tf.keras.layers.Conv1D(self.conv1.filters, kernel_size=1, padding='same'),
                tf.keras.layers.BatchNormalization()
            ])
        super(BasicResBlock, self).build(input_shape)

    def call(self, inputs, training=False):
        residual = inputs
        x = self.conv1(inputs)
        x = self.bn1(x, training=training)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x, training=training)
        if self.downsample is not None:
            residual = self.downsample(inputs, training=training)
        x = self.relu(x + residual)
        return x

class Attention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.attention_dense = tf.keras.layers.Dense(1)

    def call(self, inputs):
        # inputs shape: (batch, timesteps, features)
        scores = self.attention_dense(inputs)  # (batch, timesteps, 1)
        weights = tf.nn.softmax(scores, axis=1)  # softmax along time dimension
        weighted_sum = tf.reduce_sum(inputs * weights, axis=1)  # (batch, features)
        return weighted_sum

# -------------------------------------------
# 6. Model Definition
# -------------------------------------------
def build_model(num_classes, frame_size=1000, num_leads=12, res_channels=32, lstm_hidden=128, num_layers=1):
    """
    Constructs a model with:
      - Two ResNet-like blocks for feature extraction.
      - AveragePooling to reduce the time dimension.
      - A bidirectional LSTM to capture temporal dependencies.
      - An attention layer to focus on informative time steps.
      - A Dense layer for multi-label classification.
    The model now accepts inputs of shape (frame_size, num_leads).
    """
    inputs = tf.keras.Input(shape=(frame_size, num_leads))

    # ResNet-like blocks
    x = BasicResBlock(res_channels, kernel_size=7)(inputs)
    x = BasicResBlock(res_channels * 2, kernel_size=7)(x)

    # Reduce time dimension: from frame_size -> frame_size//10 (pool_size 10)
    x = tf.keras.layers.AveragePooling1D(pool_size=10, strides=10)(x)

    # BiLSTM layers: stacking multiple layers if num_layers > 1
    for _ in range(num_layers):
        x = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(lstm_hidden, return_sequences=True)
        )(x)

    # Attention mechanism
    x = Attention()(x)

    # Final classification layer (logits output; use from_logits=True in loss)
    outputs = tf.keras.layers.Dense(num_classes)(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# -------------------------------------------
# 7. Main Function: Data Loading, Model Training, and Saving
# -------------------------------------------
def main():
    # Set your data directory containing .mat and .hea files
    data_dir = "/Users/Dell/Desktop/Data100"  # Replace with your actual data folder path
    file_pairs = get_file_pairs(data_dir)
    if not file_pairs:
        print("No file pairs found in the specified directory.")
        return

    # Build diagnostic mapping from all .hea files
    hea_files = [pair[1] for pair in file_pairs]
    diag_mapping = build_diagnostic_mapping(hea_files)
    print("Diagnostic Mapping:", diag_mapping)

    # Split file pairs into training and validation sets
    train_pairs, val_pairs = train_test_split(file_pairs, test_size=0.2, random_state=42)

    # Parameters
    frame_size = 1000
    fs = 500
    num_classes = len(diag_mapping)
    batch_size = 8
    num_epochs = 100

    # Create tf.data Datasets from generator
    train_dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(train_pairs, diag_mapping, frame_size, fs),
        output_signature=(
            tf.TensorSpec(shape=(frame_size, None), dtype=tf.float32),
            tf.TensorSpec(shape=(num_classes,), dtype=tf.float32)
        )
    )
    val_dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(val_pairs, diag_mapping, frame_size, fs),
        output_signature=(
            tf.TensorSpec(shape=(frame_size, None), dtype=tf.float32),
            tf.TensorSpec(shape=(num_classes,), dtype=tf.float32)
        )
    )

    # Note: The second dimension in the TensorSpec (None) will be inferred from the data (should be num_leads).
    # Optionally, if you know you always have 12 leads, you can set shape=(frame_size, 12).

    train_dataset = train_dataset.shuffle(buffer_size=100).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    # Determine number of leads from the first sample
    for sample, _ in train_dataset.take(1):
        num_leads = sample.shape[-1]
    print("Number of leads detected:", num_leads)

    # Build the model with the detected number of leads
    model = build_model(num_classes, frame_size=frame_size, num_leads=num_leads,
                        res_channels=16, lstm_hidden=64, num_layers=1)
    model.summary()

    # Compile the model with binary cross-entropy loss (from_logits=True)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=[tf.keras.metrics.BinaryAccuracy()])

    # Train the model
    model.fit(train_dataset,
              epochs=num_epochs,
              validation_data=val_dataset)

    # Save the model
    model.save("/Users/Dell/Desktop/repository/updated_ecg_multilabel_model_tf.h5")
    print("Model saved as ecg_multilabel_model_tf.h5")

if __name__ == "__main__":
    main()


Diagnostic Mapping: {'10370003': 0, '106068003': 1, '111288001': 2, '11157007': 3, '111975006': 4, '13640000': 5, '164865005': 6, '164873001': 7, '164889003': 8, '164890007': 9, '164896001': 10, '164909002': 11, '164912004': 12, '164917005': 13, '164930006': 14, '164931005': 15, '164934002': 16, '164937009': 17, '164942001': 18, '164947007': 19, '17338001': 20, '17366009': 21, '195042002': 22, '195060002': 23, '195101003': 24, '233892002': 25, '233897008': 26, '233917008': 27, '251120003': 28, '251146004': 29, '251164006': 30, '251166008': 31, '251170000': 32, '251173003': 33, '251180001': 34, '251187003': 35, '251198002': 36, '251199005': 37, '251205003': 38, '251223006': 39, '270492004': 40, '27885002': 41, '284470004': 42, '29320008': 43, '365413008': 44, '39732003': 45, '418818005': 46, '425856008': 47, '426177001': 48, '426183003': 49, '426627000': 50, '426648003': 51, '426664006': 52, '426761007': 53, '426783006': 54, '426995002': 55, '427084000': 56, '427172004': 57, '427393009'

Epoch 1/100
[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 135ms/step - binary_accuracy: 0.9621 - loss: 0.1980 - val_binary_accuracy: 0.9636 - val_loss: 0.1228
Epoch 2/100
[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 133ms/step - binary_accuracy: 0.9632 - loss: 0.1240 - val_binary_accuracy: 0.9638 - val_loss: 0.1195
Epoch 3/100
[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 126ms/step - binary_accuracy: 0.9637 - loss: 0.1196 - val_binary_accuracy: 0.9641 - val_loss: 0.1141
Epoch 4/100
[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 134ms/step - binary_accuracy: 0.9639 - loss: 0.1137 - val_binary_accuracy: 0.9646 - val_loss: 0.1090
Epoch 5/100
[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 133ms/step - binary_accuracy: 0.9646 - loss: 0.1086 - val_binary_accuracy: 0.9650 - val_loss: 0.1050
Epoch 6/100
[1m514/514[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 134ms/step - binary_a



Model saved as ecg_multilabel_model_tf.h5
