# Install some dependencies

In [6]:
!pip install scipy

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [7]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [8]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


# Process Data

In [1]:
import scipy.io
import numpy as np
import os
import pandas as pd

In [2]:
# Paths to files
mat_files_directory = "./physionet.org/files/ephnogram/1.0.0/MAT/"
pandas_csv_file_path = "./physionet.org/files/ephnogram/1.0.0/ECGPCGSpreadsheet.csv"

In [3]:
# Mapping from 'recording scenario' to label (1 to 6)
scenario_to_label = {
    'rest: laying on bed': 1,
    'rest: sitting on armchair': 2,
    'exercise: walking at constant speed': 3,
    'exercise: pedaling a stationary bicycle': 4,
    'exercise: bicycle stress test': 5,
    'exercise: bruce protocol treadmill stress test': 6,
}

In [4]:
# Read the CSV file into a pandas DataFrame
### Maping file to labels
df = pd.read_csv(pandas_csv_file_path)

# Create a dictionary mapping filenames to labels
file_to_label = {}
for index, row in df.iterrows():
    filename = row['Record Name']  # Adjust column name if necessary
    scenario = row['Recording Scenario']  # Adjust column name if necessary
    # Check if the scenario can be mapped to a label
    if type(scenario) == str:
        if scenario.lower() in scenario_to_label:
            label = scenario_to_label[scenario.lower()]
            file_to_label[filename] = label
    else:
        # Skip scenarios that don't map to a clear label
        continue

In [5]:
file_to_label

{'ECGPCG0001': 4,
 'ECGPCG0002': 4,
 'ECGPCG0003': 2,
 'ECGPCG0004': 2,
 'ECGPCG0005': 2,
 'ECGPCG0006': 2,
 'ECGPCG0007': 2,
 'ECGPCG0008': 2,
 'ECGPCG0009': 2,
 'ECGPCG0010': 2,
 'ECGPCG0011': 2,
 'ECGPCG0012': 2,
 'ECGPCG0013': 1,
 'ECGPCG0014': 1,
 'ECGPCG0015': 1,
 'ECGPCG0016': 1,
 'ECGPCG0020': 1,
 'ECGPCG0021': 1,
 'ECGPCG0022': 1,
 'ECGPCG0023': 1,
 'ECGPCG0024': 4,
 'ECGPCG0025': 4,
 'ECGPCG0026': 4,
 'ECGPCG0027': 4,
 'ECGPCG0028': 4,
 'ECGPCG0029': 4,
 'ECGPCG0030': 4,
 'ECGPCG0031': 4,
 'ECGPCG0032': 4,
 'ECGPCG0033': 4,
 'ECGPCG0034': 4,
 'ECGPCG0035': 6,
 'ECGPCG0036': 6,
 'ECGPCG0037': 6,
 'ECGPCG0038': 6,
 'ECGPCG0039': 6,
 'ECGPCG0040': 2,
 'ECGPCG0046': 6,
 'ECGPCG0047': 6,
 'ECGPCG0052': 6,
 'ECGPCG0054': 6,
 'ECGPCG0055': 6,
 'ECGPCG0056': 6,
 'ECGPCG0059': 5,
 'ECGPCG0060': 5,
 'ECGPCG0061': 5,
 'ECGPCG0062': 5,
 'ECGPCG0064': 5,
 'ECGPCG0065': 5,
 'ECGPCG0066': 5,
 'ECGPCG0067': 5,
 'ECGPCG0068': 5,
 'ECGPCG0069': 5}

In [6]:
def preprocess_data(ecg_signal, pcg_signal):
    # Normalize ECG and PCG signals
    ecg_signal = (ecg_signal - np.min(ecg_signal)) / (np.max(ecg_signal) - np.min(ecg_signal))
    pcg_signal = (pcg_signal - np.min(pcg_signal)) / (np.max(pcg_signal) - np.min(pcg_signal))

    # Combine ECG and PCG signals into one dataset
    combined_signal = np.stack((ecg_signal, pcg_signal), axis=-1)

    return combined_signal

In [7]:
def process_mat_file(file_path, label):
    # Load the .mat file
    mat_data = scipy.io.loadmat(file_path)
    
    # Extract first channel of ECG and the only channel of PCG
    ecg_data = mat_data['ECG'][0]  # Taking only the first channel of ECG
    pcg_data = mat_data['PCG'][0]  # Assuming only one channel for PCG
    
    # Preprocess the signals
    combined_signal = preprocess_data(ecg_data, pcg_data)
    
    # Number of points in the signal
    num_points = combined_signal.shape[0]
    
    # Split data into segments of 10,000 points
    samples = []
    for i in range(0, num_points, 10000):
        if i + 10000 <= num_points:
            sample = combined_signal[i:i+10000]
            samples.append((sample, label))
    
    return samples
    # Split data into segments of 10,000 points
#     samples = []
#     for i in range(0, num_points, 10000):
#         if i + 10000 <= num_points:
#             ecg_sample = ecg_data[i:i+10000]
#             pcg_sample = pcg_data[i:i+10000]
#             samples.append((ecg_sample, pcg_sample, label))
    
#     return samples

In [8]:
import os
from multiprocessing import Pool, cpu_count

In [9]:
def process_single_file(args):
    file_path, label = args
    samples = process_mat_file(file_path, label)
    return samples

def process_files_in_parallel(mat_files_directory, file_to_label):
    all_samples = []
    mat_files = [f for f in os.listdir(mat_files_directory) if f.endswith('.mat')]
    
    # Prepare the arguments for each file
    args_list = []
    for mat_file in mat_files:
        file_path = os.path.join(mat_files_directory, mat_file)
        name = mat_file.split('.')[0]
        if name in file_to_label:
            label = file_to_label[name]
            args_list.append((file_path, label))

    # Use multiprocessing to process files in parallel
    with Pool(processes=70) as pool:
        results = pool.map(process_single_file, args_list)
    
    # Combine the results
    for result in results:
        all_samples.extend(result)
    
    return all_samples

In [10]:
# Usage
all_samples = process_files_in_parallel(mat_files_directory, file_to_label)

In [18]:
# # Process each .mat file and assign labels
# all_samples = []
# mat_files = [f for f in os.listdir(mat_files_directory) if f.endswith('.mat')]
# print(mat_files)
# for mat_file in mat_files:
#     file_path = os.path.join(mat_files_directory, mat_file)
#     name = mat_file.split('.')[0]
#     if name in file_to_label:
#         label = file_to_label[name]

#         samples = process_mat_file(file_path, label)
#         all_samples.extend(samples)  # Store all samples in one list

In [11]:
# Example: Print the number of samples and a sample label
example_sample = all_samples[0] if all_samples else None
if example_sample:
    print(f"Number of samples: {len(all_samples)}, Example sample label: {example_sample[1]}")
else:
    print("No samples were processed.")

Number of samples: 64992, Example sample label: 2


In [12]:
all_samples[0][0].shape

(10000, 2)

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming all_samples is a list of tuples, where each tuple is (signal, label)
# Example structure: [(np.array(10000, 2), label), ...]

def create_datasets(all_samples, test_size=0.2):
    # Separate signals and labels from the all_samples list
    signals = [sample[0] for sample in all_samples]
    labels = [sample[1] for sample in all_samples]
    
    # Convert lists to numpy arrays for better performance
    signals = np.array(signals)
    labels = np.array(labels)
    
    # Use train_test_split to split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(signals, labels, test_size=test_size, random_state=42)
    
    return X_train, X_test, y_train, y_test

# Example usage
X_train, X_test, y_train, y_test = create_datasets(all_samples, test_size=0.1)

# Output the shapes of the created datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (58492, 10000, 2)
X_test shape: (6500, 10000, 2)
y_train shape: (58492,)
y_test shape: (6500,)


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import classification_report, accuracy_score

In [15]:
# convert y_train y_test to binary
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train - 1, num_classes=6)
y_test_categorical = to_categorical(y_test - 1, num_classes=6)

# Build model and setting running on CPU

In [16]:
## build model:

# Step 4: Build CNN-LSTM Model
def build_cnn_lstm(input_shape):
    model = Sequential()

    # CNN layers for feature extraction
    model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))

    # LSTM layer for capturing temporal dependencies
    model.add(LSTM(50, return_sequences=False))

    # Fully connected layers for classification
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))  # 6 classes for exercise intensity levels

    return model

In [21]:

# Step 5: Train the CNN-LSTM Model
def train_cnn_lstm_model(model, X_train, y_train, X_test, y_test, epochs=40, batch_size=2048):
    model.compile(optimizer=Adam(learning_rate=0.005), loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)
    return history

# Step 6: Classification
def classify_exercise_intensity(model, X_test):
    predictions = model.predict(X_test)
    return np.argmax(predictions, axis=1)

# Step 7: Evaluate and Display Results
def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    y_pred = classify_exercise_intensity(model, X_test)
    print("Loss:", loss)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(np.argmax(y_test, axis=1), y_pred))

In [19]:
X_train.shape

(58492, 10000, 2)

In [20]:
# Build and train the CNN-LSTM model
input_shape = (X_train.shape[1], X_train.shape[2])
model = build_cnn_lstm(input_shape)
train_cnn_lstm_model(model, X_train, y_train_categorical, X_test, y_test_categorical)


Epoch 1/10

KeyboardInterrupt: 

In [None]:
# Build and train the CNN-LSTM model
input_shape = (X_train.shape[1], X_train.shape[2])
model = build_cnn_lstm(input_shape)
train_cnn_lstm_model(model, X_train, y_train_categorical, X_test, y_test_categorical)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

In [None]:
evaluate_model(model, X_test, y_test_categorical)

In [None]:
# Step 6: Evaluate and save the model
model.save('ecg_pcg_tpu_model.h5')

# Run TensorBoard (in a Jupyter notebook)
# %load_ext tensorboard
# %tensorboard --logdir logs/fit