In [1]:
from generate_patient_dataset import GeneratePatientDataset
data = GeneratePatientDataset("207")

readings = data.readings
annotations = data.annotations
mapping = data.mapping

In [2]:
sequences_mapping = [list(pair) for pair in mapping.keys()]
labels = [annotation.annotation for annotation in mapping.values()]

In [3]:
sequences = []
for sequence in sequences_mapping:
    entries = []
    for entry in sequence:
        entries.append([entry.ml_ii, entry.v_1])
    sequences.append(entries)

In [4]:
# Get max. length for Padding
max_length = 0

for key, val in mapping.items():
    current_length = len(key)
    if current_length > max_length:
        max_length = current_length

print("Maximum sequence length:", max_length)

Maximum sequence length: 924


In [None]:
import pandas as pd

# Load the annotations data
annotations_path = '/mnt/data/207annotations.csv'
annotations = pd.read_csv(annotations_path)

# Convert the time column to minutes
annotations['Time_min'] = annotations['Time'] / 60

# Define the intervals for analysis
intervals = [(0, 7.5), (7.5, 15), (15, 22.5), (22.5, 30)]

# Focus on the specified classes
classes_of_interest = ['L', '!', 'A', 'V', 'E']

# Function to count occurrences of classes in each interval
def count_classes_in_interval(df, interval, classes):
    start, end = interval
    interval_data = df[(df['Time_min'] >= start) & (df['Time_min'] < end)]
    return interval_data['Type'].value_counts().reindex(classes, fill_value=0)

# Count occurrences for each interval
class_counts = {f'{start}-{end} min': count_classes_in_interval(annotations, (start, end), classes_of_interest)
                for start, end in intervals}

class_counts_df = pd.DataFrame(class_counts)
class_counts_df


In [32]:
print(sequences[0])

[['981', '1043'], ['981', '1043'], ['981', '1043'], ['981', '1043'], ['981', '1043'], ['981', '1043  '], ['981', '1043'], ['981', '1043'], ['982', '1043'], ['984', '1046'], ['986', '1044'], ['985', '1041'], ['984', '1034'], ['984', '1036'], ['989', '1042'], ['992', '1043'], ['992', '1043'], ['993', '1038'], ['989', '1033'], ['992', '1035'], ['992', '1037'], ['990', '1042'], ['993', '1042'], ['990', '1037'], ['987', '1036'], ['991', '1035'], ['989', '1037'], ['992', '1041'], ['995', '1041'], ['991', '1031'], ['985', '1026'], ['977', '1015'], ['973', '1010'], ['972', '1009'], ['974', '1001'], ['972', '989'], ['970', '977'], ['976', '971'], ['995', '974'], ['1017', '980'], ['1043', '986'], ['1071', '988'], ['1087', '988'], ['1092', '987'], ['1100', '994'], ['1106', '1012'], ['1120', '1047'], ['1142', '1099'], ['1166', '1159'], ['1183', '1212'], ['1187', '1240'], ['1179', '1251'], ['1166', '1261'], ['1155', '1269'], ['1140', '1280'], ['1121', '1285'], ['1098', '1285'], ['1067', '1282'], ['

In [79]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from scipy.signal import butter, filtfilt
import numpy as np

# Pre-Processing
def preprocess_sequence(sequences, labels, max_length):
    # Convert labels to integer indices
    label_to_index = {'L': 0, 'A': 1, 'V': 2, '!': 3, 'E': 4}
    indices = [label_to_index[label] for label in labels]

    # Padding
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post', value=0)

    # Frequency Bandpass Filter
    mlii = [[sublist[0] for sublist in group] for group in padded_sequences]
    v1 = [[sublist[1] for sublist in group] for group in padded_sequences]
    
    fs = 360
    nyquist = 0.5 * fs
    low = 0.4 / nyquist
    high = 45 / nyquist
    
    b, a = butter(N=3, Wn=[low, high], btype='band')
    mlii_filtered = [filtfilt(b, a, seq) for seq in mlii]
    v1_filtered = [filtfilt(b, a, seq) for seq in v1]
    
    filtered_padded_sequences = np.asarray(
        [[[a, b] for a, b in zip(sublist_a, sublist_b)] for sublist_a, sublist_b in zip(v1_filtered, mlii_filtered)]
    )
    
    return filtered_padded_sequences, indices


In [80]:
preprocessed_sequences, indices = preprocess_sequence(sequences, labels, max_length)

In [81]:
print(preprocessed_sequences)
print(indices)

[[[-43.60633593 -29.94817132]
  [-43.1058405  -29.41352707]
  [-42.58037176 -28.8990522 ]
  ...
  [  6.41655621   5.90782623]
  [  6.08749965   5.60433353]
  [  5.76005953   5.30238002]]

 [[-30.57943618 -32.2521962 ]
  [-28.46477264 -27.78166067]
  [-24.94669162 -20.92000789]
  ...
  [-48.54461327 -43.23530732]
  [-46.07037762 -41.00556624]
  [-43.60787319 -38.7888595 ]]

 [[-48.6611364  -37.19817115]
  [-44.11916502 -30.56226524]
  [-37.96495125 -23.01290922]
  ...
  [-31.96818594 -25.54331507]
  [-30.22757227 -24.12041284]
  [-28.50568123 -22.71566061]]

 ...

 [[164.18177664 143.37944742]
  [183.46542635 165.16744098]
  [201.37276664 185.5929892 ]
  ...
  [ -4.29389719  -2.67632084]
  [ -4.13684832  -2.58862548]
  [ -3.97476746  -2.49680497]]

 [[158.34660914 150.13521511]
  [180.35410491 173.36107493]
  [200.28889269 194.8353101 ]
  ...
  [ -4.28284151  -2.69691264]
  [ -4.126216    -2.60879139]
  [ -3.96456969  -2.51648916]]

 [[122.93846788 101.05049273]
  [139.20000059 113.5279

In [40]:
from sklearn.model_selection import train_test_split

# Split data into train/test
train_x, test_x, train_y, test_y = train_test_split(
    sequences, labels, test_size=0.5, random_state=42, stratify=labels
)

In [41]:
import numpy as np

# Check for data balance between train test
train_class_distribution = np.bincount(train_y)
test_class_distribution = np.bincount(test_y)

print("Training Set Class Distribution:", train_class_distribution)
print("Testing Set Class Distribution:", test_class_distribution)

ValueError: invalid literal for int() with base 10: 'L'