# Dataset with scalograms

In [19]:
import os
import numpy as np
import matplotlib.pyplot as plt
import re

Now the training and test set will be built. One patient (that is, all of the segments corresponding to one patient) from each of the four disease cases will make up the test set.

In [31]:
np.random.seed(3)

als_test = np.random.randint(1, 14)
control_test = np.random.randint(1, 17)
hunt_test = np.random.randint(1, 21)
park_test = np.random.randint(1, 16)

print('ALS patient for test set:', als_test)
print('Huntington\'s patient for test set:', hunt_test)
print('Parkinson\'s patient for test set:', park_test)
print('Control subject for test set:', control_test)

ALS patient for test set: 11
Huntington's patient for test set: 4
Parkinson's patient for test set: 9
Control subject for test set: 9


In [5]:
test_filenames = ['als'+str(als_test),
                 'hunt'+str(hunt_test),
                 'control'+str(control_test),
                 'park'+str(park_test)]

classes = {
    'control': 0,
    'als': 1,
    'hunt': 2,
    'park': 3}

test_scalograms = []
test_labels = []
train_scalograms = []
train_labels = []
test_info = []
train_info = []

rootdir = os.pardir
in_dir = os.path.join(rootdir, 'data/interim/scalograms')

for in_filename in os.listdir(in_dir):
    record_name = in_filename.split('_')[0]
    # load npy file
    scalogram = np.load(os.path.join(in_dir, in_filename))
    # get label from record name
    disease_match = re.match(r'[a-z]+', record_name)
    disease = disease_match[0]
    label = classes[disease]
    # get segment name
    segment_name = in_filename[:-4]
    
    if record_name in test_filenames:
        test_scalograms.append(scalogram)
        test_labels.append(label)
        test_info.append(segment_name)
    
    else:
        train_scalograms.append(scalogram)
        train_labels.append(label)
        train_info.append(segment_name)

X_test = np.asarray(test_scalograms)
Y_test = np.asarray(test_labels)
Y_test = Y_test.reshape((Y_test.shape[0], 1))
Z_test = np.asarray(test_info)
Z_test = Z_test.reshape((Z_test.shape[0], 1))

X_train = np.asarray(train_scalograms)
Y_train = np.asarray(train_labels)
Y_train = Y_train.reshape((Y_train.shape[0], 1))
Z_train = np.asarray(train_info)
Z_train = Z_train.reshape((Z_train.shape[0], 1))

In [6]:
print(f'Shape of X_test: {X_test.shape}')
print(f'Shape of X_train: {X_train.shape}')

print(f'Shape of Y_test: {Y_test.shape}')
print(f'Shape of Y_train: {Y_train.shape}')

print(f'Shape of Z_test: {Z_test.shape}')
print(f'Shape of Z_train: {Z_train.shape}')

Shape of X_test: (72, 100, 100)
Shape of X_train: (1032, 100, 100)
Shape of Y_test: (72, 1)
Shape of Y_train: (1032, 1)
Shape of Z_test: (72, 1)
Shape of Z_train: (1032, 1)


In [7]:
out_dir = os.path.join(rootdir, 'data', 'processed', 'scalograms-dataset')

np.savez(os.path.join(out_dir, 'train-dev.npz'), X_train = X_train, Y_train = Y_train, Z_train = Z_train)
np.savez(os.path.join(out_dir, 'test.npz'), X_test = X_test, Y_test = Y_test, Z_test = Z_test)