In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import os

# define input and output dirs
PHYSIONET_DATA_DIR = "/content/drive/MyDrive/DLHProject/data/physionet_finetune"
PHYSIONET_spectrogram_OUT_DIR = "/content/drive/MyDrive/DLHProject/data/physionet_finetune_spectrogram_nodb"
os.makedirs(PHYSIONET_spectrogram_OUT_DIR, exist_ok=True)

# move to my repo
REPO_DIR = "/content/drive/MyDrive/DLHProject/Mylesgitrepo"
%cd "{REPO_DIR}"
!pip install -r requirements.txt

/content/drive/MyDrive/UIUC MCS/CS598 Deep Learning for Healthcare/Project/Mylesgitrepo
Collecting scipy==1.12.0 (from -r requirements.txt (line 6))
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.4/38.4 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting wfdb (from -r requirements.txt (line 8))
  Downloading wfdb-4.1.2-py3-none-any.whl (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.0/160.0 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting nbdime (from -r requirements.txt (line 14))
  Downloading nbdime-4.0.1-py3-none-any.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting colorama (from nbdime->-r requirements.txt (line 14))
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting gitpython!=2.1.4,!=2.1.5,!=2.1.6 (from nbdime->

Install project requirements

Next we want to prepare the train and test datasets. Before we generate them, we first make sure to save them to the right destination. Otherwise we will write data to the git repository which we don't want. Thus, we spend the next few cells figuring out where to save these files.

In [3]:

from finetuning import datasets
from finetuning.utils import train_test_split
from transplant.utils import save_pkl, load_pkl
import numpy as np
from transplant.datasets.icentia11k_spectrogram import *


In [4]:
# load the data - this is already preprocessed (resampled to 250Hz, normalized zero mean/stddev)
train_set = load_pkl(f'{PHYSIONET_DATA_DIR}/physionet_train.pkl')
test_set = load_pkl(f'{PHYSIONET_DATA_DIR}/physionet_test.pkl')

In [5]:
print("train_set keys: ", train_set.keys())
print("train_set[x] shape: ", train_set['x'].shape)

print("test_set keys: ", test_set.keys())
print("test_set[x] shape: ", test_set['x'].shape)


train_set keys:  dict_keys(['x', 'y', 'record_ids', 'classes'])
train_set[x] shape:  (6822, 16384, 1)
test_set keys:  dict_keys(['x', 'y', 'record_ids', 'classes'])
test_set[x] shape:  (1706, 16384, 1)


In [6]:
def preprocess_data_to_spectrogram(data):
    tempdata = []
    for i in range(data['x'].shape[0]):
        #run and add a dimunesion at 0th place
        tempdata.append( np.expand_dims(spectrogram_preprocessor(np.squeeze(data['x'][i]),
                                    window_size = 256,
                                    stride = 32,
                                    n_freqs = 128,
                                    fs = 250.,
                                    ref = 1), axis=0))
        # print progress every 500
        if i % 500 == 0:
            print("at iteration ", i, "of ", data['x'].shape[0])

    # construct the data
    new_data = {}
    new_data['x'] = np.concatenate(tempdata, axis=0)
    new_data['y'] = data['y']
    new_data['record_ids'] = data['record_ids']
    new_data['classes'] = data['classes']

    return new_data
# loop through 0th dim of test_set['x'] and run preprocessing on all of them

print('Processing test set...')
test_set_spectrogram = preprocess_data_to_spectrogram(test_set)
print("test_set shape: ", test_set['x'].shape)
print("test_set spectrogram shape: ", test_set_spectrogram['x'].shape)
save_pkl(f'{PHYSIONET_spectrogram_OUT_DIR}/physionet_test.pkl', **test_set_spectrogram)



print('Processing train set...')
train_set_spectrogram= preprocess_data_to_spectrogram(train_set)
print("train_set shape: ", train_set['x'].shape)
print("train_set spectrogram shape: ", train_set_spectrogram['x'].shape)
save_pkl(f'{PHYSIONET_spectrogram_OUT_DIR}/physionet_train.pkl', **train_set_spectrogram)




Processing test set...
at iteration  0 of  1706
at iteration  500 of  1706
at iteration  1000 of  1706
at iteration  1500 of  1706
test_set shape:  (1706, 16384, 1)
test_set spectrogram shape:  (1706, 128, 512, 1)
Processing train set...
at iteration  0 of  6822
at iteration  500 of  6822
at iteration  1000 of  6822
at iteration  1500 of  6822
at iteration  2000 of  6822
at iteration  2500 of  6822
at iteration  3000 of  6822
at iteration  3500 of  6822
at iteration  4000 of  6822
at iteration  4500 of  6822
at iteration  5000 of  6822
at iteration  5500 of  6822
at iteration  6000 of  6822
at iteration  6500 of  6822
train_set shape:  (6822, 16384, 1)
train_set spectrogram shape:  (6822, 128, 512, 1)


In [None]:

test_set_spectrogram =load_pkl(f'{PHYSIONET_spectrogram_OUT_DIR}/physionet_test.pkl')
print(type(test_set_spectrogram))
print(test_set_spectrogram.keys())
print(test_set_spectrogram['x'][0].shape)
# test_set_spectrogram is an array of elements who individually are of size (128,512). Lets concatenate these so its (n,128,512)
import matplotlib.pyplot as plt
# print one of the spectrograms
# x and y are EagerTensor
# lets plot it
x = test_set_spectrogram['x'][0];
# add dimension up front
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
ax.imshow(x, cmap='viridis')
ax.set_title(f"Sample spectrogram of shape {x.shape}")
ax.invert_yaxis()

# zoom into last 200 slices

plt.show()

NameError: name 'load_pkl' is not defined


The above took about <> time.