In [1]:
import bisect
import datetime
from dateutil.parser import parse
import itertools
from itertools import product
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import math
import matplotlib.pyplot as plt
from multiprocessing.dummy import Pool as ThreadPool
import nolds
import numpy as np
import os
import pandas as pd
import pathlib
from PreprocessFcns import *
import pywt
import random
import scipy
from scipy.fftpack import fft
from scipy.signal import butter, welch, filtfilt, resample, find_peaks
from scipy.stats import skew, kurtosis, entropy, pearsonr
import seaborn as sns
import sklearn
from sklearn import multiclass
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import LeaveOneGroupOut
import time
%matplotlib inline

Using TensorFlow backend.


In [3]:
path = r'//FS2.smpp.local\\RTO\\CIS-PD Study\MJFF Curation\Finalized Dataset'

In [220]:
# generate task abbreviation dictionary
ClinicTasks = {
    'Stndg'    : 'Standing',
    'Wlkg'     : 'Walking',
    'WlkgCnt'  : 'Walking while counting',
    'FtnR'     : 'Finger to nose--right hand',
    'FtnL'     : 'Finger to nose--left hand',
    'RamR'     : 'Alternating right hand movements',
    'RamL'     : 'Alternating left hand movements',
    'SitStand' : 'Sit to stand',
    'Drwg'     : 'Drawing on a paper',
    'Typg'     : 'Typing on a computer keyboard',
    'NtsBts'   : 'Assembling nuts and bolts',
    'Drnkg'    : 'Taking a glass of water and drinking',
    'Sheets'   : 'Organizing sheets in a folder',
    'Fldg'     : 'Folding towels',
    'Sitng'    : 'Sitting'
}

In [13]:
# generate visit number dictionary
VisitNumber = {
    '2 Weeks: Time 0'   : 0,
    '2 Weeks: Time 30'  : 1,
    '2 Weeks: Time 60'  : 2,
    '2 Weeks: Time 90'  : 3,
    '2 Weeks: Time 120' : 4,
    '2 Weeks: Time 150' : 5,
    '1 Month'           : 6
}

In [281]:
def filterMetaData(tasks):
'''filter metadata file to remove empty data and unnecessary scores
add necessary information including binary tremor scores

tasks: list of tasks for which to retrieve metadata of'''

    # open metadata containing scores for each symptom for each task completed
    metaDataFull = pd.read_csv(os.path.join(path, 'Metadata Tables', 'Table4.csv'))
    # isolate metadata corresponding to tasks of interest specified
    indices = (x for x in range(len(metaDataFull)) if metaDataFull.TaskAbb.values[x] in tasks)
    metaDataFull = metaDataFull.loc[indices]

    SubjID = []
    Visit = []
    TaskAbb = []
    AccFile = []
    Tremor = []
    for record in metaDataFull.iterrows():
        # eliminate rows of metadata that contain nan values
        if (type(record[1]['Side']) == float):
            continue
        if (np.isnan(record[1]['Tremor - ' + record[1]['Side']])):
            continue
        # build file name of the recording related to each piece of metadata
        filename = (str(int(record[1]['SubjID'])) + '_' +
                            str(VisitNumber[record[1]['Visit']]) + '_' + 
                            record[1]['TaskAbb'] + '.csv')
        # add file name to file path for easy access
        filepath = os.path.join(path, 'TaskAcc', filename)
        # test is the recording file exists (not all metadata has related acceleration recording)
        if not os.path.exists(filepath):
            continue
        SubjID = SubjID + [int(record[1]['SubjID'])]
        Visit = Visit + [VisitNumber[record[1]['Visit']]]
        TaskAbb = TaskAbb + [record[1]['TaskAbb']]
        AccFile = AccFile + [filename]
        # only concerned with tremor score on the side of subject wearing the apple watch
        Tremor = Tremor + [int(record[1]['Tremor - ' + record[1]['Side']])]
    # create column with binary tremor scores (symptomatic vs normal)
    TremorBIN = [int(t > 0) for t in Tremor]
    metaData = pd.DataFrame({'SubjID': SubjID, 
                             'Visit': Visit, 
                             'TaskAbb': TaskAbb,
                             'AccFile': AccFile,
                             'Tremor': Tremor,
                             'TremorBIN': TremorBIN})
    print('Records = ' + str(len(metaData)))
    
    return metaData
    
def formatInputNN(tasks, metaData, segment):
'''generate arrays of data, labels, and metadata to train and analyze data
filters to only contain anomalies (does not score non-anomalous clips in any way)

tasks: list of tasks to consider
metaData: variable saved after generation from function above
segment: True or False - whether or not to consider full recording or segment into clips'''
    
    # set the threshold for which to consider clips anomalies
    NormRMSE = 0.01
    Data = []
    Labels = []
    Subjects = []
    for record in metaData.iterrows():
        # load acceleration recording according to information in row of metadata
        recording = pd.read_csv(os.path.join(path, 'TaskAcc', record[1]['AccFile']), 
                                parse_dates = ['timestamp'])[['timestamp', 'x', 'y', 'z']]
        recording.columns = ['Timestamp', 'X', 'Y', 'Z']
        # recording not filtered - simply calculate magnitude from the axes
        recording['Mag'] = np.sqrt((recording.X**2 + recording.Y**2 + recording.Z**2))
        recording = recording.sort_values(by = 'Timestamp', axis = 0)
        
        if segment:
            # similar to random forest model: group sets of data by proximity in timestamp windows
            # to change segment window length, change time after tm.second modulo
            recording['TimeWdw'] = [(tm - datetime.timedelta(minutes = 0,
                                                             seconds = tm.second % 1.5,
                                                             microseconds = tm.microsecond)) 
                                    for tm in recording.Timestamp]
            # set index of data as time in epoch time after first value of recording
            recording['TimeIdx'] = (recording.Timestamp.values - 
                                    recording.Timestamp.values[0]).astype('timedelta64[ms]').astype(int)
            recording = recording.set_index('TimeIdx')
            # isolate each clip according to grouping with 50% overlap between clips
            # to change segment window length, change time in timedelta
            for t in recording.TimeWdw.unique():
                clip = recording.loc[(recording.TimeWdw == t) | 
                                     (recording.TimeWdw == (t + np.timedelta64(1500, 'ms')))]
                # clips clips containing much less data than necessary(50Hz so about 150 expected for 3 seconds)
                if len(clip) < 120:
                    continue
                # if tasks are siting or standing (most of the time they should be) skip if not anomaly
                # only want model to train to tell difference between symptomatic and normal anomaly
                if ((tasks == ['Sitng', 'Stndg']) or 
                    (tasks == ['Stndg', 'Sitng']) or 
                    (tasks == ['Sitng']) or 
                    (tasks == ['Stndg'])) and ((np.sqrt(np.mean((clip.Mag - np.mean(clip.Mag))**2))) < NormRMSE):
                    continue
                # upsample all clips (axes independently) to fit normally into array
                fx = scipy.interpolate.interp1d(range(len(clip)), clip.X.values)
                fy = scipy.interpolate.interp1d(range(len(clip)), clip.Y.values)
                fz = scipy.interpolate.interp1d(range(len(clip)), clip.Z.values)
                clipX = fx(np.linspace(start = 0, stop = len(clip) - 1, num = 500))
                clipY = fy(np.linspace(start = 0, stop = len(clip) - 1, num = 500))
                clipZ = fz(np.linspace(start = 0, stop = len(clip) - 1, num = 500))
                # manually shape resampled clip data into a rectangular array
                datasteps = []
                for dpx, dpy, dpz in zip(clipX, clipY, clipZ):
                    datasteps = datasteps + [[dpx, dpy, dpz]]
                Data = Data + [datasteps]
                # add relevant metadata to correlate to acceleration data in array
                Labels = Labels + [record[1]['TremorBIN']]
                Subjects = Subjects + [record[1]['SubjID']]
                
        else:
            # upsample recordings to 1000
            fx = scipy.interpolate.interp1d(range(len(recording)), recording.X.values)
            fy = scipy.interpolate.interp1d(range(len(recording)), recording.Y.values)
            fz = scipy.interpolate.interp1d(range(len(recording)), recording.Z.values)
            clipX = fx(np.linspace(start = 0, stop = len(recording) - 1, num = 1000))
            clipY = fy(np.linspace(start = 0, stop = len(recording) - 1, num = 1000))
            clipZ = fz(np.linspace(start = 0, stop = len(recording) - 1, num = 1000))
            # manually shape resampled clip data into a rectangular array
            datasteps = []
            for dpx, dpy, dpz in zip(clipX, clipY, clipZ):
                datasteps = datasteps + [[dpx, dpy, dpz]]
            Data = Data + [datasteps]
            # add relevant metadata to correlate to acceleration data in array
            Labels = Labels + [record[1]['TremorBIN']]
            Subjects = Subjects + [record[1]['SubjID']]

    Data = np.array(Data)
    Labels = np.array(Labels)
    Subjects = np.array(Subjects)
    print('(Samples, Timesteps, Features (Axes)) = ' + str(Data.shape))
    print('Labels = ' + str(len(Labels)))
    print('Subjects = ' + str(len(set(Subjects))))
    
    return Data, Labels, Subjects

In [284]:
metaData = filterMetaData(['Wlkg', 'WlkgCnt'])
Data, Labels, Subjects = formatInputNN(['Wlkg', 'WlkgCnt'], metaData, segment = False)

Records = 265
(Samples, Timesteps, Features (Axes)) = (265, 1000, 3)
Labels = 265
Subjects = 23


In [285]:
grouper = LeaveOneGroupOut()
valsubs = []
plt.figure(figsize = (16, 16))
# train and validate neural network by individual subject using leave one group out method
for trainInd, testInd in grouper.split(Data, Labels, groups = Subjects):
    trainData = Data[trainInd]
    trainLab = Labels[trainInd]
    testData = Data[testInd]
    testLab = Labels[testInd]
    subject = int(np.unique(Subjects[testInd]))
    # validation subjects - list of each subject in order of validation (to use as legend of ROC curve plot)
    valsubs = valsubs + [subject]
    
    print('Validation Subject: ' + str(subject))
    
    # initialize sequential neural network
    model = Sequential()
    # add a long short term memory layer specifying the input shape (length of 500 (segmented) with 3 axes)
    model.add(LSTM(50, input_shape = (500, 3)))
    # add a dense layer with a sigmoid activation function
    model.add(Dense(1, activation = 'sigmoid'))
    # compile the neural network with a mae loss function and adam optimizer
    model.compile(loss = 'mae', optimizer = 'adam')
    try:
        # fit the model using the training set and validate on the subject data left out
        history = model.fit(trainData, trainLab, epochs = 10, batch_size = int(len(trainInd) / 20), 
                            validation_data = (testData, testLab))
    # keyboard interrupt exception to plot all data acquried to this point (see plot to this point)
    except(KeyboardInterrupt):
        break
    # prediction label is prediction on test data of fit neural network
    PL = model.predict(testData)
    # true labels are clinician scores from metadata
    TL = testLab
    # ROC curve will spit an error if not positive values are present
    if all(TL == 0):
        continue
    # generate and plot ROC curve of each validation subject
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(TL, PL)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate', fontsize = 12)
    plt.ylabel('True Positive Rate', fontsize = 12)
    plt.title('Receiver Operating Characteristic Curve', fontsize = 15)
    
#     plt.figure(figsize = (8, 8))
#     plt.plot(history.history['loss'], label = 'Train')
#     plt.plot(history.history['val_loss'], label = 'Test')
#     plt.title(str(subject))
#     plt.legend()
#     plt.ylim((0,1))
#     plt.show()
plt.legend(valsubs)
plt.show()

Validation Subject: 1003


ValueError: Error when checking input: expected lstm_231_input to have shape (500, 3) but got array with shape (1000, 3)

<Figure size 1152x1152 with 0 Axes>