In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Masking, Input, Normalization

from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Model, regularizers, layers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix


from sklearn.preprocessing import StandardScaler

In [2]:
raw_data_prep_dir = '../raw_data/pads-parkinsons-disease-smartwatch-dataset-1.0.0/preprocessed/'

In [3]:
file_list = pd.read_csv(raw_data_prep_dir  + "file_list.csv")
file_list

Unnamed: 0,resource_type,id,study_id,condition,disease_comment,age_at_diagnosis,age,height,weight,gender,handedness,appearance_in_kinship,appearance_in_first_grade_kinship,effect_of_alcohol_on_tremor,label
0,patient,1,PADS,Healthy,-,56,56,173,78,male,right,True,True,Unknown,0
1,patient,2,PADS,Other Movement Disorders,Left-Sided resting tremor and hypokinesia with...,69,81,193,104,male,right,False,,No effect,2
2,patient,3,PADS,Healthy,-,45,45,170,78,female,right,False,,Unknown,0
3,patient,4,PADS,Parkinson's,IPS akinetic-rigid type,63,67,161,90,female,right,False,,No effect,1
4,patient,5,PADS,Parkinson's,IPS tremordominant type,65,75,172,86,male,left,False,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,patient,465,PADS,Parkinson's,IPS mixed type,62,65,175,80,male,right,True,False,No effect,1
465,patient,466,PADS,Healthy,-,84,84,172,74,female,right,True,True,No effect,0
466,patient,467,PADS,Parkinson's,"Essential Tremor, starting IPS tremordominant ...",55,57,190,100,male,right,False,,Improvement,1
467,patient,468,PADS,Parkinson's,IPS mixed type,73,76,198,118,male,right,False,,No effect,1


In [4]:
time_data = []
y = []
X_to_split = []
for idx, subject_idx in enumerate(file_list['id']):
    file_name = raw_data_prep_dir + f'movement/{subject_idx:03d}_ml.bin'
    time_idx_data = np.fromfile(file_name, dtype=np.float32).reshape((-1, 976))
    time_data.append(time_idx_data)
    y_idx = file_list[file_list['id']==subject_idx].label.values
    X_id = idx
    y.append(y_idx)
    X_to_split.append(X_id)

X_to_split = np.array(X_to_split)
y = np.array(y).flatten()
print(X_to_split.shape, y.shape)

(469,) (469,)


In [5]:
channels = []
for task in ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
                "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]:
    for device_location in ["LeftWrist", "RightWrist"]:
        for sensor in ["Acceleration", "Rotation"]:
            for axis in ["X", "Y", "Z"]:
                channel = f"{task}_{sensor}_{device_location}_{axis}"
                channels.append(channel)
len(channels)

132

In [6]:
X_train_idx, X_test_idx, y_train, y_test = train_test_split(X_to_split, y, test_size=0.20, random_state=42, stratify = y)

In [7]:
time_data_array = np.array(time_data)
print(time_data_array.shape)

(469, 132, 976)


In [8]:
X_train = time_data_array[X_train_idx]
X_test = time_data_array[X_test_idx]
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))

375 94
375 94


In [9]:
n_obs, n_chan, n_time =  X_train.shape
print(n_obs, n_chan, n_time)
X_train_nn = X_train.reshape(n_obs, n_time, n_chan)
X_train_nn.shape

n_obs_test, n_chan_test, n_time_test =  X_test.shape
print(n_obs_test, n_chan_test, n_time_test)
X_test_nn = X_test.reshape(n_obs_test, n_time_test, n_chan_test)
X_train_nn.shape

375 132 976
94 132 976


(375, 976, 132)

In [10]:
y_train_cat = to_categorical(y_train)
y_test_cat  = to_categorical(y_test)

In [11]:
X_test_nn[1:]

array([[[-4.43387637e-03, -9.25870240e-03, -9.99780931e-03, ...,
          3.03398957e-03, -8.40053253e-04, -1.81826414e-03],
        [-1.81499508e-03, -1.79384463e-03, -1.79075450e-03, ...,
          1.28893924e-04,  1.12025905e-03,  1.13437920e-04],
        [ 1.61023039e-04,  1.18511112e-03,  1.22500723e-03, ...,
          4.54206392e-03,  2.59471126e-03,  1.65825360e-03],
        ...,
        [-1.48406737e-02,  4.42652963e-05,  2.44781021e-02, ...,
          2.97743306e-02,  4.35587838e-02,  4.14093472e-02],
        [ 2.54550166e-02,  3.14198760e-03, -1.81039125e-02, ...,
         -4.55813408e-02,  1.07074287e-02,  6.06156029e-02],
        [ 9.45774242e-02,  1.03049204e-01,  9.87681076e-02, ...,
         -6.57974631e-02, -8.49236846e-02, -1.09361649e-01]],

       [[ 9.48457528e-05,  1.32816867e-03, -1.29215710e-03, ...,
          9.09855822e-04, -1.06748508e-03, -2.21050973e-03],
        [-1.53201667e-03, -2.81168497e-03, -1.21852895e-03, ...,
          1.53782507e-02,  1.40629690e

In [21]:
def initialize_model_lstm(input_shape:tuple, dropout_rate: float = 0.3) -> Model:
    model = Sequential()
   
    
    normalizer = Normalization(axis = -1)
    normalizer.adapt(X_train_nn)
    model.add(Input(shape=input_shape))
    model.add(normalizer)
 

    #two lstm layers to better capture
    model.add(LSTM(units=128, activation='tanh',
                   return_sequences=True))
    model.add(LSTM(units=64, activation='tanh'))

    #model.add(layers.Dropout(dropout_rate))

    #dense layer with batch normalization
    # model.add(layers.Dense(32, activation='relu'))
    # model.add(layers.BatchNormalization())
    # model.add(layers.Dropout(dropout_rate))

    # model.add(layers.Dense(16, activation='relu'))
    # model.add(layers.Dropout(dropout_rate))

    # #output layer with 3 classification
    # model.add(layers.Dense(3, activation='softmax'))
    
    
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.2))

    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.1))

    model.add(layers.Dense(16, activation='relu'))  # Keep this layer
    model.add(layers.Dropout(0.1))

    model.add(layers.Dense(3, activation='softmax'))

    return model

In [28]:
model_lstm = initialize_model_lstm(input_shape=X_train_nn.shape[1:])

model_lstm.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(learning_rate=0.005),
    metrics=['accuracy', 'precision', 'recall'])

In [29]:
model_lstm.summary()

In [30]:
es = EarlyStopping(
    monitor="val_loss",
    patience=50,
    restore_best_weights=True,
    verbose=0
)

reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,
        patience=15,
        verbose=1,
        min_lr=0
    )

history = model_lstm.fit(
    X_train_nn,
    y_train_cat,
    validation_split = 0.2,
    epochs=150,
    batch_size = 32,
    callbacks=[es, reduce_lr],
    verbose=1
)

Epoch 1/150
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.3533 - loss: 1.1952 - precision: 0.4022 - recall: 0.1233 - val_accuracy: 0.6267 - val_loss: 1.0391 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 0.0050
Epoch 2/150
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.5567 - loss: 1.0004 - precision: 0.5887 - recall: 0.2767 - val_accuracy: 0.6267 - val_loss: 1.0043 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 0.0050
Epoch 3/150
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1s/step - accuracy: 0.6033 - loss: 0.9123 - precision: 0.6615 - recall: 0.4233 - val_accuracy: 0.5600 - val_loss: 1.0133 - val_precision: 1.0000 - val_recall: 0.0267 - learning_rate: 0.0050
Epoch 4/150
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step - accuracy: 0.5633 - loss: 0.9261 - precision: 0.6541 - recall: 0.4033 - val_accuracy: 0.6000 - va

In [31]:
model_lstm.evaluate(X_test_nn, y_test_cat)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 354ms/step - accuracy: 0.5745 - loss: 0.9988 - precision: 0.5942 - recall: 0.4362


[0.9987884163856506,
 0.5744680762290955,
 0.5942028760910034,
 0.43617022037506104]

In [40]:
model_lstm.save('../spark/lstm_timeseries_model.h5')

