# Data Cleaning Notebook for MJF

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pathlib
import pickle #to save files
from itertools import product
from scipy.stats import skew, kurtosis, pearsonr
from scipy.signal import butter, welch, filtfilt
import nolds

# from PreprocessFcns import gen_clips, powerspectra

%matplotlib inline

#-- For interactive plots--
# from mpl_toolkits.mplot3d import Axes3D
# %matplotlib notebook

In [2]:
#---Pandas version required to load pickle files is 0.20.1 or greater---
pd.__version__

'0.21.0'

In [3]:
#---------------------------------------------------------------------------------------------------------
# Set path to folder containing Subject ID numbers
# path = '/Volumes/RTO/CIS-PD Study/Subjects/' #Mac
path = r'D:\CIS-PD Study\Subjects' #Windows remote path
#---------------------------------------------------------------------------------------------------------
folder_path = r'D:\CIS-PD Study' #generic Windows repo path

#Path where dictionary subject data is stored
#dict_path = 'D:\CIS-PD Study\Data_dict_noErr' #remote repo
dict_path = 'D:\CIS-PD Study\Data_dict'
# dict_path = '../Data_dict' # Mac local path
# dict_path = r'C:\Users\adai\Documents\Data_dict' #Windows local path adai

scores_path = r'D:\CIS-PD Study\Scores' #remote repo
# scores_path = '../Scores/' # Mac local path

#path where feature matrix is saved
features_path = r'D:\CIS-PD Study\FeatureMatrix' #remote repo
# features_path = '../FeatureMatrix' # Mac local path
# features_path = r'C:\Users\adai\Documents\FeatureMatrix' #Windows local path adai


In [4]:
complete = list(['Heart Rate Variability', 'MDS-UPDRS #1: Finger Tapping',
           'MDS-UPDRS #2: Hand Movements', 'MDS-UPDRS #3: Pronation-Supination',
           'MDS-UPDRS #4: Toe Tapping', 'MDS-UPDRS #5: Leg Agility',
           'MDS-UPDRS #6: Arising from Chair', 'MDS-UPDRS #7: Gait',
           'MDS-UPDRS #8: Postural Stability', 'MDS-UPDRS #9: Postural Hand Tremor',
           'MDS-UPDRS #10: Kinetic Hand Tremor', 'MDS-UPDRS #11: Rest Tremor',
           'Motor #1: Standing', 'Motor #2: Walking', 'Motor #3: Walking while Counting',
           'Motor #4: Finger to Nose', 'Motor #5: Alternating Hand Movements',
           'Motor #6: Sit to Stand', 'Motor #7: Drawing on Paper',
           'Motor #8: Typing on a Computer', 'Motor #9: Nuts and Bolts',
           'Motor #10: Drinking Water', 'Motor #11: Organizing Folder',
           'Motor #12: Folding Towels', 'Motor #13: Sitting'])

In [5]:
messy_df = pd.read_excel(os.path.join(folder_path, '142560_cisuabd4_meta_data.xlsx'))
messy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 17 columns):
user_id_intel                           665 non-null int64
user name in MUSC                       665 non-null int64
user name in Fox Insight application    665 non-null object
experiment                              665 non-null object
cohort                                  665 non-null object
Unnamed: 5                              665 non-null object
Unnamed: 6                              665 non-null object
reported timestamp start                665 non-null object
reported timestamp end                  665 non-null object
task name                               665 non-null object
Unnamed: 10                             665 non-null int64
measurement name                        665 non-null object
measurement's related body part         560 non-null object
participant’s state                     665 non-null object
Wearable location                       665 non-null objec

In [6]:
messy_df.head()

Unnamed: 0,user_id_intel,user name in MUSC,user name in Fox Insight application,experiment,cohort,Unnamed: 5,Unnamed: 6,reported timestamp start,reported timestamp end,task name,Unnamed: 10,measurement name,measurement's related body part,participant’s state,Wearable location,Wearable Type,Value
0,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,03JUL2017:16:49:44.000,03JUL2017:16:49:48.000,Shaking,100,overall,,ON,left hand,Apple Watch,
1,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,03JUL2017:16:51:25.000,03JUL2017:16:51:55.000,standing,100,overall,,ON,left hand,Apple Watch,1.0
2,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,03JUL2017:16:51:25.000,03JUL2017:16:51:55.000,standing,101,dyskinesia,left upper limb,ON,left hand,Apple Watch,0.0
3,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,03JUL2017:16:51:25.000,03JUL2017:16:51:55.000,standing,101,dyskinesia,right upper limb,ON,left hand,Apple Watch,0.0
4,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,03JUL2017:16:51:25.000,03JUL2017:16:51:55.000,standing,102,tremor,left upper limb,ON,left hand,Apple Watch,0.0


In [7]:
messy_df['reported timestamp start'] = pd.to_datetime(messy_df['reported timestamp start'], format='%d%b%Y:%H:%M:%S.%f')
messy_df['reported timestamp end'] = pd.to_datetime(messy_df['reported timestamp end'], format='%d%b%Y:%H:%M:%S.%f')
messy_df

Unnamed: 0,user_id_intel,user name in MUSC,user name in Fox Insight application,experiment,cohort,Unnamed: 5,Unnamed: 6,reported timestamp start,reported timestamp end,task name,Unnamed: 10,measurement name,measurement's related body part,participant’s state,Wearable location,Wearable Type,Value
0,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:49:44,2017-07-03 16:49:48,Shaking,100,overall,,ON,left hand,Apple Watch,
1,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:51:25,2017-07-03 16:51:55,standing,100,overall,,ON,left hand,Apple Watch,1.0
2,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:51:25,2017-07-03 16:51:55,standing,101,dyskinesia,left upper limb,ON,left hand,Apple Watch,0.0
3,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:51:25,2017-07-03 16:51:55,standing,101,dyskinesia,right upper limb,ON,left hand,Apple Watch,0.0
4,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:51:25,2017-07-03 16:51:55,standing,102,tremor,left upper limb,ON,left hand,Apple Watch,0.0
5,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:51:25,2017-07-03 16:51:55,standing,102,tremor,right upper limb,ON,left hand,Apple Watch,0.0
6,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:52:20,2017-07-03 16:52:50,walking,100,overall,,ON,left hand,Apple Watch,1.0
7,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:52:20,2017-07-03 16:52:50,walking,103,bradykinesia,left upper limb,ON,left hand,Apple Watch,1.0
8,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:52:20,2017-07-03 16:52:50,walking,103,bradykinesia,right upper limb,ON,left hand,Apple Watch,0.0
9,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 16:52:20,2017-07-03 16:52:50,walking,101,dyskinesia,left upper limb,ON,left hand,Apple Watch,0.0


In [8]:
messy_df.sort_values(by='reported timestamp start', inplace=True)
messy_df

Unnamed: 0,user_id_intel,user name in MUSC,user name in Fox Insight application,experiment,cohort,Unnamed: 5,Unnamed: 6,reported timestamp start,reported timestamp end,task name,Unnamed: 10,measurement name,measurement's related body part,participant’s state,Wearable location,Wearable Type,Value
190,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:35:18,2017-07-03 15:35:45,Shaking,100,overall,,OFF,left hand,Apple Watch,
191,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:36:50,2017-07-03 15:37:20,standing,100,overall,,OFF,left hand,Apple Watch,2.0
192,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:36:50,2017-07-03 15:37:20,standing,101,dyskinesia,left upper limb,OFF,left hand,Apple Watch,0.0
193,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:36:50,2017-07-03 15:37:20,standing,101,dyskinesia,right upper limb,OFF,left hand,Apple Watch,0.0
194,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:36:50,2017-07-03 15:37:20,standing,102,tremor,left upper limb,OFF,left hand,Apple Watch,1.0
195,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:36:50,2017-07-03 15:37:20,standing,102,tremor,right upper limb,OFF,left hand,Apple Watch,0.0
196,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:37:55,2017-07-03 15:38:25,walking,100,overall,,OFF,left hand,Apple Watch,3.0
202,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:37:55,2017-07-03 15:38:25,walking,102,tremor,right upper limb,OFF,left hand,Apple Watch,0.0
201,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:37:55,2017-07-03 15:38:25,walking,102,tremor,left upper limb,OFF,left hand,Apple Watch,0.0
197,142560,1003,cisuabd4,Clinicians,alabama,509 Motor Tasks,Caregiver,2017-07-03 15:37:55,2017-07-03 15:38:25,walking,103,bradykinesia,left upper limb,OFF,left hand,Apple Watch,3.0


In [9]:
list(messy_df.columns.values)

['user_id_intel',
 'user name in MUSC',
 'user name in Fox Insight application',
 'experiment',
 'cohort',
 'Unnamed: 5',
 'Unnamed: 6',
 'reported timestamp start',
 'reported timestamp end',
 'task name',
 'Unnamed: 10',
 'measurement name',
 "measurement's related body part",
 'participant’s state',
 'Wearable location',
 'Wearable Type',
 'Value']

In [10]:
type(messy_df['cohort'].values)

numpy.ndarray

In [11]:
messy_df.cohort.unique()

array(['alabama'], dtype=object)

In [12]:
messy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 665 entries, 190 to 664
Data columns (total 17 columns):
user_id_intel                           665 non-null int64
user name in MUSC                       665 non-null int64
user name in Fox Insight application    665 non-null object
experiment                              665 non-null object
cohort                                  665 non-null object
Unnamed: 5                              665 non-null object
Unnamed: 6                              665 non-null object
reported timestamp start                665 non-null datetime64[ns]
reported timestamp end                  665 non-null datetime64[ns]
task name                               665 non-null object
Unnamed: 10                             665 non-null int64
measurement name                        665 non-null object
measurement's related body part         560 non-null object
participant’s state                     665 non-null object
Wearable location                       