In [1]:
import os
import numpy as np
import pandas as pd
import time
import datetime
from multiprocessing.dummy import Pool as ThreadPool
from PreprocessFcns import *
%matplotlib inline

In [2]:
mot_scores = pd.read_excel('//FS2.smpp.local/RTO/CIS-PD Study/Scores/MotorTasks.xls')
user_id_pairings = pd.read_csv("//FS2.smpp.local/RTO\\CIS-PD MUSC\\decoded_forms\\videoID.csv")
user_id_pairings.head()

Unnamed: 0.1,Unnamed: 0,SubjectCode,FoxInsightID,Subj ID Athena,User Name,Site
0,0,1000.0,cisuaba1,142557.0,cisuaba1,alabama
1,1,1001.0,cisuabb2,142558.0,cisuabb2,alabama
2,2,1002.0,cisuabc3,142559.0,cisuabc3,alabama
3,3,1003.0,cisuabd4,142560.0,cisuabd4,alabama
4,4,1004.0,cisnwa1,142579.0,cisnwa1,northwestern


## Compute Features on clinical visits

In [3]:
# All Subjects:
subjDict = {1003: 'left', 1004:'right', 1005: 'left', 1007: 'left', 1009: 'right',
           1016:'left', 1018:'left',1019:'left',1020:'right', 1023: 'right', 
           1024:'left', 1029:'left', 1030:'left',1032:'left', 1038:'left',
           1039: 'left', 1043: 'left', 1044:'right',1046:'right', 1047: 'left',
           1048: 'right',1049:'left', 1050: 'left', 1051:'left', 1052: 'right', 
           1053: 'left', 1054: 'left', 1055: 'right', 1056: 'left'}

# # Northwestern Subjects Only:
# subjDict = {1004:'right',1016:'left',1018:'left',1019:'left',1020:'right',1024:'left',1029:'left',1030:'left',1032:'left',
#            1038:'left',1044:'right',1046:'right',1049:'left',1051:'left'}

# Test with 1 Subject Only:
#subjDict = {1004:'right'}


taskList = ['Standing', 'Walking', 'Walking while counting', 'Finger to nose--right hand', 
            'Finger to nose--left hand', 'Alternating right hand movements', 'Alternating left hand movements', 
            'Sit to stand', 'Drawing on a paper', 'Typing on a computer keyboard', 'Assembling nuts and bolts', 
            'Taking a glass of water and drinking', 'Organizing sheets in a folder', 'Folding towels', 'Sitting']

taskScores = ['standing','walking','walking while counting','finger to nose right','finger to nose left',
                   'alternating right','alternating left','sit to stand','drawing on a paper',
                   'typing on a computer keyboard','assembling nuts and bolts','taking a glass of water and drinking',
                   'organizing sheets in a folder','folding towels','sitting']

taskList_Abb = ['Stndg', 'Wlkg', 'WlkgCnt', 'FtnR', 'FtnL', 'RamR', 'RamL', 'SitStand', 'Drwg', 'Typg', 'NtsBts',
                'Drnkg', 'Sheets', 'Fldg', 'Sitng']

sessionList = ['2 Weeks: Time 0', '2 Weeks: Time 30', '2 Weeks: Time 60', '2 Weeks: Time 90', '2 Weeks: Time 120', 
               '2 Weeks: Time 150', '4 Weeks']

In [4]:
# get clinic data features
def ClinicDataAggregator(subjDict,taskList_Abb,taskScores,sessionList):
    
    # compute features for each subject-task pairing
    Data = pd.DataFrame()
    numSamples = pd.DataFrame()
    s = 0
    for subj in list(subjDict.keys()):
        s += 1
        print('Subject %d (%d of %d)'%(subj,s,len(subjDict)))
        for t in range(len(taskList_Abb)):
            task = taskList_Abb[t]
            task_score = taskScores[t]
            for trial in range(len(sessionList)):
                visit = sessionList[trial]
                
                # get task ACC data
                try:
                    data = pd.read_csv('//FS2.smpp.local//RTO//CIS-PD Study//MJFF Curation//TaskAcc//' + str(subj) + '_' + str(trial) + 
                                      '_' + task + '.csv',parse_dates=['timestamp'])[['timestamp','x','y','z']]
                except:
                    print('No data found for %s trial %d'%(task,trial))
                    continue
                    
                # get Tremor and Bradykinesia Scores
                side = subjDict[subj]
                subj_score = mot_scores.loc[mot_scores['subject']==subj,['subject','visit',
                                            task_score+ ' ' + 'bradykinesia ' + side + ' upper limb',
                                            task_score+ ' ' + 'tremor ' + side + ' upper limb']]
                subj_score = subj_score.rename(index=str,columns={subj_score.columns[2]:'Bradykinesia',subj_score.columns[3]:'Tremor'})
                subj_score.index = range(len(subj_score))
                    
                # reset time starting at first data point
                data['timestamp'] = (data.timestamp.values - data.timestamp.values[0]).astype('timedelta64[ms]').astype(int)
                data = data.set_index('timestamp')
                
                # filter data
                data = filterdata(data,'highpass')
                data = filterdata(data,'lowpass',cutoff=3)
                
                # get data clips
                clip_data = gen_clips(data)
                
                # get the acc features
                for c in range(len(clip_data['data'])):
                    N = pd.DataFrame()
                    N['Subject'] = subj
                    N['Task'] = task
                    N['Visit'] = sessionList[trial]
                    N['Samples'] = len(clip_data['data'][c])
                    numSamples = pd.concat([numSamples,N])
                feature_extraction_reduced(clip_data)
                if 'features' in clip_data.keys():
                    D = clip_data['features']
                    featcols = list(D.columns)
                    D['Bradykinesia'] = subj_score['Bradykinesia'][trial]
                    D['Tremor'] = subj_score['Tremor'][trial]
                    D['Visit'] = visit
                    D['Task'] = task
                    D['Subject'] = subj
                    Data = pd.concat([Data,D])
    # save acc features      
    cols = ['Subject','Visit','Task','Bradykinesia','Tremor'] + featcols
    Data = Data[cols]
    Data.to_csv('//FS2.smpp.local/RTO/CIS-PD Study\\Clinic WACC features\\REPLACE FILE NAME.csv')
    return Data

In [None]:
# All Files:
files = os.listdir("//FS2.smpp.local\\RTO\\CIS-PD Study\\MJFF Curation\\ClinicVisitACC")

for file in files:
    if os.path.isfile('X:\CIS-PD Study\Home WACC features\\features ' + file):
        continue
    Data = HomeDataAggregator(file)

## Compute Features on home data

In [10]:
# get the 4 digit user_id from the corresponding 6 digit user_id
def user_id_6_to_4(user_id_6):
    for index, row in user_id_pairings.iterrows():
        if (not np.isnan(row['Subj ID Athena'])):
            if (int(row['Subj ID Athena'])==int(user_id_6)):
                return int(row['SubjectCode'])
    return None

In [11]:
# get all home data features - less efficient version
def HomeDataAggregator(file):
    t1 = time.time()
    Data = pd.DataFrame()
    print(file)
    
    features_list = ['RMSX','RMSY','RMSZ','rangeX','rangeY','rangeZ','meanX','meanY','meanZ','varX','varY','varZ',
                    'skewX','skewY','skewZ','kurtX','kurtY','kurtZ','xcor_peakXY','xcorr_peakXZ','xcorr_peakYZ',
                    'xcorr_lagXY','xcorr_lagXZ','xcorr_lagYZ','Dom_freq','Pdom_rel','PSD_mean','PSD_std','PSD_skew',
                    'PSD_kur','jerk_mean','jerk_std','jerk_skew','jerk_kur']
        
    # get acc data
    try:
        # change if we want to use Activity Level and Tremor Score
        data = pd.read_pickle('X:\\CIS-PD Study\\MJFF Curation\\combined pre-visit data\\' + 
                               file)[['user_id','timestamp','Gait','x','y','z']]  
    except:
        print('No data found for %s trial %d'%(task,trial))
        return
        
    # organize data and make 5 second clips
    data = data.sort_values(by = 'timestamp', axis = 0)
    data['timestamp2'] = [(tm - datetime.timedelta(minutes=0,
                                                   seconds=tm.second % 5,
                                                   microseconds=tm.microsecond)) 
                          for tm in data.timestamp]
    
    data['timestamp'] = (data.timestamp.values - data.timestamp.values[0]).astype('timedelta64[ms]').astype(int)
    data = data.set_index('timestamp')
    data.loc[:,['x', 'y', 'z']] = filterdata(data[['x', 'y', 'z']])
                
    # "clip" the data into 5 second chunks    
    five_sec_intervals = data.timestamp2.unique()
        
    # calculate features
    F=[]
    num_empty = 0
    times = []
    for t in five_sec_intervals:
        clip = data.loc[(data.timestamp2 == t)] # & (data.Gait == 1)] # take only walking data
        if (clip.empty or (len(clip.timestamp2) < 200)):
            num_empty += 1
        else:
            F.append(reduced_feature_extraction_from_1_clip(clip[['x', 'y', 'z']]))
            times.append(t)

    success_info = [file, len(five_sec_intervals), (len(five_sec_intervals)-num_empty)]
    df = pd.DataFrame(data = [success_info], columns = ['file', 'num expected clips', 'num actual clips'])
    if (os.path.isfile("X:\\CIS-PD Study\\Home WACC features\\success info.csv")):
            dfo = pd.read_csv("X:\\CIS-PD Study\\Home WACC features\\success info.csv", index_col = 0)
            df = pd.concat([dfo, df])
    df.to_csv("X:\\CIS-PD Study\\Home WACC features\\success info.csv")
            
    # create features dataframe
    D = pd.DataFrame(data=F,columns=features_list,dtype='float32')    
    featcols = list(D.columns)
    D['Subject'] = data.loc[0, 'user_id']
    D['timestamp'] = times #['timestamp2']
    Data = pd.concat([Data,D])   
    cols = ['Subject','timestamp'] + featcols
    Data = Data[cols]
    elapsed_time = ((time.time() - t1)/60.0).__str__()
    print(elapsed_time + " mins\n")
    Data.to_pickle('X:\CIS-PD Study\Home WACC features\\features ' + file)

In [12]:
# get features using threading - helper function
def mapping_func(t, data):
    clip = data.loc[(data.timestamp2 == t)] # & (data.Gait == 1)] # take only walking data
    if not((clip.empty or (len(clip.index) < 200))):
        return list(reduced_feature_extraction_from_1_clip(clip[['x', 'y', 'z']])).append(t)


In [13]:
# get home features using threading
def HomeDataAggregatorThreading(file):
    t1 = time.time()
    Data = pd.DataFrame()
    print(file)
    
    features_list = ['RMSX','RMSY','RMSZ','rangeX','rangeY','rangeZ','meanX','meanY','meanZ','varX','varY','varZ',
                    'skewX','skewY','skewZ','kurtX','kurtY','kurtZ','xcor_peakXY','xcorr_peakXZ','xcorr_peakYZ',
                    'xcorr_lagXY','xcorr_lagXZ','xcorr_lagYZ','Dom_freq','Pdom_rel','PSD_mean','PSD_std','PSD_skew',
                    'PSD_kur','jerk_mean','jerk_std','jerk_skew','jerk_kur', 'timestamp']
        
    # get acc data
    try:
        # change if we want to use Activity Level and Tremor Score
        data = pd.read_pickle('X:\\CIS-PD Study\\MJFF Curation\\combined pre-visit data\\' + 
                               file)[['user_id','timestamp','Gait','x','y','z']]  
    except:
        print('No data found for %s trial %d'%(task,trial))
        return
        
    # organize data and make 5 second clips
    data = data.sort_values(by = 'timestamp', axis = 0)
    
    data['timestamp2'] = [(tm - datetime.timedelta(minutes=0,
                                                   seconds=tm.second % 5,
                                                   microseconds=tm.microsecond)) 
                          for tm in data.timestamp]
    
    data['timestamp'] = (data.timestamp.values - data.timestamp.values[0]).astype('timedelta64[ms]').astype(int)
    data = data.set_index('timestamp')
    data.loc[:,['x', 'y', 'z']] = filterdata(data[['x', 'y', 'z']])
                
    # "clip" the data into 5 second chunks    
    five_sec_intervals = data.timestamp2.unique()
        
    # calculate features
    pool = ThreadPool(4)
    results = pool.map(lambda t : mapping_func(t, data), five_sec_intervals)
    pool.close()
    pool.join()
    print("\n\n")
    print(results)
    results = list(results)
    print('\n')
    print (results)
    results = list(filter(lambda r: r is not None, results))
    success_info = [file, len(five_sec_intervals), len(results)]
    df = pd.DataFrame(data = [success_info], columns = ['file', 'num expected clips', 'num actual clips'])
    if (os.path.isfile("X:\\CIS-PD Study\\Home WACC features\\success info.csv")):
            dfo = pd.read_csv("X:\\CIS-PD Study\\Home WACC features\\success info.csv", index_col = 0)
            df = pd.concat([dfo, df])
    df.to_csv("X:\\CIS-PD Study\\Home WACC features\\success info.csv")
            
    # create features dataframe
    D = pd.DataFrame(data=results,columns=features_list,dtype='float32')    
    featcols = list(D.columns)
    D['Subject'] = data.loc[0, 'user_id']
    Data = pd.concat([Data,D])   
    cols = ['Subject'] + featcols
    Data = Data[cols]
    elapsed_time = ((time.time() - t1)/60.0).__str__()
    print(elapsed_time + " mins\n")
    Data.to_pickle('X:\CIS-PD Study\Home WACC features\\features ' + file)

In [15]:
# All Files:
#files = os.listdir("X:\\CIS-PD Study\\MJFF Curation\\combined pre-visit data")

# Luca's Files:
# files = ['142605 2017-08-16.pkl', '142557 2017-09-21.pkl', '142559 2017-07-06.pkl', '142617 2017-07-17.pkl', '142582 2017-07-27.pkl', '142577 2017-11-30.pkl', '142604 2017-10-02.pkl', '142604 2017-07-21.pkl', '142568 2017-09-18.pkl', '142615 2017-07-21.pkl', '142570 2017-07-13.pkl', '142623 2017-10-12.pkl', '142559 2017-07-20.pkl', '142602 2017-12-12.pkl', '142592 2017-10-03.pkl', '142561 2017-07-03.pkl', '142584 2018-02-21.pkl', '142562 2017-07-07.pkl', '142622 2017-08-03.pkl', '142606 2017-10-13.pkl', '142604 2017-08-14.pkl', '142568 2018-02-28.pkl', '142583 2017-10-13.pkl', '142618 2017-07-31.pkl', '142583 2017-08-25.pkl', '142603 2017-08-07.pkl', '142615 2017-10-05.pkl', '142595 2017-10-12.pkl', '142593 2018-01-29.pkl', '142623 2017-08-17.pkl', '142601 2017-07-13.pkl', '142570 2017-08-03.pkl', '142568 2017-09-06.pkl', '142602 2017-08-03.pkl', '142618 2017-10-30.pkl', '142616 2018-02-05.pkl', '142584 2017-11-29.pkl', '142598 2017-07-24.pkl', '142608 2017-09-18.pkl']

# Nick's Files:
# files = ['142598 2017-07-10.pkl', '142617 2017-08-01.pkl', '142603 2017-07-31.pkl', '142605 2017-08-02.pkl', '142618 2018-01-30.pkl', '142592 2017-07-13.pkl', '142558 2017-06-29.pkl', '142620 2017-07-21.pkl', '142593 2017-07-20.pkl', '142617 2017-10-03.pkl', '142594 2017-08-10.pkl', '142592 2017-07-27.pkl', '142578 2018-02-19.pkl', '142621 2017-07-21.pkl', '142612 2017-09-06.pkl', '142609 2017-08-11.pkl', '142612 2017-08-15.pkl', '142619 2017-08-10.pkl', '142608 2017-09-11.pkl', '142558 2017-07-14.pkl', '142585 2017-10-04.pkl', '142595 2018-01-10.pkl', '142557 2017-06-29.pkl', '142622 2017-10-12.pkl', '142601 2017-07-26.pkl', '142575 2017-08-29.pkl', '142581 2017-07-26.pkl', '142566 2017-07-26.pkl', '142608 2017-11-09.pkl', '142602 2017-09-20.pkl', '142606 2017-12-29.pkl', '142567 2017-07-31.pkl', '142620 2017-08-10.pkl', '142622 2017-08-17.pkl', '142583 2017-08-09.pkl', '142602 2017-07-13.pkl', '142566 2017-08-09.pkl', '142580 2017-08-01.pkl']

# Julianne's Files:
files = ['142618 2017-08-14.pkl', '142557 2017-07-13.pkl', '142585 2018-02-02.pkl', '142581 2017-08-14.pkl', '142577 2017-08-03.pkl', '142619 2018-01-22.pkl', '142584 2017-11-01.pkl', '142600 2017-07-11.pkl', '142620 2017-10-30.pkl', '142593 2017-08-03.pkl', '142621 2017-10-23.pkl', '142609 2017-09-01.pkl', '142578 2017-09-05.pkl', '142616 2017-07-19.pkl', '142593 2017-11-06.pkl', '142600 2017-07-25.pkl', '142606 2017-08-16.pkl', '142616 2017-08-01.pkl', '142615 2017-07-07.pkl', '142595 2017-08-22.pkl', '142615 2018-01-22.pkl', '142598 2017-09-13.pkl', '142557 2017-12-11.pkl', '142601 2017-09-20.pkl', '142582 2017-08-09.pkl', '142594 2017-07-19.pkl', '142577 2017-10-11.pkl', '142579 2017-07-18.pkl', '142619 2017-07-21.pkl', '142578 2017-09-18.pkl', '142605 2017-10-11.pkl', '142575 2017-08-16.pkl', '142580 2017-07-14.pkl', '142623 2017-08-03.pkl', '142563 2017-07-18.pkl', '142604 2017-11-27.pkl', '142584 2017-09-13.pkl', '142616 2017-10-03.pkl', '142560 2017-07-03.pkl']

for file in files:
    if os.path.isfile('X:\CIS-PD Study\Home WACC features\\features ' + file):
        continue
    Data = HomeDataAggregator(file)

142609 2017-09-01.pkl
22.153518044948576 mins

142578 2017-09-05.pkl
309.9703974843025 mins

142616 2017-07-19.pkl
587.6584043860436 mins

142593 2017-11-06.pkl
6.090699501832327 mins

142600 2017-07-25.pkl
14.770508551597596 mins

142606 2017-08-16.pkl
2.7018452882766724 mins

142616 2017-08-01.pkl
4.589166724681855 mins

142615 2017-07-07.pkl
19.4826144973437 mins

142595 2017-08-22.pkl


KeyboardInterrupt: 

In [None]:
#pool = ThreadPool(2)
#pool.map(HomeDataAggregator, files)
#pool.close()
#pool.join()