In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import xlsxwriter as xw
import pathlib
import pytz

%matplotlib inline

In [2]:
def process_annotations(path):
# Processes raw annotations file to extract start / end timestamps and remove unnecessary data
#
# Inputs:  path - filepath of the subject folder containing annotations.csv
#
# Outputs: timestamps - dataframe containing list of activities and their start / end timestamps
    df = pd.read_csv(os.path.join(path, 'annotations.csv'))
    del df['Timestamp (ms)']
    del df['AnnotationId']
    del df['AuthorId']
    
    testInfo = df[df.EventType == 'Testing Day'].dropna(how='any', axis=0)
    testInfo['Start Timestamp (ms)'] = pd.to_datetime(testInfo['Start Timestamp (ms)'], unit='ms', utc=True).dt.tz_localize('UTC').dt.tz_convert('US/Central')
    del testInfo['Stop Timestamp (ms)']
    del testInfo['EventType']
    del df['Value']
    
    testInfo = testInfo.rename(columns = {'Value':'Day', 'Start Timestamp (ms)':'Date'}).reset_index(drop=True)
    testInfo['Date'] = testInfo['Date'].dt.date
    
    df = df[(df.EventType != 'Testing Day')]

    # RESIDUAL: Separation into Day 1 and Day 2 data, may re-include at later time
    #Day1 = testInfo.loc[testInfo['Day'] == 'DAY 1', 'Date']
    #Day2 = testInfo.loc[testInfo['Day'] == 'DAY 2', 'Date']

    #d1_df = process_activities(df[df['Start Timestamp (ms)'].dt.date.isin(Day1)].reset_index(drop=True)).set_index('EventType')
    #d2_df = process_activities(df[df['Start Timestamp (ms)'].dt.date.isin(Day2)].reset_index(drop=True)).set_index('EventType')
    
    #timestamps = process_activities(df.reset_index(drop=True)).set_index('EventType')
    
    sorter = set(df.EventType.unique().flatten())
    sorterIndex = dict(zip(sorter, range(len(sorter))))
        
    df['EventType_Rank'] = df['EventType'].map(sorterIndex)
    df['Cycle'] = df.groupby('EventType')['Start Timestamp (ms)'].rank(ascending=True).astype(int)
    del df['EventType_Rank']
    df[df['EventType'].str.contains('MDS-UPDRS')] = df[df['EventType'].str.contains('MDS-UPDRS')].replace(to_replace={'Cycle': {2: 3}})
    df[df['EventType'].str.contains('Heart')] = df[df['EventType'].str.contains('Heart')].replace(to_replace={'Cycle': {1: 'NaN', 2: 'NaN'}})
    #df.sort_values(['EventType', 'EventType_Rank', 'Start Timestamp (ms)'], axis=0)
    df = df.reset_index(drop=True).set_index('EventType')
    
    # return d1_df, d2_df, timestamps
    return df

In [3]:
complete = list(['Heart Rate Variability', 'MDS-UPDRS #1: Finger Tapping',
           'MDS-UPDRS #2: Hand Movements', 'MDS-UPDRS #3: Pronation-Supination',
           'MDS-UPDRS #4: Toe Tapping', 'MDS-UPDRS #5: Leg Agility',
           'MDS-UPDRS #6: Arising from Chair', 'MDS-UPDRS #7: Gait',
           'MDS-UPDRS #8: Postural Stability', 'MDS-UPDRS #9: Postural Hand Tremor',
           'MDS-UPDRS #10: Kinetic Hand Tremor', 'MDS-UPDRS #11: Rest Tremor',
           'Motor #1: Standing', 'Motor #2: Walking', 'Motor #3: Walking while Counting',
           'Motor #4: Finger to Nose', 'Motor #5: Alternating Hand Movements',
           'Motor #6: Sit to Stand', 'Motor #7: Drawing on Paper',
           'Motor #8: Typing on a Computer', 'Motor #9: Nuts and Bolts',
           'Motor #10: Drinking Water', 'Motor #11: Organizing Folder',
           'Motor #12: Folding Towels', 'Motor #13: Sitting'])

#path = r'C:\Users\adai\Documents\PD Study Data\RawData\1004'
#path = r'C:\Users\Andrew\Documents\PD Study Data\RawData\1004'
path = r'E:\PD Study Data\RawData\1004'
#path = r'X:\CIS-PD Study\Subjects\1004'

timestamps = process_annotations(path)

locations = [locs for locs in os.listdir(path) if os.path.isdir(os.path.join(path, locs))]

accel = {locs: pd.DataFrame() for locs in locations}
gyro = {locs: pd.DataFrame() for locs in locations}
elec = {locs: pd.DataFrame() for locs in locations}

for root, dirs, files in os.walk(path, topdown=True):
    for filenames in files:
        if filenames.endswith('accel.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            location = str(p.relative_to(path)).split("\\")[0]
            temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
            accel[location] = accel[location].append(temp_df)
            
        elif filenames.endswith('gyro.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            location = str(p.relative_to(path)).split("\\")[0]
            temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
            gyro[location] = gyro[location].append(temp_df)
            
        if filenames.endswith('elec.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            location = str(p.relative_to(path)).split("\\")[0]
            temp_df = pd.read_csv(p).set_index('Timestamp (ms)')
            elec[location] = elec[location].append(temp_df)


timestamps

Unnamed: 0_level_0,Start Timestamp (ms),Stop Timestamp (ms),Cycle
EventType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Heart Rate Variability,1499433333880,1499433693890,
MDS-UPDRS #1: Finger Tapping,1499434364865,1499434376490,1
MDS-UPDRS #2: Hand Movements,1499434381669,1499434391035,1
MDS-UPDRS #3: Pronation-Supination,1499434395837,1499434403196,1
MDS-UPDRS #4: Toe Tapping,1499434408186,1499434419320,1
MDS-UPDRS #5: Leg Agility,1499434423006,1499434430233,1
MDS-UPDRS #6: Arising from Chair,1499434433974,1499434440149,1
MDS-UPDRS #7: Gait,1499434445293,1499434458738,1
MDS-UPDRS #8: Postural Stability,1499434469293,1499434477034,1
MDS-UPDRS #9: Postural Hand Tremor,1499434505427,1499434515713,1


In [None]:
# Complete dictionary of all activities
act_dict = {acts: pd.DataFrame() for acts in complete}

# Populate dictionary keys per activity with every iteration / trial
for activities in complete:
    
    startTimestamp = timestamps.loc[activities, 'Start Timestamp (ms)'].values
    endTimestamp = timestamps.loc[activities, 'Stop Timestamp (ms)'].values
    
    # Create trial dictionary with each key containing all sensor data related with each activity's trial
    trial_dict = {trials: pd.DataFrame() for trials in range(0, len(startTimestamp))}
    
    # Populate trial directory keys
    for trials in range(0, len(startTimestamp)):
        
        startTime = startTimestamp[trials]
        endTime = endTimestamp[trials]
        
        # Create sensor location dictionary with each key corresponding to sensor locations
        sensor_dict = {locs: pd.DataFrame() for locs in locations}
        
        # Extract sensor data and populate sensor_dict with sensor data
        for location in locations:
            
            data = {'accel': pd.DataFrame(), 'gyro': pd.DataFrame(), 'elec': pd.DataFrame()}
            
            if not accel[location].empty:
                accelData = accel[location]
                data['accel'] = accelData[(accelData.index >= startTime) & (accelData.index <= endTime)]
                
            if not gyro[location].empty:
                gyroData = gyro[location]
                data['gyro'] = gyroData[(gyroData.index >= startTime) & (gyroData.index <= endTime)]
                
            if not elec[location].empty:
                elecData = elec[location]
                data['elec'] = elecData[(elecData.index >= startTime) & (elecData.index <= endTime)]
                
            sensor_dict[location] = data
        
        trial_dict[trials] = sensor_dict
    
    act_dict[activities] = trial_dict

In [None]:
dat = pd.DataFrame(act_dict['Motor #8: Typing on a Computer'][0])
dat.loc['accel', 'dorsal_hand_right'][0:500].plot()

In [None]:
#path2 = r'C:\Users\Andrew\Documents\PD Study Data\RawData'
path2 = r'E:\PD Study Data\RawData'

subjects = os.listdir(path2)

for subjs in subjects:
    print(os.path.join(path2, subjs))

In [None]:
def fix_errors(participant):
#input: 4 digit participant ID
#Output: ErrorList - A list of the errors needed to be fixed for the participant
#        timestamps - The dataFrame with the errors corrected for the participant
#        errordf - The dataFrame containing the remaining errors less the ones just fixed

    errordf = pd.read_excel(r'C:\Users\Alex\OneDrive\SRALAB\PD\PD_errorWorkbook.xlsx')
    errPar = errordf[errordf['Participant'] == participant]
    fixdf = pd.read_excel(r'C:\Users\Alex\OneDrive\SRALAB\PD\PD_fixedErrors.xlsx')
    errorActivity = (errPar['Activity'])
    error = errPar['Error']
    cycle = errPar['Cycle']
    day = errPar['Day']
    time = errPar['Time Adjusted (sec)']
    errorAndActivity = errPar[['Error','Activity']]
    
    print('There are',str(len(error)),'errors to be fixed:\n')
    print(errorAndActivity)
    
    for a in range(0,len(error)):
        errAct = (errorActivity.iloc[a])
        errType = (error.iloc[a])
        errCycle = (cycle.iloc[a])
        errTime = (time.iloc[a])
        if errAct == 'Merge':
            fix_merge(timestamps,errordf,errAct,errCycle)
        elif errAct == 'Late':
            fix_late(timestamps,errordf,errAct,errCycle)
        elif errAct == 'Early':
            fix_early(timestamps,errordf,errAct,errCycle)
        elif errAct == 'Duplicate':
            fix_duplicate(timestamps,errordf,errAct,errCycle)
        elif errAct == 'Split':
            fix_split(timestamps,errordf,errAct,errCycle)

    tempappend = errordf.loc[errPar.index.values]
    fixdf = fixdf.append(tempappend)
    errordf = errordf.drop(errPar.index.values)
    
    timestamps.to_excel(r'C:\Users\Alex\OneDrive\SRALAB\PD\timestamps_corrected.xlsx')
    errordf.to_excel(r'C:\Users\Alex\OneDrive\SRALAB\PD\PD_errorWorkbook.xlsx')
    fixdf.to_excel(r'C:\Users\Alex\OneDrive\SRALAB\PD\PD_fixedErrors.xlsx')
    return timestamps,errordf,fixdf


In [None]:
def fix_late(timestamps,errordf,errAct,errCycle):
# subtracts time from the beginning timestamp of the designated activity
    for aaa in range(0,len(timestamps)):
        startRow = timestamps.iloc[aaa]
        testRow = errordf.iloc[aaa]
        if timestamps.index[aaa] == errAct and startRow[3] == errCycle:
            errorAtLocation = startRow
            errorLocation = aaa
            if testRow['Type'] == 'End':
                startTime = errorAtLocation[1]
                startTime = startTime - (errTime*1000)
                timestamps.set_value(timestamps.index[aaa],'Stop Timestamp (ms)',startTime)    
            else:
                startTime = errorAtLocation[0]
                startTime = startTime - (errTime*1000)
                timestamps.set_value(timestamps.index[aaa],'Start Timestamp (ms)',startTime)
    return timestamps

def fix_early(timestamps,errordf,errAct,errCycle):
# adds time to the beginning timestamp of the designated activity
    for aaa in range(0,len(timestamps)):
        startRow = timestamps.iloc[aaa]
        testRow = errordf.iloc[aaa]
        if timestamps.index[aaa] == errAct and startRow[3] == errCycle:
            errorAtLocation = startRow
            errorLocation = aaa
            if testRow['Type'] == 'End':
                startTime = errorAtLocation[1]
                startTime = startTime + (errTime*1000)
                timestamps.set_value(timestamps.index[aaa],'Stop Timestamp (ms)',startTime)    
            else:
                startTime = errorAtLocation[0]
                startTime = startTime + (errTime*1000)
                timestamps.set_value(timestamps.index[aaa],'Start Timestamp (ms)',startTime) 
    return timestamps
    #startTime = startRow[0] #1499957620950 #df.set_value(num,colname)


In [None]:
def fix_merge(timestamps,errordf,errAct,errCycle):
    xx = 0
    for aaa in range(0,len(timestamps)):
        nextRow = timestamps.iloc[aaa]
        startRow = timestamps.iloc[aaa-1]
        if timestamps.index[aaa] == errAct and nextRow[3] == errCycle:
            if xx == 0:
                xx = 1
            elif xx == 1:
                timeEnd = nextRow['Stop Timestamp (ms)']
                timestamps.set_value(timestamps.index[aaa-1],'Stop Timestamp (ms)',timeEnd)
                timestamps = timestamps.drop(aaa)
    return timestamps
                
def fix_split(timestamps,errordf,errAct,errCycle):
    for aaa in range(0,len(timestamps)):
        row = timestamps.iloc[aaa]
        if timestamps.index[aaa] == errAct and row[3] == errCycle:
            timeStart1 = row['Start Timestamp (ms)']
            timeEnd2 = row['Stop Timestamp (ms)']
            timeChange = time
            timeEnd1 = timeStart1 + timeChange
            timeStart2 = timeEnd1
        # Need Error Activity 2 from earlier nested function
        # Need to know what eventtype_rank means, what should it be here
            line = DataFrame({"Start Timestamp (ms)":timeStart2,"Stop Timestamp (ms)":timeEnd2,"EventType_Rank": 1,"Cycle":cycle},index=[ErrorActivity2])
            timestamps = concat([timestamps.iloc[:aaa],line,timestamps.iloc[(aaa+1):]])
    return timestamps
            
def fix_duplicate(timestamps,errordf,errAct,errCycle):
    for aaa in range(0,len(timestamps)):
        row = timestamps.iloc[aaa]
        x = 0
        if timestamps.index[aaa] == errAct and row[3] == errCycle:
            if x == 0:
                x = 1
            elif x == 1:
                timestamps.drop(aaa)
    return timestamps
            
        

In [None]:
fix_errors(1016)