# Importing required libraries, and initializing workspace variables

In [1]:
# Add libraries when required if adapting base script
import os
import numpy as np
import pandas as pd
import dask
import dask.delayed as ddel
import dask.dataframe as dd
from dask.distributed import Client
import datetime as dt
import scipy.io as sio
import collections
import time
import copy
import datetime
# from datetime import datetime
from pytz import timezone
from collections import Counter


### -----------------------------------------------------------------------------------------------------------------------------------------------------------

# Defining functions

### Read annotations and cut file raw datafile into chuncks acording to each activity

In [2]:
def readAnnotationsFile(path, activitySet):

    '''
    Debug section
    '''
#     path = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\MC10 Study\Data\biostamp_data\cva\CVA08\bicep_left\d5la7xyp\2018-05-31T20-22-51-653Z\accel.csv'
#     activitySet = [
#             'Clinical - 10MWT SSV', 'Clinical - 10MWT FV', 'Activity Recognition',
#             'Resting ECG', 'Physical Therapy', 'Clinical - TUG',
#             'Clinical - MAS', 'Clinical - MMT', 'Clinical - 6MWT',
#             'Clinical - BBS'
#                   ]
#     activitySet = ['Clinical - 10MWT SSV']
#     currentPath = r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\MC10 Study\Data\biostamp_data\cva\CVA08'
    '''
    End debug section
    '''

    # Read the .csv file (path) that was given as an argument in the 'extractSubjectData' function
    dfRaw = pd.read_csv(path)
    # ... and remove duplicate timestamps if present in this datafile
    dfRaw = dfRaw.drop_duplicates(subset=dfRaw.columns[0], keep='first').reset_index(drop=True)

    # Find the annotation file in the current subject directory (global variable 'currentPath')
    dfAnnotations = pd.read_csv(os.path.join(currentPath, 'annotations.csv'))

    # Delete unwanted collumns (for debugging purposes, not really required for running the script)
    del dfAnnotations['Timestamp (ms)']
    del dfAnnotations['AnnotationId']
    del dfAnnotations['AuthorId']

    # Iterate over all the activities present in the activitySet
    for entries in range(len(activitySet)):
        # If this is the first entry of the activitySet, preallocate the function output variable
        if entries == 0:
            activityData = {}

        # Python variables are always linked, so instead if we make a (shallow) copy of the variable. Now we can alter the 
        # annotations file
        df = dfAnnotations.copy()

        # Extract the current activityName
        activityName = activitySet[entries]

        # For now, we construct an activity dataframe seperately for each activity. Since activities differ in the way they are
        # written in the annotations file, we 1) detect the event type and 2) sort the associated value as follows:
        activityAnnotationSet = df[df['EventType'].str.match(activityName)]
        activityAnnotationSet = activityAnnotationSet.drop_duplicates(subset=activityAnnotationSet.columns[1:2], keep='first')

        # If the previously formed activitySet is empty (the activity wasn't present in the annotations file), continue to
        # the next activity in the activitySet
        if activityAnnotationSet.empty:
            activityData[activityName] = {}
            continue

        # Starting with these 4:
        if (
            activityName == 'Activity Recognition' or 
            activityName == 'Resting ECG' or
            activityName == 'Physical Therapy' or
            activityName == 'Clinical - BBS'
           ):

            # Preallocate the dataframe:
            eventTypeSetComp = pd.DataFrame()

            # Iterate over all the event types:
            for ran in range(0,len(activityAnnotationSet)):
                tt = df[df['Stop Timestamp (ms)'] == activityAnnotationSet['Stop Timestamp (ms)'][activityAnnotationSet.index[ran]]]
                tt = tt.sort_values(by=['Start Timestamp (ms)'])
                if activityName == 'Activity Recognition':
                    eventTypeSet = tt[tt['EventType'].str.match('Activity type')]
                elif activityName == 'Resting ECG':
                    eventTypeSet = tt[tt['EventType'].str.match('When was rest performed?')]; 
                elif activityName == 'Physical Therapy':
                    eventTypeSet = tt[tt['EventType'].str.match('Type of Physical Therapy')]; 
                elif activityName == 'Clinical - BBS':
                    eventTypeSet = tt[tt['EventType'].str.match('Type of Assessment')]; 
                eventTypeSetComp = eventTypeSetComp.append(eventTypeSet)
                eventTypeSetComp = eventTypeSetComp.drop_duplicates(subset=eventTypeSetComp.columns[1:2], keep='first')
            activityAnnotationSet.Value = eventTypeSetComp.Value.values

        # If not the first 4, then:
        elif (
            activityName == 'Clinical - MAS' or
            activityName == 'Clinical - MMT'
           ): 

            # Preallocate the dataframe:
            eventTypeSetComp = pd.DataFrame()
            sideSetComp = pd.DataFrame()

            # Iterate over all the event types:
            for ran in range(0,len(activityAnnotationSet)):
                tt = df[df['Stop Timestamp (ms)'] == activityAnnotationSet['Stop Timestamp (ms)'][activityAnnotationSet.index[ran]]]
                tt = tt.drop_duplicates(subset=tt.columns[0], keep='first')
                tt = tt.sort_values(by=['Start Timestamp (ms)'])
                if activityName == 'Clinical - MAS':
                    eventTypeSet = tt[tt['EventType'].str.match('Movement Type')] 
                elif activityName == 'Clinical - MMT':
                    eventTypeSet = tt[tt['EventType'].str.match('Type of Activity')]
                sideSet = tt[tt['EventType'].str.match('Side')]
                eventTypeSetComp = eventTypeSetComp.append(eventTypeSet)
                eventTypeSetComp = eventTypeSetComp.drop_duplicates(subset=eventTypeSetComp.columns[1:2], keep='first')
                sideSetComp = sideSetComp.append(sideSet)
            activityAnnotationSet.Value = sideSetComp.Value.values + '_' + eventTypeSetComp.Value.values

        # Here we create a readable dataframe, again for debugging purposes only
        activityAnnotationSet['Session'] = activityAnnotationSet.groupby('EventType')['Start Timestamp (ms)'].rank(ascending=True).astype(int)
        activityAnnotationSet = activityAnnotationSet.reset_index(drop=True).set_index('Session')
        activityAnnotationSet.insert(0,'date',pd.to_datetime(activityAnnotationSet['Start Timestamp (ms)'], unit='ms'))
        activityAnnotationSet.date = activityAnnotationSet.date.dt.date
        del activityAnnotationSet['EventType']    

         # Create easily accesible matrix of start-and endtimes
        startSize = len(activityAnnotationSet.index)
        if startSize == 1:
            startTimestamp = pd.Series(activityAnnotationSet.loc[1, 'Start Timestamp (ms)'])
            endTimestamp = pd.Series(activityAnnotationSet.loc[1, 'Stop Timestamp (ms)'])
        else:
            startTimestamp = pd.Series(activityAnnotationSet.loc[1:startSize, 'Start Timestamp (ms)'].values)
            endTimestamp = pd.Series(activityAnnotationSet.loc[1:startSize, 'Stop Timestamp (ms)'].values)

        # Small check/ correction to change if the activityType string starts with a number (Matlab doesn't accept variables
        # starting with a number)
        if pd.isnull(activityAnnotationSet.iloc[0,3]):
            pass
        else:
            for nTrials in range(0, len(activityAnnotationSet['Value'])):
                if activityAnnotationSet.iloc[nTrials,3][0:2].isdigit():
                    activityAnnotationSet.iloc[nTrials,3] = activityAnnotationSet.iloc[nTrials,3].replace(activityAnnotationSet.iloc[nTrials,3][0:2], 'N' + str(activityAnnotationSet.iloc[nTrials,3][0:2]))
                elif activityAnnotationSet.iloc[nTrials,3][0:1].isdigit():
                    activityAnnotationSet.iloc[nTrials,3] = activityAnnotationSet.iloc[nTrials,3].replace(activityAnnotationSet.iloc[nTrials,3][0:1], 'N' + str(activityAnnotationSet.iloc[nTrials,3][0:1]))

        # Created nested dictionary containing number of occurrences of all trials of given activity entry
        occOfTrials = Counter(activityAnnotationSet['Value'])
        occOfTrialsTBD = dict(occOfTrials.copy())

        # For the amount of trials of this eventtype (for example; 2 trials of 10MWT etc.), slice the raw data and save it
        # as a filled dataframe
        for trials in range(0, np.size(startTimestamp)):

            # If this is the first trial of the given activity, preallocate some variables
            if trials == 0:
                fild_df = {}
                triCount = 1

            # Here we select data of the raw datafiles that start and end on the timestamps as given by the annotation file
            act1_df = dfRaw[(dfRaw['Timestamp (ms)'] >= startTimestamp[trials]) & (dfRaw['Timestamp (ms)'] <= endTimestamp[trials])]

            # Resample selected data to correct faulty timestamp series
            if act1_df.empty:
                continue
            else:
                tt = act1_df.copy()
                tt['Timestamp (ms)'] = pd.to_datetime(tt['Timestamp (ms)'],unit='ms')

                if os.path.basename(path).startswith('accel') or os.path.basename(path).startswith('gyro'):
                    tt = tt.resample('32ms', on='Timestamp (ms)').sum()
                elif os.path.basename(path).startswith('elec'):
                    tt = tt.resample('1ms', on='Timestamp (ms)').sum()

                ts =  [t.value // 10 ** 6 for t in tt.index]
                tt.insert(0, 'Timestamp (ms)', ts)       
                act1_df = tt.reset_index(drop=True)

            # ... and rename them to allow Matlab importation
            act1_df = act1_df.rename(columns={'Timestamp (ms)': 'TimestampMS',
                                              'Accel X (g)': 'AccelX', 
                                              'Accel Y (g)': 'AccelY', 
                                              'Accel Z (g)': 'AccelZ', 
                                              'Sample (V)': 'SampleVolts',
                                              'Gyro X (' + chr(176) +'/s)': 'GyroX',
                                              'Gyro Y (' + chr(176) +'/s)': 'GyroY',
                                              'Gyro Z (' + chr(176) +'/s)': 'GyroZ'})

            # Here we transfer the data (act1_df in dataframe form) to a dictionary to allow for nested exportation to .mat file
            trialNumStr = 'S' + str(triCount)
            triCount += 1
            if pd.isnull(activityAnnotationSet.iloc[trials,3]):
                fild_df[trialNumStr] = act1_df.to_dict('split')
            else:
                info = activityAnnotationSet.iloc[trials,3][:25] + (activityAnnotationSet.iloc[trials,3][25:] and '_Tr')
                newActNum = info + '_' + 'S' + str(occOfTrials[activityAnnotationSet.iloc[trials,3]] - occOfTrialsTBD[activityAnnotationSet.iloc[trials,3]] + 1)
                occOfTrialsTBD[activityAnnotationSet.iloc[trials,3]] = occOfTrialsTBD[activityAnnotationSet.iloc[trials,3]]-1

                # Gather all the data in the filled dataframe under its respective activityname   
                fild_df[newActNum] = act1_df.to_dict('split')

        activityData[activityName] = fild_df
    
    return activityData 

### Extract all datafiles (accel/gyro/elec) in the subfolders of the current subject directory

In [3]:
def extractPatientData(path, activitySet):
    # Preallocate dictionary for sliced datafiles (accel/gyro/elec) that contain data on timestamps where 
    # activities of annotation files were performed
    activityData = {}
    
    # Perform recursive iteration until .csv file is found
    if os.path.isdir(path):
        for name in os.listdir(path):
            nameR = name
            if nameR.startswith('2017') or nameR.startswith('2018'):
                nameR = nameR[:16]
            tbd = extractPatientData(os.path.join(path, name), activitySet)
            if nameR in activityData:
                activityData[nameR].update(tbd)
            else:
                activityData[nameR] = tbd
    
    # When .csv file is found (except for the annotations file), fille dictionary with activity data
    elif os.path.isfile(path) and os.path.split(path)[1] != 'annotations.csv':
        activityData = pd.DataFrame();
        # Read annotations files that are present in this directory and slice datafiles (accel/gyro/elec) to contain the
        # data on only the timestamps where activities of annotation files were performed. 
        '''
        !!! Note that the following readAnnotationsFile function is delayed (ddel). This means that the output of this function
        !!! is not immediately calculated, but instead saved as a delayed function. The entire combination of all datafiles/events
        !!! will be called as a parallel computing task in a following function in the 'Run-all' section of the scripts
        '''
        t_df = ddel(readAnnotationsFile)(path, activitySet)
        activityData = t_df
    else:
        activityData = ''
    return activityData


### Compute all delayed processes that are nested in the subject data variable

In [4]:
def subjectDataCompute(subjectDataDelayed):
    # Preallocate a list for saving all the delayed data
    delayedDataMat = []
    
    # Iterate over the delayed subject data and extract all the delayed data into a list. Afterwards compute the list to start
    # parralel computation over all the list entries to drastically reduce computation time.
    # (We don't use recursive functions here for debugging purposes)
    for key,val in subjectDataDelayed.items():
        tbd = subjectDataDelayed.copy()
        tbd2 = tbd.pop(key)
        for key2,val2 in tbd2.items():
            tbd3 = tbd2.copy()
            tbd4 = tbd3.pop(key2)
            for key3,val3 in tbd4.items():
                tbd5 = tbd4.copy()
                tbd6 = tbd5.pop(key3)
                for key4,val4 in tbd6.items():
                    tbd7 = tbd6.copy()
                    tbd8 = tbd7.pop(key4)
                    delayedDataMat.append(tbd8)
    
    # Here we call the compute function to compute all the delayed datafiles in the delayedDataMat in a parallel fashion
    subjectDataComputed = dask.compute(*delayedDataMat)
#     subjectDataComputed = delayedDataMat
    return subjectDataComputed


### Combine computed data with a dictonary to create an orderly datastructure similar to the patient folders

In [5]:
def combineSubjectData(subjectDataDelayed, computedData, currentSubject):
    # Preallocate variables to fill up later
    combinedSubjectData = {}
    combinedSubjectData[currentSubject] = {}        
    count = 0
    countA = 1; countE = 1; countG = 1

    # Iterate over all the keys in the delayed(!) data structure and use these as the initial dictionary structure
    for key,val in subjectDataDelayed.items():
        tbd = subjectDataDelayed.copy()
        tbd2 = tbd.pop(key)
        for key2,val2 in tbd2.items():
            tbd3 = tbd2.copy()
            tbd4 = tbd3.pop(key2)
            for key3,val3 in tbd4.items():
                # Remove all unnecessary string from the date key after you find the 'T'                
                fk = key3.find('T')
                key3n = key3.replace(key3[fk:], '', 1)
                
                # Preallocate dictionary structure with restructured order (key3 THEN key 1)
                if key3n in combinedSubjectData[currentSubject]:
                    pass
                else:
                    combinedSubjectData[currentSubject][key3n] = {}
                if key in combinedSubjectData[currentSubject][key3n]:
                    pass
                else:
                    combinedSubjectData[currentSubject][key3n][key] = {}
                    
                tbd5 = tbd4.copy()
                tbd6 = tbd5.pop(key3)
                for key4,val4 in tbd6.items():
                    # Sometimes (errors etc.) multiple datafiles are present in the current subject sensor folders. If there
                    # are multiple, we index them by plopping a number on them
                    if key4.startswith('accel'):
                        key4n = key4 + str(countA)
                        countA += 1
                    elif key4.startswith('elec'):
                        key4n = key4 + str(countE)
                        countE += 1
                    elif key4.startswith('gyro'):
                        key4n = key4 + str(countG)
                        countG += 1
                    else:
                        key4n = key4
                    
                    # Combine computed data (computedData) with dictionary (combinedSubjectData). Reordering of keys can be done
                    # here, however it is advised to do this in the following function files
                    combinedSubjectData[currentSubject][key3n][key][key4n] = {}
                    combinedSubjectData[currentSubject][key3n][key][key4n] = computedData[count]
                    count += 1

    return combinedSubjectData
    

### Replace string indexes that are not support by Matlab variable syntax

In [6]:
def pizza(d, oldK, newK):
    # Pizza?
    new = {}
    for k, v in d.items():
        if isinstance(v, dict):
            v = pizza(v, oldK, newK)
        new[k.replace(oldK, newK)] = v
    # Pizza!
    return new


### Change output order for readability and logic

In [7]:
def alterDictionaryStructure(data):   
    # Preallocate some variables
    eatYourData = {}
    
    # Iterate over the keys that were created in the combined subjectdata variable. 
    # (No recursion for debugging purposes)
    for key,val in data.items():
            if key in eatYourData:
                pass
            else:
                eatYourData[key] = {}
            tbd = data.copy()
            tbd2 = tbd.pop(key)
            for key2,val2 in tbd2.items():
                if key2 in eatYourData[key]:
                    pass
                else:
                    eatYourData[key][key2] = {}
                tbd3 = tbd2.copy()
                tbd4 = tbd3.pop(key2)
                for key3,val3 in tbd4.items():
                    tbd5 = tbd4.copy()
                    tbd6 = tbd5.pop(key3)
                    for key4,val4 in tbd6.items():
                        tbd7 = tbd6.copy()
                        tbd8 = tbd7.pop(key4)
                        for key5,val5 in tbd8.items():
                            if key5 in eatYourData[key][key2]:
                                pass
                            else:
                                eatYourData[key][key2][key5] = {}
                            if key3 in eatYourData[key][key2][key5]:
                                pass
                            else:
                                eatYourData[key][key2][key5][key3] = {}
                            tbd9 = tbd8.copy()
                            tbd10 = tbd9.pop(key5)
                            for key6,val6 in tbd10.items():
                                if key6 in eatYourData[key][key2][key5][key3]:
                                    pass
                                else:
                                    eatYourData[key][key2][key5][key3][key6] = {}
                                tbd11 = tbd10.copy()
                                tbd12 = tbd11.pop(key6)
                                eatYourData[key][key2][key5][key3][key6][key4] = tbd12       

    # # Alter data key names to enable saving to .mat file
    test = pizza(eatYourData, '.', '')
    test = pizza(test, '-', '_')
    test = pizza(test, '2017', 'ymd2017')
    test = pizza(test, '2018', 'ymd2018')
    test = pizza(test, ' ', '_')
    test = pizza(test, '___', '_')
    finalizedData = pizza(test, '/', '_')
    
    return finalizedData


### -----------------------------------------------------------------------------------------------------------------------------------------------------------

# Run all

In [8]:
def alles(currentSubject, activitySet, currentPath):
    # Extract all data for the current subject, using the allocated activitySet resulting in dictionary of delayed datasets
    '''
    !!! Note that the delayed datasets will be computed during the following function calls
    '''
    subjectDataDelayed = extractPatientData(currentSubjectDirectory, activitySet)
    
    # Since an annotations file is present in the current subject folder, we pop this key form the subjectData dictionary as it
    # does not contain any useable subject data
    list_keys = list(subjectDataDelayed.keys())
    for k in list_keys:
        if k.startswith('anno'):
            subjectDataDelayed.pop(k)
    
    # Compute all delayed data using parralel computing
    subjectDataComputed = subjectDataCompute(subjectDataDelayed)
    
    # Combine the computed subject data with a dictionary structure resembling the patient folder structure
    combinedSubjectData = combineSubjectData(subjectDataDelayed, subjectDataComputed, currentSubject)

    # Delete annotations file key and unnecessary nested keys from the main data dictionary
    finalizedData = alterDictionaryStructure(combinedSubjectData)
    
    return finalizedData

### Read in main subject folder to select feasible subjects

In [9]:
def select_subjects(mainPatientPath):
    
    # Preallocate subjectlist
    subjectDirectoryList = []
    
    # Iterate over items in given main patient path, and detect feasible subject folders
    for item in os.listdir(mainPatientPath):
        if item.startswith('CVA'):
            subjectDirectoryList.append(os.path.join(mainPatientPath,item))

    # Here we control the amount of subjects that are saved in the subjectDirectoryList. Might be useful for when only 1 or a
    # different specific amount of subjects are required
    subjectDirectoryList = subjectDirectoryList[:]
    
    return subjectDirectoryList
    

In [10]:
# Select main subject directory to detect feasible patient folders
subjectDirectoryList = select_subjects(r'//FS2.smpp.local\RTO\Inpatient Sensors -Stroke\MC10 Study\Data\biostamp_data\cva')

s = time.time()
# Allocating the activities that are to be extracted. Can remove/ add to current list (allbut PT)
activitySet = [
        'Resting ECG'
              ]

# Iterate over all subjects in the subjectDirectoryList and extract their data
for currentSubjectDirectory in subjectDirectoryList:
    
    # Allocating the current subject name
    currentSubject = os.path.basename(currentSubjectDirectory)

#     activitySet = ['Clinical - 10MWT SSV']
    global currentPath
    currentPath = currentSubjectDirectory

    # Extract all data for the current subject, using the allocated activitySet resulting in dictionary of delayed datasets
    '''
    !!! Note that the delayed datasets will be computed during the following function calls
    '''
    subjectDataDelayed = extractPatientData(currentSubjectDirectory, activitySet)
    
    # Since an annotations file is present in the current subject folder, we pop this key form the subjectData dictionary as it
    # does not contain any useable subject data
    list_keys = list(subjectDataDelayed.keys())
    for k in list_keys:
        if k.startswith('anno'):
            subjectDataDelayed.pop(k)
    
    # Compute all delayed data using parralel computing
    subjectDataComputed = subjectDataCompute(subjectDataDelayed)
    
    # Combine the computed subject data with a dictionary structure resembling the patient folder structure
    combinedSubjectData = combineSubjectData(subjectDataDelayed, subjectDataComputed, currentSubject)

    # Delete annotations file key and unnecessary nested keys from the main data dictionary
    finalizedData = alterDictionaryStructure(combinedSubjectData)
     
    
    
#     # Initiate a global variable for the current subjectpath, making a copy to be used in all functions
#     finalizedData = alles(currentSubject, activitySet, currentPath)

    # Save mats per activity
    sio.savemat(currentSubject, finalizedData, oned_as = 'column')

    subjectDataDelayed = None
    subjectDataComputed = None
    combinedSubjectData = None
    finalizedData = None

    e = time.time()
    print(e-s)

301.4773745536804
832.9558396339417
1566.6116697788239
2427.9288618564606
3277.4456951618195
4225.244484901428
5127.347537755966
6691.046915531158
7163.828056335449
8114.142024517059
9015.752526044846
9826.661342382431
10853.004443407059
11571.08915734291
12485.365584135056
13179.00894498825
13459.963686227798
14259.696521520615
15132.057679891586
15661.951844930649
16928.71837568283
18028.078919410706
18607.29796743393
19357.56440114975
20421.238462924957
21309.599083185196
22725.11762857437
23657.305287599564
24393.852690935135
25548.644151210785
26590.840001821518
27434.340468406677
28352.76505923271
29050.01438856125
29833.47002840042
30548.427880048752
31406.38359761238
31965.463836669922
32586.05991244316
33270.90717935562
33872.75172305107
34771.50609755516
35724.73108148575
36278.5560092926
37083.41713500023
37724.32567238808
38454.20196557045
39140.09416246414
39908.74111032486
40541.23784446716
41628.25576996803
42357.74273991585
43101.46178007126
43735.85276436806
44235.6194