# Assemble data for CIS-PD Curation

1. Create table/file (tasktimestamp.csv) with subjid, visit, task, start timestamp, stop timestamp
2. Create table/file for sensor info
3. Create table/file for sensor serial number info for each subject

Completed 10-4-18

Project status:
- Complete: Table 1, 2, 3
- In progress: Create summary table

# Import packages

In [3]:
# import packages
import os
import pandas as pd
import numpy as np
import re
import pathlib

# Table 1 - task and timestamp
- output: //FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation\task_timestamp.csv

In [None]:
path = r'//FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation\TaskAcc'

# get all filenames
list = []
for root, dirs, files in os.walk(path, topdown=True):
    for filenames in files:
        if filenames.endswith('.csv'):
            p = pathlib.Path(os.path.join(root, filenames))
            name = str(p.relative_to(path)).split("\\")[0]
            list.append(name)
df = pd.DataFrame({'filename':list})

# split filename into separate columns
temp = pd.DataFrame(df.filename.str.replace('\.csv',''))
temp.filename = temp.filename.str.split('_')
id_list = []
time_list = []
task_list = []
for j,k,l in temp.filename:
    id_list.append(j)
    time_list.append(k)
    task_list.append(l)
subjid = np.asarray(id_list)
time = np.asarray(time_list)
task = np.asarray(task_list)
df['subjid'] = subjid
df['time'] = time
df['task'] = task

# extract start/stop timestamps
# empty lists
start = []
stop = []
# extract start and stop timeframes in each file (which is each task)
for i in df.filename:
    filename = i
    tempfile = pd.read_csv(os.path.join(path,filename))
    start.append(tempfile.timestamp.iloc[0])
    stop.append(tempfile.timestamp.iloc[-1])  
# add columns to df
start = np.asarray(start)
stop = np.asarray(stop)
df['start timestamp'] = start
df['stop timestamp'] = stop

# change time to full visit name
oldname = ['0','1','2','3','4','5','6']
newname = ['2 Weeks: Time 0', 
           '2 Weeks: Time 30', 
           '2 Weeks: Time 60', 
           '2 Weeks: Time 90', 
           '2 Weeks: Time 120', 
           '2 Weeks: Time 150', 
           '1 Month']
namechange = dict(zip(oldname, newname))
df = df.replace({'time':namechange})

# convert task short name to full name
key = ['Shaking', 'Stndg', 'Wlkg', 'WlkgCnt', 'FtnR', 'FtnL', 'RamR', 'RamL', 'SitStand', 
       'Drwg', 'Typg', 'NtsBts', 'Drnkg', 'Sheets', 'Fldg', 'Sitng']
value = ['Shaking','Standing','Walking','Walking while counting',
         'Finger to nose--right hand', 'Finger to nose--left hand',
         'Alternating right hand movements', 'Alternating left hand movements',
         'Sit to stand','Drawing on a paper',
         'Typing on a computer keyboard','Assembling nuts and bolts',
         'Taking a glass of water and drinking','Organizing sheets in a folder',
         'Folding towels','Sitting']
name_dict = dict(zip(key,value))
df.task = df.task.map(name_dict)

# delete column
del df['filename']

# change column names
df.columns = ['SubjID', 'Visit', 'Task', 'Start Timestamp (UTC)', 'Stop Timestamp (UTC)']

# save file
savepath = r'//FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation'
finalname = os.path.join(savepath,'task_timestamp.csv')
with open(finalname,'wb') as f:
    df.to_csv(finalname, sep=',')

## Skip: Functions - if time

In [41]:
def extract_data(path):
    """Open each csv file, extract metadata from filename and first and last timestamps."""
#     locations = [locs for locs in os.listdir(path) if os.path.isdir(os.path.join(path, locs))]
    list = []

    for root, dirs, files in os.walk(path, topdown=True):
        for filenames in files:
            if filenames.endswith('.csv'):
                p = pathlib.Path(os.path.join(root, filenames))
                name = str(p.relative_to(path)).split("\\")[0]
    #             series = series.append(name)
                list.append(name)

    df = pd.DataFrame(list,columns=['subjid'])


# Question for Table 2
- ranges?

# Table 2 - sensor info
- Refer to PD Study Sensor Guide for info
    - Path: CIS-PD Study/PD Sensor Guides
- output: //FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation\sensor_info.csv

In [58]:
# sensor name
sensorlocs = ['medial_chest', 
              'flexor_digitorum_right',
              'flexor_digitorum_left',
              'anterior_thigh_right',
              'anterior_thigh_left', 
              'distal_lateral_shank_right',
              'distal_lateral_shank_left',
              'sacrum',
              'dorsal_hand_right', 
              'dorsal_hand_left']

# sensor name type
sensorname = ['ECG/EMG (KHz)', 'Accel (Hz)', 'Gyro (Hz)']

# sensor frequency
freq_e = [1]*3 + [None]*7
freq_a = [31.25]*3 + [62.5]*7
freq_g = [None]*3 + [62.5]*7

# landmarks
landmark = [
# medial_chest - Halfway between base of throat and bottom of sternum (xiphoid process)
            'Halfway between base of throat and bottom of sternum (xiphoid process)',
# flexor_digitorum_right (Wrist Flexors)
            'On top of Wrist Flexors',
# flexor_digitorum_left(Wrist Flexors)
            'On top of Wrist Flexors',
# anterior_thigh_right (proximal of Epicondyles)
            'Proximal of Femur Epicondyles',
# anterior_thigh_left (proximal of Epicondyles)
            'Proximal of Femur Epicondyles',
# distal_lateral_shank_right (along fibula, proximal of Lateral Malleolus)
            'Proximal of Fibular Lateral Malleolus',
# distal_lateral_shank_left (along fibula, proximal of Lateral Malleolus)
            'Proximal of Fibular Lateral Malleolus',
# sacrum - (slightly superior of Posterior Superior Iliac Spine (PSIS))
            'Superior of Posterior Superior Iliac Spine (PSIS)',
# dorsal_hand_right - thick end away from thumb
            'Pointing away from thumb, Parallel to wrist joint',
# dorsal_hand_left - thick end away from thumb
            'Pointing away from thumb, Parallel to wrist joint']

# axis direction
axisname = ['X-axis orientation', 'Y-axis orientation', 'Z-axis orientation']
axis_x = ['Left','Inferior','Inferior','Inferior','Inferior',
          'Inferior','Inferior','Right','Lateral','Lateral']
axis_y = ['Superior','Medial','Lateral','Medial','Lateral',
          'Anterior','Posterior','Superior','Superior','Inferior']
axis_z = ['Anterior','Anterior','Anterior','Anterior','Anterior',
          'Lateral','Lateral','Posterior','Posterior','Posterior']

# name of columns
colnames = ['Sensor Location']+sensorname+['Description of Landmarks']+axisname

# assemble dataframe
sensordf = pd.DataFrame({'Sensor Location':sensorlocs,
                         'ECG/EMG (KHz)':freq_e,
                         'Accel (Hz)':freq_a,
                         'Gyro (Hz)':freq_g,
                         'Description of Landmarks':landmark,
                         'X-axis orientation':axis_x, 
                         'Y-axis orientation':axis_y, 
                         'Z-axis orientation':axis_z
                        })

# save file
savepath = r'//FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation'
finalname = os.path.join(savepath,'sensor_info.csv')
with open(finalname,'wb') as f:
    sensordf.to_csv(finalname, sep=',')

In [311]:
# Tidy version - sensor type and frequency separated, without axis orientation info

# sensor name
sensorlocs = ['medial_chest', 'medial_chest',
              'flexor_digitorum_right','flexor_digitorum_right',
              'flexor_digitorum_left','flexor_digitorum_left',
              'anterior_thigh_right','anterior_thigh_right',
              'anterior_thigh_left', 'anterior_thigh_left',
              'distal_lateral_shank_right','distal_lateral_shank_right',
              'distal_lateral_shank_left','distal_lateral_shank_left',
              'sacrum','sacrum',
              'dorsal_hand_right', 'dorsal_hand_right',
              'dorsal_hand_left', 'dorsal_hand_left']

# sensor type
type1 = ['ECG', 'Accel'] + ['EMG', 'Accel']*2
type2 = ['Gyro', 'Accel']*7
sensortype = type1 + type2

# sensor frequency
freq1 = [1000, 31.25]*3
freq2 = [62.5]*2*7
freq = freq1 + freq2

# landmarks
# medial_chest - Halfway between base of throat and bottom of sternum (xiphoid process)
landmark = ['Halfway between base of throat and bottom of sternum (xiphoid process)',
            'Halfway between base of throat and bottom of sternum (xiphoid process)',
# flexor_digitorum_right (Wrist Flexors)
            'On top of Wrist Flexors',
            'On top of Wrist Flexors',
# flexor_digitorum_left(Wrist Flexors)
            'On top of Wrist Flexors',
            'On top of Wrist Flexors',
# anterior_thigh_right (proximal of Epicondyles)
            'Proximal of Femur Epicondyles',
            'Proximal of Femur Epicondyles',
# anterior_thigh_left (proximal of Epicondyles)
            'Proximal of Femur Epicondyles',
            'Proximal of Femur Epicondyles',
# distal_lateral_shank_right (along fibula, proximal of Lateral Malleolus)
            'Proximal of Fibular Lateral Malleolus',
            'Proximal of Fibular Lateral Malleolus',
# distal_lateral_shank_left (along fibula, proximal of Lateral Malleolus)
            'Proximal of Fibular Lateral Malleolus',
            'Proximal of Fibular Lateral Malleolus',
# sacrum - (slightly superior of Posterior Superior Iliac Spine (PSIS))
            'Superior of Posterior Superior Iliac Spine (PSIS)',
            'Superior of Posterior Superior Iliac Spine (PSIS)',
# dorsal_hand_right - thick end away from thumb
            'Pointing away from thumb, Parallel to wrist joint',
            'Pointing away from thumb, Parallel to wrist joint',
# dorsal_hand_left - thick end away from thumb
            'Pointing away from thumb, Parallel to wrist joint',
            'Pointing away from thumb, Parallel to wrist joint']

# name of columns
colnames = ['Sensor Location','Sensor Type','Frequency (Hz)','Description of Landmarks']

# sensordf = pd.DataFrame(templist, columns=colnames)
sensordf = pd.DataFrame({'Sensor Location':sensorlocs,
                         'Sensor Type':sensortype,
                         'Frequency (Hz)':freq,
                         'Description of Landmarks':landmark})

# Table 3 - sensor location, sensor serial number
- output: //FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation\sensor_serialnum.csv
- optional: use pathlib to clean up code

In [297]:
# Part 1: Create dataframe (tidy version) with sensor location and serial number

# walk path down and grab subjid, sensor location, and serial #
table3path = r'//FS2.smpp.local\RTO\CIS-PD Study\Subjects'

list = []

for root, dirs, files in os.walk(table3path, topdown=True):
    for d in dirs:
        p = pathlib.Path(os.path.join(root, d))
        list.append(str(p))

# cut beginning part of path
list = [l.replace('\\\\FS2.smpp.local\\RTO\\CIS-PD Study\\Subjects\\','') for l in list]        

# split list such that each element is a subfolder
splitlist = [l.split("\\") for l in list]

# extract paths with subjid, sensor location, serial number, and timestamp info
templist = []
for i, j in enumerate(splitlist):
    if len(j)==4:
        templist.append(j)

# assemble dataframe
colnames = ['SubjID','Sensor Location','Serial Number','Visit']
df = pd.DataFrame(templist, columns=colnames)

# clean up sensor location values
df['Sensor Location'] = df['Sensor Location'].str.replace('_', ' ')

# Keep date info only in Visit
for i, j in enumerate(df.Visit):
   df.Visit[i] = j[0:10]

# drop duplicate Visit
df.drop_duplicates(subset=None, keep="first", inplace=True)

# order by Visit
df.sort_values(by=['SubjID', 'Sensor Location', 'Visit'])

# reset index
df.reset_index(drop=True, inplace=True)

# loop through rows 0-299 and alternate fill '2 weeks' and '4 weeks'
for i, j in enumerate(df.Visit):
    if(i%2==0 and i<300):
        df.Visit[i] = '2 weeks'
    elif(i%2==1 and i<300):
        df.Visit[i] = '4 weeks'

# label Fluctuator Visit as '2 weeks'
for i in range(300,len(df)):
    df.Visit[i]='2 weeks'

# remove Fluctuator subject 1056 values - refer to Paper copy for correct serial numbers
removerow = [341,343,344,346,348,350,355]
for i in removerow:
    df.drop([i], inplace=True)
df.reset_index(drop=True, inplace=True)

In [298]:
# Part 2: Rearrange sensor locations as columns with serial numbers as values
# Output: multilevel dataframe with sensor locations as column names and serial numbers as values
        
# use hierarchical index and unstack to pivot sensor location to column and serial number as value
df.set_index(['SubjID','Visit','Sensor Location'], inplace=True)
df.sort_index(inplace=True)
unstackdf = df.unstack('Sensor Location')
# reindex to row numbers
unstackdf.reset_index()

# save multilevel dataframe as csv
savepath = r'//FS2.smpp.local\RTO\CIS-PD Study\MJFF Curation'
finalname = os.path.join(savepath,'sensor_serialnum.csv')
with open(finalname,'wb') as f:
    unstackdf.to_csv(finalname, sep=',')