# Create Summary Tables and Other Related Info for CIS-PD curation paper

# Next action
- clean up and consolidate code

In [2]:
# Importing the Libraries
import os
import platform
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import pathlib
import pickle
import time
import re
import copy
%matplotlib inline

# Load and check codelist and data dictionary
- codelist doesn't have form 508 or 238, only datadict does

In [24]:
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\codelist.sas7bdat'
codelist = pd.read_sas(filename)
codelist.columns = codelist.columns.str.replace('z', '')
codelist = codelist.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\datadictionary.sas7bdat'
datadict = pd.read_sas(filename, format = 'sas7bdat', encoding='iso-8859-1')
datadict.columns = datadict.columns.str.replace('z', '')

# Create Table 1 - Demographics table

In [101]:
# items: gender, race, ethnicity, age
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\subjenrollment.sas7bdat'
subjenroll = pd.read_sas(filename, format = 'sas7bdat', encoding='iso-8859-1')
subjenroll.columns = subjenroll.columns.str.replace('z', '')

In [102]:
tabledata = {
'Question': ['Age', 'Gender', 'Race', 'Ethnicity', 'Hoehn & Yahr stage 1-3'],
'Variable Name': ['Age', 'Gender', 'Race', 'Ethnicity', 'Hoehn & Yahr stage 1-3'],
'Variable details': ['Integer',
                     'One of: (\'Male\',\'Female\')',
                     'One of: (\'American Indian or Alaska Native\',\'Asian\',\'Black or African American\',\'Native Hawaiian or Other Pacific Islander\',\'White\',\'Unknown\')',
                     'One of: (\'Hispanic or Latino\',\'Not Hispanic or Latino\',\'Unknown\')',
                     'One of: (\'no\',\'yes\')']
}
df = pd.DataFrame(tabledata)

# save table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_demo.csv')
with open(filename,'wb') as f:
    df.to_csv(filename, sep=',', index=False)

In [13]:
df

Unnamed: 0,Question,Variable Name,Variable details
0,Age,Age,Integer
1,Gender,Gender,"One of: ('Male','Female')"
2,Race,Race,"One of: ('American Indian or Alaska Native','A..."
3,Ethnicity,Ethnicity,"One of: ('Hispanic or Latino','Not Hispanic or..."
4,Hoehn & Yahr stage 1-3,Hoehn & Yahr stage 1-3,"One of: ('no','yes')"


# Create Table 2 - MDS-UPDRS questionnaire parts 1, 2, and 4

In [None]:
# load files
path = r'Y:\CIS-PD MUSC\decoded_forms'
data124 = pd.read_hdf(os.path.join(path,'updrs_124.h5'))

In [100]:
# get questions by slicing list for parts 1,2,4, subtract part 3
list = datadict.loc[datadict.FormID == 238.0]['FieldNm'].tolist()
qlist = list[:30] + list[70:-1]

In [100]:
name1 = ['One of: (\'Patient\', \'Caregiver\', \'Patient and Caregiver in Equal Proportion\')']
name2 = ['One of: (\'Normal\',\' Slight\',\' Mild\',\' Moderate\',\' Severe\') mapping to (0, 1, 2, 3, 4)']
variabledetails = ['Subject ID', 'SiteID', 'Visit', 'FormDate'] + \
                    name1 + name2*6 + name1 + name2*26 + ['One of: (\'no\',\'yes\')']

tabledata124 = {
'Question': ['Subject ID', 'SiteID', 'Visit',
             'Date of assessment',
             'Primary source of information',
             '1.1 Cognitive Impairment',
             '1.2 Hallucinations and Psychosis',
             '1.3 Depressed Mood',
             '1.4 Anxious Mood',
             '1.5 Apathy',
             '1.6 Features of Dopamine Dysregulation Syndrome',
             'Who is filling out this questionnaire',
             '1.7 Sleep Problems',
             '1.8 Daytime Sleepiness',
             '1.9 Pain and Other Sensations',
             '1.10 Urinary Problems',
             '1.11 Constipation Problems',
             '1.12 Light Headedness on Standing',
             '1.13 Fatigue',
             '2.1 Speech',
             '2.2 Saliva & Drooling',
             '2.3 Chewing and Swallowing',
             '2.4 Eating Tasks',
             '2.5 Dressing',
             '2.6 Hygiene',
             '2.7 Handwriting',
             '2.8 Doing Hobbies and Other Activities',
             '2.9 Turning in Bed',
             '2.10 Tremor',
             '2.11 Getting Out of Bed, a Car, or a Deep Chair',
             '2.12 Walking and Balance',
             '2.13 Freezing',
             '4.1 Time Spent With Dyskinesias',
             '4.2 Functional Impact of Dyskinesias',
             '4.3 Time Spent in the Off State',
             '4.4 Functional Impact of Fluctuations',
             '4.5 Complexity of Motor Fluctuations',
             '4.6 Painful Off-State Dystonia',
             'Data Collected'],
'Variable Name': data124.columns.values.tolist(),
'Variable details': variabledetails
}
df124 = pd.DataFrame(tabledata124)

# save table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_part124.csv')
with open(filename,'wb') as f:
    df124.to_csv(filename, sep=',', index=False)

# Create Table 3 - MDS-UPDRS questionnaire  part 3
Notes:
why does data3 have 
- ParticipantState and 3b - same
- What's the difference b/n UTC and FormTime?

In [None]:
# load files
path = r'Y:\CIS-PD MUSC\decoded_forms'
data124 = pd.read_hdf(os.path.join(path,'updrs_124.h5'))
data3 = pd.read_hdf(os.path.join(path,'updrs_part3.h5'))

In [98]:
onoff = ['One of: (\'on\',\'off\')']
noyes = ['One of: (\'no\',\'yes\')']
severity = ['One of: (\'Normal\',\' Slight\',\' Mild\',\' Moderate\',\' Severe\') mapping to (0, 1, 2, 3, 4)']

tabledata = {
'Question': ['Subject ID', 'SiteID', 'Visit', 'Date of assessment',
             'Time of assessment', 'FormTime',
             # repeated items with 3b - 'ParticipantState', 'Hours since last PD medication dose'???
             '3b Mark the patient\'s clinical state',
             'Hours since last PD medication dose',
             # 3a, 3b, 3c, 3C1
             '3a Is the patient on medication for treating the symptoms of Parkinson\'s Disease?',
             '3b Mark the patient\'s clinical state', 
             '3c Is the patient on Levodopa?',
             '3.C1 Minutes since last levodopa dose',
             '3.1 Speech',
             '3.2 Facial Expression',
             '3.3 Rigidity - Neck',
             '3.3 Rigidity - Right Upper Extremity',
             '3.3 Rigidity - Left Upper Extremity',
             '3.3 Rigidity - Right Lower Extremity',
             '3.3 Rigidity - Left Lower Extremity',
             '3.4 Finger Tapping - Right Hand',
             '3.4 Finger Tapping - Left Hand',
             '3.5 Hand Movements - Right Hand',
             '3.5 Hand Movements - Left Hand',
             '3.6 Pronation-Supination Movements of Hands - Right Hand',
             '3.6 Pronation-Supination Movements of Hands - Left Hand',
             '3.7 Toe Tapping - Right Foot',
             '3.7 Toe Tapping - Left Foot',
             '3.8 Leg Agility - Right Leg',
             '3.8 Leg Agility - Left Leg',
             '3.9 Arising From Chair',
             '3.10 Gait',
             '3.11 Freezing of Gait',
             '3.12 Postural Stability',
             '3.13 Posture',
             '3.14 Global Spontaneity of Movement Body Bradykinesia',
             '3.15 Postural Tremor of the Hands - Right Hand',
             '3.15 Postural Tremor of the Hands - Left Hand',
             '3.16 Kinetic Tremor of the Hands - Right Hand',
             '3.16 Kinetic Tremor of the Hands - Left Hand',
             '3.17 Rest Tremor Amplitude - Right Upper Extremity',
             '3.17 Rest Tremor Amplitude - Left Upper Extremity',
             '3.17 Rest Tremor Amplitude - Right Lower Extremity',
             '3.17 Rest Tremor Amplitude - Left Lower Extremity',
             '3.17 Rest Tremor Amplitude - Lip/Jaw',
             '3.18 Constancy of Rest Tremor',
             '3.19 A Were dyskinesias (chorea or dystonia) present during examination?',
             '3.19 B Did these movements interfere with your ratings?',
             '3.20 Hoen and Yahr Stage',
             'Data collected'],
'Variable Name': data3.columns.values.tolist(),
'Variable details': ['Subject ID', 'SiteID', 'Visit', 'Date of assessment',
       'Datetime', 'FormTime']+onoff+['Integer']+noyes+onoff+noyes+['Integer']+severity*33+noyes*2+['One of: (0, 1, 2, 3, 4, 5)']+noyes
}
df = pd.DataFrame(tabledata)

# save summary table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_part3.csv')
with open(filename,'wb') as f:
    df.to_csv(filename, sep=',', index=False)

# Create Table 4 - Task and Timestamp

In [43]:
filepath = r'Y:\CIS-PD Study\MJFF Curation'
taskdf = pd.read_csv(os.path.join(filepath,'task_timestamp.csv'))

In [75]:
tabledata = {
'Question': ['Subject ID'] + taskdf.columns.tolist()[1:],
'Variable Name': taskdf.columns.tolist(),
'Variable details': ['Integer',
                    'One of: (\'' + '\', \''.join(taskdf.Visit.unique().tolist()) + '\')',
                    'One of: (\'' + '\', \''.join(taskdf.Task.unique().tolist()) + '\')',
                    'Date Time',
                    'Date Time']
}
df = pd.DataFrame(tabledata)

# save table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_task_timestamp.csv')
with open(filename,'wb') as f:
    df.to_csv(filename, sep=',', index=False)

In [76]:
df

Unnamed: 0,Question,Variable Name,Variable details
0,Subject ID,SubjID,Integer
1,Visit,Visit,"One of: ('2 Weeks: Time 0', '2 Weeks: Time 30'..."
2,Task,Task,One of: ('Taking a glass of water and drinking...
3,Start Timestamp (UTC),Start Timestamp (UTC),Date Time
4,Stop Timestamp (UTC),Stop Timestamp (UTC),Date Time


# Create Table 5 - Apple watch and phone specs
- form101
    - Q11 iphone model
    - Q12 iOS version

In [96]:
# Get questions on forms for Apple watch and phone info
# form101 Q11 and 12 have phone model and iOS version
path = r'Y:\CIS-PD MUSC\decoded_forms'
data101 = pd.read_hdf(os.path.join(path,'form101.h5'))
data126 = pd.read_hdf(os.path.join(path,'form126.h5'))

In [97]:
tabledata = {
'Question': ['iPhone model to be used by subject', 'iOS version to be used'],
'Variable Name': ['iphone', 'ios'],
'Variable details': ['Text',
                     'Text']
}
df = pd.DataFrame(tabledata)

# save table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_watchandphone_specs.csv')
with open(filename,'wb') as f:
    df.to_csv(filename, sep=',', index=False)

# Create Table 6 - Sensor Info

In [None]:
# for reference - OMIT
'Variable details': ['Integer',
                     'One of: (\'Male\',\'Female\')',
                     'One of: (\'American Indian or Alaska Native\',\'Asian\',\'Black or Afican American\',\'Native Hawaiian or Other Pacific Islander\',\'White\',\'Unknown\')',
                     'One of: (\'Hispanic or Latino\',\'Not Hispanic or Latino\',\'Unknown\')',
                     'One of: (\'no\',\'yes\')']

In [77]:
filepath = r'Y:\CIS-PD Study\MJFF Curation'
sensordf = pd.read_csv(os.path.join(filepath,'sensor_info.csv'))

In [80]:
sensordf.columns.tolist()

['Sensor Location',
 'ECG/EMG (KHz)',
 'Accel (Hz)',
 'Gyro (Hz)',
 'Description of Landmarks',
 'X-axis orientation',
 'Y-axis orientation',
 'Z-axis orientation']

In [81]:
sensordf.head(3)

Unnamed: 0,Sensor Location,ECG/EMG (KHz),Accel (Hz),Gyro (Hz),Description of Landmarks,X-axis orientation,Y-axis orientation,Z-axis orientation
0,medial_chest,1.0,31.25,,Halfway between base of throat and bottom of s...,Left,Superior,Anterior
1,flexor_digitorum_right,1.0,31.25,,On top of Wrist Flexors,Inferior,Medial,Anterior
2,flexor_digitorum_left,1.0,31.25,,On top of Wrist Flexors,Inferior,Lateral,Anterior


In [82]:
len(sensordf.columns)

8

In [90]:
'One of: (\'' + '\', \''.join(sensordf.iloc[:,0].unique().tolist()) + '\')'

"One of: ('medial_chest', 'flexor_digitorum_right', 'flexor_digitorum_left', 'anterior_thigh_right', 'anterior_thigh_left', 'distal_lateral_shank_right', 'distal_lateral_shank_left', 'sacrum', 'dorsal_hand_right', 'dorsal_hand_left')"

In [97]:
sensordf.iloc[:,1].unique().tolist()

[1.0, nan]

In [98]:
sensordf.iloc[:,2].unique().tolist()

[31.25, 62.5]

In [99]:
sensordf.iloc[:,3].unique().tolist()

[nan, 62.5]

In [100]:
sensordf.iloc[:,4].unique().tolist()

['Halfway between base of throat and bottom of sternum (xiphoid process)',
 'On top of Wrist Flexors',
 'Proximal of Femur Epicondyles',
 'Proximal of Fibular Lateral Malleolus',
 'Superior of Posterior Superior Iliac Spine (PSIS)',
 'Pointing away from thumb, Parallel to wrist joint']

In [101]:
sensordf.iloc[:,5].unique().tolist()

['Left', 'Inferior', 'Right', 'Lateral']

In [102]:
sensordf.iloc[:,6].unique().tolist()

['Superior', 'Medial', 'Lateral', 'Anterior', 'Posterior', 'Inferior']

In [103]:
sensordf.iloc[:,7].unique().tolist()

['Anterior', 'Lateral', 'Posterior']

In [105]:
tabledata = {
'Question': sensordf.columns.tolist(),
'Variable Name': sensordf.columns.tolist(),
'Variable details': ['One of: (\'' + '\', \''.join(sensordf.iloc[:,0].unique().tolist()) + '\')',
                     'One of: (\'1.0\',\'\')',
                     'One of: (\'31.25\',\'62.5\')',
                     'One of: (\'62.5\',\'\')',
                     'One of: (\'' + '\', \''.join(sensordf.iloc[:,4].unique().tolist()) + '\')',
                     'One of: (\'' + '\', \''.join(sensordf.iloc[:,5].unique().tolist()) + '\')',
                     'One of: (\'' + '\', \''.join(sensordf.iloc[:,6].unique().tolist()) + '\')',
                     'One of: (\'' + '\', \''.join(sensordf.iloc[:,7].unique().tolist()) + '\')']
}
df = pd.DataFrame(tabledata)

# save table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_sensorinfo.csv')
with open(filename,'wb') as f:
    df.to_csv(filename, sep=',', index=False)

In [106]:
df

Unnamed: 0,Question,Variable Name,Variable details
0,Sensor Location,Sensor Location,"One of: ('medial_chest', 'flexor_digitorum_rig..."
1,ECG/EMG (KHz),ECG/EMG (KHz),"One of: ('1.0','')"
2,Accel (Hz),Accel (Hz),"One of: ('31.25','62.5')"
3,Gyro (Hz),Gyro (Hz),"One of: ('62.5','')"
4,Description of Landmarks,Description of Landmarks,One of: ('Halfway between base of throat and b...
5,X-axis orientation,X-axis orientation,"One of: ('Left', 'Inferior', 'Right', 'Lateral')"
6,Y-axis orientation,Y-axis orientation,"One of: ('Superior', 'Medial', 'Lateral', 'Ant..."
7,Z-axis orientation,Z-axis orientation,"One of: ('Anterior', 'Lateral', 'Posterior')"


# Create Table 7 - Sensor Location and Serial Number

In [4]:
filepath = r'Y:\CIS-PD Study\MJFF Curation'
senserialdf = pd.read_csv(os.path.join(filepath,'sensor_serialnum.csv'))

In [55]:
tabledata = {
'Question': ['Subject ID', 'Visit'] + [senserialdf.iloc[0][0]] + [senserialdf.columns[2]],
'Variable Name': senserialdf.iloc[1][0:2].values.tolist() + [senserialdf.iloc[0][0]] + [senserialdf.columns[2]],
'Variable details': ['Integer',
                     'One of: (\'2 weeks\',\'4 weeks\')',
                     'One of: (\'' + '\', \''.join(senserialdf.iloc[0,2:].values.tolist()) + '\')',
                     'Text']
}
df = pd.DataFrame(tabledata)

# save table as csv file
tablepath = r'Y:\CIS-PD Study\MJFF Curation\tables'
filename = os.path.join(tablepath, 'summarytable_sensor_serialnum.csv')
with open(filename,'wb') as f:
    df.to_csv(filename, sep=',', index=False)

In [54]:
df

Unnamed: 0,Question,Variable Name,Variable details
0,Subject ID,SubjID,Integer
1,Visit,Visit,"One of: ('2 weeks','4 weeks')"
2,Sensor Location,Sensor Location,"One of: ('anterior thigh left', 'anterior thig..."
3,Serial Number,Serial Number,Text


# Extract data for Methods section
- number of subjects
- subjects per site
- removed subjects

In [27]:
data101.head(3)

Unnamed: 0,SubjectCode,SiteID,VisitNm,Q11,Q12,ConsentDate,Age,Q03,Q04,Q05,Q06,Q07,Q08,Q09,Q10
0,1000.0,1313.0,Baseline,A1784,10.3.2,2017-06-15,63.0,Yes,Yes,Yes,Yes,Yes,Yes,No,No
1,1001.0,1313.0,Baseline,A1778,10.3.2,2017-06-15,64.0,No,Yes,Yes,Yes,Yes,Yes,No,No
2,1002.0,1313.0,Baseline,A1586,10.3.2,2017-06-16,51.0,No,Yes,Yes,Yes,Yes,Yes,No,No


In [None]:
# data101.groupby(pd.cut(Pxx.index, bins=binedges)).mean().fillna(0)

In [28]:
data101.groupby(by=data101.SubjectCode)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000020272935E80>

In [34]:
pd.unique(data101.SubjectCode.values)

array([1000., 1001., 1002., 1003., 1004., 1005., 1006., 1007., 1008.,
       1009., 1010., 1011., 1013., 1014., 1015., 1016., 1017., 1018.,
       1019., 1020., 1021., 1022., 1023., 1024., 1025., 1026., 1027.,
       1028., 1029., 1030., 1031., 1032., 1033., 1034., 1035., 1036.,
       1037., 1038., 1039., 1040., 1041., 1042., 1043., 1044., 1045.,
       1046., 1047., 1048., 1049., 1050., 1051., 1052., 1053., 1054.,
       1055., 1056.])

### total number of patients - DONE

In [32]:
len(pd.unique(data101.SubjectCode.values))

56

In [33]:
len(data101.SubjectCode.values)

56

### check 4 sites - PASS

In [37]:
# checks out
sites = pd.unique(data101.SiteID.values)
print(sites)

[1313. 1332. 1396. 1018.]


### check n for each site - DONE

In [None]:
data101.loc[data101.SiteID == 1313]

In [40]:
# n for site 1313
len(data101.loc[data101.SiteID == 1313])

13

In [None]:
data101.loc[data101.SiteID == 1332]

In [41]:
len(data101.loc[data101.SiteID == 1332])

21

In [None]:
data101.loc[data101.SiteID == 1396]

In [42]:
len(data101.loc[data101.SiteID == 1396])

12

In [None]:
data101.loc[data101.SiteID == 1018]

In [43]:
len(data101.loc[data101.SiteID == 1018])

10

### Decode site code - DONE

Determine site from subjenrollment.sas7bdat using SiteID and FoxInsightID that has the initials of the institution.
Site 1313 - University of Alabama
Site 1332 - Northwestern University
Site 1396 - University of Rochester
Site 1018 - University of Cincinnati 

# Patient onboarding illustration?
Pt assessed for eligiblity, n=
Screen failed, n=
Consented, n=		FORM 126
Completed testing, n= FORM 126
Non-completion, n=
Why not completed - death, etc. look on the form
Data used/shared, n= 
Unusable data, n=