# Data curation: MDS part III scores

Form 238v1: MDS-UPDRS Scoring Summary
Form 508v1: MDS-UPDRS Part 3 Scoring Summary

In [3]:
# import packages
import os
import platform # don't need
import pandas as pd
import re
import h5py #save hdf5 but pandas has a function...
import numpy as np

  from ._conv import register_converters as _register_converters


In [101]:
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\codelist.sas7bdat'
codelist = pd.read_sas(filename)
codelist.columns = codelist.columns.str.replace('z', '')
codelist = codelist.applymap(lambda x: x.decode() if isinstance(x, bytes) else x)
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\datadictionary.sas7bdat'
datadict = pd.read_sas(filename, format = 'sas7bdat', encoding='iso-8859-1')
datadict.columns = datadict.columns.str.replace('z', '')

In [None]:
codelist.head()

In [None]:
datadict.head()

In [None]:
datadict.loc[datadict.FormID == 238.0]

# Combine forms 508 and 238 for updrs part 3 scores

In [20]:
# load decoded forms
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\form238.h5'
form238 = pd.read_hdf(filename)
form238 = form238.rename(index=str, columns={'SubjectCode':'Subject ID',
                                  'VisitNm':'Visit'})
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\form508.h5'
form508 = pd.read_hdf(filename)
form508 = form508.rename(index=str, columns={'SubjectCode':'Subject ID',
                                  'VisitNm':'Visit'})

# Clean form 508

In [30]:
form508.columns.values

array(['Subject ID', 'SiteID', 'Visit', 'FormDate', 'Q33', 'Q34', 'Q35',
       'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44',
       'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51', 'Q52', 'Q53',
       'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 'Q60', 'Q61', 'Q62',
       'Q63', 'Q64', 'Q65', 'Q02', 'Q01', 'Q66', 'Q67', 'Q68',
       'DataCollected', 'Qb2_UTC', 'FormTime'], dtype=object)

In [31]:
# reorder form 508 columns
form508 = form508[['Subject ID', 'SiteID', 'Visit', 'FormDate', 
                   'Qb2_UTC', 'FormTime', 'Q01', 'Q02',
                   'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 
                   'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 
                   'Q51', 'Q52', 'Q53', 'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 
                   'Q60', 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q67', 'Q68', 
                   'DataCollected']]

In [33]:
# Kept and renamed columns from form 508
# Note: form 508 and 238 have different Q01 and Q02 each.
form508 = form508.rename(index=str, columns={'Q02':'Hours since last PD medication dose', 
                                             'Q01':'ParticipantState', 
                                             'Qb2_UTC':'UTC'})

In [34]:
form508.columns

Index(['Subject ID', 'SiteID', 'Visit', 'FormDate', 'UTC', 'FormTime',
       'ParticipantState', 'Hours since last PD medication dose', 'Q33', 'Q34',
       'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44',
       'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51', 'Q52', 'Q53', 'Q54',
       'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 'Q60', 'Q61', 'Q62', 'Q63', 'Q64',
       'Q65', 'Q66', 'Q67', 'Q68', 'DataCollected'],
      dtype='object')

# Clean form 238

In [22]:
# clean form 238

# drop form238 columns
form238 = form238.drop(columns=['Q01', 'Q02', 'Q03', 'Q04', 'Q05',
'Q06', 'Q07', 'Q08', 'Q09', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16',
'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
'Q27', 'Q28',
'Q69', 'Q70', 'Q71', 'Q72', 'Q73', 'Q74'])

# reorder columns
form238 = form238[['Subject ID', 'SiteID', 'Visit', 'FormDate', 'Q29', 'Q30', 'Q31', 
                   'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 
                   'Q41', 'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 
                   'Q50', 'Q51', 'Q52', 'Q53', 'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 
                   'Q59', 'Q60', 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q67', 'Q68',
                   'DataCollected']]

# Did not drop Q29-32, so NaN values will be on Form 508 data

In [None]:
form238.head(3)

# Combine forms 508 and 238

In [39]:
# concatenate forms 508 and 238 dataframes
combo = pd.concat([form238, form508], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [42]:
# reorder columns
combo = combo[['Subject ID', 'SiteID', 'Visit', 'FormDate', 'UTC', 'FormTime',
               'ParticipantState', 'Hours since last PD medication dose', 
               'Q29', 'Q30', 'Q31', 'Q32', 'Q33', 'Q34',
               'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 'Q42', 'Q43', 'Q44',
               'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 'Q51', 'Q52', 'Q53', 'Q54',
               'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 'Q60', 'Q61', 'Q62', 'Q63', 'Q64',
               'Q65', 'Q66', 'Q67', 'Q68', 
              'DataCollected']]

# Change column names to short description

In [44]:
oldname = ['Subject ID', 'SiteID', 'Visit', 'FormDate', 'UTC', 'FormTime',
           'ParticipantState', 'Hours since last PD medication dose', 
           'Q29', 'Q30', 'Q31', 'Q32',
           'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39', 'Q40', 'Q41', 
           'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49', 'Q50', 
           'Q51', 'Q52', 'Q53', 'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59', 
           'Q60', 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q66', 'Q67', 'Q68',
           'DataCollected']

In [45]:
# change column names for Q33-68 with 3.13 etc.
newname = ['Subject ID', 'SiteID', 'Visit', 'FormDate', 'UTC', 'FormTime',
           'ParticipantState', 'Hours since last PD medication dose',
# questions 33-34
'3a',
'3b',
'3c',
'3C1',
'3.1',
'3.2',
# questions 35-39
'3.3 Neck',
'3.3 Right Upper Extremity',
'3.3 Left Upper Extremity',
'3.3 Right Lower Extremity',
'3.3 Left Lower Extremity',
# questions 40-41
'3.4 Right Hand',
'3.4 Left Hand',
# questions 42-43
'3.5 Right Hand',
'3.5 Left Hand',
# questions 44-45
'3.6 Right Hand',
'3.6 Left Hand',
# questions 46-47
'3.7 Right Foot',
'3.7 Left Foot',
# questions 48-49
'3.8 Right Leg',
'3.8 Left Leg',
# questions 50-55
'3.9',
'3.10',
'3.11',
'3.12',
'3.13',
'3.14',
# questions 56-57
'3.15 Right Hand',
'3.15 Left Hand',
# questions 58-59
'3.16 Right Hand',
'3.16 Left Hand',
# questions 60-64
'3.17 Right Upper Extremity',
'3.17 Left Upper Extremity',
'3.17 Right Lower Extremity',
'3.17 Left Lower Extremity',
'3.17 Lip-Jaw',
# question 65
'3.18',
# questions 66-67
'3.19A',
'3.19B',
# question 68
'3.20',
'DataCollected']

In [46]:
col_dict = dict(zip(oldname,newname))
print(col_dict)

{'Subject ID': 'Subject ID', 'SiteID': 'SiteID', 'Visit': 'Visit', 'FormDate': 'FormDate', 'UTC': 'UTC', 'FormTime': 'FormTime', 'ParticipantState': 'ParticipantState', 'Hours since last PD medication dose': 'Hours since last PD medication dose', 'Q29': '3a', 'Q30': '3b', 'Q31': '3c', 'Q32': '3C1', 'Q33': '3.1', 'Q34': '3.2', 'Q35': '3.3 Neck', 'Q36': '3.3 Right Upper Extremity', 'Q37': '3.3 Left Upper Extremity', 'Q38': '3.3 Right Lower Extremity', 'Q39': '3.3 Left Lower Extremity', 'Q40': '3.4 Right Hand', 'Q41': '3.4 Left Hand', 'Q42': '3.5 Right Hand', 'Q43': '3.5 Left Hand', 'Q44': '3.6 Right Hand', 'Q45': '3.6 Left Hand', 'Q46': '3.7 Right Foot', 'Q47': '3.7 Left Foot', 'Q48': '3.8 Right Leg', 'Q49': '3.8 Left Leg', 'Q50': '3.9', 'Q51': '3.10', 'Q52': '3.11', 'Q53': '3.12', 'Q54': '3.13', 'Q55': '3.14', 'Q56': '3.15 Right Hand', 'Q57': '3.15 Left Hand', 'Q58': '3.16 Right Hand', 'Q59': '3.16 Left Hand', 'Q60': '3.17 Right Upper Extremity', 'Q61': '3.17 Left Upper Extremity', 'Q62

In [47]:
combo = combo.rename(index=str, columns=col_dict)

In [None]:
combo.head(3)

In [49]:
# get rid of decimals
columns = ['Subject ID', 'SiteID']
for col in columns:
    combo[col] = combo[col].apply(lambda x: x if pd.isnull(x) else int(x))

# keeps columns float but removes decimals
pd.set_option('precision', 0)

In [None]:
combo.head(2)

In [53]:
# save file as updrs_part3.h5
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\updrs_part3.h5'
# save pd.read_hdf(filename)
with open(filename,'wb') as f:
     combo.to_hdf(filename, key='combo', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['Visit', 'FormTime', 'ParticipantState', '3a', '3b', '3c', '3.19A', '3.19B', '3.20', 'DataCollected']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [None]:
# open file
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\updrs_part3.h5'
pd.read_hdf(filename)

# Create updrs_124
- Use Form 238 only
- Remove part 3
- Keep parts 1, 2, 4

Part 1-2
Qa, Qb, Q01 - Q28
Part 4
Q69 - Q74, Qc

In [4]:
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\form238.h5'
f238 = pd.read_hdf(filename)
f238 = f238.rename(index=str, columns={'SubjectCode':'Subject ID',
                                  'VisitNm':'Visit'})

In [5]:
f238.columns

Index(['Subject ID', 'SiteID', 'Visit', 'FormDate', 'Q02', 'Q03', 'Q04', 'Q05',
       'Q06', 'Q07', 'Q09', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16',
       'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
       'Q27', 'Q28', 'Q32', 'Q33', 'Q34', 'Q35', 'Q36', 'Q37', 'Q38', 'Q39',
       'Q40', 'Q41', 'Q42', 'Q43', 'Q44', 'Q45', 'Q46', 'Q47', 'Q48', 'Q49',
       'Q50', 'Q51', 'Q52', 'Q53', 'Q54', 'Q55', 'Q56', 'Q57', 'Q58', 'Q59',
       'Q60', 'Q61', 'Q62', 'Q63', 'Q64', 'Q65', 'Q69', 'Q70', 'Q71', 'Q72',
       'Q73', 'Q74', 'DataCollected', 'Q01', 'Q08', 'Q29', 'Q30', 'Q31', 'Q66',
       'Q67', 'Q68'],
      dtype='object')

In [6]:
# remove columns Q29:31, Q32:Q65, Q66:Q68
f238.drop(f238.columns[30:64], axis=1, inplace=True)
f238 = f238.drop(columns=['Q29','Q30','Q31','Q66', 'Q67', 'Q68'])

In [None]:
f238.head(5)

In [8]:
# reorder columns
f238 = f238[['Subject ID', 'SiteID', 'Visit', 'FormDate', 'Q01', 'Q02', 'Q03', 'Q04', 'Q05',
       'Q06', 'Q07', 'Q08', 'Q09', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16',
       'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
       'Q27', 'Q28', 'Q69', 'Q70', 'Q71', 'Q72', 'Q73', 'Q74', 'DataCollected']]

In [9]:
f238.columns

Index(['Subject ID', 'SiteID', 'Visit', 'FormDate', 'Q01', 'Q02', 'Q03', 'Q04',
       'Q05', 'Q06', 'Q07', 'Q08', 'Q09', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14',
       'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24',
       'Q25', 'Q26', 'Q27', 'Q28', 'Q69', 'Q70', 'Q71', 'Q72', 'Q73', 'Q74',
       'DataCollected'],
      dtype='object')

In [143]:
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\datadictionary.sas7bdat'
datadict = pd.read_sas(filename, format = 'sas7bdat', encoding='iso-8859-1')
datadict.columns = datadict.columns.str.replace('z', '')

In [279]:
# get question names
questions = datadict.loc[datadict.FormID == 238.0]
questions = questions[['FieldNb','FieldNm']]
questions = questions.reset_index(drop=True)

In [280]:
questions = questions.drop(questions.index[30:70]).reset_index(drop=True)
# remove z in FieldNb column
questions.FieldNb = questions.FieldNb.str.replace('z', '')
# change first 2 question names
questions.FieldNm[0] = questions.FieldNb[0]
questions.FieldNm[1] = questions.FieldNb[1]

In [251]:
def keepleftstring(string, sep=' '):
    """Take a string and keep text before specified character.
    Default character is ' '.
    """
    new_string = string.split(sep, 1)[0]
    return new_string

In [281]:
# change question descriptions
# skip indices: 0, 1, 2, 9, 36
for i, k in enumerate(questions.FieldNm):
    if not (i==0) | (i==1) | (i==2)| (i==9)| (i==36):
        questions.FieldNm[i]=keepleftstring(k)

In [282]:
# remove FieldNb rows: DataCollected, FormDate, and Notes
questions = questions.drop(questions.index[0:2]).reset_index(drop=True)
questions = questions.drop(questions.index[-1]).reset_index(drop=True)

### Rename f238 column names then save

In [300]:
f238.columns

Index(['Subject ID', 'SiteID', 'Visit', 'FormDate', 'Q01', 'Q02', 'Q03', 'Q04',
       'Q05', 'Q06', 'Q07', 'Q08', 'Q09', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14',
       'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24',
       'Q25', 'Q26', 'Q27', 'Q28', 'Q69', 'Q70', 'Q71', 'Q72', 'Q73', 'Q74',
       'DataCollected'],
      dtype='object')

In [303]:
newcolnames = ['Subject ID', 'SiteID', 'Visit', 'FormDate',
               'Primary source of information', '1.1', '1.2', '1.3', '1.4', '1.5',
               '1.6', 'Who is filling out this questionnaire', '1.7', '1.8',
               '1.9', '1.10', '1.11', '1.12', '1.13', '2.1', '2.2', '2.3', '2.4',
               '2.5', '2.6', '2.7', '2.8', '2.9', '2.10', '2.11', '2.12', '2.13',
               '4.1', '4.2', '4.3', '4.4', '4.5', '4.6', 'DataCollected']
f238.columns = newcolnames

In [305]:
f238.columns = newcolnames

In [306]:
f238.columns

Index(['Subject ID', 'SiteID', 'Visit', 'FormDate',
       'Primary source of information', '1.1', '1.2', '1.3', '1.4', '1.5',
       '1.6', 'Who is filling out this questionnaire', '1.7', '1.8', '1.9',
       '1.10', '1.11', '1.12', '1.13', '2.1', '2.2', '2.3', '2.4', '2.5',
       '2.6', '2.7', '2.8', '2.9', '2.10', '2.11', '2.12', '2.13', '4.1',
       '4.2', '4.3', '4.4', '4.5', '4.6', 'DataCollected'],
      dtype='object')

In [297]:
questions.FieldNm.values

array(['Primary source of information', '1.1', '1.2', '1.3', '1.4', '1.5',
       '1.6', 'Who is filling out this questionnaire', '1.7', '1.8',
       '1.9', '1.10', '1.11', '1.12', '1.13', '2.1', '2.2', '2.3', '2.4',
       '2.5', '2.6', '2.7', '2.8', '2.9', '2.10', '2.11', '2.12', '2.13',
       '4.1', '4.2', '4.3', '4.4', '4.5', '4.6'], dtype=object)

In [None]:
f238

In [308]:
# save file as updrs_124.h5
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\updrs_124.h5'
# save pd.read_hdf(filename)
with open(filename,'wb') as f:
    f238.to_hdf(filename, key='f238', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['Visit', 'Primary source of information', 'Who is filling out this questionnaire', 'DataCollected']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [None]:
# updrs parts 1, 2, and 4
filename = r'//FS2.smpp.local\RTO\CIS-PD MUSC\decoded_forms\updrs_124.h5'
pd.read_hdf(filename, 'f238')