In [55]:
import pandas
import numpy

df_updrs_ii = pandas.read_csv('../source_data/MDS_UPDRS_Part_II__Patient_Questionnaire.csv')
df_patnos = pandas.read_csv('../source_data/non_hc_patnos.csv')
df_sigfall = pandas.read_csv('../source_data/sigfall_main.csv')
non_hc_patnos = list(df_patnos.values.flat)

df_sigfall.head()

Unnamed: 0,PATNO,EVENT_ID,INFODT,AGE_AT_EVENT,PREV_SIGFALL,SIGFALL
0,3001,V15,03/2019,73.2,0,0.0
1,3001,V17,09/2021,75.7,0,1.0
2,3002,V15,03/2019,75.6,0,0.0
3,3002,V17,09/2021,78.1,0,0.0
4,3003,V15,03/2019,64.7,0,0.0


In [56]:
# --- Create dataframe that combines UPDRSii data with SIGFALL & PREV_SIGFALL ---

def lookup_value(dframe,row,value):
    patno = row['PATNO']
    infodt = row['INFODT']
    
    s1 = dframe[(dframe.INFODT == infodt) & (dframe.PATNO == patno)][value]
    if s1.empty:    
        return numpy.nan
    else:
        return s1.values[0]
    
df_updrs_ii_sigfall_base = df_updrs_ii[df_updrs_ii['PATNO'].isin(non_hc_patnos)]
df_updrs_ii_sigfall_base = df_updrs_ii_sigfall_base.drop(['REC_ID','PAG_NAME','NUPSOURC','ORIG_ENTRY','LAST_UPDATE'], axis=1)
df_updrs_ii_sigfall_base['SIGFALL'] = df_updrs_ii_sigfall_base.apply (lambda row: lookup_value(df_sigfall,row,'SIGFALL'), axis=1)
df_updrs_ii_sigfall_base['PREV_SIGFALL'] = df_updrs_ii_sigfall_base.apply (lambda row: lookup_value(df_sigfall,row,'PREV_SIGFALL'), axis=1)
df_updrs_ii_sigfall_base = df_updrs_ii_sigfall_base.dropna()
df_updrs_ii_sigfall_base.head()

Unnamed: 0,PATNO,EVENT_ID,INFODT,NP2SPCH,NP2SALV,NP2SWAL,NP2EAT,NP2DRES,NP2HYGN,NP2HWRT,NP2HOBB,NP2TURN,NP2TRMR,NP2RISE,NP2WALK,NP2FREZ,NP2PTOT,PREV_SIGFALL,SIGFALL
24,3001,V15,03/2019,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,0.0,0.0,13.0,0.0,0.0
25,3001,V17,09/2021,1.0,0.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,0.0,16.0,0.0,1.0
42,3002,V15,03/2019,2.0,3.0,1.0,2.0,2.0,1.0,3.0,2.0,2.0,1.0,2.0,3.0,0.0,24.0,0.0,0.0
43,3002,V17,09/2021,2.0,4.0,3.0,2.0,1.0,2.0,1.0,2.0,3.0,2.0,4.0,4.0,0.0,30.0,0.0,0.0
59,3003,V15,03/2019,1.0,2.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,12.0,0.0,0.0


In [37]:
df_updrs_ii_sigfall_base.columns

Index(['PATNO', 'EVENT_ID', 'INFODT', 'NP2SPCH', 'NP2SALV', 'NP2SWAL',
       'NP2EAT', 'NP2DRES', 'NP2HYGN', 'NP2HWRT', 'NP2HOBB', 'NP2TURN',
       'NP2TRMR', 'NP2RISE', 'NP2WALK', 'NP2FREZ', 'NP2PTOT', 'PREV_SIGFALL',
       'SIGFALL', 'D_NP2SPCH', 'D_NP2SALV', 'D_NP2SWAL', 'D_NP2EAT',
       'D_NP2DRES', 'D_NP2HYGN', 'D_NP2HWRT', 'D_NP2HOBB', 'D_NP2TURN',
       'D_NP2TRMR', 'D_NP2RISE', 'D_NP2WALK', 'D_NP2FREZ', 'D_NP2PTOT'],
      dtype='object')

In [57]:
# Compute Delta values, from last sample event

def get_infodts(dframe,patno):
    return list(dframe[dframe.PATNO == patno]['INFODT'].values)

def date_to_tuple(date):
    year = date[-4:]
    month = date[:2]
    return (month,year,date)

def date_sort(dates):
    tup_list = map(date_to_tuple,dates)
    return sorted(tup_list, key = lambda x: (x[1],x[0]))

def get_index(date,dates):
    output = (False,999)    
    for idx, val in enumerate(dates):
        if val[2] == date:
            output = (True,idx)
    return output

def prev_value(dframe,row,q):
    output = numpy.nan
    patno = row['PATNO']
    infodt = row['INFODT']    
    dates = get_infodts(dframe,patno)
    sorted_dates = date_sort(dates)
    i = get_index(infodt,sorted_dates)
    
    if i[0] == True:
        try:
            prev_date_index = i[1] - 1
            if prev_date_index > -1:
                prev_date = sorted_dates[prev_date_index][2]
                output = dframe[(dframe.INFODT == prev_date ) & (dframe.PATNO == patno)][q].values[0]
        except:
            pass
    return output

def q_delta_actual(dframe,row,q):
    output = numpy.nan
    patno = row['PATNO']
    infodt = row['INFODT']
    
    current = dframe[(dframe.INFODT == infodt ) & (dframe.PATNO == patno)][q].values[0]
    prev = prev_value(dframe,row,q)
    
    try:
        output = current - prev
    except:
        pass
    return output

updrs_qs = ['NP2SPCH','NP2SALV','NP2SWAL','NP2EAT','NP2DRES','NP2HYGN','NP2HWRT','NP2HOBB','NP2TURN','NP2TRMR','NP2RISE','NP2WALK','NP2FREZ','NP2PTOT']
dq_pairs = [(x,('D_'+x)) for x in updrs_qs]

for x in dq_pairs:
    df_updrs_ii_sigfall_base[x[1]] = df_updrs_ii_sigfall_base.apply (lambda row: q_delta_actual(df_updrs_ii_sigfall_base,row,x[0]), axis=1)




In [67]:
# Extracting subsets
# short form = minimum viable number of columns

df_short_form = df_updrs_ii_sigfall_base[['D_NP2PTOT','PREV_SIGFALL','SIGFALL']]
df_short_form = df_short_form.dropna()
df_short_form.to_csv('../working_data/updrsii_short_form.csv', index=False)

# quick analysis
print(df_short_form.query('SIGFALL == 1.0').describe())
print(df_short_form.query('SIGFALL == 0.0').describe())
# -> Those who fall do have averagelly larger Changes in TOTAL score

df_short_form.head()

       D_NP2PTOT  PREV_SIGFALL  SIGFALL
count  54.000000     54.000000     54.0
mean    1.648148      0.333333      1.0
std     3.577484      0.475831      0.0
min    -8.000000      0.000000      1.0
25%    -1.000000      0.000000      1.0
50%     1.500000      0.000000      1.0
75%     4.750000      1.000000      1.0
max    10.000000      1.000000      1.0
        D_NP2PTOT  PREV_SIGFALL  SIGFALL
count  405.000000    405.000000    405.0
mean     0.740741      0.076543      0.0
std      3.868436      0.266194      0.0
min    -13.000000      0.000000      0.0
25%     -1.000000      0.000000      0.0
50%      0.000000      0.000000      0.0
75%      2.000000      0.000000      0.0
max     20.000000      1.000000      0.0


Unnamed: 0,D_NP2PTOT,PREV_SIGFALL,SIGFALL
25,3.0,0.0,1.0
43,6.0,0.0,0.0
110,4.0,0.0,0.0
163,9.0,0.0,1.0
191,-13.0,1.0,0.0


In [44]:
df_updrs_ii_sigfall_base.columns

Index(['PATNO', 'EVENT_ID', 'INFODT', 'NP2SPCH', 'NP2SALV', 'NP2SWAL',
       'NP2EAT', 'NP2DRES', 'NP2HYGN', 'NP2HWRT', 'NP2HOBB', 'NP2TURN',
       'NP2TRMR', 'NP2RISE', 'NP2WALK', 'NP2FREZ', 'NP2PTOT', 'PREV_SIGFALL',
       'SIGFALL', 'D_NP2SPCH', 'D_NP2SALV', 'D_NP2SWAL', 'D_NP2EAT',
       'D_NP2DRES', 'D_NP2HYGN', 'D_NP2HWRT', 'D_NP2HOBB', 'D_NP2TURN',
       'D_NP2TRMR', 'D_NP2RISE', 'D_NP2WALK', 'D_NP2FREZ', 'D_NP2PTOT'],
      dtype='object')

In [66]:
df_qs = df_updrs_ii_sigfall_base[['D_NP2SPCH', 'D_NP2SALV', 'D_NP2SWAL', 'D_NP2EAT',
       'D_NP2DRES', 'D_NP2HYGN', 'D_NP2HWRT', 'D_NP2HOBB', 'D_NP2TURN',
       'D_NP2TRMR', 'D_NP2RISE', 'D_NP2WALK', 'D_NP2FREZ','PREV_SIGFALL','SIGFALL']]
df_qs = df_qs.dropna()
df_qs.to_csv('../working_data/updrsii_all_qs.csv', index=False)

print(df_qs.query('SIGFALL == 1.0').describe())
print(df_qs.query('SIGFALL == 0.0').describe())

       D_NP2SPCH  D_NP2SALV  D_NP2SWAL   D_NP2EAT  D_NP2DRES  D_NP2HYGN  \
count  54.000000  54.000000  54.000000  54.000000  54.000000  54.000000   
mean    0.055556  -0.092593   0.129630   0.037037   0.185185   0.074074   
std     0.762733   1.050939   0.584268   0.699406   0.754210   0.609728   
min    -2.000000  -3.000000  -1.000000  -2.000000  -2.000000  -2.000000   
25%     0.000000  -0.750000   0.000000   0.000000   0.000000   0.000000   
50%     0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
75%     0.000000   0.000000   0.000000   0.000000   1.000000   0.000000   
max     2.000000   3.000000   2.000000   2.000000   2.000000   2.000000   

       D_NP2HWRT  D_NP2HOBB  D_NP2TURN  D_NP2TRMR  D_NP2RISE  D_NP2WALK  \
count  54.000000  54.000000  54.000000  54.000000  54.000000  54.000000   
mean    0.185185   0.240741  -0.074074   0.037037   0.351852   0.240741   
std     0.802686   0.775453   0.773423   0.725882   0.850248   0.750728   
min    -1.000000  -1.000

In [65]:
# Some selective subset of the UPDRS II q's that talk about freezing of gait etc...
df_q_targeted = df_updrs_ii_sigfall_base[['D_NP2RISE', 'D_NP2WALK', 'D_NP2FREZ','PREV_SIGFALL','SIGFALL']]
df_q_targeted = df_q_targeted.dropna()
df_q_targeted.to_csv('../working_data/updrsii_q_targeted.csv', index=False)
print(df_q_targeted.query('SIGFALL == 1.0').describe())
print(df_q_targeted.query('SIGFALL == 0.0').describe())

df_q_targeted.shape

       D_NP2RISE  D_NP2WALK  D_NP2FREZ  PREV_SIGFALL  SIGFALL
count  54.000000  54.000000  54.000000     54.000000     54.0
mean    0.351852   0.240741   0.277778      0.333333      1.0
std     0.850248   0.750728   0.656367      0.475831      0.0
min    -1.000000  -1.000000  -1.000000      0.000000      1.0
25%     0.000000   0.000000   0.000000      0.000000      1.0
50%     0.000000   0.000000   0.000000      0.000000      1.0
75%     1.000000   1.000000   0.750000      1.000000      1.0
max     3.000000   2.000000   3.000000      1.000000      1.0
        D_NP2RISE   D_NP2WALK   D_NP2FREZ  PREV_SIGFALL  SIGFALL
count  405.000000  405.000000  405.000000    405.000000    405.0
mean     0.096296    0.074074    0.066667      0.076543      0.0
std      0.716227    0.644431    0.591189      0.266194      0.0
min     -3.000000   -3.000000   -3.000000      0.000000      0.0
25%      0.000000    0.000000    0.000000      0.000000      0.0
50%      0.000000    0.000000    0.000000      0.000

(459, 5)

In [64]:
print(df_q_targeted.shape)
print(df_short_form.shape)
print(df_qs.shape)

(459, 5)
(459, 3)
(459, 15)
