# Data Preparation

In [1]:
import os
import ppmi_downloader

data_dir = 'data'

if not os.path.exists(data_dir):
    os.makedirs(data_dir)
required_files = ['PPMI_Original_Cohort_BL_to_Year_5_Dataset_Apr2020.csv', 'Age_at_visit.csv',
                  'Magnetic_Resonance_Imaging__MRI_.csv', 'MDS_UPDRS_Part_III.csv', 'Demographics.csv']
missing_files = [x for x in required_files if not os.path.exists(os.path.join(data_dir, x))]

if len(missing_files) > 0:
    ppmi = ppmi_downloader.PPMIDownloader('tristan.glatard@concordia.ca', 'w-jEV(#-A6N-5ICZBy')
    ppmi.download_metadata(missing_files, destination_dir=data_dir, headless=False, timeout=600)


In [2]:
import pandas as pd

keep_cols = ['EVENT_ID', 'PATNO', 'gen', 'hy', 'APPRDX']
df_hy = pd.read_csv(os.path.join(data_dir, 'PPMI_Original_Cohort_BL_to_Year_5_Dataset_Apr2020.csv'))
df_hy = df_hy.drop([x for x in df_hy.columns if x not in keep_cols], axis=1)
# drop non-PD patients
df_hy = df_hy[df_hy['APPRDX']==1]
df_hy = df_hy.drop(['APPRDX'], axis=1)
df_hy

Unnamed: 0,PATNO,EVENT_ID,gen,hy
6,3001,BL,1,1.0
7,3001,V04,1,2.0
8,3001,V06,1,2.0
9,3001,V08,1,
10,3001,V10,1,2.0
...,...,...,...,...
3437,4135,V06,1,
3438,4135,V08,1,
3439,4136,BL,1,2.0
3440,4136,V04,1,2.0


In [3]:
# keep_cols = ['EVENT_ID', 'PATNO', 'NHY']
# df_hy_1 = pd.read_csv(os.path.join(data_dir, 'MDS_UPDRS_Part_III.csv'))
# df_hy_1 = df_hy_1[df_hy_1['PDSTATE'] == 'OFF']
# df_hy_1 = df_hy_1.drop([x for x in df_hy_1.columns if x not in keep_cols], axis=1)


In [4]:
df_age = pd.read_csv(os.path.join(data_dir, 'Age_at_visit.csv'))
df_age

Unnamed: 0,PATNO,EVENT_ID,AGE_AT_VISIT
0,3000,BL,69.1
1,3000,R17,79.9
2,3000,SC,69.1
3,3000,V01,69.4
4,3000,V02,69.6
...,...,...,...
18616,151111,BL,37.7
18617,151111,SC,37.6
18618,152369,SC,70.3
18619,153089,SC,54.4


In [5]:
mri_df = pd.read_csv(os.path.join(data_dir, 'Magnetic_Resonance_Imaging__MRI_.csv'))
keep_cols = ['EVENT_ID', 'PATNO', 'MRICMPLT']
mri_df = mri_df.drop([x for x in mri_df.columns if x not in keep_cols], axis=1)
mri_df

Unnamed: 0,PATNO,EVENT_ID,MRICMPLT
0,3000,BL,1.0
1,3000,V12,0.0
2,3001,BL,1.0
3,3002,BL,1.0
4,3003,BL,1.0
...,...,...,...
2870,146573,BL,1.0
2871,146935,BL,0.0
2872,147077,BL,1.0
2873,150818,BL,1.0


In [6]:
keys = ['PATNO', 'EVENT_ID']
df = df_hy.merge(df_age, on=keys).merge(mri_df, on=keys)

In [7]:
visits = {'BL': 'V08', 
          'V04': 'V10',
          'V06': 'V12',
          'V08': 'V13',
          'V10': 'V14'}
def next_visit(x):
    return visits[x]

In [8]:
visits_df = df[(df['MRICMPLT']==1.0) & (df['EVENT_ID'].isin(visits))]
visits_df['next_visit'] = visits_df['EVENT_ID'].apply(next_visit)


visits_df = visits_df.merge(df_hy, left_on=['PATNO', 'next_visit'], right_on=['PATNO', 'EVENT_ID']).dropna()\
                     .drop(['MRICMPLT', 'next_visit', 'gen_y'], axis=1)
visits_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visits_df['next_visit'] = visits_df['EVENT_ID'].apply(next_visit)


Unnamed: 0,PATNO,EVENT_ID_x,gen_x,hy_x,AGE_AT_VISIT,EVENT_ID_y,hy_y
1,3002,BL,2,2.0,67.6,V08,2.0
2,3003,BL,2,2.0,56.7,V08,2.0
5,3018,BL,2,2.0,60.5,V08,2.0
6,3020,BL,2,2.0,74.0,V08,3.0
7,3021,BL,2,2.0,64.1,V08,3.0
...,...,...,...,...,...,...,...
589,4117,BL,2,1.0,59.9,V08,1.0
590,4121,BL,2,2.0,65.3,V08,3.0
591,4123,BL,2,2.0,60.3,V08,2.0
592,4124,BL,1,2.0,71.1,V08,2.0


In [9]:
visits_df['stable'] = visits_df['hy_x'] == visits_df['hy_y']

In [10]:
visits_df.groupby('stable').count()

Unnamed: 0_level_0,PATNO,EVENT_ID_x,gen_x,hy_x,AGE_AT_VISIT,EVENT_ID_y,hy_y
stable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,150,150,150,150,150,150,150
True,225,225,225,225,225,225,225


In [11]:
visits_df.groupby(['stable', 'hy_x']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PATNO,EVENT_ID_x,gen_x,AGE_AT_VISIT,EVENT_ID_y,hy_y
stable,hy_x,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,1.0,113,113,113,113,113,113
False,2.0,34,34,34,34,34,34
False,3.0,3,3,3,3,3,3
True,1.0,47,47,47,47,47,47
True,2.0,175,175,175,175,175,175
True,3.0,3,3,3,3,3,3


There are exactly 47 stable patients with HY=1, as in Shu et al's cohort. Coincidence?

In [12]:
len(pd.unique(visits_df[(visits_df['stable'] == True) & (visits_df['hy_x'] == 1)]['PATNO']))

39

There are only 39 unique stable patients with HY=1 while Shu et al has 47. How come?

The only way to get 47 data records in this group is to take multiple times the same patient.

In [13]:
len(pd.unique(visits_df[(visits_df['stable'] == False) & (visits_df['hy_x'] == 2)]['PATNO']))

32

There are 32 unique progressive patients with HY=2, Shu et al has 25.

# Matching

In [14]:
stable_1_patnos = pd.unique(visits_df[(visits_df['stable'] == True) & (visits_df['hy_x'] == 1)]['PATNO'])
stable_1_patnos

array([3061, 3130, 3173, 3175, 3203, 3205, 3227, 3307, 3308, 3322, 3327,
       3328, 3419, 3420, 3421, 3429, 3430, 3431, 3454, 3469, 3470, 3507,
       3661, 3700, 3702, 3708, 3711, 3808, 3815, 3819, 3824, 3832, 3834,
       3838, 3914, 3960, 3961, 4096, 4117])

In [15]:
stable_1_df = None
progr_1_df = None

progr_df = visits_df[(visits_df['stable'] == False) & (visits_df['hy_x'] == 1)]

for p in stable_1_patnos:
    # Take a random visit pair
    s = visits_df[(visits_df['stable'] == True) & (visits_df['hy_x'] == 1) & (visits_df['PATNO'] == p)].sample(1)
    t = nn(s, progr_df)
    if stable_1_df is None:
        stable_1_df = s
    else:
        stable_1_df = stable_1_df.append(s)
    if progr_1_df is None:
        progr_1_df = t
    else:
        progr_1_df = progr_1_df.append(t)

NameError: name 'nn' is not defined

In [None]:
progr_1_df

In [None]:
# find index of nearest neighbor of x in df
def nn(x, df):
#     df_hy_match = df[df['hy_x'] == x['hy_x']] # get as many subjects with matching HY as possible
#     if len(df_hy_match) == 0:
#         df_hy_match = df
    df_hy_match = df
    df_hy_match['dist'] = sum((df_hy_match[f'{var}_norm']-x[f'{var}_norm'])**2 for var in all_vars)
    df_hy_match.sort_values('dist', inplace=True)
    return df_hy_match.head(1)  ## there's probably a better way to do it but it should work


# Matching loop

def match(n_samples, df_stable, df_progr):
    ids_stable_matched = []
    ids_progr_matched = []

    df_stable_ = df_stable.copy()
    df_progr_ = df_progr.copy()

    for i in range(n_samples):
        # get a random stable subject without replacement
#         if len(df_stable_[df_stable_['initialHY']==1]) > 0: # add as many stable patients with low H&Y as possible
#             stable = df_stable_[df_stable_['initialHY']==1].sample()
#         else:
        stable = df_stable_.sample()
        stable_index = stable.index[0]
        ids_stable_matched.append(stable_index)
        df_stable_.drop(index=stable_index, inplace=True)

        # get nn in progr set for this subject without replacement
        progr_index = nn(stable.iloc[0], df_progr_)
        ids_progr_matched.append(progr_index)
        df_progr_.drop(index=progr_index, inplace=True)
    
    df_stable_matched = df[df.index.isin(ids_stable_matched)]
    df_progr_matched = df[df.index.isin(ids_progr_matched)]
    return df_stable_matched, df_progr_matched

In [None]:
stable_1_df

In [None]:
# normalize columns

df = visits_df

all_vars = ['AGE_AT_VISIT', 'gen_x', 'hy_x']
to_normalize = ['AGE_AT_VISIT'] 
for var in all_vars:
    if var in to_normalize:
        df[f'{var}_norm'] = (df[var] - df[var].mean())/df[var].std()
    else:
        df[f'{var}_norm'] = df[var]

df