# New PPMI patients

We will use this notebook to get more patients from the PPMI dataset. The goal is to reach patients per group. We currently have 48 per group. **The goal is to add as many new patients as we can to match 72 patients per group.**

In [31]:
import os
import pandas as pd
import numpy as np

In [32]:
def print_variable_stats(df):
    print(f"Progressive group - M/F gender: {df[(df['gen']==1) & (df['group']==1)].shape[0]}/{df[(df['gen']==2) & (df['group']==1)].shape[0]}")
    print(f"Stable group - M/F gender: {df[(df['gen']==1) & (df['group']==0)].shape[0]}/{df[(df['gen']==2) & (df['group']==0)].shape[0]}")

    age_prog_mean = df[df["group"]==1]["age"].mean()
    age_prog_std = df[df["group"]==1]["age"].std()
    age_stable_mean = df[df["group"]==0]["age"].mean()
    age_stable_std = df[df["group"]==0]["age"].std()
    print(f"Progressive group - Age: {age_prog_mean} +- {age_prog_std}")
    print(f"Stable group - Age: {age_stable_mean} +- {age_stable_std}")

    hy_stable_stage1_bl = len(df[df["initialHY"]==1])
    hy_stable_stage2_bl = len(df[df["initialHY"]==2])
    hy_stable_stage1_3y = len(df[df["followUpHY"]==1])
    hy_stable_stage2_3y = len(df[df["followUpHY"]==2])
    
    print(f"H&Y baseline stage 1: {hy_stable_stage1_bl}")
    print(f"H&Y baseline stage 2: {hy_stable_stage2_bl}")
    print(f"H&Y 3Y follow-up stage 1: {hy_stable_stage1_3y}")
    print(f"H&Y 3Y follow-up stage 2: {hy_stable_stage2_3y}")
    
    print(f"Stable dataset size: {len(df[df['group'] == 0])}")
    print(f"Progressive dataset size: {len(df[df['group'] == 1])}")

The following dataframe is a data file from PPMI containing data on all scans. We will narrow it down to the following filters:

- BL & V08 H&Y stage available
- Verio scanner used

In [86]:
baselineData = "../data/ppmi-data/HY_Baseline_Stage.csv"
df = pd.read_csv(baselineData)
df = df[["PATNO", "EVENT_ID", "hy_on"]]

# Remove null and missing data
df = df.dropna(subset=['hy_on'])
df = df[df['hy_on'].apply(lambda x: isinstance(x, float))]

# Keep BL and V08 patients
df = df[(df["EVENT_ID"] == "BL") | (df["EVENT_ID"] == "V08")]

# # Remove current patients
CURRENT_PATIENT_ID = [4037, 3168, 3131, 4024, 4001, 3373, 4081, 3127, 3380, 3107, 3834,
       3134, 3825, 4035, 3307, 3567, 3124, 3181, 3577, 3778, 3775, 3818,
       4029, 3559, 4020, 3387, 3771, 4012, 3111, 3154, 3835, 3366, 3831,
       3591, 3826, 4027, 4135, 4026, 3556, 3371, 3323, 3752, 3175, 3308,
       3374, 3166, 3829, 3787, 3309, 3770, 3332, 3182, 3762, 4033, 3174,
       4083, 3173, 3176, 3322, 4034, 3113, 3125, 3377, 3119, 4005, 3102,
       3587, 4022, 3552, 4082, 3824, 3802, 3150, 3118, 3757, 3585, 3185,
       3777, 3120, 3190, 3814, 3305, 3808, 3311, 3838, 3364, 3378, 3832,
       3588, 3383, 3866, 3386, 3116, 3780, 3828, 3130, 3815, 3819, 3781,
       4038, 3178, 3108, 3557, 3128, 3365, 3800, 3593, 3564, 4021, 3328,
       3830, 3367, 3575, 3126, 3758, 3823, 3760, 3325, 3132, 3385, 3822,
       3372, 3589, 3586, 3321]

df = df.loc[~df['PATNO'].isin(CURRENT_PATIENT_ID)]

# Remove rows with missing info
listOfSubjectID = df["PATNO"].unique()
for subId in listOfSubjectID:
    if len(df[df["PATNO"] == subId]) != 2:
        indexNames = df[ df['PATNO'] == subId ].index
        df.drop(indexNames , inplace=True)
    
#     if 0 in df[df["PATNO"] == subId]["hy_on"].values:
#         indexNames = df[ df['PATNO'] == subId ].index
#         df.drop(indexNames , inplace=True)
      
df = df.rename(columns={"PATNO":"subjectId"})

print(df)

print(f"There are {len(df['subjectId'].unique())} patients available.")

      subjectId EVENT_ID  hy_on
0          3000       BL    0.0
3          3000      V08    0.0
6          3001       BL    1.0
9          3001      V08    2.0
12         3002       BL    2.0
...         ...      ...    ...
3419       4123      V08    2.0
3421       4124       BL    2.0
3424       4124      V08    2.0
3427       4126       BL    2.0
3430       4126      V08    2.0

[772 rows x 3 columns]
There are 386 patients available.


**Fetch all progressive and stable patients from the subject IDs we have available.**

In [91]:
# Init H&Y list
baselineStageList = []
followUpStageList = []

tempDf = df.copy()

for subId in tempDf["subjectId"].values:
    baselineStage = tempDf.loc[(tempDf['subjectId'] == subId) & (tempDf['EVENT_ID'] == "BL")]["hy_on"].values[0]
    followUpStage = tempDf.loc[(tempDf['subjectId'] == subId) & (tempDf['EVENT_ID'] == "V08")]["hy_on"].values[0]
    baselineStageList.append(baselineStage)
    followUpStageList.append(followUpStage)
    
tempDf["initialHY"] = baselineStageList
tempDf["followUpHY"] = followUpStageList
tempDf["group"] = (tempDf["initialHY"] < tempDf["followUpHY"]).astype(int)
progressiveIds = tempDf[tempDf["group"] == 1]["subjectId"].unique()
stableIds = tempDf[tempDf["group"] == 0]["subjectId"].unique()
print(stableIds)
tempDf

[3000 3002 3003 3004 3008 3009 3010 3012 3013 3016 3018 3021 3023 3027
 3028 3051 3052 3053 3054 3055 3057 3060 3061 3062 3064 3068 3069 3071
 3072 3073 3074 3075 3078 3083 3085 3087 3088 3089 3100 3105 3106 3109
 3112 3114 3115 3151 3156 3157 3160 3161 3162 3165 3169 3171 3172 3180
 3188 3191 3200 3201 3203 3205 3206 3209 3212 3214 3215 3216 3217 3218
 3219 3221 3222 3224 3225 3227 3228 3237 3251 3252 3260 3264 3268 3270
 3274 3279 3280 3282 3284 3285 3300 3310 3312 3316 3318 3320 3350 3351
 3352 3353 3355 3357 3358 3359 3360 3361 3362 3368 3369 3389 3401 3404
 3409 3410 3411 3414 3415 3417 3418 3419 3420 3421 3423 3424 3428 3429
 3430 3432 3433 3434 3436 3443 3446 3448 3452 3453 3454 3455 3457 3458
 3460 3461 3462 3464 3466 3467 3468 3469 3470 3471 3472 3476 3479 3480
 3481 3500 3503 3507 3515 3516 3517 3519 3521 3523 3525 3527 3528 3530
 3532 3542 3543 3544 3551 3554 3563 3565 3569 3570 3571 3572 3600 3607
 3609 3612 3613 3614 3619 3622 3624 3625 3627 3629 3632 3634 3635 3636
 3637 

Unnamed: 0,subjectId,EVENT_ID,hy_on,initialHY,followUpHY,group
0,3000,BL,0.0,0.0,0.0,0
3,3000,V08,0.0,0.0,0.0,0
6,3001,BL,1.0,1.0,2.0,1
9,3001,V08,2.0,1.0,2.0,1
12,3002,BL,2.0,2.0,2.0,0
...,...,...,...,...,...,...
3419,4123,V08,2.0,2.0,2.0,0
3421,4124,BL,2.0,2.0,2.0,0
3424,4124,V08,2.0,2.0,2.0,0
3427,4126,BL,2.0,2.0,2.0,0


The next step is to extract the list ID and pass it to PPMI. The following dataframe represents the potential candidates. **There are 75 patients in total that can be added with SIEMENS scanners. However, only 13 patients were scanned using a Verio scanner.**

In [94]:
progressivePatients = "../data/ppmi-data/newProgressivePatientsFound.csv"
stablePatients = "../data/ppmi-data/newStablePatientsFound.csv"

progDf = pd.read_csv(progressivePatients)
progDf = progDf.rename(columns={"Subject ID":"subjectId", "Sex": "gen", "Age": "age"})
progDf = progDf[["subjectId", "gen", "age"]]
progDf['gen'] = progDf['gen'].map(dict(zip(['M','F'],[1,2])))

stableDf = pd.read_csv(stablePatients)
stableDf = stableDf.rename(columns={"Subject ID":"subjectId", "Sex": "gen", "Age": "age"})
stableDf = stableDf[["subjectId", "gen", "age"]]
stableDf['gen'] = stableDf['gen'].map(dict(zip(['M','F'],[1,2])))

**Let's add the initial and follow up H&Y stage for each patient as well as the group.**

In [95]:
# Init H&Y list
baselineStageList = []
followUpStageList = []

for subId in progDf["subjectId"].values:
    baselineStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "BL")]["hy_on"].values[0]
    followUpStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "V08")]["hy_on"].values[0]
    baselineStageList.append(baselineStage)
    followUpStageList.append(followUpStage)
    
progDf["initialHY"] = baselineStageList
progDf["followUpHY"] = followUpStageList
progDf["group"] = (progDf["initialHY"] < progDf["followUpHY"]).astype(int)
print_variable_stats(progDf)
progDf

Progressive group - M/F gender: 31/28
Stable group - M/F gender: 0/0
Progressive group - Age: 60.18305084745762 +- 11.743980887398312
Stable group - Age: nan +- nan
H&Y baseline stage 1: 51
H&Y baseline stage 2: 7
H&Y 3Y follow-up stage 1: 1
H&Y 3Y follow-up stage 2: 48
Stable dataset size: 0
Progressive dataset size: 59


Unnamed: 0,subjectId,gen,age,initialHY,followUpHY,group
0,3001,1,65.1,1.0,2.0,1
1,3020,2,74.0,2.0,3.0,1
2,3024,1,52.7,1.0,2.0,1
3,3050,2,51.5,1.0,2.0,1
4,3056,1,55.7,1.0,2.0,1
5,3059,1,83.0,2.0,3.0,1
6,3066,2,63.8,1.0,2.0,1
7,3067,1,73.9,1.0,2.0,1
8,3076,2,75.6,2.0,3.0,1
9,3077,1,62.5,2.0,3.0,1


In [96]:
# Init H&Y list
baselineStageList = []
followUpStageList = []

for subId in stableDf["subjectId"].values:
    baselineStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "BL")]["hy_on"].values[0]
    followUpStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "V08")]["hy_on"].values[0]
    baselineStageList.append(baselineStage)
    followUpStageList.append(followUpStage)
    
stableDf["initialHY"] = baselineStageList
stableDf["followUpHY"] = followUpStageList
stableDf["group"] = (stableDf["initialHY"] < stableDf["followUpHY"]).astype(int)
print_variable_stats(stableDf)
stableDf

Progressive group - M/F gender: 0/0
Stable group - M/F gender: 153/90
Progressive group - Age: nan +- nan
Stable group - Age: 61.13662551440329 +- 10.640136089534248
H&Y baseline stage 1: 26
H&Y baseline stage 2: 78
H&Y 3Y follow-up stage 1: 35
H&Y 3Y follow-up stage 2: 70
Stable dataset size: 243
Progressive dataset size: 0


Unnamed: 0,subjectId,gen,age,initialHY,followUpHY,group
0,3000,2,69.1,0.0,0.0,0
1,3002,2,67.6,2.0,2.0,0
2,3003,2,56.7,2.0,2.0,0
3,3004,1,59.4,0.0,0.0,0
4,3008,2,81.9,0.0,0.0,0
...,...,...,...,...,...,...
238,4116,1,64.5,0.0,0.0,0
239,4117,2,59.9,1.0,1.0,0
240,4118,2,68.1,0.0,0.0,0
241,4123,2,60.3,2.0,2.0,0


Fetch the current data we have

In [98]:
currentData = "../data/volume-data/preMatchVolumes.csv"
currentDf = pd.read_csv(currentData)
currentDf = currentDf.drop("Unnamed: 0", 1)
currentDf = currentDf[["subjectId", "gen", "age", "initialHY", "followUpHY", "group"]]
print_variable_stats(currentDf)
# currentDf

Progressive group - M/F gender: 33/15
Stable group - M/F gender: 49/28
Progressive group - Age: 60.54356644710416 +- 10.246004545004064
Stable group - Age: 61.396429567870136 +- 9.26828414251294
H&Y baseline stage 1: 59
H&Y baseline stage 2: 66
H&Y 3Y follow-up stage 1: 23
H&Y 3Y follow-up stage 2: 94
Stable dataset size: 77
Progressive dataset size: 48


  This is separate from the ipykernel package so we can avoid doing imports until


In [101]:
final_df = pd.concat([currentDf, progDf, stableDf])
final_df.to_csv("../data/volume-data/preMatchVolumes.csv")
print_variable_stats(final_df)

Progressive group - M/F gender: 64/43
Stable group - M/F gender: 202/118
Progressive group - Age: 60.34477747159813 +- 11.047475453752032
Stable group - Age: 61.199140864768744 +- 10.313252527954853
H&Y baseline stage 1: 136
H&Y baseline stage 2: 151
H&Y 3Y follow-up stage 1: 59
H&Y 3Y follow-up stage 2: 212
Stable dataset size: 320
Progressive dataset size: 107


### Let's call our R script and perform cohort matching with our experimental larger dataset.

In [19]:
os.system("Rscript match-data.R")

sh: Rscript: command not found


32512

In [10]:
matchedVolumesDf = pd.read_csv("../data/volume-data/matchedVolumes.csv")
matchedVolumesDf = matchedVolumesDf.drop(["Unnamed: 0", "X", 'distance', 'weights', 'subclass'], 1)
print_variable_stats(matchedVolumesDf)

Progressive group - M/F gender: 39/17
Stable group - M/F gender: 37/19
Progressive group - Age: 59.383771240375005 +- 10.793706586411549
Stable group - Age: 59.70187340260718 +- 9.258640636077349
H&Y baseline stage 1: 59
H&Y baseline stage 2: 35
H&Y 3Y follow-up stage 1: 11
H&Y 3Y follow-up stage 2: 74
Stable dataset size: 56
Progressive dataset size: 56


  
