# New PPMI patients

We will use this notebook to get more patients from the PPMI dataset. The goal is to reach patients per group. We currently have 48 per group. **The goal is to add as many new patients as we can to match 72 patients per group.**

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def print_variable_stats(df):
    stable_gender = df[(df['gen']==1) & (df['group']==0)].shape[0] / len(df[df["group"]==0])
    progressive_gender = df[(df['gen']==1) & (df['group']==1)].shape[0] / len(df[df["group"]==1])
    
    print(f"Stable group - Gender [male (n, %)]: {df[(df['gen']==1) & (df['group']==0)].shape[0]} ({stable_gender*100})")
    print(f"Progressive group - Gender [male (n, %)]: {df[(df['gen']==1) & (df['group']==1)].shape[0]} ({progressive_gender*100})\n")
    
    age_prog_mean = df[df["group"]==1]["age"].mean()
    age_prog_std = df[df["group"]==1]["age"].std()
    age_stable_mean = df[df["group"]==0]["age"].mean()
    age_stable_std = df[df["group"]==0]["age"].std()
    print(f"Progressive group - Age: {age_prog_mean} +- {age_prog_std}")
    print(f"Stable group - Age: {age_stable_mean} +- {age_stable_std}\n")
    
    stage1_stable = len(df[(df["group"]==0) & (df["initialHY"]==1)])
    stage1_prog = len(df[(df["group"]==1) & (df["initialHY"]==1)])
    stage2_stable = len(df[(df["group"]==0) & (df["initialHY"]==2)])
    stage2_prog = len(df[(df["group"]==1) & (df["initialHY"]==2)])
    
    print(f"H&Y stage 1 stable/progressive: {stage1_stable}/{stage1_prog}")
    print(f"H&Y stage 2 stable/progressive: {stage2_stable}/{stage2_prog}\n")
    
    print(f"Stable dataset size: {len(df[df['group'] == 0])}")
    print(f"Progressive dataset size: {len(df[df['group'] == 1])}")

### Using entire PPMI data to filter our needs

The following dataframe is a data file from PPMI containing data on all scans. We will narrow it down to the following filters:

- BL & V08 H&Y stage available
- Stage 1+ (remove stage0 patients)

In [3]:
baselineData = "../data/ppmi-data/HY_Baseline_Stage.csv"
df = pd.read_csv(baselineData)
df = df[["PATNO", "EVENT_ID", "hy_on"]]


# Remove null and missing data
df = df.dropna(subset=['hy_on'])
df = df[df['hy_on'].apply(lambda x: isinstance(x, float))]

# Keep BL and V08 patients
df = df[(df["EVENT_ID"] == "BL") | (df["EVENT_ID"] == "V08")]

# Remove rows with missing info
listOfSubjectID = df["PATNO"].unique()
for subId in listOfSubjectID:
    if len(df[df["PATNO"] == subId]) != 2:
        indexNames = df[ df['PATNO'] == subId ].index
        df.drop(indexNames , inplace=True)
    
    if (0 in df[df["PATNO"] == subId]["hy_on"].values):
        indexNames = df[ df['PATNO'] == subId ].index
        df.drop(indexNames , inplace=True)
      
df = df.rename(columns={"PATNO":"subjectId"})

print(df['subjectId'].unique())

print(f"There are {len(df['subjectId'].unique())} patients available.")

[3001 3002 3003 3010 3012 3018 3020 3021 3023 3024 3027 3028 3050 3051
 3052 3054 3056 3058 3059 3060 3061 3062 3066 3067 3068 3076 3077 3078
 3080 3083 3088 3089 3102 3105 3107 3108 3110 3111 3113 3116 3118 3119
 3120 3122 3123 3124 3125 3126 3127 3128 3130 3131 3132 3134 3150 3154
 3162 3166 3168 3173 3174 3175 3176 3178 3179 3180 3181 3182 3185 3186
 3190 3203 3205 3207 3209 3211 3212 3214 3218 3220 3223 3224 3225 3226
 3227 3228 3229 3230 3231 3234 3251 3252 3268 3269 3279 3280 3282 3284
 3285 3305 3307 3308 3309 3311 3312 3313 3321 3322 3323 3325 3328 3332
 3333 3352 3359 3360 3364 3365 3366 3367 3371 3372 3373 3374 3377 3378
 3380 3383 3385 3386 3387 3400 3403 3406 3407 3409 3415 3417 3418 3419
 3420 3421 3422 3423 3429 3430 3432 3433 3434 3435 3436 3439 3442 3443
 3444 3445 3446 3448 3451 3454 3455 3459 3461 3462 3467 3469 3470 3471
 3472 3473 3475 3476 3482 3500 3507 3516 3522 3528 3530 3532 3542 3552
 3556 3557 3559 3564 3567 3575 3577 3585 3586 3587 3588 3589 3591 3593
 3603 

**Fetch all progressive and stable patients from the subject IDs we have available.**

In [4]:
# Init H&Y list
baselineStageList = []
followUpStageList = []

tempDf = df.copy()

for subId in tempDf["subjectId"].values:
    baselineStage = tempDf.loc[(tempDf['subjectId'] == subId) & (tempDf['EVENT_ID'] == "BL")]["hy_on"].values[0]
    followUpStage = tempDf.loc[(tempDf['subjectId'] == subId) & (tempDf['EVENT_ID'] == "V08")]["hy_on"].values[0]
    baselineStageList.append(baselineStage)
    followUpStageList.append(followUpStage)
    
tempDf["initialHY"] = baselineStageList
tempDf["followUpHY"] = followUpStageList
tempDf["group"] = (tempDf["initialHY"] < tempDf["followUpHY"]).astype(int)
progressiveIds = tempDf[tempDf["group"] == 1]["subjectId"].unique()
stableIds = tempDf[tempDf["group"] == 0]["subjectId"].unique()
print(progressiveIds)
print(len(progressiveIds))

[3001 3020 3024 3050 3056 3058 3059 3066 3067 3076 3077 3080 3102 3108
 3110 3111 3113 3122 3123 3124 3127 3134 3168 3174 3175 3176 3178 3179
 3185 3186 3190 3207 3211 3220 3223 3226 3229 3230 3231 3234 3269 3305
 3308 3311 3313 3325 3332 3333 3364 3372 3373 3400 3403 3406 3407 3422
 3435 3439 3442 3444 3445 3451 3459 3473 3475 3482 3522 3586 3587 3588
 3591 3593 3603 3604 3621 3631 3654 3660 3664 3665 3752 3776 3778 3781
 3785 3787 3792 3802 3815 3818 3822 3825 3826 3830 3831 3835 3866 3869
 3870 4005 4011 4019 4027 4029 4034 4037 4051 4054 4055 4056 4057 4059
 4083 4092 4093 4094 4096 4103 4109 4113 4121 4135]
122


### Progressive and Stable patient DataFrames

In [5]:
progressivePatients = "../data/ppmi-data/newProgressivePatientsFound.csv"
stablePatients = "../data/ppmi-data/newStablePatientsFound.csv"

progDf = pd.read_csv(progressivePatients)
progDf = progDf.rename(columns={"Subject ID":"subjectId", "Sex": "gen", "Age": "age"})
progDf = progDf[["subjectId", "gen", "age"]]
progDf['gen'] = progDf['gen'].map(dict(zip(['M','F'],[1,2])))

stableDf = pd.read_csv(stablePatients)
stableDf = stableDf.rename(columns={"Subject ID":"subjectId", "Sex": "gen", "Age": "age"})
stableDf = stableDf[["subjectId", "gen", "age"]]
stableDf['gen'] = stableDf['gen'].map(dict(zip(['M','F'],[1,2])))

**Let's add the initial and follow up H&Y stage to the progressive dataframes**

In [6]:
baselineStageList = []
followUpStageList = []

for subId in progDf["subjectId"].values:
    baselineStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "BL")]["hy_on"].values[0]
    followUpStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "V08")]["hy_on"].values[0]
    baselineStageList.append(baselineStage)
    followUpStageList.append(followUpStage)
    
progDf["initialHY"] = baselineStageList
progDf["followUpHY"] = followUpStageList
progDf["group"] = (progDf["initialHY"] < progDf["followUpHY"]).astype(int)
progDf.head()
print(len(progDf))

80


**Let's add the initial and follow up H&Y stage to the stable dataframes**

In [7]:
# Init H&Y list
baselineStageList = []
followUpStageList = []

for subId in stableDf["subjectId"].values:
    baselineStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "BL")]["hy_on"].values[0]
    followUpStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == "V08")]["hy_on"].values[0]
    baselineStageList.append(baselineStage)
    followUpStageList.append(followUpStage)
    
stableDf["initialHY"] = baselineStageList
stableDf["followUpHY"] = followUpStageList
stableDf["group"] = (stableDf["initialHY"] < stableDf["followUpHY"]).astype(int)
stableDf.head()
print(len(stableDf))

118


### Putting it all together

In [8]:
final_df = pd.concat([progDf, stableDf])
print_variable_stats(final_df)
final_df.head()
final_df.to_csv("../data/volume-data/preMatchVolumes.csv")


Progressive group - Age: 60.10625 +- 10.777864607904903
Stable group - Age: 62.53898305084746 +- 8.904651947572269

H&Y stage 1 stable/progressive: 25/74
H&Y stage 2 stable/progressive: 91/6

Stable dataset size: 118
Progressive dataset size: 80


**Drop random rows to get 77 patients per set**

In [9]:
stable_df = final_df[final_df["group"]==0]
progressive_df = final_df[final_df["group"]==1]
progressive_df = progressive_df.sample(n=77, random_state=42)
final_df = pd.concat([stable_df, progressive_df])
final_df.to_csv("../data/volume-data/preMatchVolumes.csv")
print_variable_stats(final_df)
final_df.head()


Progressive group - Age: 60.449350649350656 +- 10.809834987988136
Stable group - Age: 62.53898305084746 +- 8.904651947572269

H&Y stage 1 stable/progressive: 25/72
H&Y stage 2 stable/progressive: 91/5

Stable dataset size: 118
Progressive dataset size: 77


Unnamed: 0,subjectId,gen,age,initialHY,followUpHY,group
0,3105,1,68.5,2.0,2.0,0
1,3107,1,69.6,2.0,1.0,0
2,3116,1,65.0,2.0,2.0,0
3,3118,1,60.3,2.0,2.0,0
4,3119,1,64.4,2.0,2.0,0


### Let's call our R script and perform cohort matching with our experimental larger dataset.

In [10]:
os.system("Rscript match-data.R")


Call:
matchit(formula = group ~ (age + gen), data = data, method = "nearest", 
    distance = "glm", replacement = F)

Summary of Balance for All Data:
         Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
distance        0.4039        0.3890          0.2167     1.8905    0.0652
age            60.4494       62.5390         -0.1933     1.4737    0.0722
gen             1.4156        1.3475          0.1373     1.0761    0.0341
         eCDF Max
distance   0.1665
age        0.1539
gen        0.0681


Summary of Balance for Matched Data:
         Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
distance        0.4039        0.3969          0.1022     1.5142    0.0252
age            60.4494       61.0260         -0.0533     1.2808    0.0244
gen             1.4156        1.3506          0.1309     1.0667    0.0325
         eCDF Max Std. Pair Dist.
distance   0.1299          0.1380
age        0.0649          0.4383
gen        0.0649          0.6545

Percent

0

In [11]:
matchedVolumesDf = pd.read_csv("../data/volume-data/matchedVolumes.csv")
matchedVolumesDf = matchedVolumesDf.drop(["Unnamed: 0", "X", 'distance', 'weights', 'subclass'], 1)
print_variable_stats(matchedVolumesDf)


Progressive group - Age: 60.449350649350656 +- 10.809834987988136
Stable group - Age: 61.02597402597405 +- 9.551762066669365

H&Y stage 1 stable/progressive: 20/72
H&Y stage 2 stable/progressive: 56/5

Stable dataset size: 77
Progressive dataset size: 77


  


In [52]:
matchedVolumesDf.subjectId.values

array([3116, 3119, 3120, 3125, 3126, 3128, 3130, 3131, 3132, 3150, 3154,
       3173, 3181, 3182, 3309, 3323, 3328, 3352, 3359, 3360, 3365, 3366,
       3367, 3374, 3377, 3380, 3385, 3387, 3556, 3557, 3559, 3564, 3575,
       3577, 3589, 3607, 3622, 3629, 3634, 3661, 3757, 3758, 3770, 3771,
       3775, 3777, 3780, 3780, 3800, 3808, 3814, 3819, 3824, 3828, 3832,
       3838, 4001, 4012, 4021, 4022, 4024, 4025, 4026, 4033, 4058, 4081,
       4091, 4099, 4101, 4106, 4107, 4108, 4110, 4111, 4112, 4115, 4117,
       3586, 3102, 3332, 3587, 3305, 3439, 3174, 4059, 3122, 3176, 3815,
       3591, 4055, 3603, 4056, 3778, 4093, 4019, 3831, 3660, 3168, 4034,
       3123, 3787, 3593, 4027, 3665, 3830, 3186, 3654, 3835, 4135, 3127,
       3818, 3826, 3308, 4054, 3373, 3776, 3178, 4103, 3113, 3190, 3631,
       3134, 4037, 3124, 3604, 4092, 3869, 3781, 4113, 3185, 3403, 3664,
       3400, 3802, 3372, 3752, 4109, 3866, 3175, 3588, 4096, 3870, 4029,
       4057, 3621, 3442, 3108, 3825, 3325, 3111, 33