# Cohort matching

In this notebook, we will add to our dataset the HY scale at baseline and 3Y follow up. Additionally, we will perform cohort matching on age, gender and baseline HY scale using `pymatch`

### Imports

In [20]:
import os
import pandas as pd
import numpy as np

### Add HY (baseline, 3Y)

In [21]:
# Import data
baselineData = "../data/ppmi-data/HY_Baseline_Stage.csv"
volumesDataFile = "../data/volume-data/sanitizedVolumes.csv"
baselineDf = pd.read_csv(baselineData)
volumesDf = pd.read_csv(volumesDataFile)
baselineDf = baselineDf[["PATNO", "EVENT_ID","NHY_ON"]]

# Init H&Y list
baselineStageList = []
followUpStageList = []

# Fetch stages per patient
for subId in volumesDf["subjectId"].values:
    baselineStage = baselineDf.loc[(baselineDf['PATNO'] == subId) & (baselineDf['EVENT_ID'] == "BL")]["NHY_ON"]
    followUpStage = baselineDf.loc[(baselineDf['PATNO'] == subId) & (baselineDf['EVENT_ID'] == "V08")]["NHY_ON"]
    
    baselineStageList.append(baselineStage.values[0]) if not baselineStage.empty else baselineStageList.append(-1)
    followUpStageList.append(followUpStage.values[0]) if not followUpStage.empty else followUpStageList.append(-1)

# Append stages to DF
volumesDf["initialHY"] = baselineStageList
volumesDf["followUpHY"] = followUpStageList

print(f"Shape of data before removing missing data: {volumesDf.shape}")

# Remove missing data
volumesDf = volumesDf[volumesDf.followUpHY != -1]
volumesDf = volumesDf[volumesDf.initialHY != -1]
volumesDf = volumesDf.dropna(subset=['initialHY', 'followUpHY'])
volumesDf = volumesDf.drop("Unnamed: 0", 1)

print(f"Shape of data after removing missing data: {volumesDf.shape}")

# Create label group (0: stable | 1: progressive)
volumesDf["group"] = (volumesDf["initialHY"] < volumesDf["followUpHY"]).astype(int)
volumesDf = volumesDf.rename(columns={"Thalamus-Proper": "Thalamus", "Cerebellum-Cortex": "CerebellumCortex", "Cerebellum-White-Matter": "CerebellumWM", "3rd-Ventricle": "V3", "4th-Ventricle": "V4"})
volumesDf

Shape of data before removing missing data: (151, 16)
Shape of data after removing missing data: (125, 15)




Unnamed: 0,subjectId,Pallidum,Putamen,Caudate,Thalamus,CerebellumCortex,CerebellumWM,V3,V4,Pons,SCP,Midbrain,Insula,initialHY,followUpHY,group
0,4037,4261.3,10223.0,7586.8,17351.7,116856.4,38230.0,934.2,1330.4,19552.781704,307.724746,6989.344230,6476,1.0,2.0,1
1,3168,3776.7,8200.9,5738.2,13200.4,91395.5,31986.1,1089.4,1339.5,14452.019213,281.394581,5453.716726,7346,2.0,4.0,1
2,3131,4523.6,9383.2,8577.0,16020.2,118487.3,34742.2,1719.7,2169.2,21000.357958,348.730585,8004.224865,8146,2.0,2.0,0
3,4024,3444.1,8405.3,5940.0,12945.9,93723.6,22075.2,1587.0,1690.8,13148.503931,297.837617,5969.685193,6497,2.0,2.0,0
4,4001,4174.6,11058.9,7890.2,15731.7,126094.9,29284.0,1650.6,2085.8,16901.777054,262.838901,7330.256842,7613,2.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,3822,3916.3,8508.5,6534.8,13407.2,120662.0,34627.3,1605.5,1856.3,15993.876755,316.770996,6418.796616,6877,1.0,2.0,1
147,3372,4797.0,10114.8,9268.2,15093.5,112521.2,36136.9,3301.4,3167.2,17399.059361,318.438874,6918.687371,7682,1.0,2.0,1
148,3589,3067.4,7619.4,6386.1,13104.2,92495.3,29665.6,1319.4,1262.7,14052.641567,283.262188,5467.899590,5476,2.0,2.0,0
149,3586,3709.8,8082.5,5973.9,13831.9,109396.4,29020.2,1464.9,1306.0,16415.510277,342.644554,6638.502535,6423,1.0,2.0,1


### Add gender, age

In [22]:
# Read data
baselineDf = pd.read_csv(baselineData)
baselineDf = baselineDf[["PATNO", "age", "gen"]]
baselineDf = baselineDf.drop_duplicates().rename(columns={"PATNO":"subjectId"})

# Merge data
volumesDf = pd.merge(baselineDf, volumesDf, on=["subjectId"], how="right")
volumesDf

Unnamed: 0,subjectId,age,gen,Pallidum,Putamen,Caudate,Thalamus,CerebellumCortex,CerebellumWM,V3,V4,Pons,SCP,Midbrain,Insula,initialHY,followUpHY,group
0,4037,52.831492,1,4261.3,10223.0,7586.8,17351.7,116856.4,38230.0,934.2,1330.4,19552.781704,307.724746,6989.344230,6476,1.0,2.0,1
1,3168,63.094798,2,3776.7,8200.9,5738.2,13200.4,91395.5,31986.1,1089.4,1339.5,14452.019213,281.394581,5453.716726,7346,2.0,4.0,1
2,3131,71.205479,1,4523.6,9383.2,8577.0,16020.2,118487.3,34742.2,1719.7,2169.2,21000.357958,348.730585,8004.224865,8146,2.0,2.0,0
3,4024,72.292350,1,3444.1,8405.3,5940.0,12945.9,93723.6,22075.2,1587.0,1690.8,13148.503931,297.837617,5969.685193,6497,2.0,2.0,0
4,4001,49.893151,1,4174.6,11058.9,7890.2,15731.7,126094.9,29284.0,1650.6,2085.8,16901.777054,262.838901,7330.256842,7613,2.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,3822,55.994536,1,3916.3,8508.5,6534.8,13407.2,120662.0,34627.3,1605.5,1856.3,15993.876755,316.770996,6418.796616,6877,1.0,2.0,1
121,3372,71.390710,1,4797.0,10114.8,9268.2,15093.5,112521.2,36136.9,3301.4,3167.2,17399.059361,318.438874,6918.687371,7682,1.0,2.0,1
122,3589,74.909589,2,3067.4,7619.4,6386.1,13104.2,92495.3,29665.6,1319.4,1262.7,14052.641567,283.262188,5467.899590,5476,2.0,2.0,0
123,3586,62.927210,1,3709.8,8082.5,5973.9,13831.9,109396.4,29020.2,1464.9,1306.0,16415.510277,342.644554,6638.502535,6423,1.0,2.0,1


In [23]:
volumesDf.to_csv("../data/volume-data/preMatchVolumes.csv")

### Analyze stats before matching

In [24]:
def print_variable_stats(df):
    print(f"Progressive group - M/F gender: {df[(df['gen']==1) & (df['group']==1)].shape[0]}/{df[(df['gen']==2) & (df['group']==1)].shape[0]}")
    print(f"Stable group - M/F gender: {df[(df['gen']==1) & (df['group']==0)].shape[0]}/{df[(df['gen']==2) & (df['group']==0)].shape[0]}")

    age_prog_mean = df[df["group"]==1]["age"].mean()
    age_prog_std = df[df["group"]==1]["age"].std()
    age_stable_mean = df[df["group"]==0]["age"].mean()
    age_stable_std = df[df["group"]==0]["age"].std()
    print(f"Progressive group - Age: {age_prog_mean} +- {age_prog_std}")
    print(f"Stable group - Age: {age_stable_mean} +- {age_stable_std}")

    hy_prog_mean = df[df["group"]==1]["initialHY"].mean()
    hy_prog_std = df[df["group"]==1]["initialHY"].std()
    hy_stable_mean = df[df["group"]==0]["initialHY"].mean()
    hy_stable_std = df[df["group"]==0]["initialHY"].std()

    print(f"Progressive group - HY: {hy_prog_mean} +- {hy_prog_std}")
    print(f"Stable group - HY: {hy_stable_mean} +- {hy_stable_std}")
    
print_variable_stats(volumesDf)

Progressive group - M/F gender: 33/15
Stable group - M/F gender: 49/28
Progressive group - Age: 60.54356644710416 +- 10.246004545004064
Stable group - Age: 61.396429567870136 +- 9.26828414251294
Progressive group - HY: 1.1041666666666667 +- 0.3087092781885838
Stable group - HY: 1.7922077922077921 +- 0.40838779099431866


## Perform cohort matching

Now that we have our `volumesDf` ready, let's perform cohort matching on stable (0) and progressing (1) groups based on age, gender and initialHY. This will be done using the R script `match-data.R`

In [25]:
os.system("Rscript match-data.R")


Call:
matchit(formula = group ~ age + gen + initialHY, data = data, 
    method = "nearest", distance = "glm", replacement = F)

Summary of Balance for All Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
distance         0.6642        0.2094          2.1828     0.6270    0.3596
age             60.5436       61.3964         -0.0832     1.2221    0.0533
gen              1.3125        1.3636         -0.1092     0.9359    0.0256
initialHY        1.1042        1.7922         -2.2288     0.5714    0.3440
          eCDF Max
distance    0.6880
age         0.1285
gen         0.0511
initialHY   0.6880


Summary of Balance for Matched Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
distance         0.6642        0.2960          1.7672     0.4740    0.2287
age             60.5436       63.2762         -0.2667     1.8296    0.1103
gen              1.3125        1.2083          0.2224     1.3026    0.0521
initialHY        1.1042     

0

In [33]:
# Import matched data
matchedVolumesDf = pd.read_csv("../data/volume-data/matchedVolumes.csv")
matchedVolumesDf = matchedVolumesDf.drop(["Unnamed: 0", "X", 'distance', 'weights', 'subclass'], 1)
print_variable_stats(matchedVolumesDf)
matchedVolumesDf

Progressive group - M/F gender: 33/15
Stable group - M/F gender: 38/10
Progressive group - Age: 60.54356644710416 +- 10.246004545004064
Stable group - Age: 63.27622888437501 +- 7.574883442297089
Progressive group - HY: 1.1041666666666667 +- 0.3087092781885838
Stable group - HY: 1.6666666666666667 +- 0.4763930673403308


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,subjectId,age,gen,Pallidum,Putamen,Caudate,Thalamus,CerebellumCortex,CerebellumWM,V3,V4,Pons,SCP,Midbrain,Insula,initialHY,followUpHY,group
0,4037,52.831492,1,4261.3,10223.0,7586.8,17351.7,116856.4,38230.0,934.2,1330.4,19552.781704,307.724746,6989.344230,6476,1,2,1
1,3168,63.094798,2,3776.7,8200.9,5738.2,13200.4,91395.5,31986.1,1089.4,1339.5,14452.019213,281.394581,5453.716726,7346,2,4,1
2,3131,71.205479,1,4523.6,9383.2,8577.0,16020.2,118487.3,34742.2,1719.7,2169.2,21000.357958,348.730585,8004.224865,8146,2,2,0
3,4024,72.292350,1,3444.1,8405.3,5940.0,12945.9,93723.6,22075.2,1587.0,1690.8,13148.503931,297.837617,5969.685193,6497,2,2,0
4,3373,60.815630,1,4085.5,10269.8,6978.1,13021.3,113109.0,28933.6,1714.2,2325.2,16216.102413,255.410208,6855.129215,7782,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,3760,68.517808,1,4123.7,10424.5,8110.8,14402.7,111459.4,26840.2,1216.8,2024.0,14397.696801,254.078771,6409.484955,7869,2,2,0
92,3325,66.565222,2,3939.1,9136.5,7239.2,14178.1,118630.7,33138.9,1698.1,2095.9,18443.846279,293.460180,7169.696370,7027,1,2,1
93,3822,55.994536,1,3916.3,8508.5,6534.8,13407.2,120662.0,34627.3,1605.5,1856.3,15993.876755,316.770996,6418.796616,6877,1,2,1
94,3372,71.390710,1,4797.0,10114.8,9268.2,15093.5,112521.2,36136.9,3301.4,3167.2,17399.059361,318.438874,6918.687371,7682,1,2,1
