## Building cohort

In this notebook, we use PPMI's MDS_UPDRS_Part_III file to extract a cohort of progressive and stable patients.

In [1]:
import os
import pandas as pd
import numpy as np

#### Build dataframe containing patients with BL and V08

In [7]:
def get_df(visitFrom="BL", visitTo="V08", dataFile="../data/ppmi-data/HY_Baseline_Stage.csv"):
    df = pd.read_csv(dataFile)
    df = df[["PATNO", "EVENT_ID", "NHY"]]

    # Remove null and missing data
    df = df.dropna(subset=['NHY'])

    # Keep BL and V08 patients
    df = df[(df["EVENT_ID"] == visitFrom) | (df["EVENT_ID"] == visitTo)]

    # Remove rows with missing info
    listOfSubjectID = df["PATNO"].unique()
    for subId in listOfSubjectID:
        if not(visitFrom in df[df["PATNO"] == subId]["EVENT_ID"].values and visitTo in df[df["PATNO"] == subId]["EVENT_ID"].values):
            indexNames = df[ df['PATNO'] == subId ].index
            df.drop(indexNames , inplace=True)

        if len(df[df["PATNO"] == subId]) != 2:
            indexNames = df[ df['PATNO'] == subId ].index
            df.drop(indexNames , inplace=True)

        if ("0" in df[df["PATNO"] == subId]["NHY"].values or 0 in df[df["PATNO"] == subId]["NHY"].values):
            indexNames = df[ df['PATNO'] == subId ].index
            df.drop(indexNames , inplace=True)

    df = df.rename(columns={"PATNO":"subjectId"})
    return df

def build_df_cohort(df, cohort, visitFrom="BL", visitTo="V08"):
    baselineStageList = []
    followUpStageList = []

    tempDf = df.copy()

    for subId in tempDf["subjectId"].values:
        baselineStage = tempDf.loc[(tempDf['subjectId'] == subId) & (tempDf['EVENT_ID'] == visitFrom)]["NHY"].values[0]
        followUpStage = tempDf.loc[(tempDf['subjectId'] == subId) & (tempDf['EVENT_ID'] == visitTo)]["NHY"].values[0]
        baselineStageList.append(baselineStage)
        followUpStageList.append(followUpStage)

    tempDf["initialHY"] = baselineStageList
    tempDf["followUpHY"] = followUpStageList
    tempDf["group"] = (tempDf["initialHY"] < tempDf["followUpHY"]).astype(int)
    
    progressiveIds = tempDf[tempDf["group"] == 1]["subjectId"].unique()
    stableIds = tempDf[tempDf["group"] == 0]["subjectId"].unique()
    
    if cohort == "progressive":
        return progressiveIds
    if cohort == "stable":
        return stableIds

def build_cohort_per_3y_gap(dataFile="../data/ppmi-data/HY_Baseline_Stage.csv", cohort="progressive", progressiveIds=None):
    '''
    Build cohort using all patients that have 3 year difference in visits available.
    '''
    subjectDict = {}
    subjectIds = set()
    visitFrom = ["BL", "V02", "V04", "V05", "V06", "V08", "V10", "V12", "V13", "V15", "V16", "V17"]
    visitTo = ["V08", "V09", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V18", "V19", "V20"]
    for index, visit in enumerate(visitFrom):
        print(f"Getting subjects from {visit} to {visitTo[index]}...")
        vFrom = visit
        vTo = visitTo[index]
        df = get_df(vFrom, vTo, dataFile)
        cohort_df = build_df_cohort(df, cohort, visitFrom=vFrom, visitTo=vTo)
        
        # Once we fetched the progressive patients, we select stable patients and ensure that they
        # are not already in the progressive cohort.
        if cohort == "stable":
            for subjectId in cohort_df:
                if subjectId not in progressiveIds:
                    subjectIds.add(subjectId)
        else:
            subjectIds.update(cohort_df)
            
        # Save a dictionary to know each patient is taken from which visits
        for subjectId in subjectIds:
            if subjectId not in subjectDict:
                subjectDict[subjectId] = f'{vFrom}-{vTo}'

    return subjectIds, subjectDict

In [8]:
HY_BASELINE_FILE = "../data/ppmi-data/HY_Baseline_Stage.csv"
UPDRS_FILE = "../data/ppmi-data/MDS_UPDRS_Part_III.csv"

print("==============Fetching all progressive patients with 3y gap visits==============")
progressiveSubjectsIds, progressiveDict = build_cohort_per_3y_gap(dataFile=UPDRS_FILE, cohort="progressive")
print("==============Fetching all stable patients with 3y gap visits==============")
stableSubjectsIds, stableDict = build_cohort_per_3y_gap(dataFile=UPDRS_FILE, cohort="stable", progressiveIds=progressiveSubjectsIds)

Getting subjects from BL to V08...




Getting subjects from V02 to V09...
Getting subjects from V04 to V10...
Getting subjects from V05 to V11...
Getting subjects from V06 to V12...
Getting subjects from V08 to V13...
Getting subjects from V10 to V14...
Getting subjects from V12 to V15...
Getting subjects from V13 to V16...
Getting subjects from V15 to V18...
Getting subjects from V16 to V19...
Getting subjects from V17 to V20...
Getting subjects from BL to V08...
Getting subjects from V02 to V09...
Getting subjects from V04 to V10...
Getting subjects from V05 to V11...
Getting subjects from V06 to V12...
Getting subjects from V08 to V13...
Getting subjects from V10 to V14...
Getting subjects from V12 to V15...
Getting subjects from V13 to V16...
Getting subjects from V15 to V18...
Getting subjects from V16 to V19...
Getting subjects from V17 to V20...


In [9]:
print(f"Progressive subjects IDs: - {len(progressiveSubjectsIds)} patients\n{','.join(map(str, progressiveSubjectsIds))}\n")
print(f"Stable subjects IDs: - {len(stableSubjectsIds)} patients \n{','.join(map(str, stableSubjectsIds))}")

Progressive subjects IDs: - 252 patients
3588,3076,3078,3591,4103,4109,3086,4111,4112,4113,41486,3603,3604,4117,4115,3609,3102,3616,3105,3621,3622,3111,4135,3625,3116,3631,41521,3123,3638,50746,3130,53308,3650,3653,3654,3660,3150,3664,3665,3666,72784,40538,53339,3166,41568,60004,3173,3175,40553,3179,3180,3184,3185,3186,3700,3190,3711,60033,40578,60035,3203,3207,18567,42121,40585,3211,3212,40592,3218,3220,3223,3226,3227,3229,3230,3231,3233,60065,3752,50860,3757,3758,3760,3763,3251,3252,3766,3770,3773,3776,3778,3268,3269,3781,3780,3785,3787,3789,3791,3792,3793,3795,3802,3815,3305,3818,3308,3309,3822,3311,3312,3313,3823,3825,3826,40691,40693,3829,3830,3831,3832,3835,3323,3834,40694,40703,40704,3328,85242,3332,40709,16644,40713,40714,3856,3866,40730,3354,3869,3870,40733,3364,3365,3371,3372,3373,3377,40755,3385,3386,3387,3392,41281,40769,41282,42308,41285,41280,41287,41289,40777,3914,40778,41293,92490,3407,41295,55124,90456,41305,3417,3419,3420,3421,40800,3429,3431,42346,3435,3439,3951,3953

#### Build progressive and stable cohort dataframe

In [10]:
progressivePatients = "../data/ppmi-data/prog.csv"
stablePatients = "../data/ppmi-data/stable.csv"

progDf = pd.read_csv(progressivePatients)
progDf = progDf.rename(columns={"Subject ID":"subjectId", "Sex": "gen", "Age": "age"})
progDf = progDf[["subjectId", "gen", "age"]]
progDf['gen'] = progDf['gen'].map(dict(zip(['M','F'],[1,2])))

stableDf = pd.read_csv(stablePatients)
stableDf = stableDf.rename(columns={"Subject ID":"subjectId", "Sex": "gen", "Age": "age"})
stableDf = stableDf[["subjectId", "gen", "age"]]
stableDf['gen'] = stableDf['gen'].map(dict(zip(['M','F'],[1,2])))

#### Get dataframe for each 3y gaps

In [22]:
def get_df_dict(dataFile):
    df_per_range = {}
    visitFrom = ["BL", "V02", "V04", "V05", "V06", "V08", "V10", "V12", "V13", "V15", "V16", "V17"]
    visitTo = ["V08", "V09", "V10", "V11", "V12", "V13", "V14", "V15", "V16", "V18", "V19", "V20"]

    for index, vFrom in enumerate(visitFrom):
        vTo = visitTo[index]
        print(f"Building dataframe for {vFrom}-{vTo}...")
        df_temp = get_df(vFrom, vTo, dataFile)
        rangeKey = f"{vFrom}-{vTo}"
        df_per_range[rangeKey] = df_temp
    
    return df_per_range

#### Add initial and final NHY per cohort

In [31]:
def add_nhy(cohortDf, cohortDict, dataFile):
    baselineStageList = []
    followUpStageList = []
    df_dict = get_df_dict(dataFile)

    for subId in cohortDf["subjectId"].values:
        vFrom, vTo = cohortDict[subId].split("-")
        df = df_dict[f'{vFrom}-{vTo}']
        baselineStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == vFrom)]["NHY"].values[0]
        followUpStage = df.loc[(df['subjectId'] == subId) & (df['EVENT_ID'] == vTo)]["NHY"].values[0]
        baselineStageList.append(baselineStage)
        followUpStageList.append(followUpStage)

    cohortDf["initialHY"] = baselineStageList
    cohortDf["followUpHY"] = followUpStageList
    return cohortDf

In [32]:
print("=================Adding initial and final HY to progressive dataframe=================")
progDf = add_nhy(progDf, progressiveDict, dataFile=UPDRS_FILE)
print("=================Adding initial and final HY to stable dataframe=================")
stableDf = add_nhy(stableDf, stableDict, dataFile=UPDRS_FILE)

Building dataframe for BL-V08...


  if __name__ == '__main__':


Building dataframe for V02-V09...
Building dataframe for V04-V10...
Building dataframe for V05-V11...
Building dataframe for V06-V12...
Building dataframe for V08-V13...
Building dataframe for V10-V14...
Building dataframe for V12-V15...
Building dataframe for V13-V16...
Building dataframe for V15-V18...
Building dataframe for V16-V19...
Building dataframe for V17-V20...
Building dataframe for BL-V08...
Building dataframe for V02-V09...
Building dataframe for V04-V10...
Building dataframe for V05-V11...
Building dataframe for V06-V12...
Building dataframe for V08-V13...
Building dataframe for V10-V14...
Building dataframe for V12-V15...
Building dataframe for V13-V16...
Building dataframe for V15-V18...
Building dataframe for V16-V19...
Building dataframe for V17-V20...


#### Save dataframes

In [33]:
final_df = pd.concat([progDf, stableDf])
final_df["group"] = final_df["initialHY"] < final_df["followUpHY"]
final_df["group"] = final_df["group"].astype(int)
final_df.to_csv("../data/volume-data/cohortBeforeMatching.csv")
final_df

Unnamed: 0,subjectId,gen,age,initialHY,followUpHY,group
0,3001,1,65.1,1,2,1
1,3003,2,56.7,2,3,1
2,3020,2,74.0,2,3,1
3,3024,1,52.7,1,2,1
4,3059,1,83.0,2,3,1
...,...,...,...,...,...,...
195,53060,1,68.1,2,2,0
196,53060,1,68.1,2,2,0
197,54265,1,75.2,2,2,0
198,55875,1,59.0,2,2,0
