In [2]:
import numpy as np
import collections
import csv
import pandas as pd 
import textwrap
from dateutil.parser import parse
from datetime import datetime

In [10]:
#DX_CURREN = {'1':'NL', "2": 'MCI', "3": 'AD', "":""}
DX_CURREN = {"1": 0, "2": 0, "3": 1, "":""}
#DX_CHANGE = {'1':"Stable:NL to NL",'2':"Stable: MCI to MCI",'3':"Stable: AD to AD",'4':"Conv:NL to MCI",'5':"Conv:MCI to AD",'6':"Conv:NL to AD", '7':"Rev:MCI to NL",'8':"Rev:AD to MCI",'9':"Conv:AD to NL","":""}
DX_CHANGE = {"1":0,"2":0,"3":1,"4":0,"5":1,"6":1,"7":0,"8":0,"9":0,"":""}

reverted_patients = [167, 429, 555, 1226, 2210, 2367, 4005, 4114, 4426, 4434, 4641, 4706, 4746, 4899]

def get_labels(reversions = 'exclude'):
    """Produces the response variable
    
    Adds column that has binary AD variable. 
    
    Parameters:
    ----------
    reversions: how to deal with patients who revert from AD; 'exclude' or 'label0' (assume they actually reverted an therefore don't have AD)
    
    Returns:
    -------
    Data as a numpy array, and the column names 
    """
    patient_diagnosis_dict = {}
    patients_nonADdementia = set()

    with open('../Assessments/DXSUM_PDXCONV_ADNIALL.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader) #skip header
        for row in reader:
            RID = int(row[2])
            EXAMDATE = datetime.strptime(row[8], '%m/%d/%Y')
            Dx_curren = row[10]
            Dx_change = row[9]
            DXOTHDEM = row[47]
            
            if RID not in patient_diagnosis_dict:
                patient_diagnosis_dict[RID] = []

            #use the DXCURREN or DXCHANGE, depending on which is present
            if Dx_curren != "" and Dx_change == "":
                patient_diagnosis_dict[RID].append([EXAMDATE, DX_CURREN[Dx_curren]])
            elif Dx_change != "" and Dx_curren == "":
                patient_diagnosis_dict[RID].append([EXAMDATE, DX_CHANGE[Dx_change]])
            else:
                assert 1 == 0
                
            #Check for the Non-AD dementia cases
            ##################
            #if Dx_change indicates AD but non-AD dementia by DXOTHDEM
            if DXOTHDEM == "1":
                if Dx_change in ['3','5','6'] or Dx_curren == '3':
                    patients_nonADdementia.add(RID)

    #take the most recent diagnosis information 
    for patient in patient_diagnosis_dict:
        exams = sorted(patient_diagnosis_dict[patient])[-1]
        patient_diagnosis_dict[patient] = exams

    patient_dict = []
    labels = []

    with open('../Data___Database/ADNIMERGE.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            if row[0] == "RID":
                labels = row
            else:
                assert patient_diagnosis_dict[int(row[0])] #checks to make sure this patient is in diagnosis file

                diag_info = patient_diagnosis_dict[int(row[0])]
                diag_date = diag_info[0]
                diag = diag_info[1]
                
                if int(row[0]) in patients_nonADdementia: #manually correct patients with non-AD dementia 
                    row.append(0)
                
                #AD patient if the most recent Diagnosis is AD
                elif diag == 1:
                    row.append(1)
                else:
                    row.append(0)

                #parameter setting to exclude the patients that revert 
                if reversions == 'exclude':
                    if int(row[0]) not in reverted_patients:
                        patient_dict.append(row)
                    
                #parameter setting to label reversions as AD = 0
                if reversions == 'label0':
                    if int(row[0]) in reverted_patients:
                        row[-1] = 0 #correct label
                    patient_dict.append(row)

    return np.array(patient_dict), labels

In [3]:
def choose_study(df, labels, study):
    """Filters the data to only consider the specific study of interest
    
    Parameters:
    -----------
    df: dataframe
    study: 'ADNI1', 'ADNI2', "ADNIGO' or 'all'
    """
    
    df = pd.DataFrame(df)
    labels.append("AD")
    df.columns = labels
    del df['DX']
    
    if study != 'all':
        df = df[df.ORIGPROT == study]
        
    return df


def drop_imaging_columns(df, imaging_to_drop):
    """Function to drop Imaging data
    
    Parameters:
    -----------
    -df: dataframe
    -imaging_to_drop: can either drop 'MRI', 'PET', 'all' or 'none'
    
    Returns:
    --------
    Dataframe 
    
    """
    MRI_coln_names = ['FLDSTRENG', 'FSVERSION', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV','FLDSTRENG_bl', 'FSVERSION_bl', 'Ventricles_bl', 'Hippocampus_bl', 'WholeBrain_bl', 'Entorhinal_bl', 'Fusiform_bl', 'MidTemp_bl', 'ICV_bl']
    PET_coln_names = ['FDG', 'PIB', 'AV45','FDG_bl', 'PIB_bl', 'AV45_bl']
    
    if imaging_to_drop == 'MRI':
        df = df.drop(MRI_coln_names, axis=1, inplace=False)
        
    elif imaging_to_drop == 'PET':
        df = df.drop(PET_coln_names, axis=1, inplace=False)

    elif imaging_to_drop == 'all':
        df = df.drop(PET_coln_names, axis=1, inplace=False)
        df = df.drop(MRI_coln_names, axis=1, inplace=False)
    
    return df

In [4]:
def data_preprocess(study = 'all', imaging_to_drop = 'none', reversions = 'exclude'):
    """ Driver function for pre-processing
    Produces csv file with binary output variable as last column
    
    Parameters:
    -----------
    Stage 1 parameters:
        -study: specify which study to consider; 'ADNI1', 'ADNI2', "ADNIGO' or 'all'
        -imaging_to_drop: specify which imaging type to ignore; 'MRI', 'PET', 'all' or 'none'
        -reversions: specify how to deal with reverted patients; 'exclude' or 'label0'
    Output:
    -------
    Saves file as a csv named Merged_data.csv
    
    """
    #stage 1- get response variable
    matrix_final, labels = get_labels(reversions)
        
    #stage 2- choose study
    data = choose_study(matrix_final, labels, study)
    
    #stage 3- drop imaging columns
    data = drop_imaging_columns(data, imaging_to_drop)
        
    #write as csv for inspection
    data.to_csv("Merged_data.csv", index = False)
    
    return data
                

In [1]:
#data_preprocess(study = 'all', imaging_to_drop = 'none')