In [1]:
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import collections
import csv
import pandas as pd 
from scipy.stats import mode
from dateutil.parser import parse


# Merge in Diagnosis Data

Input: ADNIMERGE file

Output: Merged_data.csv

Driver function for merge is 'data_preprocess' in next code block

In [2]:
def specific_diagnosis(blank_DX = 'delete', dementia_noSpecifics = 'delete', dxothdem = 'standard'):
    """Integrates the specific diagnosis information into ADNIMERGE
    
    Adds column that has binary AD variable. If dementia_noSpecifics = 'keep', final column will show 'Non-specific Dementia' when specific info missing.
    If blank_DX = 'keep', final column will show 'No diagnosis' when no diagnosis info available. This is for future work potentially imputing the diagnoses.
    
    Parameters:
    ----------
    blank_DX: specifies how to handle blank DX entries in ADNIMERGE, either 'delete' or 'keep'
    dementia_noSpecifics: specifies how to handle Dementia DX with no corresponding specific diagnosis (AD vs. non-AD), either 'delete' or 'keep'
    dxothdem: specifies what to consider AD in Diagnosis file, 'standard' means only consider empty string as AD, 'include-4' means consider empty string and -4's as AD
    
    Returns:
    -------
    Data as a numpy array, and the column names 
    """
    
    #make patient dictionary of RID --> rows corresponding to patient
    patient_dict = {}
    labels = []
    with open('../Data___Database/ADNIMERGE.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            if row[0] == "RID":
                labels = row
            else:
                if row[0] not in patient_dict:
                    patient_dict[row[0]] = [row]
                else:
                    patient_dict[row[0]].append(row)

    #make dictionary of dictionaries RID --> EXAMDATE --> specific diagnosis 
    patient_diagnosis_dict = {}
    with open('../Assessments/DXSUM_PDXCONV_ADNIALL.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader) #skip header
        for row in reader:
            RID = row[2]
            if RID not in patient_diagnosis_dict:
                patient_diagnosis_dict[RID] = {}
            
            EXAMDATE = parse(row[8])
            DXOTHDEM = row[47]
            patient_diagnosis_dict[RID][EXAMDATE] = DXOTHDEM

    ################
    #Iterate through each visit for each patient
    #  -if diagnosis is Dementia (or conversion to Dementia), try to find RID/EXAMDATE match in Diagnosis file
    #      -if RID/EXAMDATE missing from file, throw out data (unless dementia_noSpecifics = keep)
    #      -O/w, based on dxothdem parameter, use specific Diagnosis data to decide on value for AD column
    #  -if diagnosis is non-Dementia, AD = 0 
    #  -if diagnosis is missing, throw out data (unless blank_DX = keep )
    ################
    patient_matrix = []
    for patient in patient_dict:
        for visit in patient_dict[patient]:
            DX = visit[51]
            if DX == 'Dementia' or DX.split(' ')[-1] == 'Dementia':
                #attempt to find the RID, EXAMDATE combo in the Diagnosis file
                try:
                    EXAMDATE_merge = parse(visit[6])
                    DX_AD = patient_diagnosis_dict[patient][EXAMDATE_merge]

                    if dxothdem == 'standard' and not DX_AD: #if empty, they have AD
                        visit.append(1)
                        patient_matrix.append(visit)
                    elif dxothdem == 'include-4' and (not DX_AD or DX_AD == '-4'): #if empty or -4, they have AD
                        visit.append(1)
                        patient_matrix.append(visit)
                    else: #otherwise, don't have AD
                        visit.append(0)
                        patient_matrix.append(visit)

                except: #if exam date/patient combo does not have specific info, throw out info unless dementia_noSpecifics == keep 
                    if dementia_noSpecifics == 'keep':
                        visit.append('Non-specific Dementia')
                        patient_matrix.append(visit)

            elif DX: #if there is other diagnosis besides Dementia 
                visit.append(0)
                patient_matrix.append(visit)
            else: #if missing diagnosis information
                if blank_DX == 'keep':
                    visit.append('No Diagnosis')
                    patient_matrix.append(visit)
        
    return np.array(patient_matrix), labels


def convert_to_binary_predictor(intermediate_matrix, labels, method = "delete"):
    """ This method converts the final column into binary variable
    
    This would also be where any future work would be done in terms of imputing
    a diagnosis based on previous/future diagnoses. For now, rows with incomplete diagnosis
    are deleted.
   
    Parameters:
    -----------
    -labels: column names
    -method: method through which to deal with non-numeric entries added in the specific_diagnosis function.
             method = 'delete' says to delete all rows where AD != 0 or 1
    """
    
    if method == "delete":
        df = pd.DataFrame(intermediate_matrix)
        labels.append("AD")
        df.columns = labels
        del df['DX']
        
        df2 = df.loc[df['AD'].isin(['0','1'])] #drop rows where AD is not 0 or 1 
        df2[['AD']] = df2[['AD']].apply(pd.to_numeric) #convert to numeric
        return df2

def choose_study(df, study):
    """Filters the data to only consider the specific study of interest
    
    Parameters:
    -----------
    df: dataframe
    study: 'ADNI1', 'ADNI2', "ADNIGO' or 'all'
    """
    if study != 'all':
        return df.loc[df['COLPROT'].isin([study])]
    return df


def drop_imaging_columns(df, imaging_to_drop):
    """Function to drop Imaging data
    
    Parameters:
    -----------
    -df: dataframe
    -imaging_to_drop: can either drop 'MRI', 'PET', 'all' or 'none'
    
    Returns:
    --------
    Dataframe 
    
    """
    MRI_coln_names = ['FLDSTRENG', 'FSVERSION', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Fusiform', 'MidTemp', 'ICV','FLDSTRENG_bl', 'FSVERSION_bl', 'Ventricles_bl', 'Hippocampus_bl', 'WholeBrain_bl', 'Entorhinal_bl', 'Fusiform_bl', 'MidTemp_bl', 'ICV_bl']
    PET_coln_names = ['FDG', 'PIB', 'AV45','FDG_bl', 'PIB_bl', 'AV45_bl']
    
    if imaging_to_drop == 'MRI':
        df = df.drop(MRI_coln_names, axis=1, inplace=False)
        
    elif imaging_to_drop == 'PET':
        df = df.drop(PET_coln_names, axis=1, inplace=False)

    elif imaging_to_drop == 'all':
        df = df.drop(PET_coln_names, axis=1, inplace=False)
        df = df.drop(MRI_coln_names, axis=1, inplace=False)
    
    return df
    

In [1]:
def data_preprocess(study = 'all', blank_DX = 'delete', dementia_noSpecifics = 'delete', dxothdem = 'standard', method = 'delete', imaging_to_drop = 'none'):
    """ Driver function for pre-processing
    Produces csv file with binary output variable as last column
    
    Parameters:
    -----------
    Stage 1 parameters:
        -blank_DX: specify whether to 'keep' or 'delete' rows with blank DX during the first stage of preprocessing
        -dementia_noSpecifics: specify whether to 'keep' or 'delete' rows with Dementia DX but no specific diagnosis info in Diagnosis file
        -dxothdem: specifies whether only " " ('standard') are considered AD patients or "-4" is also considered AD ('include-4')
    Stage 2 parameters:
        -study: specify which study to consider; 'ADNI1', 'ADNI2', "ADNIGO' or 'all'
        -method: Specifies how to convert AD column to a binary predictor; 'delete' deletes all columns with non-numeric AD column value
    Stage 3 parameters:
        -imaging_to_drop: specify which imaging type to ignore; 'MRI', 'PET', 'all' or 'none'
    Output:
    -------
    Saves file as a csv named Merged_data.csv
    
    """
    #stage 1
    matrix_intermediate, labels = specific_diagnosis(blank_DX, dementia_noSpecifics, dxothdem )
    
    #stage 2
    matrix_final = convert_to_binary_predictor(matrix_intermediate, labels, method)
    data = choose_study(matrix_final, study)
    
    #stage 3
    data = drop_imaging_columns(data, imaging_to_drop)
    
    #write as csv for inspection
    data.to_csv("Merged_data.csv", index = False)
    
    return data
                

### Function to see number of AD patients left in each study

In [4]:
def LookAtPatientNumbers():
    d = data_preprocess(study = 'ADNI1', blank_DX = 'delete', dementia_noSpecifics = 'delete', dxothdem = 'standard', method = 'delete', imaging_to_drop = 'all')
    l =  d.groupby(['RID'])[["AD"]].sum()
    print "ADNI1 has ", int(l[(l.AD != 0)].count()), " patients with AD"
    print d.shape


    dd = data_preprocess(study = 'ADNI2', blank_DX = 'delete', dementia_noSpecifics = 'delete', dxothdem = 'standard', method = 'delete', imaging_to_drop = 'MRI')
    ll =  dd.groupby(['RID'])[["AD"]].sum()
    print "ADNI2 has ", int(ll[(ll.AD != 0)].count()), " patients with AD"
    print len(dd['RID'].unique())
    print dd.shape


    ddd = data_preprocess(study = 'ADNIGO', blank_DX = 'delete', dementia_noSpecifics = 'delete', dxothdem = 'standard', method = 'delete', imaging_to_drop = 'PET')
    lll =  ddd.groupby(['RID'])[["AD"]].sum()
    print "ADNIGO has ",int(lll[(lll.AD != 0)].count()), " rows with AD"
    print ddd.shape
