In [1]:
import numpy as np
import collections
import csv
import pandas as pd 
from datetime import datetime

In [2]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())




- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


# Sanity Check

Run Random Forest model with just mean mode imputation, no categorical encoding, no feature selection for all available data

In [4]:
#Get labels

#DX_CURREN = {'1':'NL', "2": 'MCI', "3": 'AD', "":""}
DX_CURREN = {"1": 0, "2": 0, "3": 1, "":""}
#DX_CHANGE = {'1':"Stable:NL to NL",'2':"Stable: MCI to MCI",'3':"Stable: AD to AD",'4':"Conv:NL to MCI",'5':"Conv:MCI to AD",'6':"Conv:NL to AD", '7':"Rev:MCI to NL",'8':"Rev:AD to MCI",'9':"Conv:AD to NL","":""}
DX_CHANGE = {"1":0,"2":0,"3":1,"4":0,"5":1,"6":1,"7":0,"8":0,"9":0,"":""}

reverted_patients = [167, 429, 555, 1226, 2210, 2367, 4005, 4114, 4426, 4434, 4641, 4706, 4746, 4899]

patient_diagnosis_dict = {}
patients_nonADdementia = set()

with open('../Assessments/DXSUM_PDXCONV_ADNIALL.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader) #skip header
    for row in reader:
        RID = int(row[2])
        EXAMDATE = datetime.strptime(row[8], '%m/%d/%Y')
        Dx_curren = row[10]
        Dx_change = row[9]
        DXOTHDEM = row[47]

        if RID not in patient_diagnosis_dict:
            patient_diagnosis_dict[RID] = []

        #use the DXCURREN or DXCHANGE, depending on which is present
        if Dx_curren != "" and Dx_change == "":
            patient_diagnosis_dict[RID].append([EXAMDATE, DX_CURREN[Dx_curren]])
        elif Dx_change != "" and Dx_curren == "":
            patient_diagnosis_dict[RID].append([EXAMDATE, DX_CHANGE[Dx_change]])
        else:
            assert 1 == 0

        #Check for the Non-AD dementia cases
        ##################
        #if Dx_change indicates AD but non-AD dementia by DXOTHDEM
        if DXOTHDEM == "1":
            if Dx_change in ['3','5','6'] or Dx_curren == '3':
                patients_nonADdementia.add(RID)

#take the most recent diagnosis information 
for patient in patient_diagnosis_dict:
    exams = sorted(patient_diagnosis_dict[patient])[-1]
    patient_diagnosis_dict[patient] = exams

patient_dict = []
labels = []
response_variable = []

with open('../Data___Database/ADNIMERGE.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        if row[0] == "RID":
            labels = row
        else:
            assert patient_diagnosis_dict[int(row[0])] #checks to make sure this patient is in diagnosis file

            diag_info = patient_diagnosis_dict[int(row[0])]
            diag_date = diag_info[0]
            diag = diag_info[1]

            if int(row[0]) in patients_nonADdementia: #manually correct patients with non-AD dementia 
                response_variable.append(0)

            #AD patient if the most recent Diagnosis is AD
            elif diag == 1:
                response_variable.append(1)
            else:
                response_variable.append(0)


            #parameter setting to label reversions as AD = 0
            if int(row[0]) not in reverted_patients:
                patient_dict.append(row)
            else:
                response_variable = response_variable[:-1]

#write labels out to a separate file
with open('FinalLabels.csv','wb') as csv_file:
    writer = csv.writer(csv_file)
    for i in range(len(patient_dict)):
        writer.writerow([patient_dict[i][0],response_variable[i]])


In [5]:
#convert to DataFrame and delete DX column
df = pd.DataFrame(np.array(patient_dict))
df.columns = labels
del df['DX']

## Convert Categorical Variables and Delete Variables We Don't Want

In [6]:
###Converts the two ExamDate columns into one EXAMDATE_bl_minus_EXAMDATE column
date_cols = ['EXAMDATE','EXAMDATE_bl']
df['EXAMDATE_bl_minus_EXAMDATE'] = (pd.to_datetime(df[date_cols[0]])-pd.to_datetime(df[date_cols[1]]))
   
delta = []
for i in df['EXAMDATE_bl_minus_EXAMDATE']:
    delta.append(i.days) #append the number of days since the basline visit
df['EXAMDATE_bl_minus_EXAMDATE'] = delta



###Delete these given columns now 
cols_to_delete = ['update_stamp','EXAMDATE','EXAMDATE_bl','PTID','SITE','FLDSTRENG','FSVERSION','FLDSTRENG_bl','FSVERSION_bl','DX_bl']
for i in cols_to_delete:
    del df[i]

    
    
####one-hot encode the following columns 
one_hot_cols = ['VISCODE','COLPROT','ORIGPROT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
df = pd.get_dummies(df, columns = one_hot_cols)

df.to_csv("Merged_data.csv", index = False)

# Longitudinal Code (adapted slightly)

In [7]:
import LongitudinalDataAnalysis_WithOutLabels

importing Jupyter notebook from LongitudinalDataAnalysis_WithOutLabels.ipynb


<IPython.core.display.Javascript object>

In [8]:

# -------------------------------
# Longitudinal Data Analysis
# -------------------------------

# Input file name for Longitudinal Data Analysis
InputToLongitudinal='Merged_data.csv'

with open(InputToLongitudinal) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    labels = next(reader)

# Output file name from this script
OutputFromLongitudinal='LongitudinalDataAnalysis.csv'

# Patient RID Features
Patient_FEATURES=['RID'];

# Demographic Features
Demo_FEATURES_type=['AGE','PTEDUCAT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
Demo_FEATURES = []
for i in labels:
    if i in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    
# Baseline OneTime Features
BaselineOneTime_FEATURES_type = ['APOE4','Years_bl','DX_bl']
BaselineOneTime_FEATURES = []
for i in labels:
    if i in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
    elif i.rfind("_") != -1 and i[:i.rfind("_")] in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
        
# Time Headers
Time_FEATURES_type=['SITE','Month','update_stamp_minus_EXAMDATE_bl','update_stamp_minus_EXAMDATE','EXAMDATE_bl_minus_EXAMDATE',
               'COLPROT','ORIGPROT','M','Month_bl']
Time_FEATURES = []
for i in labels:
    if i in Time_FEATURES_type:
        Time_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Time_FEATURES_type:
        Time_FEATURES.append(i)

# Baseline Evaluation Features
BaselineEvaluation_FEATURES=['CDRSB_bl','ADAS11_bl','ADAS13_bl','MMSE_bl','RAVLT_learning_bl','RAVLT_forgetting_bl',
                             'RAVLT_perc_forgetting_bl','FAQ_bl','MOCA_bl','EcogPtLang_bl','EcogPtVisspat_bl',
                             'EcogPtPlan_bl','EcogPtOrgan_bl','EcogPtDivatt_bl','EcogPtTotal_bl','EcogSPLang_bl',
                             'EcogSPVisspat_bl','EcogSPPlan_bl','EcogSPOrgan_bl','EcogSPDivatt_bl','EcogSPTotal_bl'];


   
# Current Medical Evaluation
CurrentEvaluation_FEATURES=['CDRSB','ADAS11','ADAS13','MMSE','RAVLT_learning','RAVLT_forgetting','RAVLT_perc_forgetting',
                            'FAQ','MOCA','EcogPtLang','EcogPtVisspat','EcogPtPlan','EcogPtOrgan','EcogPtDivatt','EcogPtTotal',
                            'EcogSPLang','EcogSPVisspat','EcogSPPlan','EcogSPOrgan','EcogSPDivatt','EcogSPTotal'];


# Longitudinal Method
LongitudinalMethod=2;
MetricList=['MaxTime','Delta','Mean','Std'];

# Run Longitudinal Data Anaysis
LongitudinalDataAnalysis_WithOutLabels.runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,\
                                         BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,\
                                         CurrentEvaluation_FEATURES,LongitudinalMethod,MetricList)





Unnamed: 0,RID,AGE,PTEDUCAT,APOE4,FDG,PIB,AV45,CDRSB,ADAS11,ADAS13,...,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,PTMARRY_Divorced,PTMARRY_Married,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed
0,2,74.3,16,0.0,1.36926,,,0.0,10.67,18.67,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,3,81.3,18,1.0,1.09079,,,4.5,22.0,31.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,3,81.3,18,1.0,1.0636,,,6.0,19.0,30.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


Input Matrix Size:
-----------------------
(12618, 122)
 
Identified columns of interest in input file: 
------------------------------------------------------------------------------------------------------------------------
Patient RID column: [0]
Demo columns: [  1   2 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
 121]
BaselineOneTime columns: [ 3 72]
BaselineEvaluation columns: [38 39 40 41 43 44 45 46 54 56 57 58 59 60 61 63 64 65 66 67 68]
Time columns: [ 73  74  75  76  99 100 101 102 103 104]
CurrentEvaluation columns: [ 7  8  9 10 12 13 14 15 16 18 19 20 21 22 23 25 26 27 28 29 30]
------
Method 2 for Longitudinal Data Analysis
------
 
New Input Matrix Size:
-----------------------
(1723, 106)
 
New Output Matrix Size:
-----------------------
(0,)


## Split Data Set

In [9]:
#Deal with X 
df = pd.read_csv('LongitudinalDataAnalysis.csv')
l = len(df)
data_train = df.iloc[:int(2*l/3.0),:]
data_test = df.iloc[int(2*l/3.0)+1:,:]

#Read in Y from the saved file earlier
patient_labels = {}
with open('FinalLabels.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        patient_labels[int(row[0])] = int(row[1])

Y_train = [patient_labels[i] for i in list(data_train.iloc[:,0])]
Y_test = [patient_labels[i] for i in list(data_test.iloc[:,0])]

data_train.to_csv("LongitudinalDataAnalysis_train.csv", index = False)
data_test.to_csv("LongitudinalDataAnalysis_test.csv", index = False)

## Mean-mode Imputation

In [10]:
data_train = pd.read_csv("LongitudinalDataAnalysis_train.csv")
data_test = pd.read_csv("LongitudinalDataAnalysis_test.csv")

for column in data_train:
    #convert to numeric
    data_train[[column]] = data_train[[column]].apply(pd.to_numeric)
    data_test[[column]] = data_test[[column]].apply(pd.to_numeric)
    
    #if the column empty, delete it 
    if pd.isnull(data_train[column]).all():
        del data_train[column]    
        del data_test[column] 

    #if this is a categorical column
    elif np.array_equal(sorted(data_train[column].unique()),[0,1]) or np.array_equal(sorted(data_train[column].unique()),[0]) or np.array_equal(sorted(data_train[column].unique()),[1]):
        data_train[column] = data_train[column].replace(np.nan, data_train[column].value_counts()[0])
        data_test[column] = data_test[column].replace(np.nan, data_train[column].value_counts()[0])

    else: #if numerical column
        data_train[column] = data_train[column].replace(np.nan, data_train[column].mean()) 
        data_test[column] = data_test[column].replace(np.nan, data_train[column].mean()) 

data_train.to_csv("Features_train.csv", index = False)
data_test.to_csv("Features_test.csv", index = False)

## Supervised Learning

In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

###Read in data and delete RID column before supervised learning
data_train = pd.read_csv('Features_train.csv', delimiter=",")
data_test = pd.read_csv('Features_test.csv', delimiter=",")

del data_train['# RID']
del data_test['# RID']
##########


## MLP Model
model = MLPClassifier()
model.fit(data_train, Y_train)

y_pred = model.predict(data_test)

prec, rec, f1, sup = precision_recall_fscore_support(Y_test, y_pred, average= 'binary')
acc = accuracy_score(Y_test, y_pred)

print "Accuracy:",acc, "Precision:",prec, "Recall:",rec, "F1",f1

## Random Forest Model
model = RandomForestClassifier()
model.fit(data_train, Y_train)
featimp = model.feature_importances_

importances = sorted(zip(featimp,list(data_train)),reverse = True)

y_pred = model.predict(data_test)

prec, rec, f1, sup = precision_recall_fscore_support(Y_test, y_pred, average= 'binary')
acc = accuracy_score(Y_test, y_pred)

print "Accuracy:",acc, "Precision:",prec, "Recall:",rec, "F1",f1

Accuracy: 0.939024390244 Precision: 0.914438502674 Recall: 0.9 F1 0.907161803714
Accuracy: 0.925087108014 Precision: 0.940119760479 Recall: 0.826315789474 F1 0.879551820728


In [12]:
for i in importances:
    print i

(0.13886068807876101, 'FAQ_Mean')
(0.098756345708504084, 'RAVLT_perc_forgetting_Mean')
(0.094684954472393654, 'MMSE_Mean')
(0.090463608097144349, 'ADAS11_Mean')
(0.086292068433668781, 'ADAS13_Mean')
(0.065302518267442058, 'EcogSPTotal_Mean')
(0.04738300247302956, 'CDRSB_Mean')
(0.043484439230889335, 'CDRSB_Delta')
(0.041917960667983992, 'CDRSB_Std')
(0.034498462215677012, 'EcogSPVisspat_Mean')
(0.022891836611709439, 'FAQ_Std')
(0.015445954125760396, 'RAVLT_learning_Mean')
(0.012573139481013962, 'ADAS11_Std')
(0.011647826970743109, 'ADAS11_Delta')
(0.011271858508419857, 'EcogSPOrgan_Mean')
(0.010913428401122948, 'ADAS13_Std')
(0.010537301556634061, 'ADAS13_Delta')
(0.0087227894491112533, 'AGE')
(0.0073421192395510928, 'RAVLT_forgetting_Delta')
(0.0072199176404592752, 'MMSE_Std')
(0.0065026539084536827, 'RAVLT_perc_forgetting_Std')
(0.0063494007083218559, 'FAQ_Delta')
(0.0062596082176310745, 'MMSE_Delta')
(0.0062289284537240374, 'EcogSPPlan_Mean')
(0.0061778583186433127, 'EcogSPDivatt_Me