### Script to merge file AIBL_data.csv with Merged_data.csv (which is the existing ADNI file)


In [1]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())


- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


In [2]:
# ------------------------------------------------------------------------------------------------------------------------------
#                                              Import libraries
# ------------------------------------------------------------------------------------------------------------------------------
#%reset
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import collections
import matplotlib as mpl
from collections import OrderedDict
import time
from datetime import datetime
from sys import stdout
import collections
import csv
import pandas as pd 
from scipy.stats import mode
from scipy import stats
from dateutil.parser import parse





In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [4]:
import scipy
import numpy
import pandas
import csv
import MergeDiagnosis_AdjustedLabels
import Categorical_Updated
import LongitudinalDataAnalysis
import Imputation
import FeatureReduction
import SupervisedLearning
import ModelPerformance
import TrainTestSplit

importing Jupyter notebook from MergeDiagnosis_AdjustedLabels.ipynb
importing Jupyter notebook from Categorical_Updated.ipynb
importing Jupyter notebook from LongitudinalDataAnalysis.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from Imputation.ipynb
importing Jupyter notebook from FeatureReduction.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from SupervisedLearning.ipynb
importing Jupyter notebook from ModelPerformance.ipynb
importing Jupyter notebook from TrainTestSplit.ipynb


In [5]:
# -------------------------------
# Merge Data
# -------------------------------
merged_data = MergeDiagnosis_AdjustedLabels.data_preprocess(study = "all",imaging_to_drop = 'all', reversions = 'label0')
from shutil import copyfile
copyfile("Merged_data.csv", "Merged_data_original.csv")


In [6]:
# -----------------------------------------
# Load ADNI and AIBL data sets
# -----------------------------------------
#
ADNI_Data=pd.read_csv("Merged_data_original.csv")
AIBL_Data=pd.read_csv("AIBL_data.csv")

#Change the types of AIBL to match ADNI
for i in list(ADNI_Data):
    AIBL_Data[i] = AIBL_Data[i].astype(ADNI_Data[i].dtype)

# Add 100000 to AIBL's RIDs to make sure they do not overlap with ADNI's RIDs
#AIBL_Data['RID']=AIBL_Data['RID']+100000

# The timestamp columns given in the AIBL file do not have the right format; we overwrite them for now (it does not matter too
# much as they are not really used in our analysis)
AIBL_Data['update_stamp'] = [ADNI_Data['update_stamp'].loc[0]]*len(AIBL_Data)

# Stack ADNI and AIBL data sets into one big matrix
ADNI_AIBL_data = ADNI_Data.append(AIBL_Data)

# Print matrix size after stacking ADNI and AIBL data sets
print " "; print "Data size after stacking ADNI and AIBL data sets:";print "------------------------------------------------- ";
print ADNI_AIBL_data.shape

# Save to .csv file
ADNI_AIBL_data.to_csv('Merged_data.csv',header=list(AIBL_Data), index = False)

 
Data size after stacking ADNI and AIBL data sets:
------------------------------------------------- 
(14223, 70)


In [7]:
# -------------------------------
# Categorical to Numerical
# -------------------------------
date_cols = ['update_stamp','EXAMDATE','EXAMDATE_bl']
cols_to_ignore = ['PTID']

Categorical_Updated.categorical_conversion(date_cols,cols_to_ignore)

Columns that are one-hot encoded
-------------------------------------
['VISCODE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY']


In [8]:
# -------------------------------
# Longitudinal Data Analysis
# -------------------------------

# Input file name for Longitudinal Data Analysis
InputToLongitudinal='CategoricalToNumerical.csv'

with open(InputToLongitudinal) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    labels = next(reader)

# Output file name from this script
OutputFromLongitudinal='LongitudinalDataAnalysis.csv'

# Patient RID Features
Patient_FEATURES=['RID'];

# Demographic Features
Demo_FEATURES_type=['AGE','PTEDUCAT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
Demo_FEATURES = []
for i in labels:
    if i in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    
# Baseline OneTime Features
BaselineOneTime_FEATURES_type = ['APOE4','Years_bl','ORIGPROT']
BaselineOneTime_FEATURES = []
for i in labels:
    if i in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
    elif i.rfind("_") != -1 and i[:i.rfind("_")] in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
        
# Time Headers
Time_FEATURES_type=['SITE','Month','update_stamp_minus_EXAMDATE_bl','update_stamp_minus_EXAMDATE','EXAMDATE_minus_EXAMDATE_bl',
               'COLPROT','M','Month_bl']
Time_FEATURES = []
for i in labels:
    if i in Time_FEATURES_type:
        Time_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Time_FEATURES_type:
        Time_FEATURES.append(i)
Time_FEATURES.insert(len(Time_FEATURES), Time_FEATURES.pop(Time_FEATURES.index('Month_bl'))) # Month_bl must be last feature in this list

        
# Baseline Evaluation Features
BaselineEvaluation_FEATURES=['CDRSB_bl','ADAS11_bl','ADAS13_bl','MMSE_bl','RAVLT_learning_bl','RAVLT_forgetting_bl',
                             'RAVLT_perc_forgetting_bl','RAVLT_immediate_bl','FAQ_bl','MOCA_bl','EcogPtLang_bl','EcogPtVisspat_bl',
                             'EcogPtPlan_bl','EcogPtOrgan_bl','EcogPtDivatt_bl','EcogPtMem_bl','EcogPtTotal_bl','EcogSPLang_bl',
                             'EcogSPVisspat_bl','EcogSPPlan_bl','EcogSPOrgan_bl','EcogSPDivatt_bl','EcogSPMem_bl','EcogSPTotal_bl'];


   
# Current Medical Evaluation
CurrentEvaluation_FEATURES=['CDRSB','ADAS11','ADAS13','MMSE','RAVLT_learning','RAVLT_forgetting','RAVLT_perc_forgetting','RAVLT_immediate',
                            'FAQ','MOCA','EcogPtLang','EcogPtVisspat','EcogPtPlan','EcogPtOrgan','EcogPtDivatt','EcogPtMem','EcogPtTotal',
                            'EcogSPLang','EcogSPVisspat','EcogSPPlan','EcogSPOrgan','EcogSPDivatt','EcogSPMem','EcogSPTotal'];


# Current Diagnosis
CurrentDiagnosis_FEATURES= ['AD'];

# Longitudinal Method
LongitudinalMethod=2;
MetricList=['MaxTime','Delta','Mean','Std'];

# Run Longitudinal Data Anaysis
LongitudinalDataAnalysis.runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,\
                                         BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,\
                                         CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList)





Unnamed: 0.1,Unnamed: 0,RID,SITE,AGE,PTEDUCAT,APOE4,CDRSB,ADAS11,ADAS13,MMSE,...,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,PTMARRY_Married,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed,AD
0,0,2,11,74.3,16,0.0,0.0,10.67,18.67,28.0,...,0,0,0,0,1,1,0,0,0,0
1,1,3,11,81.3,18,1.0,4.5,22.0,31.0,20.0,...,0,0,0,0,1,1,0,0,0,1
2,2,3,11,81.3,18,1.0,6.0,19.0,30.0,24.0,...,0,0,0,0,1,1,0,0,0,1


Input Matrix Size:
-----------------------
(14223L, 106L)
 
Identified columns of interest in input file: 
------------------------------------------------------------------------------------------------------------------------
Patient RID column: [1]
Demo columns: [  3   4  92  93  94  95  96  97  98  99 100 101 102 103 104]
BaselineOneTime columns: [ 5 54 85 86 87]
BaselineEvaluation columns: [30 31 32 33 35 36 37 34 38 39 41 42 43 44 45 40 46 48 49 50 51 52 47 53]
Time columns: [ 2 56 57 58 59 60 82 83 84 55]
CurrentEvaluation columns: [ 6  7  8  9 11 12 13 10 14 15 17 18 19 20 21 16 22 24 25 26 27 28 23 29]
CurrentDiagnosis columns: [105]
------
Method 2 for Longitudinal Data Analysis
------
 
New Input Matrix Size:
-----------------------
(2598L, 116L)
 
New Output Matrix Size:
-----------------------
(2598L,)


In [9]:
res = []

for i in range(100):
    
    # ------------------
    # Train Test Split
    # ------------------
    TrainTestSplit.traintest_split(0.33)

    # --------------------------------------------------------------
    # Keep only a few columns that are common to both ADNI and AIBL
    # --------------------------------------------------------------
    f=pd.read_csv("LongitudinalDataAnalysis_test.csv")
    keep_col = ['# AGE','PTGENDER_Male','APOE4','ORIGPROT_ADNI2','ORIGPROT_ADNIGO','ORIGPROT_AIBL','MMSE_MaxTime','MMSE_Delta','MMSE_Mean','MMSE_Std','Diagnostics']
    new_f = f[keep_col]
    new_f.to_csv("LongitudinalDataAnalysis_test.csv", index=False)

    f=pd.read_csv("LongitudinalDataAnalysis_train.csv")
    keep_col = ['# AGE','PTGENDER_Male','APOE4','ORIGPROT_ADNI2','ORIGPROT_ADNIGO','ORIGPROT_AIBL','MMSE_MaxTime','MMSE_Delta','MMSE_Mean','MMSE_Std','Diagnostics']
    new_f = f[keep_col]
    new_f.to_csv("LongitudinalDataAnalysis_train.csv", index=False)

    # ------------------
    # Imputation
    # ------------------
    #Imputation.imputation('knn')
    Imputation.imputation('meanmode')
    #Imputation.imputation('nuclearnorm')
    #Imputation.imputation('softimpute')

    # ----------------------------------
    # Feature Reduction 
    # ----------------------------------

    # Input and Output files from Feature Reduction for train set
    InputToFeatureReduction_train     ='ImputedMatrix_train.csv'
    OutputFromFeatureReduction_train  ='Features_train.csv'
    InputToFeatureReduction_test      ='ImputedMatrix_test.csv'
    OutputFromFeatureReduction_test   ='Features_test.csv'


    # Normalization method
    NormalizationMethod='MinMax'
    #NormalizationMethod='MeanStd'

    # Feature Reduction Method and Settings
    #FeatureReductionMethod='SVD'; 
    ExplainedVariance=0.99; # For method 'SVD'

    FeatureReductionMethod='none'; 
    APpreference=-50; # Hyperparameter for method 'AffinityPropagation'


    # Run Feature Reduction
    FeatureReduction.RunFeatureReduction(InputToFeatureReduction_train,OutputFromFeatureReduction_train,\
                                         InputToFeatureReduction_test,OutputFromFeatureReduction_test,\
                                         NormalizationMethod,FeatureReductionMethod,ExplainedVariance,APpreference)

    # ------------------
    # Supervised Learning- Model 3
    # ------------------
    from sklearn.neural_network import MLPClassifier
    from sklearn.ensemble import RandomForestClassifier
    import numpy as np
    data_train = np.loadtxt('Features_train.csv', delimiter=",", skiprows = 1)
    data_test = np.loadtxt('Features_test.csv', delimiter=",", skiprows = 1)
    training_set_X, test_set_X, training_set_Y, test_set_Y = data_train[:,:-1], data_test[:,:-1], data_train[:,-1], data_test[:,-1]
    best_params = {'n_estimators': 1000}
    model = RandomForestClassifier()
    SupervisedLearning.test_model(training_set_X, test_set_X, training_set_Y, test_set_Y, model, best_params)
    res.append(ModelPerformance.model_performance('all'))

    #------------------
    # Model Evaluation
    # ------------------
    #print ModelPerformance.model_performance('confusion_matrix')
    print ModelPerformance.model_performance('all')
    
    res.append(ModelPerformance.model_performance('all'))


[0.93379790940766549, 0.79047619047619044, 0.70338983050847459, 0.74439461883408065]
[0.93379790940766549, 0.80000000000000004, 0.69999999999999996, 0.74666666666666659]
[0.93147502903600465, 0.80000000000000004, 0.68852459016393441, 0.74008810572687223]
[0.93379790940766549, 0.80952380952380953, 0.69672131147540983, 0.74889867841409685]
[0.93147502903600465, 0.80952380952380953, 0.68548387096774188, 0.74235807860262004]
[0.93495934959349591, 0.80952380952380953, 0.7024793388429752, 0.75221238938053114]
[0.93031358885017423, 0.80000000000000004, 0.68292682926829273, 0.73684210526315796]
[0.93031358885017423, 0.79047619047619044, 0.68595041322314054, 0.73451327433628311]
[0.93263646922183507, 0.80000000000000004, 0.69421487603305787, 0.74336283185840701]
[0.93031358885017423, 0.80952380952380953, 0.68000000000000005, 0.73913043478260887]
[0.92915214866434381, 0.79047619047619044, 0.68032786885245899, 0.7312775330396476]
[0.93263646922183507, 0.79047619047619044, 0.69747899159663862, 0.7

In [10]:
#print res
res2=np.asarray(res)
#print res2
np.mean(res2, axis=0)

array([ 0.93184669,  0.80333333,  0.68933623,  0.74194203])