In [1]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())


- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


In [2]:
import scipy
import numpy
import pandas
import csv
import MergeDiagnosis_AdjustedLabels
import LongitudinalDataAnalysis
import Categorical_Updated
import Imputation
import FeatureReduction
import SupervisedLearning
import ModelPerformance
import TrainTestSplit
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

importing Jupyter notebook from MergeDiagnosis_AdjustedLabels.ipynb
importing Jupyter notebook from LongitudinalDataAnalysis.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from Categorical_Updated.ipynb
importing Jupyter notebook from Imputation.ipynb
importing Jupyter notebook from FeatureReduction.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from SupervisedLearning.ipynb
importing Jupyter notebook from ModelPerformance.ipynb
importing Jupyter notebook from TrainTestSplit.ipynb


In [3]:
# -------------------------------
# Merge Data
# -------------------------------
merged_data = MergeDiagnosis_AdjustedLabels.data_preprocess(study = "all",imaging_to_drop = 'all', reversions = 'label0')

In [4]:
# -------------------------------
# Categorical to Numerical
# -------------------------------
date_cols = ['update_stamp','EXAMDATE','EXAMDATE_bl']
cols_to_ignore = ['PTID']

Categorical_Updated.categorical_conversion(date_cols,cols_to_ignore)

Columsn that are one-hot encoded
-------------------------------------
['VISCODE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY']


In [5]:
# -------------------------------
# Longitudinal Data Analysis
# -------------------------------

# Input file name for Longitudinal Data Analysis
InputToLongitudinal='CategoricalToNumerical.csv'

with open(InputToLongitudinal) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    labels = next(reader)

# Output file name from this script
OutputFromLongitudinal='LongitudinalDataAnalysis.csv'

# Patient RID Features
Patient_FEATURES=['RID'];

# Demographic Features
Demo_FEATURES_type=['AGE','PTEDUCAT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
Demo_FEATURES = []
for i in labels:
    if i in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    
# Baseline OneTime Features
BaselineOneTime_FEATURES_type = ['APOE4','Years_bl','ORIGPROT']
BaselineOneTime_FEATURES = []
for i in labels:
    if i in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
    elif i.rfind("_") != -1 and i[:i.rfind("_")] in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
        
# Time Headers
Time_FEATURES_type=['SITE','Month','update_stamp_minus_EXAMDATE_bl','update_stamp_minus_EXAMDATE','EXAMDATE_minus_EXAMDATE_bl',
               'COLPROT','M','Month_bl']
Time_FEATURES = []
for i in labels:
    if i in Time_FEATURES_type:
        Time_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Time_FEATURES_type:
        Time_FEATURES.append(i)
Time_FEATURES.insert(len(Time_FEATURES), Time_FEATURES.pop(Time_FEATURES.index('Month_bl'))) # Month_bl must be last feature in this list

        
# Baseline Evaluation Features
BaselineEvaluation_FEATURES=['CDRSB_bl','ADAS11_bl','ADAS13_bl','MMSE_bl','RAVLT_learning_bl','RAVLT_forgetting_bl',
                             'RAVLT_perc_forgetting_bl','RAVLT_immediate_bl','FAQ_bl','MOCA_bl','EcogPtLang_bl','EcogPtVisspat_bl',
                             'EcogPtPlan_bl','EcogPtOrgan_bl','EcogPtDivatt_bl','EcogPtMem_bl','EcogPtTotal_bl','EcogSPLang_bl',
                             'EcogSPVisspat_bl','EcogSPPlan_bl','EcogSPOrgan_bl','EcogSPDivatt_bl','EcogSPMem_bl','EcogSPTotal_bl'];


   
# Current Medical Evaluation
CurrentEvaluation_FEATURES=['CDRSB','ADAS11','ADAS13','MMSE','RAVLT_learning','RAVLT_forgetting','RAVLT_perc_forgetting','RAVLT_immediate',
                            'FAQ','MOCA','EcogPtLang','EcogPtVisspat','EcogPtPlan','EcogPtOrgan','EcogPtDivatt','EcogPtMem','EcogPtTotal',
                            'EcogSPLang','EcogSPVisspat','EcogSPPlan','EcogSPOrgan','EcogSPDivatt','EcogSPMem','EcogSPTotal'];


# Current Diagnosis
CurrentDiagnosis_FEATURES= ['AD'];

# Longitudinal Method
LongitudinalMethod=2;
MetricList=['MaxTime','Delta','Mean','Std'];

# Run Longitudinal Data Anaysis
LongitudinalDataAnalysis.runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,\
                                         BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,\
                                         CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList)





Unnamed: 0.1,Unnamed: 0,RID,SITE,AGE,PTEDUCAT,APOE4,CDRSB,ADAS11,ADAS13,MMSE,...,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,PTMARRY_Married,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed,AD
0,0,2,11,74.3,16,0.0,0.0,10.67,18.67,28.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
1,1,3,11,81.3,18,1.0,4.5,22.0,31.0,20.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
2,2,3,11,81.3,18,1.0,6.0,19.0,30.0,24.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1


Input Matrix Size:
-----------------------
(12736, 104)
 
Identified columns of interest in input file: 
------------------------------------------------------------------------------------------------------------------------
Patient RID column: [1]
Demo columns: [  3   4  90  91  92  93  94  95  96  97  98  99 100 101 102]
BaselineOneTime columns: [ 5 54 84 85]
BaselineEvaluation columns: [30 31 32 33 35 36 37 34 38 39 41 42 43 44 45 40 46 48 49 50 51 52 47 53]
Time columns: [ 2 56 57 58 59 60 82 83 55]
CurrentEvaluation columns: [ 6  7  8  9 11 12 13 10 14 15 17 18 19 20 21 16 22 24 25 26 27 28 23 29]
CurrentDiagnosis columns: [103]
------
Method 2 for Longitudinal Data Analysis
------
 
New Input Matrix Size:
-----------------------
(1737, 115)
 
New Output Matrix Size:
-----------------------
(1737,)


In [6]:
TrainTestSplit.traintest_split(0.33)

Imputation.imputation('meanmode')

# ----------------------------------
# Feature Reduction 
# ----------------------------------

# Input and Output files from Feature Reduction for train set
InputToFeatureReduction_train     ='ImputedMatrix_train.csv'
OutputFromFeatureReduction_train  ='Features_train.csv'
InputToFeatureReduction_test      ='ImputedMatrix_test.csv'
OutputFromFeatureReduction_test   ='Features_test.csv'


# Normalization method
NormalizationMethod='MinMax'
#NormalizationMethod='MeanStd'

# Feature Reduction Method and Settings
#FeatureReductionMethod='SVD'; 
ExplainedVariance=0.99; # For method 'SVD'

FeatureReductionMethod= 'none'; 
APpreference=-50; # Hyperparameter for method 'AffinityPropagation'


# Run Feature Reduction
FeatureReduction.RunFeatureReduction(InputToFeatureReduction_train,OutputFromFeatureReduction_train,\
                                     InputToFeatureReduction_test,OutputFromFeatureReduction_test,\
                                     NormalizationMethod,FeatureReductionMethod,ExplainedVariance,APpreference)

Unnamed: 0,# AGE,PTEDUCAT,PTGENDER_Male,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,...,EcogSPDivatt_Std,EcogSPMem_MaxTime,EcogSPMem_Delta,EcogSPMem_Mean,EcogSPMem_Std,EcogSPTotal_MaxTime,EcogSPTotal_Delta,EcogSPTotal_Mean,EcogSPTotal_Std,Diagnostics
0,77.8,11.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.296113,47.80054,0.159046,2.197026,0.263977,47.914268,0.208483,1.880656,0.203772,1.0
1,88.5,9.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.108253,24.0328,0.125,3.96875,0.054127,24.0328,0.69871,3.559482,0.397436,1.0
2,83.1,14.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.239357,119.574,0.159046,1.479167,0.133398,119.574,0.208483,1.418803,0.143276,0.0


Unnamed: 0,# AGE,PTEDUCAT,PTGENDER_Male,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,...,EcogSPDivatt_Std,EcogSPMem_MaxTime,EcogSPMem_Delta,EcogSPMem_Mean,EcogSPMem_Std,EcogSPTotal_MaxTime,EcogSPTotal_Delta,EcogSPTotal_Mean,EcogSPTotal_Std,Diagnostics
0,69.3,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.176777,23.7049,0.25,1.625,0.176777,23.7049,0.46154,1.442308,0.235265,0.0
1,69.6,16.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.644205,36.1311,1.375,3.5,0.480885,36.1311,1.97436,3.142672,0.720218,1.0
2,84.6,16.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.2623,0.125,2.125,0.75519,6.2623,0.0,1.535795,0.453005,1.0


In [23]:
import pandas as pd
d = pd.read_csv('Features_train.csv')
d_test = pd.read_csv('Features_test.csv')
names = list(d)
names = [i.replace("'","").replace(" ","").replace("#","").replace("[","").replace("]","") for i in names]
d.columns = names
d_test.columns = names

List of features we are using:  
    1.Demographics (AGE, PTEDUCAT, PTGENDER, PTETHCAT, PTRACCAT, PTMARRY)  
    2.APOE4  
    3.Years_bl  
    4.Dx_bl  
    5.CDRSB   
    6.ADAS11  
    7.ADAS13  
    8.MMSE    
    9.RAVLT    
    10.FAQ  
    11.MOCA  
    12.ECogPtTotal  
    13.EcogSPTotal 
    

In [27]:
feature_to_column = {
    1: ['AGE','PTEDUCAT','PTGENDER_Male','PTETHCAT_NotHisp/Latino','PTETHCAT_Unknown','PTRACCAT_Asian','PTRACCAT_Black','PTRACCAT_Hawaiian/OtherPI','PTRACCAT_Morethanone','PTRACCAT_Unknown','PTRACCAT_White','PTMARRY_Married', 'PTMARRY_Nevermarried','PTMARRY_Unknown', 'PTMARRY_Widowed'],
    2: ['APOE4'],
    3: ['Years_bl'],
    #4: ['DX_bl_CN', 'DX_bl_EMCI', 'DX_bl_LMCI', 'DX_bl_SMC'],
    4: ['CDRSB_MaxTime', 'CDRSB_Delta', 'CDRSB_Mean', 'CDRSB_Std'],
    5: ['ADAS11_MaxTime', 'ADAS11_Delta', 'ADAS11_Mean', 'ADAS11_Std'],
    6: ['ADAS13_MaxTime', 'ADAS13_Delta', 'ADAS13_Mean', 'ADAS13_Std'],
    7: ['MMSE_MaxTime', 'MMSE_Delta', 'MMSE_Mean', 'MMSE_Std'],
    8: ['RAVLT_learning_MaxTime', 'RAVLT_learning_Delta', 'RAVLT_learning_Mean', 'RAVLT_learning_Std', 'RAVLT_forgetting_MaxTime', 'RAVLT_forgetting_Delta', 'RAVLT_forgetting_Mean', 'RAVLT_forgetting_Std', 'RAVLT_perc_forgetting_MaxTime', 'RAVLT_perc_forgetting_Delta', 'RAVLT_perc_forgetting_Mean', 'RAVLT_perc_forgetting_Std', 'RAVLT_immediate_MaxTime', 'RAVLT_immediate_Delta', 'RAVLT_immediate_Mean', 'RAVLT_immediate_Std'],
    9: ['FAQ_MaxTime', 'FAQ_Delta', 'FAQ_Mean', 'FAQ_Std'], 
    10: ['MOCA_MaxTime', 'MOCA_Delta', 'MOCA_Mean', 'MOCA_Std'],
    11: ['EcogPtLang_MaxTime', 'EcogPtLang_Delta', 'EcogPtLang_Mean', 'EcogPtLang_Std', 'EcogPtVisspat_MaxTime', 'EcogPtVisspat_Delta', 'EcogPtVisspat_Mean', 'EcogPtVisspat_Std', 'EcogPtPlan_MaxTime', 'EcogPtPlan_Delta', 'EcogPtPlan_Mean', 'EcogPtPlan_Std', 'EcogPtOrgan_MaxTime', 'EcogPtOrgan_Delta', 'EcogPtOrgan_Mean', 'EcogPtOrgan_Std', 'EcogPtDivatt_MaxTime', 'EcogPtDivatt_Delta', 'EcogPtDivatt_Mean', 'EcogPtDivatt_Std', 'EcogPtMem_MaxTime', 'EcogPtMem_Delta', 'EcogPtMem_Mean', 'EcogPtMem_Std', 'EcogPtTotal_MaxTime', 'EcogPtTotal_Delta', 'EcogPtTotal_Mean', 'EcogPtTotal_Std'],
    12: ['EcogSPLang_MaxTime', 'EcogSPLang_Delta', 'EcogSPLang_Mean', 'EcogSPLang_Std', 'EcogSPVisspat_MaxTime', 'EcogSPVisspat_Delta', 'EcogSPVisspat_Mean', 'EcogSPVisspat_Std', 'EcogSPPlan_MaxTime', 'EcogSPPlan_Delta', 'EcogSPPlan_Mean', 'EcogSPPlan_Std', 'EcogSPOrgan_MaxTime', 'EcogSPOrgan_Delta', 'EcogSPOrgan_Mean', 'EcogSPOrgan_Std', 'EcogSPDivatt_MaxTime', 'EcogSPDivatt_Delta', 'EcogSPDivatt_Mean', 'EcogSPDivatt_Std', 'EcogSPMem_MaxTime', 'EcogSPMem_Delta', 'EcogSPMem_Mean', 'EcogSPMem_Std', 'EcogSPTotal_MaxTime', 'EcogSPTotal_Delta', 'EcogSPTotal_Mean', 'EcogSPTotal_Std'],
}

from itertools import combinations
n = range(1,len(feature_to_column)+1)
output = sum([map(list, combinations(n, i)) for i in range(1,6)], [])

In [28]:
import csv
with open('Models_MetaClassifier_NEW.csv','w') as f:
    writer = csv.writer(f)
    count = 0
    for i in output:
        writer.writerow(["Model "+str(count)]+i)
        count += 1
    

In [29]:
final_dataFrame = pd.DataFrame()
model_num = 0
for model_features in output:
    c= [] 
    for i in model_features:
        c =  c + feature_to_column[i]
    c = c + ['Diagnostics']

    data_train = d[c].as_matrix()
    data_test = d_test[c].as_matrix()

    training_set_X, test_set_X, training_set_Y, test_set_Y = data_train[:,:-1], data_test[:,:-1], data_train[:,-1], data_test[:,-1]
    model = RandomForestClassifier(n_estimators = 1000)

    model.fit(training_set_X, training_set_Y)
        
    y_pred = model.predict(test_set_X)
    
    final_dataFrame['Model'+str(model_num)] = y_pred
    model_num += 1
    if model_num % 100 == 0:
        print model_num
    if model_num % 1000 == 0:
        final_dataFrame.to_csv("MetaClassifer_Dataset_NEW.csv", index = False)
    

final_dataFrame['Diagnostics'] = test_set_Y
final_dataFrame.to_csv("MetaClassifer_Dataset_NEW.csv", index = False)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500


In [10]:
## Look at Results using RandomForestClassifier

from sklearn.model_selection import train_test_split
import pandas as pd

d_meta = pd.read_csv('MetaClassifier_Dataset.csv')
labels = list(d_meta)

data = d_meta.as_matrix()[:,:] 

Y = data[:, -1]
X = data[:,0:-1]

training_set_X, test_set_X, training_set_Y, test_set_Y, = train_test_split(
X, Y, test_size=0.33)

##### Save files #####
Yy = np.transpose(np.asmatrix(training_set_Y))
Yyy = np.transpose(np.asmatrix(test_set_Y))

train = np.concatenate((training_set_X, Yy), axis=1)
test = np.concatenate((test_set_X, Yyy), axis=1)

data_train = pd.DataFrame(data=train[:,:],  
             columns=labels)  

data_test = pd.DataFrame(data=test[:,:],  
             columns=labels) 

data_train = data_train.as_matrix()
data_test = data_test.as_matrix()

training_set_X, test_set_X, training_set_Y, test_set_Y = data_train[:,:-1], data_test[:,:-1], data_train[:,-1], data_test[:,-1]
model = RandomForestClassifier(n_estimators = 2000)

model.fit(training_set_X, training_set_Y)

y_pred = model.predict(test_set_X)

count = []
recall = []
for i in range(len(y_pred)):
    if y_pred[i] == test_set_Y[i]:
        count.append(1)
    else:
        count.append(0)
        
    if test_set_Y[i] == 1:
        if y_pred[i] == 1:
            recall.append(1)
        else:
            recall.append(0)
        
print sum(count)/float(len(count))
print sum(recall)/float(len(recall))

0.947368421053
0.95652173913
