# Code for importing iPython notebooks

In [1]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())



- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


# Import and Run Pipeline

In [2]:
import scipy
import numpy
import pandas
import csv
import MergeDiagnosis_AdjustedLabels
import Categorical_Updated
import LongitudinalDataAnalysis
import Imputation
import FeatureReduction
import SupervisedLearning
import ModelPerformance
import TrainTestSplit

importing Jupyter notebook from MergeDiagnosis_AdjustedLabels.ipynb
importing Jupyter notebook from Categorical_Updated.ipynb
importing Jupyter notebook from LongitudinalDataAnalysis.ipynb





<IPython.core.display.Javascript object>

importing Jupyter notebook from Imputation.ipynb
importing Jupyter notebook from FeatureReduction.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from SupervisedLearning.ipynb
importing Jupyter notebook from ModelPerformance.ipynb
importing Jupyter notebook from TrainTestSplit.ipynb


In [3]:
# -------------------------------
# Merge Data
# -------------------------------
merged_data = MergeDiagnosis_AdjustedLabels.data_preprocess(study = "all",imaging_to_drop = 'all', reversions = 'label0')

In [4]:
# -------------------------------
# Categorical to Numerical
# -------------------------------
date_cols = ['update_stamp','EXAMDATE','EXAMDATE_bl']
cols_to_ignore = ['PTID']

Categorical_Updated.categorical_conversion(date_cols,cols_to_ignore)


Columns that are one-hot encoded
-------------------------------------
['VISCODE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY']


In [5]:
# -------------------------------
# Longitudinal Data Analysis
# -------------------------------

# Input file name for Longitudinal Data Analysis
InputToLongitudinal='CategoricalToNumerical.csv'

with open(InputToLongitudinal) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    labels = next(reader)

# Output file name from this script
OutputFromLongitudinal='LongitudinalDataAnalysis.csv'

# Patient RID Features
Patient_FEATURES=['RID'];

# Demographic Features
Demo_FEATURES_type=['AGE','PTEDUCAT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
Demo_FEATURES = []
for i in labels:
    if i in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    
# Baseline OneTime Features
BaselineOneTime_FEATURES_type = ['APOE4','Years_bl','ORIGPROT']
BaselineOneTime_FEATURES = []
for i in labels:
    if i in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
    elif i.rfind("_") != -1 and i[:i.rfind("_")] in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
        
# Time Headers
Time_FEATURES_type=['SITE','Month','update_stamp_minus_EXAMDATE_bl','update_stamp_minus_EXAMDATE','EXAMDATE_minus_EXAMDATE_bl',
               'COLPROT','M','Month_bl']
Time_FEATURES = []
for i in labels:
    if i in Time_FEATURES_type:
        Time_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Time_FEATURES_type:
        Time_FEATURES.append(i)
Time_FEATURES.insert(len(Time_FEATURES), Time_FEATURES.pop(Time_FEATURES.index('Month_bl'))) # Month_bl must be last feature in this list

        
# Baseline Evaluation Features
BaselineEvaluation_FEATURES=['CDRSB_bl','ADAS11_bl','ADAS13_bl','MMSE_bl','RAVLT_learning_bl','RAVLT_forgetting_bl',
                             'RAVLT_perc_forgetting_bl','RAVLT_immediate_bl','FAQ_bl','MOCA_bl','EcogPtLang_bl','EcogPtVisspat_bl',
                             'EcogPtPlan_bl','EcogPtOrgan_bl','EcogPtDivatt_bl','EcogPtMem_bl','EcogPtTotal_bl','EcogSPLang_bl',
                             'EcogSPVisspat_bl','EcogSPPlan_bl','EcogSPOrgan_bl','EcogSPDivatt_bl','EcogSPMem_bl','EcogSPTotal_bl'];


   
# Current Medical Evaluation
CurrentEvaluation_FEATURES=['CDRSB','ADAS11','ADAS13','MMSE','RAVLT_learning','RAVLT_forgetting','RAVLT_perc_forgetting','RAVLT_immediate',
                            'FAQ','MOCA','EcogPtLang','EcogPtVisspat','EcogPtPlan','EcogPtOrgan','EcogPtDivatt','EcogPtMem','EcogPtTotal',
                            'EcogSPLang','EcogSPVisspat','EcogSPPlan','EcogSPOrgan','EcogSPDivatt','EcogSPMem','EcogSPTotal'];


# Current Diagnosis
CurrentDiagnosis_FEATURES= ['AD'];

# Longitudinal Method
LongitudinalMethod=2;
MetricList=['MaxTime','Delta','Mean','Std'];

# Run Longitudinal Data Anaysis
LongitudinalDataAnalysis.runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,\
                                         BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,\
                                         CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList)




Unnamed: 0.1,Unnamed: 0,RID,SITE,AGE,PTEDUCAT,APOE4,CDRSB,ADAS11,ADAS13,MMSE,...,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,PTMARRY_Married,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed,AD
0,0,2,11,74.3,16,0.0,0.0,10.67,18.67,28.0,...,0,0,0,0,1,1,0,0,0,0
1,1,3,11,81.3,18,1.0,4.5,22.0,31.0,20.0,...,0,0,0,0,1,1,0,0,0,1
2,2,3,11,81.3,18,1.0,6.0,19.0,30.0,24.0,...,0,0,0,0,1,1,0,0,0,1


Input Matrix Size:
-----------------------
(12736L, 104L)
 
Identified columns of interest in input file: 
------------------------------------------------------------------------------------------------------------------------
Patient RID column: [1]
Demo columns: [  3   4  90  91  92  93  94  95  96  97  98  99 100 101 102]
BaselineOneTime columns: [ 5 54 84 85]
BaselineEvaluation columns: [30 31 32 33 35 36 37 34 38 39 41 42 43 44 45 40 46 48 49 50 51 52 47 53]
Time columns: [ 2 56 57 58 59 60 82 83 55]
CurrentEvaluation columns: [ 6  7  8  9 11 12 13 10 14 15 17 18 19 20 21 16 22 24 25 26 27 28 23 29]
CurrentDiagnosis columns: [103]
------
Method 2 for Longitudinal Data Analysis
------
2.0
3.0
4.0
5.0
6.0
7.0
8.0
10.0
14.0
15.0
16.0
19.0
21.0
22.0
23.0
29.0
30.0
31.0
33.0
35.0
38.0
40.0
41.0
42.0
43.0
44.0
45.0
47.0
48.0
50.0
51.0
53.0
54.0
55.0
56.0
57.0
58.0
59.0
60.0
61.0
66.0
67.0
68.0
69.0
70.0
72.0
74.0
76.0
77.0
78.0
80.0
81.0
83.0
84.0
86.0
87.0
88.0
89.0
90.0
91.0
93.0
94.

In [6]:
# ------------------
# Train Test Split
# ------------------
TrainTestSplit.traintest_split(0.33)

In [7]:
# ------------------
# Imputation
# ------------------
#Imputation.imputation('knn')
Imputation.imputation('meanmode')
#Imputation.imputation('nuclearnorm')
#Imputation.imputation('softimpute')

In [8]:
# ----------------------------------
# Feature Reduction 
# ----------------------------------

# Input and Output files from Feature Reduction for train set
InputToFeatureReduction_train     ='ImputedMatrix_train.csv'
OutputFromFeatureReduction_train  ='Features_train.csv'
InputToFeatureReduction_test      ='ImputedMatrix_test.csv'
OutputFromFeatureReduction_test   ='Features_test.csv'


# Normalization method
NormalizationMethod='MinMax'
#NormalizationMethod='MeanStd'

# Feature Reduction Method and Settings
#FeatureReductionMethod='SVD'; 
ExplainedVariance=0.99; # For method 'SVD'

FeatureReductionMethod='AffinityPropagation'; 
APpreference=-50; # Hyperparameter for method 'AffinityPropagation'


# Run Feature Reduction
FeatureReduction.RunFeatureReduction(InputToFeatureReduction_train,OutputFromFeatureReduction_train,\
                                     InputToFeatureReduction_test,OutputFromFeatureReduction_test,\
                                     NormalizationMethod,FeatureReductionMethod,ExplainedVariance,APpreference)



Unnamed: 0,# AGE,PTEDUCAT,PTGENDER_Male,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,...,EcogSPDivatt_Std,EcogSPMem_MaxTime,EcogSPMem_Delta,EcogSPMem_Mean,EcogSPMem_Std,EcogSPTotal_MaxTime,EcogSPTotal_Delta,EcogSPTotal_Mean,EcogSPTotal_Std,Diagnostics
0,86.8,17.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,60.2951,0.176537,3.375,0.0,60.2951,0.207631,3.61905,0.0,1.0
1,67.7,14.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.280263,47.583163,0.176537,2.172718,0.259859,47.645247,0.207631,1.85438,0.195883,1.0
2,81.8,16.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.280263,47.583163,0.176537,2.172718,0.259859,47.645247,0.207631,1.85438,0.195883,1.0


Unnamed: 0,# AGE,PTEDUCAT,PTGENDER_Male,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,...,EcogSPDivatt_Std,EcogSPMem_MaxTime,EcogSPMem_Delta,EcogSPMem_Mean,EcogSPMem_Std,EcogSPTotal_MaxTime,EcogSPTotal_Delta,EcogSPTotal_Mean,EcogSPTotal_Std,Diagnostics
0,69.1,17.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.381881,47.9016,0.0,1.111112,0.248453,47.9016,0.0,1.05298,0.084002,0.0
1,68.1,20.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.262445,60.3607,0.25,1.25,0.094491,60.3607,0.20513,1.164836,0.076135,0.0
2,73.3,14.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.280263,47.583163,0.176537,2.172718,0.259859,47.645247,0.207631,1.85438,0.195883,1.0


 Feature reduction implementing using Affinity Propagation 
----------------------------------------------------------------------------
- Original train matrix has size (1163L, 113L)
- Reduced train matrix has size (1163L, 23L)
- Original test matrix has size (574L, 113L)
- Reduced test matrix has size (574L, 23L)
- Number of features selected 23


In [9]:
# ------------------
# Supervised Learning- options currently are LogisticRegression, RandomForest, MLP, knn
# ------------------

#parameters = {'C': [0.001,0.01,0.1,0.5,1], 'penalty': ['l1','l2']}
#model = "LogisticRegression"

#parameters = {'n_estimators': [10,100,200]}#, 
#     'criterion' : ['gini', 'entropy'],
#     'max_features' : ['sqrt', 'log2'],
#     'max_depth' : np.arange(5, 25, 2)}
#model = "RandomForest"

#parameters = {'n_neighbors': [1,3,5,7,11]}
#model = 'knn'

parameters = {'alpha': [0.001, 0.01, 0.1, 1]}
model = "MLP"

#parameters = {'C': [0.001,0.01,0.1,0.5,1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'probability': [True]}
#model = "SVM"

#parameters = {'n_estimators' : range(10, 60, 10)}
#model = "AdaBoost"

# parameters = {'n_estimators' : range(10, 60, 10)}
# model = "GradientBoosting"

# parameters = {'C' : np.logspace(-5, 3, 5),
#             'loss' : ['hinge', 'squared_hinge']}
# model = "LinearSVM"

# parameters = {'criterion' : ['gini', 'entropy'],
#     'max_features' : ['sqrt', 'log2', None],
#     'max_depth' : np.arange(10, 20)}
# model = "DecisionTree"

result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')


In [10]:
### ------------------
# Model Evaluation
# ------------------
print ModelPerformance.model_performance('confusion_matrix')
print ModelPerformance.model_performance('all')


#print ModelPerformance.model_performance('roc')

[[308  31]
 [ 46 189]]
[0.86585365853658536, 0.80425531914893622, 0.85909090909090913, 0.83076923076923082]


# Test all Models

In [11]:
import numpy as np
parameters = {'C': [0.001,0.01,0.1,0.5,1], 'penalty': ['l1','l2']}
model = "LogisticRegression"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'n_estimators': [10,100,200,500,1000]} 
     #'criterion' : ['gini', 'entropy'],
     #'max_features' : ['sqrt', 'log2'],
     #'max_depth' : np.arange(5, 25, 2)}
model = "RandomForest"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'n_neighbors': [1,3,5,7,11]}
model = 'knn'
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'alpha': [0.001, 0.01, 0.1, 1]}
model = "MLP"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'C': [0.001,0.01,0.1,0.5,1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'probability': [True]}
model = "SVM"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'n_estimators' : range(10, 60, 10)}
model = "AdaBoost"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'n_estimators' : range(10, 60, 10)}
model = "GradientBoosting"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'C' : np.logspace(-5, 3, 5),
             'loss' : ['hinge', 'squared_hinge']}
model = "LinearSVM"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')


parameters = {'criterion' : ['gini', 'entropy'],
     'max_features' : ['sqrt', 'log2', None],
     'max_depth' : np.arange(10, 20)}
model = "DecisionTree"
result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
print ModelPerformance.model_performance('all')



[0.86411149825783973, 0.80425531914893622, 0.85520361990950222, 0.82894736842105265]
[0.8850174216027874, 0.82127659574468082, 0.88940092165898621, 0.85398230088495575]
[0.84843205574912894, 0.78723404255319152, 0.83333333333333337, 0.80962800875273522]
[0.87630662020905925, 0.83829787234042552, 0.85652173913043483, 0.84731182795698923]
[0.86933797909407662, 0.82553191489361699, 0.85087719298245612, 0.83801295896328298]
[0.86585365853658536, 0.76595744680851063, 0.8910891089108911, 0.82379862700228823]
[0.8850174216027874, 0.84680851063829787, 0.86899563318777295, 0.85775862068965503]
[0.86411149825783973, 0.82127659574468082, 0.84279475982532748, 0.8318965517241379]
[0.79442508710801396, 0.65106382978723409, 0.80952380952380953, 0.72169811320754729]


  'precision', 'predicted', average, warn_for)
