In [1]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())


- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


In [2]:
import scipy
import numpy
import pandas
import csv
import MergeDiagnosis_AdjustedLabels
import LongitudinalDataAnalysis
import Categorical_Updated
import Imputation
import FeatureReduction
import SupervisedLearning
import ModelPerformance
import TrainTestSplit

importing Jupyter notebook from MergeDiagnosis_AdjustedLabels.ipynb
importing Jupyter notebook from LongitudinalDataAnalysis.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from Categorical_Updated.ipynb
importing Jupyter notebook from Imputation.ipynb
importing Jupyter notebook from FeatureReduction.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from SupervisedLearning.ipynb
importing Jupyter notebook from ModelPerformance.ipynb
importing Jupyter notebook from TrainTestSplit.ipynb


In [3]:
# -------------------------------
# Merge Data
# -------------------------------
merged_data = MergeDiagnosis_AdjustedLabels.data_preprocess(study = "all",imaging_to_drop = 'all', reversions = 'label0')

In [4]:
# -------------------------------
# Categorical to Numerical
# -------------------------------
date_cols = ['update_stamp','EXAMDATE','EXAMDATE_bl']
cols_to_ignore = ['PTID']

Categorical_Updated.categorical_conversion(date_cols,cols_to_ignore)



Columsn that are one-hot encoded
-------------------------------------
['VISCODE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY']


In [5]:
# -------------------------------
# Longitudinal Data Analysis
# -------------------------------

# Input file name for Longitudinal Data Analysis
InputToLongitudinal='CategoricalToNumerical.csv'

with open(InputToLongitudinal) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    labels = next(reader)

# Output file name from this script
OutputFromLongitudinal='LongitudinalDataAnalysis.csv'

# Patient RID Features
Patient_FEATURES=['RID'];

# Demographic Features
Demo_FEATURES_type=['AGE','PTEDUCAT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
Demo_FEATURES = []
for i in labels:
    if i in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    
# Baseline OneTime Features
BaselineOneTime_FEATURES_type = ['APOE4','Years_bl','DX_bl','ORIGPROT']
BaselineOneTime_FEATURES = []
for i in labels:
    if i in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
    elif i.rfind("_") != -1 and i[:i.rfind("_")] in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
        
# Time Headers
Time_FEATURES_type=['SITE','Month','update_stamp_minus_EXAMDATE_bl','update_stamp_minus_EXAMDATE','EXAMDATE_minus_EXAMDATE_bl',
               'COLPROT','M','Month_bl']
Time_FEATURES = []
for i in labels:
    if i in Time_FEATURES_type:
        Time_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Time_FEATURES_type:
        Time_FEATURES.append(i)
Time_FEATURES.insert(len(Time_FEATURES), Time_FEATURES.pop(Time_FEATURES.index('Month_bl'))) # Month_bl must be last feature in this list

        
# Baseline Evaluation Features
BaselineEvaluation_FEATURES=['CDRSB_bl','ADAS11_bl','ADAS13_bl','MMSE_bl','RAVLT_learning_bl','RAVLT_forgetting_bl',
                             'RAVLT_perc_forgetting_bl','RAVLT_immediate_bl','FAQ_bl','MOCA_bl','EcogPtLang_bl','EcogPtVisspat_bl',
                             'EcogPtPlan_bl','EcogPtOrgan_bl','EcogPtDivatt_bl','EcogPtMem_bl','EcogPtTotal_bl','EcogSPLang_bl',
                             'EcogSPVisspat_bl','EcogSPPlan_bl','EcogSPOrgan_bl','EcogSPDivatt_bl','EcogSPMem_bl','EcogSPTotal_bl'];


   
# Current Medical Evaluation
CurrentEvaluation_FEATURES=['CDRSB','ADAS11','ADAS13','MMSE','RAVLT_learning','RAVLT_forgetting','RAVLT_perc_forgetting','RAVLT_immediate',
                            'FAQ','MOCA','EcogPtLang','EcogPtVisspat','EcogPtPlan','EcogPtOrgan','EcogPtDivatt','EcogPtMem','EcogPtTotal',
                            'EcogSPLang','EcogSPVisspat','EcogSPPlan','EcogSPOrgan','EcogSPDivatt','EcogSPMem','EcogSPTotal'];


# Current Diagnosis
CurrentDiagnosis_FEATURES= ['AD'];

# Longitudinal Method
LongitudinalMethod=2;
MetricList=['MaxTime','Delta','Mean','Std'];

# Run Longitudinal Data Anaysis
LongitudinalDataAnalysis.runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,\
                                         BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,\
                                         CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList)





Unnamed: 0.1,Unnamed: 0,RID,SITE,AGE,PTEDUCAT,APOE4,CDRSB,ADAS11,ADAS13,MMSE,...,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,PTMARRY_Married,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed,AD
0,0,2,11,74.3,16,0.0,0.0,10.67,18.67,28.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
1,1,3,11,81.3,18,1.0,4.5,22.0,31.0,20.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
2,2,3,11,81.3,18,1.0,6.0,19.0,30.0,24.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1


Input Matrix Size:
-----------------------
(12736, 104)
 
Identified columns of interest in input file: 
------------------------------------------------------------------------------------------------------------------------
Patient RID column: [1]
Demo columns: [  3   4  90  91  92  93  94  95  96  97  98  99 100 101 102]
BaselineOneTime columns: [ 5 54 84 85 86 87 88 89]
BaselineEvaluation columns: [30 31 32 33 35 36 37 34 38 39 41 42 43 44 45 40 46 48 49 50 51 52 47 53]
Time columns: [ 2 55 56 57 58 59 60 82 83]
CurrentEvaluation columns: [ 6  7  8  9 11 12 13 10 14 15 17 18 19 20 21 16 22 24 25 26 27 28 23 29]
CurrentDiagnosis columns: [103]
------
Method 2 for Longitudinal Data Analysis
------
 
New Input Matrix Size:
-----------------------
(1737, 120)
 
New Output Matrix Size:
-----------------------
(1737,)


In [None]:
for imp in ['meanmode', 'knn']:
    for feat in ['none','SVD','AffinityPropagation']:
        log = []
        rf  = []
        mlp = []
        grad = []
        svm = []
        log_p = []
        rf_p  = []
        mlp_p = []
        grad_p = []
        svm_p = []
        for i in range(50):
            TrainTestSplit.traintest_split(0.33)

            Imputation.imputation(imp)

            # ----------------------------------
            # Feature Reduction 
            # ----------------------------------

            # Input and Output files from Feature Reduction for train set
            InputToFeatureReduction_train     ='ImputedMatrix_train.csv'
            OutputFromFeatureReduction_train  ='Features_train.csv'
            InputToFeatureReduction_test      ='ImputedMatrix_test.csv'
            OutputFromFeatureReduction_test   ='Features_test.csv'


            # Normalization method
            NormalizationMethod='MinMax'
            #NormalizationMethod='MeanStd'

            # Feature Reduction Method and Settings
            #FeatureReductionMethod='SVD'; 
            ExplainedVariance=0.99; # For method 'SVD'

            FeatureReductionMethod= feat; 
            APpreference=-50; # Hyperparameter for method 'AffinityPropagation'


            # Run Feature Reduction
            FeatureReduction.RunFeatureReduction(InputToFeatureReduction_train,OutputFromFeatureReduction_train,\
                                                 InputToFeatureReduction_test,OutputFromFeatureReduction_test,\
                                                 NormalizationMethod,FeatureReductionMethod,ExplainedVariance,APpreference)




            ##Models
            #parameters = {'C': [0.001,0.01,0.1,0.5,1, 5], 'penalty': ['l1','l2']}
            #model = "LogisticRegression"
            #result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
            #log_p.append(result)
            #log.append(ModelPerformance.model_performance('all'))


            #parameters = {'n_estimators': [10,100,200,500,1000]}
            #model = "RandomForest"
            #result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
            #rf_p.append(result)
            #rf.append(ModelPerformance.model_performance('all'))


            parameters = {'alpha': [0.001, 0.01, 0.1, 1, 5], 'hidden_layer_sizes': [(100,), (100,100)] }
            model = "MLP"
            result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
            mlp_p.append(result)
            mlp.append(ModelPerformance.model_performance('all'))
            print i

            #parameters = {'C': [0.001,0.01,0.1,0.5,1, 5], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'probability': [True]}
            #model = "SVM"
            #result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
            #svm_p.append(result)
            #svm.append(ModelPerformance.model_performance('all'))


            #parameters = {'n_estimators' : range(10, 800, 50)}
            #model = "GradientBoosting"
            #result = SupervisedLearning.TuneAndReport(model, parameters, 5, 'recall')
            #grad_p.append(result)
            #grad.append(ModelPerformance.model_performance('all'))
            
        with open(str(imp)+"_"+str(feat)+".csv", 'w') as f:
            writer = csv.writer(f)
            #writer.writerow(['log',log_p,log])
            #writer.writerow(['rf',rf_p,rf])
            writer.writerow(['mlp',mlp_p,mlp])
            #writer.writerow(['svm',svm_p,svm])
            #writer.writerow(['grad',grad_p,grad])
            



Unnamed: 0,# RID,AGE,PTEDUCAT,PTGENDER_Male,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,...,EcogSPDivatt_Std,EcogSPMem_MaxTime,EcogSPMem_Delta,EcogSPMem_Mean,EcogSPMem_Std,EcogSPTotal_MaxTime,EcogSPTotal_Delta,EcogSPTotal_Mean,EcogSPTotal_Std,Diagnostics
0,419.0,70.2,18.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.132455,1.34375,0.16238,0.0,0.199948,1.264845,0.065282,0.0
1,619.0,77.5,12.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.28318,0.034314,0.132455,2.186072,0.254667,0.03321,0.199948,1.881981,0.195019,1.0
2,814.0,71.0,16.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.28318,0.034314,0.132455,2.186072,0.254667,0.03321,0.199948,1.881981,0.195019,1.0


Unnamed: 0,# RID,AGE,PTEDUCAT,PTGENDER_Male,PTETHCAT_Not Hisp/Latino,PTETHCAT_Unknown,PTRACCAT_Asian,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,...,EcogSPDivatt_Std,EcogSPMem_MaxTime,EcogSPMem_Delta,EcogSPMem_Mean,EcogSPMem_Std,EcogSPTotal_MaxTime,EcogSPTotal_Delta,EcogSPTotal_Mean,EcogSPTotal_Std,Diagnostics
0,899.0,80.1,16.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.28318,0.034314,0.132455,2.186072,0.254667,0.03321,0.199948,1.881981,0.195019,0.0
1,4194.0,62.0,16.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.272431,0.0,-0.125,1.6875,0.548435,0.0,0.01178,1.274957,0.139251,0.0
2,579.0,65.4,18.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.28318,0.034314,0.132455,2.186072,0.254667,0.03321,0.199948,1.881981,0.195019,0.0
