### Script to merge file AIBL_data.csv with Merged_data.csv (which is the existing ADNI file)


In [1]:
# ------------------------------------------------------------------------------------------------------------------------------
#                                              Import libraries
# ------------------------------------------------------------------------------------------------------------------------------
#%reset
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import collections
import matplotlib as mpl
from collections import OrderedDict
import time
from datetime import datetime
from sys import stdout
import collections
import csv
import pandas as pd 
from scipy.stats import mode
from scipy import stats
from dateutil.parser import parse


In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

In [3]:
def load_numerical_table(fileName):
    """Function to load .csv file that has numerical-only data; it also displays some basic info about the .csv file content
    
    Parameters:
    -----------
    -fileName: path of rthe .csv file we want to load; put file name only if file is located in current path
    
    Returns:
    --------
    InputMatrix  = numpy array with all values, equivalent to the values in the input .csv file (but no headers)
    InputHeaders = list with headers in input .csv file
    
    """

    # Load .csv file into panda dataframe and show head
    # ----------------------------------------------------
    transformed_data = pd.read_csv(fileName)
    display(transformed_data.head(3))

    # Put headers into list
    # ----------------------
    InputHeaders=list(transformed_data)


    # Turn data frames into matrix
    # ---------------------------------
    InputMatrix  = transformed_data.as_matrix(); InputMatrix = np.array(InputMatrix) ;
    print "Input Matrix Size:"; print "-----------------------"
    print InputMatrix.shape

    # Turn -1000 into NaNs
    # ---------------------------------
    #InputMatrix[InputMatrix==-1000] = np.nan
    
    return InputMatrix,InputHeaders

In [4]:
# -----------------------------------------
# Load ADNI and AIBL data sets
# -----------------------------------------
#
ADNI_Data=pd.read_csv("Merged_data_original.csv")
AIBL_Data=pd.read_csv("AIBL_data.csv")

#Change the types of AIBL to match ADNI
for i in list(ADNI_Data):
    AIBL_Data[i] = AIBL_Data[i].astype(ADNI_Data[i].dtype)

# Add 100000 to AIBL's RIDs to make sure they do not overlap with ADNI's RIDs
AIBL_Data['RID']=AIBL_Data['RID']+100000

# The timestamp columns given in the AIBL file do not have the right format; we overwrite them for now (it does not matter too
# much as they are not really used in our analysis)
AIBL_Data['update_stamp'] = [ADNI_Data['update_stamp'].loc[0]]*len(AIBL_Data)

# Stack ADNI and AIBL data sets into one big matrix
ADNI_AIBL_data = ADNI_Data.append(AIBL_Data)

# Print matrix size after stacking ADNI and AIBL data sets
print " "; print "Data size after stacking ADNI and AIBL data sets:";print "------------------------------------------------- ";
print ADNI_AIBL_data.shape

# Save to .csv file
ADNI_AIBL_data.to_csv('Merged_data.csv',header=list(AIBL_Data), index = False)

 
Data size after stacking ADNI and AIBL data sets:
------------------------------------------------- 
(26959, 70)


In [5]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path


class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())




- use nbformat for read/write/validate public API
- use nbformat.vX directly to composing notebooks of a particular version

  """)


In [6]:
import LongitudinalDataAnalysis
import Imputation
import FeatureReduction
import SupervisedLearning
import ModelPerformance
import TrainTestSplit
import Categorical_Updated
import csv

importing Jupyter notebook from LongitudinalDataAnalysis.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from Imputation.ipynb
importing Jupyter notebook from FeatureReduction.ipynb


<IPython.core.display.Javascript object>

importing Jupyter notebook from SupervisedLearning.ipynb
importing Jupyter notebook from ModelPerformance.ipynb
importing Jupyter notebook from TrainTestSplit.ipynb
importing Jupyter notebook from Categorical_Updated.ipynb


In [7]:
# -------------------------------
# Categorical to Numerical
# -------------------------------
date_cols = ['update_stamp','EXAMDATE','EXAMDATE_bl']
cols_to_ignore = ['PTID']

Categorical_Updated.categorical_conversion(date_cols,cols_to_ignore)

Columns that are one-hot encoded
-------------------------------------
['VISCODE', 'COLPROT', 'ORIGPROT', 'DX_bl', 'PTGENDER', 'PTETHCAT', 'PTRACCAT', 'PTMARRY']


In [8]:
# -------------------------------
# Longitudinal Data Analysis
# -------------------------------

# Input file name for Longitudinal Data Analysis
InputToLongitudinal='CategoricalToNumerical.csv'

with open(InputToLongitudinal) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    labels = next(reader)

# Output file name from this script
OutputFromLongitudinal='LongitudinalDataAnalysis.csv'

# Patient RID Features
Patient_FEATURES=['RID'];

# Demographic Features
Demo_FEATURES_type=['AGE','PTEDUCAT','PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
Demo_FEATURES = []
for i in labels:
    if i in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Demo_FEATURES_type:
        Demo_FEATURES.append(i)
    
# Baseline OneTime Features
BaselineOneTime_FEATURES_type = ['APOE4','Years_bl','ORIGPROT']
BaselineOneTime_FEATURES = []
for i in labels:
    if i in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
    elif i.rfind("_") != -1 and i[:i.rfind("_")] in BaselineOneTime_FEATURES_type:
        BaselineOneTime_FEATURES.append(i)
        
# Time Headers
Time_FEATURES_type=['SITE','Month','update_stamp_minus_EXAMDATE_bl','update_stamp_minus_EXAMDATE','EXAMDATE_minus_EXAMDATE_bl',
               'COLPROT','M','Month_bl']
Time_FEATURES = []
for i in labels:
    if i in Time_FEATURES_type:
        Time_FEATURES.append(i)
    elif i.find("_") != -1 and i[:i.find("_")] in Time_FEATURES_type:
        Time_FEATURES.append(i)
Time_FEATURES.insert(len(Time_FEATURES), Time_FEATURES.pop(Time_FEATURES.index('Month_bl'))) # Month_bl must be last feature in this list

        
# Baseline Evaluation Features
BaselineEvaluation_FEATURES=['CDRSB_bl','ADAS11_bl','ADAS13_bl','MMSE_bl','RAVLT_learning_bl','RAVLT_forgetting_bl',
                             'RAVLT_perc_forgetting_bl','RAVLT_immediate_bl','FAQ_bl','MOCA_bl','EcogPtLang_bl','EcogPtVisspat_bl',
                             'EcogPtPlan_bl','EcogPtOrgan_bl','EcogPtDivatt_bl','EcogPtMem_bl','EcogPtTotal_bl','EcogSPLang_bl',
                             'EcogSPVisspat_bl','EcogSPPlan_bl','EcogSPOrgan_bl','EcogSPDivatt_bl','EcogSPMem_bl','EcogSPTotal_bl'];


   
# Current Medical Evaluation
CurrentEvaluation_FEATURES=['CDRSB','ADAS11','ADAS13','MMSE','RAVLT_learning','RAVLT_forgetting','RAVLT_perc_forgetting','RAVLT_immediate',
                            'FAQ','MOCA','EcogPtLang','EcogPtVisspat','EcogPtPlan','EcogPtOrgan','EcogPtDivatt','EcogPtMem','EcogPtTotal',
                            'EcogSPLang','EcogSPVisspat','EcogSPPlan','EcogSPOrgan','EcogSPDivatt','EcogSPMem','EcogSPTotal'];


# Current Diagnosis
CurrentDiagnosis_FEATURES= ['AD'];

# Longitudinal Method
LongitudinalMethod=2;
MetricList=['MaxTime','Delta','Mean','Std'];

# Run Longitudinal Data Anaysis
LongitudinalDataAnalysis.runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,\
                                         BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,\
                                         CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList)





Unnamed: 0.1,Unnamed: 0,RID,SITE,AGE,PTEDUCAT,APOE4,CDRSB,ADAS11,ADAS13,MMSE,...,PTRACCAT_Black,PTRACCAT_Hawaiian/Other PI,PTRACCAT_More than one,PTRACCAT_Unknown,PTRACCAT_White,PTMARRY_Married,PTMARRY_Never married,PTMARRY_Unknown,PTMARRY_Widowed,AD
0,0,2,11,74.3,16,0.0,0.0,10.67,18.67,28.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
1,1,3,11,81.3,18,1.0,4.5,22.0,31.0,20.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
2,2,3,11,81.3,18,1.0,6.0,19.0,30.0,24.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1


Input Matrix Size:
-----------------------
(26959, 106)
 
Identified columns of interest in input file: 
------------------------------------------------------------------------------------------------------------------------
Patient RID column: [1]
Demo columns: [  3   4  92  93  94  95  96  97  98  99 100 101 102 103 104]
BaselineOneTime columns: [ 5 54 85 86 87]
BaselineEvaluation columns: [30 31 32 33 35 36 37 34 38 39 41 42 43 44 45 40 46 48 49 50 51 52 47 53]
Time columns: [ 2 56 57 58 59 60 82 83 84 55]
CurrentEvaluation columns: [ 6  7  8  9 11 12 13 10 14 15 17 18 19 20 21 16 22 24 25 26 27 28 23 29]
CurrentDiagnosis columns: [105]
------
Method 2 for Longitudinal Data Analysis
------
 
New Input Matrix Size:
-----------------------
(4335, 116)
 
New Output Matrix Size:
-----------------------
(4335,)
