In [None]:
#  =============================================================================================================================
#                                         SCRIPT FOR LONGITUDINAL DATA ANALYSIS
#  lAST UPDATE: 03/10/2017
#  =============================================================================================================================

In [None]:
# ------------------------------------------------------------------------------------------------------------------------------
#                                              Import libraries
# ------------------------------------------------------------------------------------------------------------------------------
#%reset
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import collections
import matplotlib as mpl
from collections import OrderedDict
import time
from datetime import datetime
from sys import stdout
import collections
import csv
import pandas as pd 
from scipy.stats import mode
from scipy import stats
from dateutil.parser import parse



In [None]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

In [None]:
def load_numerical_table(fileName):
    """Function to load .csv file that has numerical-only data; it also displays some basic info about the .csv file content
    
    Parameters:
    -----------
    -fileName: path of rthe .csv file we want to load; put file name only if file is located in current path
    
    Returns:
    --------
    InputMatrix  = numpy array with all values, equivalent to the values in the input .csv file (but no headers)
    InputHeaders = list with headers in input .csv file
    
    """

    # Load .csv file into panda dataframe and show head
    # ----------------------------------------------------
    transformed_data = pd.read_csv(fileName)
    display(transformed_data.head(3))

    # Put headers into list
    # ----------------------
    InputHeaders=list(transformed_data)


    # Turn data frames into matrix
    # ---------------------------------
    InputMatrix  = transformed_data.as_matrix(); InputMatrix = np.array(InputMatrix) ;
    print "Input Matrix Size:"; print "-----------------------"
    print InputMatrix.shape

    # Turn -1000 into NaNs
    # ---------------------------------
    #InputMatrix[InputMatrix==-1000] = np.nan
    
    return InputMatrix,InputHeaders

def ismember(x, y):
    """Function to find indexes of a list (x) within another list (y) in the same order as indicate by x
    
    Parameters:
    -----------
    -x:list
    -y list
    
    Returns:
    --------
    indexes in list "y" where the elements of "x" are located.
    """
        
    bind = {}
    for i, elt in enumerate(y):
        if elt not in bind:
            bind[elt] = i
    return [bind.get(itm, None) for itm in x]  # None can be replaced by any other "not in b" value


def find_column_features(InputHeaders,Patient_FEATURES,Demo_FEATURES,BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES,
                        CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES):
    """Function to find column numbers for features of interest, and group those column numbers based on feature groups
    
    Parameters:
    -----------
    - InputHeaders: List of headers
    - Patient_FEATURES: These are the names of the features that indicate patient ID or something like that.
    - Demo_FEATURES: These are the names of the features that are associated to demographic information (Age, Sex, Ethncity...)
    - BaselineOneTime_FEATURES : These are the names of the features that indicate information taken exclusively during baseline
    - Time_FEATURES: These are the names of the features that provide information about time of any evalation
    - BaselineEvaluation_FEATURES: These are the names of the features associated to measurements during baseline
    - CurrentEvaluation_FEATURES: These are the names of the features associated to measurements after baseline
    - CurrentDiagnosis_FEATURES: This is the name of the feature that contains the diagnosis (binary variable)
    
    * Note: The whole idea of this grouping is that BaselineEvaluation_FEATURES and CurrentEvaluation_FEATURES
            have exactly the same number of features (so they be easily handled during longitudinal data analysis)
    
    
    Returns:
    --------
    This function returns the exact column numbers of the above features in list InputHeaders. In fact it returns a vector
    of column numbers for each feature group.
    """
    
    # Find columns for each Feature type
    c_Patient             =ismember(Patient_FEATURES,InputHeaders)
    c_Demo                =ismember(Demo_FEATURES,InputHeaders)
    c_BaselineOneTime     =ismember(BaselineOneTime_FEATURES,InputHeaders)
    c_BaselineEvaluation  =ismember(BaselineEvaluation_FEATURES,InputHeaders)
    c_Time                =ismember(Time_FEATURES,InputHeaders)
    c_CurrentEvaluation   =ismember(CurrentEvaluation_FEATURES,InputHeaders)
    c_CurrentDiagnosis    =ismember(CurrentDiagnosis_FEATURES,InputHeaders)

    # Convert to np arrays
    c_Patient             =np.array(c_Patient);
    c_Demo                =np.array(c_Demo);
    c_BaselineOneTime     =np.asarray(c_BaselineOneTime);
    c_BaselineEvaluation  =np.asarray(c_BaselineEvaluation);
    c_Time                =np.asarray(c_Time);
    c_CurrentEvaluation   =np.asarray(c_CurrentEvaluation);
    c_CurrentDiagnosis    =np.asarray(c_CurrentDiagnosis);


    # Print 
    print ' '
    print 'Identified columns of interest in input file: '
    print '------------------------------------------------------------------------------------------------------------------------'
    print 'Patient RID column:',
    print c_Patient
    print 'Demo columns:',
    print c_Demo
    print 'BaselineOneTime columns:',
    print c_BaselineOneTime
    print 'BaselineEvaluation columns:',
    print c_BaselineEvaluation
    print 'Time columns:',
    print c_Time
    print 'CurrentEvaluation columns:',
    print c_CurrentEvaluation
    print 'CurrentDiagnosis columns:',
    print c_CurrentDiagnosis

    return c_Patient,c_Demo,c_BaselineOneTime,c_Time,c_BaselineEvaluation,c_CurrentEvaluation,c_CurrentDiagnosis

def computeLDSmetric(A,B,CurrentEvaluation_FEATURES,patientTime_p,MetricList):
    
    # Print matrices
    #print A;print '-----------------------------------------'
    #print B;print '-----------------------------------------'
    
    LongitudinalMetric=np.array([]);
    LongitudinalFeatures=np.array([]);

    # loop over each column
    Ashape=A.shape
    for c in range(Ashape[1]):
        
        #print c,Ashape[1]
        
        # Extract colum without NaNs
        C=B[~np.isnan(B[:,c]),c]
        

        
        # Loop over each metric for this column
        for m in MetricList:
            
            if m=='MaxTime':
            
                # Compute metric 
                if C.size!=0:
                    metricValue=patientTime_p[np.asarray(np.argwhere(~np.isnan(B[:,c])))[-1]];
                else:
                    metricValue=np.asarray(np.nan)  
                metricHeader=CurrentEvaluation_FEATURES[c]+'_MaxTime'


                # Stack Horizontally
                LongitudinalMetric = np.hstack((LongitudinalMetric, metricValue)) if LongitudinalMetric.size else metricValue
                LongitudinalFeatures = np.hstack((LongitudinalFeatures, metricHeader)) if len(LongitudinalFeatures) else metricHeader

            
            if m=='Delta':
                
                # Compute metric 
                if C.size!=0:
                    metricValue=C[-1]-A[0,c];
                else:
                    metricValue=np.asarray(np.nan)
                metricHeader=CurrentEvaluation_FEATURES[c]+'_Delta'
              

                # Stack Horizontally
                LongitudinalMetric = np.hstack((LongitudinalMetric, metricValue)) if LongitudinalMetric.size else metricValue
                LongitudinalFeatures = np.hstack((LongitudinalFeatures, metricHeader)) if len(LongitudinalFeatures) else metricHeader
               
                
            
            if m=='Mean':

                # Compute metric 
                if C.size!=0:
                    metricValue=np.mean(C);
                else:
                    metricValue=np.asarray(np.nan)
                metricHeader=CurrentEvaluation_FEATURES[c]+'_Mean'

                # Stack Horizontally
                LongitudinalMetric = np.hstack((LongitudinalMetric, metricValue)) if LongitudinalMetric.size else metricValue
                LongitudinalFeatures = np.hstack((LongitudinalFeatures, metricHeader)) if len(LongitudinalFeatures) else metricHeader
                
                
            if m=='Std':

                # Compute metric 
                if C.size!=0:
                    metricValue=np.std(C);
                else:
                    metricValue=np.asarray(np.nan)
                metricHeader=CurrentEvaluation_FEATURES[c]+'_Std'

                # Stack Horizontally
                LongitudinalMetric = np.hstack((LongitudinalMetric, metricValue)) if LongitudinalMetric.size else metricValue
                LongitudinalFeatures = np.hstack((LongitudinalFeatures, metricHeader)) if len(LongitudinalFeatures) else metricHeader
              
        
    #print '----';print LongitudinalFeatures; print LongitudinalMetric;print '----'
    #print '----'; print LongitudinalMetric.shape;print LongitudinalFeatures.shape; 
                
    return LongitudinalMetric,LongitudinalFeatures


def handle_longitudinal_data(InputMatrix,c_Patient,c_Demo,c_BaselineOneTime,c_Time,c_BaselineEvaluation,c_CurrentEvaluation,\
                             c_CurrentDiagnosis,Patient_FEATURES,Demo_FEATURES,BaselineOneTime_FEATURES,Time_FEATURES,\
                         BaselineEvaluation_FEATURES,CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList):

    """Function that builds Input and Output matrices that can be used for supervised learning according to pre-defined approach to
    longitudinal data analysis.
    
    Parameters:
    -----------
    -InputMatrix: Input matrix with numerical values only
    
    -c_Patient: column numbers for features containing basic patient ID information
    -c_Demo: column numbers for features containing patient demographic information (Age, Sex, Ethnicity, ...)
    -c_BaselineOneTime: column number for features containing measurements taken at baseline only.
    -c_Time: column numbers for features informing about time
    -c_BaselineEvaluation: column numbers for features containing baseline measurements.
    -c_CurrentEvaluation:column numbers for features containing current time measurements.
    -c_CurrentDiagnosis: column number containing diagnostics information
    
    - Patient_FEATURES: These are the names of the features that indicate patient ID or something like that.
    - Demo_FEATURES: These are the names of the features that are associated to demographic information (Age, Sex, Ethncity...)
    - BaselineOneTime_FEATURES : These are the names of the features that indicate information taken exclusively during baseline
    - Time_FEATURES: These are the names of the features that provide information about time of any evalation
    - BaselineEvaluation_FEATURES: These are the names of the features associated to measurements during baseline
    - CurrentEvaluation_FEATURES: These are the names of the features associated to measurements after baseline
    - CurrentDiagnosis_FEATURES: This is the name of the feature that contains the diagnosis (binary variable)
    
    - LongitudinalMethod: number indicating method for longitudinal data analyis (see options below)
    
    
    Returns:
    --------
    - NewInputMatrix: Input matrix ready for supervised learning (after implementing longitudinal data analysis)
    - NewOutputMatrix: Output matrix ready for supervised learning (after implementing longitudinal data analysis)
    - NewInputHeader: Column headers for NewInputMatrix
    - NewOutputHeader:Column headers for NewOutputMatrix
    """
    
    # -------------------------------------------------------------------------------------------------------------------------
    #                                 Option 1 for Longitudinal Data
    # -------------------------------------------------------------------------------------------------------------------------
    # Goal:     - To Predict next visit's diagnostics for a given patient at a given time
    # INPUTS:   - Patient data including:(Demographics) + (Baseline One Time Measurement) + (Baseline Medical Evaluation) +
    #             (Time Info)  + (Delta Change in Medical Evaluation).
    #           - Each record will represent a given patient at a given time; we also include delta time until next visit
    # OUTPUTS:  - Predicting DX diagnostics at next visit
    #           - For  now we are predicting just Dementia
    # -------------------------------------------------------------------------------------------------------------------------
    if LongitudinalMethod==1:
        

        # All patient RID
        aRID=InputMatrix[:,c_Patient];aRID=aRID.flatten()

        # Identify unique patient RID
        uRID=np.unique(InputMatrix[:,c_Patient]);uRID=np.asarray(uRID)

        # Initialize
        NewInputMatrix = np.array([])
        NewOutputMatrix = np.array([])

        # ------------------------
        # Loop over each patient
        # ------------------------
        k=-1;
        for i in uRID:


            # Print 
            #print i

            # Identify rows for this patient
            rRID=np.nonzero(aRID==i); rRID=np.asarray(rRID);rRID=rRID.flatten()


            # Extract all data for this patient
            patientRID                    =InputMatrix[np.ix_(rRID,c_Patient)]
            patientDemo                   =InputMatrix[np.ix_(rRID,c_Demo)]
            patientBaselineOneTime        =InputMatrix[np.ix_(rRID,c_BaselineOneTime)]
            patientBaselineEvaluation     =InputMatrix[np.ix_(rRID,c_BaselineEvaluation)]
            patientCurrentEvaluation      =InputMatrix[np.ix_(rRID,c_CurrentEvaluation)]
            patientTime                   =InputMatrix[np.ix_(rRID,c_Time)]
            patientCurrentDiagnosis       =InputMatrix[np.ix_(rRID,c_CurrentDiagnosis)]

            # Concatenate all data for this patient into one matrix
            patientData=np.concatenate((patientRID,patientDemo, patientBaselineOneTime, patientBaselineEvaluation,
                                       patientCurrentEvaluation,patientTime,patientCurrentDiagnosis), axis=1)


            # Take all rows for this patient except for last one (no output available for that one)
            patientInput= patientData[:-1,:]

            # Take the time until next visit (two different measurements of this are available)
            t1=len(Time_FEATURES)-np.asarray(ismember(['Month_bl'],Time_FEATURES)); t1=t1[0]+1
            t2=len(Time_FEATURES)-np.asarray(ismember(['M'],Time_FEATURES)); t2=t2[0]+1;
            aux1=np.diff(patientData[:,-t1]);
            aux2=np.diff(patientData[:,-t2]);
            patientDeltaT=np.column_stack((aux1,aux2))

            # Concatenate input data for this patient
            patientInputAll=np.concatenate((patientInput,patientDeltaT),axis=1)

            # Read output (diagnostics at next visit)
            patientOutputAll=patientData[1:,-1]

            # Store Input data for this patient into big matrix
            NewInputMatrix= np.vstack([NewInputMatrix, patientInputAll]) if NewInputMatrix.size else patientInputAll

            # Store Output data for this patient into big matrix
            NewOutputMatrix = np.append(NewOutputMatrix, patientOutputAll, axis=0) if NewOutputMatrix.size else patientOutputAll

            # Define Input and Output Headers
            if i==uRID[1]:
                #NewInputHeader=','.join(Demo_FEATURES) + ','.join(BaselineOneTime_FEATURES) +  ','.join(BaselineEvaluation_FEATURES) + \
                #,'.join(Time_FEATURES) + ','.join(CurrentEvaluation_FEATURES) +  ','.join(CurrentDiagnosis_FEATURES) 

                NewInputHeader0=",".join(Patient_FEATURES);
                NewInputHeader1=",".join(Demo_FEATURES);
                NewInputHeader2=",".join(BaselineOneTime_FEATURES);
                NewInputHeader3=",".join(BaselineEvaluation_FEATURES);
                NewInputHeader4=",".join(CurrentEvaluation_FEATURES);
                NewInputHeader5=",".join(Time_FEATURES);
                NewInputHeader6=",".join(CurrentDiagnosis_FEATURES);

                NewInputHeader = ",".join((NewInputHeader0, NewInputHeader1))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader2))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader3))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader4))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader5))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader6))
                NewInputHeader = ",".join((NewInputHeader, 'DeltaTime1'))
                NewInputHeader = ",".join((NewInputHeader, 'DeltaTime2'))

                NewOutputHeader='NextDiagnostics'

            # Print info
            #stdout.write("\r%d " % i); stdout.flush()
            
    # -------------------------------------------------------------------------------------------------------------------------
    
    # -------------------------------------------------------------------------------------------------------------------------
    #                                 Option 2 for Longitudinal Data
    # -------------------------------------------------------------------------------------------------------------------------
    # Goal:     - To Predict Final Diagnostics (AD or not)
    # INPUTS:   - Patient data including:(Demographics) + (Baseline One Time Measurement) + (Baseline Medical Evaluation) +
    #             (Time Info)  + (Delta Change in Medical Evaluation).
    #           - Each record will represent a single patient; longitudinal data is summarized into metrics
    # OUTPUTS:  - AD prediction for this patient
    # -------------------------------------------------------------------------------------------------------------------------
    if LongitudinalMethod==2:
        
        # Print Method
        print '------'; print 'Method 2 for Longitudinal Data Analysis';print '------';
        
        
         # All patient RID
        aRID=InputMatrix[:,c_Patient];aRID=aRID.flatten()

        # Identify unique patient RID
        uRID=np.unique(InputMatrix[:,c_Patient]);uRID=np.asarray(uRID)

        # Initialize
        NewInputMatrix = np.array([])
        NewOutputMatrix = np.array([])

        # ------------------------
        # Loop over each patient
        # ------------------------
        k=-1;
        for i in uRID:
            
             # Print 
            print i

            # Identify rows for this patient
            rRID=np.nonzero(aRID==i); rRID=np.asarray(rRID);rRID=rRID.flatten()


            # Extract all data for this patient
            patientRID                    =InputMatrix[np.ix_(rRID,c_Patient)]
            patientDemo                   =InputMatrix[np.ix_(rRID,c_Demo)]
            patientBaselineOneTime        =InputMatrix[np.ix_(rRID,c_BaselineOneTime)]
            patientBaselineEvaluation     =InputMatrix[np.ix_(rRID,c_BaselineEvaluation)]
            patientCurrentEvaluation      =InputMatrix[np.ix_(rRID,c_CurrentEvaluation)]
            patientTime                   =InputMatrix[np.ix_(rRID,c_Time)]
            patientCurrentDiagnosis       =InputMatrix[np.ix_(rRID,c_CurrentDiagnosis)]

    
            
            # Use last feature in the Time List as a time ID
            patientTime_p=patientTime[:,-1]
            
            # Sort (make sure time is in ascending order)
            tsortindx=patientTime_p.argsort(axis=0)
            patientTime_p=patientTime_p[tsortindx]
            patientDemo=patientDemo[tsortindx,:]
            patientBaselineOneTime=patientBaselineOneTime[tsortindx,:]
            patientBaselineEvaluation =patientBaselineEvaluation[tsortindx,:]
            patientCurrentEvaluation  =patientCurrentEvaluation[tsortindx,:]
            patientCurrentDiagnosis=patientCurrentDiagnosis[tsortindx,:]
            
            # Compute longitudinal delta metric
            patientLongitudinalMetric,patientLongitudinalFeatures=computeLDSmetric(patientBaselineEvaluation,patientCurrentEvaluation,\
                                                                                   CurrentEvaluation_FEATURES,patientTime_p,MetricList)  
                       
            #print patientLongitudinalMetric.shape
            
            # Concatenate all features for this patient into one matrix    
            #patientInputAll=np.concatenate((patientRID[0],patientDemo[0,:], patientBaselineOneTime[0,:],patientLongitudinalMetric), axis=0)
            patientInputAll=np.concatenate((patientDemo[0,:], patientBaselineOneTime[0,:],patientLongitudinalMetric), axis=0)
            
             # Read output (diagnostics at next visit)
            patientOutputAll=patientCurrentDiagnosis[-1]
            
            #print 'Finished3'
            #print patientLongitudinalMetric.shape
            #print patientInputAll.shape
            #print NewInputMatrix.shape
            

            # Store Input data for this patient into big matrix
            NewInputMatrix= np.vstack([NewInputMatrix, patientInputAll]) if NewInputMatrix.size else patientInputAll
            

            # Store Output data for this patient into big matrix
            NewOutputMatrix = np.append(NewOutputMatrix, patientOutputAll, axis=0) if NewOutputMatrix.size else patientOutputAll

            # Define Input and Output Headers
            if i==uRID[1]:
                #NewInputHeader=','.join(Demo_FEATURES) + ','.join(BaselineOneTime_FEATURES) +  ','.join(BaselineEvaluation_FEATURES) + \
                #,'.join(Time_FEATURES) + ','.join(CurrentEvaluation_FEATURES) +  ','.join(CurrentDiagnosis_FEATURES) 

                #NewInputHeader0=",".join(Patient_FEATURES);
                NewInputHeader1=",".join(Demo_FEATURES);
                NewInputHeader2=",".join(BaselineOneTime_FEATURES);
                NewInputHeader3=",".join(patientLongitudinalFeatures);


                NewInputHeader = ",".join((NewInputHeader1, NewInputHeader2))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader3))
                #NewInputHeader = ",".join((NewInputHeader, NewInputHeader3))


                NewOutputHeader='Diagnostics'

            # Print info
            #stdout.write("\r%d " % i); stdout.flush()


    # -------------------------------------------------------------------------------------------------------------------------
    
     # -------------------------------------------------------------------------------------------------------------------------
    #                                 Option 3 for Longitudinal Data
    # -------------------------------------------------------------------------------------------------------------------------
    # Goal:     - To Predict Final Diagnostics (AD or not)
    # INPUTS:   - Patient data including:(Demographics) + (Baseline One Time Measurement) + (Baseline Medical Evaluation) +
    #             (Time Info)  + (Delta Change in Medical Evaluation).
    #           - Each record will represent a single patient; longitudinal data is summarized into metrics
    # OUTPUTS:  - AD prediction for this patient
    #           - Test patients are limited in terms of how many input times can use
    # -------------------------------------------------------------------------------------------------------------------------
    if LongitudinalMethod==3:
        
        # Print Method
        print '------'; print 'Method 3 for Longitudinal Data Analysis';print '------';
        
        # Read indexes for test set
        transformed_data = pd.read_csv('test_indices.csv',header=None)
        IndexTable   = transformed_data.as_matrix();
        IndexTestSet = np.array(IndexTable) ; # Indexes for test patients
        print " Total number of test patients: " + str(len(IndexTestSet))
        #print IndexTestSet
        
        
         # All patient RID
        aRID=InputMatrix[:,c_Patient];aRID=aRID.flatten()
        
        # Identify unique patient RID
        uRID,all_p_indices =np.unique(InputMatrix[:,c_Patient],return_index=True);uRID=np.asarray(uRID)
      
        # Identify test patients uRID
        IndexTestSetaux=IndexTestSet.flatten()
        IndexTestSetaux=IndexTestSetaux.astype(int)        
        test_uRID=uRID[IndexTestSetaux]
        #print IndexTestSetaux

        # Initialize
        NewInputMatrix = np.array([])
        NewOutputMatrix = np.array([])
        IndexTestSetMarked = np.array([]) # This array will contain test patients that have enough number of visits
        icounter=np.array(-1);

        # ------------------------
        # Loop over each patient
        # ------------------------
        k=-1;
        for i in uRID:
            
             # Print 
            #print i
            
            # i counter
            icounter=icounter+1;

            # Identify rows for this patient
            rRID=np.nonzero(aRID==i); rRID=np.asarray(rRID);rRID=rRID.flatten()


            # Extract all data for this patient
            patientRID                    =InputMatrix[np.ix_(rRID,c_Patient)]
            patientDemo                   =InputMatrix[np.ix_(rRID,c_Demo)]
            patientBaselineOneTime        =InputMatrix[np.ix_(rRID,c_BaselineOneTime)]
            patientBaselineEvaluation     =InputMatrix[np.ix_(rRID,c_BaselineEvaluation)]
            patientCurrentEvaluation      =InputMatrix[np.ix_(rRID,c_CurrentEvaluation)]
            patientTime                   =InputMatrix[np.ix_(rRID,c_Time)]
            patientCurrentDiagnosis       =InputMatrix[np.ix_(rRID,c_CurrentDiagnosis)]

            
            # Sort (make sure time is in ascending order)
            tsortindx=patientTime_p.argsort(axis=0)
            patientTime_p=patientTime_p[tsortindx]
            patientDemo=patientDemo[tsortindx,:]
            patientBaselineOneTime=patientBaselineOneTime[tsortindx,:]
            patientBaselineEvaluation =patientBaselineEvaluation[tsortindx,:]
            patientCurrentEvaluation  =patientCurrentEvaluation[tsortindx,:]
            patientCurrentDiagnosis=patientCurrentDiagnosis[tsortindx,:]
        

            
            # Compute longitudinal delta metric
            patientTime_p=patientTime[:,-1]
            patNVisits=patientCurrentEvaluation.shape[0]; # Total number of visits for this patient
            
            # Hard-coded maximum number of visits
            # -----------------------------------
            maxNVisits=1;
            # -----------------------------------
            #print icounter,all_p_indices[icounter]
            if patNVisits>=maxNVisits:
                #if all_p_indices[icounter] in IndexTestSet:
                if i in test_uRID:
                    #print " - TEST patient, [index RID] =  " + str(all_p_indices[icounter]) + " " + str(i) +"]" 
                    #print "BEFORE FILTER : "; print patientCurrentEvaluation
                    patientBaselineEvaluation=patientBaselineEvaluation[:maxNVisits,:]
                    patientCurrentEvaluation =patientCurrentEvaluation[:maxNVisits,:]
                    #IndexTestSetMarked= np.vstack([IndexTestSetMarked, all_p_indices[icounter]]) if IndexTestSetMarked.size else all_p_indices[icounter]
                    iaux=np.argwhere(test_uRID==i)[0]
                    iaux2=IndexTestSetaux[iaux]
                    IndexTestSetMarked= np.vstack([IndexTestSetMarked, iaux2]) if IndexTestSetMarked.size else iaux2
                    #print " - TEST patient, [index RID] =  " + str(iaux2) + " " + str(i) +"]" 
                #else:
                #    print " - TRAIN patient, [index RID] =  " + str(all_p_indices[icounter]) + " " + str(i) +"]"
            #else:
            #    print 'FAILED: This patient does not have enough number of visits'
                
            patientLongitudinalMetric,patientLongitudinalFeatures=computeLDSmetric(patientBaselineEvaluation,patientCurrentEvaluation,\
                                                                                   CurrentEvaluation_FEATURES,patientTime_p,MetricList)  
                     
            #print patientLongitudinalMetric.shape
            
            # Concatenate all features for this patient into one matrix    
            #patientInputAll=np.concatenate((patientRID[0],patientDemo[0,:], patientBaselineOneTime[0,:],patientLongitudinalMetric), axis=0)
            patientInputAll=np.concatenate((patientDemo[0,:], patientBaselineOneTime[0,:],patientLongitudinalMetric), axis=0)
            
             # Read output (diagnostics at next visit)
            patientOutputAll=patientCurrentDiagnosis[-1]
            
            #print 'Finished3'
            #print patientLongitudinalMetric.shape
            #print patientInputAll.shape
            #print NewInputMatrix.shape
            
        
            # Store Input data for this patient into big matrix
            NewInputMatrix= np.vstack([NewInputMatrix, patientInputAll]) if NewInputMatrix.size else patientInputAll
            

            # Store Output data for this patient into big matrix
            NewOutputMatrix = np.append(NewOutputMatrix, patientOutputAll, axis=0) if NewOutputMatrix.size else patientOutputAll

            # Define Input and Output Headers
            if i==uRID[1]:
                #NewInputHeader=','.join(Demo_FEATURES) + ','.join(BaselineOneTime_FEATURES) +  ','.join(BaselineEvaluation_FEATURES) + \
                #,'.join(Time_FEATURES) + ','.join(CurrentEvaluation_FEATURES) +  ','.join(CurrentDiagnosis_FEATURES) 

                #NewInputHeader0=",".join(Patient_FEATURES);
                NewInputHeader1=",".join(Demo_FEATURES);
                NewInputHeader2=",".join(BaselineOneTime_FEATURES);
                NewInputHeader3=",".join(patientLongitudinalFeatures);


                NewInputHeader = ",".join((NewInputHeader1, NewInputHeader2))
                NewInputHeader = ",".join((NewInputHeader, NewInputHeader3))
                #NewInputHeader = ",".join((NewInputHeader, NewInputHeader3))


                NewOutputHeader='Diagnostics'


            # Print info
            #stdout.write("\r%d " % i); stdout.flush()
               
        # Export again test_indices.csv and LongitudinalDataAnalysis_test.csv
        np.savetxt('test_indices.csv', IndexTestSetMarked, delimiter=',')
        #longtest_temp = pd.read_csv('LongitudinalDataAnalysis_test.csv')
        #longtest_temp_headers=list(longtest_temp)
        IndexTestSetMarked=IndexTestSetMarked.flatten()
        NewOutputMatrix_temp=NewOutputMatrix[IndexTestSetMarked]
        #print "Hi4b" 
        myMatrix_temp=np.append(NewInputMatrix[np.ix_(IndexTestSetMarked,range(NewInputMatrix.shape[1]))], NewOutputMatrix_temp[:, None], 1);
        myHeader_temp=  ",".join((NewInputHeader, NewOutputHeader))
        np.savetxt('LongitudinalDataAnalysis_test.csv',myMatrix_temp,header=myHeader_temp,delimiter=',')


    # -------------------------------------------------------------------------------------------------------------------------

    
    return NewInputMatrix,NewOutputMatrix,NewInputHeader,NewOutputHeader,uRID

    
def runLongitudinal(InputToLongitudinal,OutputFromLongitudinal,Patient_FEATURES,Demo_FEATURES,BaselineOneTime_FEATURES,\
                    Time_FEATURES,BaselineEvaluation_FEATURES,CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList):
    """Main function to run Longitudinal Data Analysis
    
    Parameters:
    -----------
    - InputToLongitudinal: Input file (.csv)
    - OutputFromLongitudinal: Output file (.csv)
    - Patient_FEATURES: These are the names of the features that indicate patient ID or something like that.
    - Demo_FEATURES: These are the names of the features that are associated to demographic information (Age, Sex, Ethncity...)
    - BaselineOneTime_FEATURES : These are the names of the features that indicate information taken exclusively during baseline
    - Time_FEATURES: These are the names of the features that provide information about time of any evalation
    - BaselineEvaluation_FEATURES: These are the names of the features associated to measurements during baseline
    - CurrentEvaluation_FEATURES: These are the names of the features associated to measurements after baseline
    - CurrentDiagnosis_FEATURES: This is the name of the feature that contains the diagnosis (binary variable)
    Returns:
    --------
    Output is a .csv file with name <OutputFromLongitudinal>
    """

    # -----------------------------------------
    # Load Numerical-only .csv file
    # -----------------------------------------
    InputMatrix,InputHeaders=load_numerical_table(InputToLongitudinal)
    
    # -----------------------------------------------------------------------
    # Identify exact column number for each feature (using feature groups)
    # -----------------------------------------------------------------------
    c_Patient,c_Demo,c_BaselineOneTime,c_Time,c_BaselineEvaluation,c_CurrentEvaluation,c_CurrentDiagnosis \
    = find_column_features (InputHeaders,Patient_FEATURES,Demo_FEATURES,BaselineOneTime_FEATURES,Time_FEATURES,BaselineEvaluation_FEATURES, \
                            CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES)

    # -----------------------------------------------------------------------------------------------------------------------------
    #  Handle Longitudinal Data
    # -----------------------------------------------------------------------------------------------------------------------------
    NewInputMatrix,NewOutputMatrix,NewInputHeader,NewOutputHeader,uRID=\
    handle_longitudinal_data(InputMatrix,c_Patient,c_Demo,c_BaselineOneTime,c_Time,c_BaselineEvaluation,c_CurrentEvaluation,\
                             c_CurrentDiagnosis,Patient_FEATURES,Demo_FEATURES,BaselineOneTime_FEATURES,Time_FEATURES,\
                             BaselineEvaluation_FEATURES,CurrentEvaluation_FEATURES,CurrentDiagnosis_FEATURES,LongitudinalMethod,MetricList)


    # -----------------------------------------------------------------------------------------------------------------------------
    # Report and Export .csv file
    # -----------------------------------------------------------------------------------------------------------------------------

    print " "; print "New Input Matrix Size:";print "-----------------------";
    print NewInputMatrix.shape

    print " ";print "New Output Matrix Size:";print "-----------------------";
    print NewOutputMatrix.shape

    # Save dataframe to human readable data
    myMatrix=np.append(NewInputMatrix, NewOutputMatrix[:, None], 1);
    myHeader=  ",".join((NewInputHeader, NewOutputHeader))
    np.savetxt(OutputFromLongitudinal,myMatrix,header=myHeader,delimiter=',')
    
     # Save RIDs to human readable data
    np.savetxt('Longitudinal_RID.csv',uRID,header='RID',delimiter=',')
