In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [1]:
def traintest_split(test_percentage, train_study = None, test_study = None):
    """Function that splits data into training and test set
    
    Parameters:
    ----------
    test_percentage: percentage of dataset to use as test set
    
    Saves files:
    -----------
    LongitudinalDataAnalysis_train.csv
    LongitudinalDataAnalysis_test.csv
    train_indices.csv: indices of original data set used for training set
    test_indices.csv: indices of original data set used for test set
    """
    if train_study == None and test_study == None:
        data = pd.read_csv("LongitudinalDataAnalysis.csv")
        labels = list(data)

        data = data.as_matrix()[:,:] 

        Y = data[:, -1]
        X = data[:,0:-1]

        indices = np.arange(len(data))

        training_set_X, test_set_X, training_set_Y, test_set_Y, indices_train, indices_test = train_test_split(
        X, Y, indices, test_size=test_percentage)

        ##### Save files #####
        Yy = np.transpose(np.asmatrix(training_set_Y))
        Yyy = np.transpose(np.asmatrix(test_set_Y))

        train = np.concatenate((training_set_X, Yy), axis=1)
        test = np.concatenate((test_set_X, Yyy), axis=1)

        data_train = pd.DataFrame(data=train[:,:],  
                     columns=labels)  

        data_test = pd.DataFrame(data=test[:,:],  
                     columns=labels) 
        
        try:
            del data_train['ORIGPROT_ADNI2']
            del data_test['ORIGPROT_ADNI2']
            del data_train['ORIGPROT_ADNIGO']
            del data_test['ORIGPROT_ADNIGO']
        except:
            pass

        data_train.to_csv("LongitudinalDataAnalysis_train.csv", index = False, na_rep = np.nan)
        data_test.to_csv("LongitudinalDataAnalysis_test.csv", index = False, na_rep = np.nan)

        np.savetxt('train_indices.csv', indices_train, delimiter=',')
        np.savetxt('test_indices.csv', indices_test, delimiter=',')
        #########################
    else:
        
        data = pd.read_csv("LongitudinalDataAnalysis.csv")
        labels = list(data)
        
        if train_study == 'ADNI1':
            data_train = data[(data['ORIGPROT_ADNI2'] == 0) & (data['ORIGPROT_ADNIGO'] == 0)]
        elif train_study == 'ADNI2':
            data_train = data[(data['ORIGPROT_ADNI2'] == 1)]
        elif train_study == 'ADNIGO':
            data_train = data[(data['ORIGPROT_ADNIGO'] == 1)]

        if test_study == 'ADNI1':
            data_test = data[(data['ORIGPROT_ADNI2'] == 0) & (data['ORIGPROT_ADNIGO'] == 0)]
        elif test_study == 'ADNI2':
            data_test = data[(data['ORIGPROT_ADNI2'] == 1)]
        elif test_study == 'ADNIGO':
            data_test = data[(data['ORIGPROT_ADNIGO'] == 1)]
            
        try:
            del data_train['ORIGPROT_ADNI2']
            del data_test['ORIGPROT_ADNI2']
            del data_train['ORIGPROT_ADNIGO']
            del data_test['ORIGPROT_ADNIGO']
        except:
            pass

        data_train.to_csv("LongitudinalDataAnalysis_train.csv", index = False, na_rep = np.nan)
        data_test.to_csv("LongitudinalDataAnalysis_test.csv", index = False, na_rep = np.nan)

        #########################
        


traintest_split(0.33)