# Script to train ML algorithms after feature selection and feature extraction  except MARS for STUDY1 and STUDY2

- MARS is performed using R caret. Check the file MARS_GA.R for reference

In [None]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd
# import seaborn as ns
from sklearn.decomposition import PCA,FastICA,KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.ensemble import RandomForestRegressor
from genetic_selection import GeneticSelectionCV
import matplotlib.pyplot as plt
import math
import time
from RegscorePy.aic import aic # for calculating Akaike’s Information Criterion
from RegscorePy.bic import bic # for calculating Bayesian Information Criterion

In [None]:
def readDataFromCsv(file):
    import pandas as pd
    print ("Reading the file from: ",file)
    df = pd.read_csv(file)
    return df

def loadDataset(data='study1',path='../datasets/files_generated/UX/study1_features_data.csv',target='PQ',app='Spell'):    
    df = readDataFromCsv(path)
    df=df[df['App']==app]   
    print('The shape of the data  currently: ',df.shape)
    
    ## This should not have been there
    if(df.isnull().values.any()==True and data=='study1'):
        df = df.dropna()
        print('The shape of the data after dropping null values: ',df.shape)
    if data == 'startData':
        X,y= df.drop(['PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    elif data=='study1':
        X,y= df.drop(['user_id','App','Cond','sessionNr','SEA', 'PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    elif data=='study2':
        X,y=df.drop(['sessionNr','App','user_id','Size','UserId', 'Session', 
                     'PQ', 'ATT', 'HQI', 'HQS', 'HQ', 'IconSize'],axis=1),df[target]
        print('inside study2 if')
    df_result={'data':X,'target':y}
    return df_result


def optimalModelSelection(model,param_grid,X,y,method='grid'):
    """Tune the hyperparameters to find the best score
    """
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler,RobustScaler
    from sklearn.pipeline import make_pipeline,Pipeline
    from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV
    
    K = 10
    kf = KFold(n_splits=K, shuffle=True,random_state=32)
    
    scoring={'r2':'r2','mse':'neg_mean_squared_error','mae':'neg_mean_absolute_error'}
    if(method=='grid'):
        search = GridSearchCV(model, param_grid, cv=kf,n_jobs=-1,scoring=scoring,return_train_score=True,refit='r2')
        search.fit(X,y)
    if(method=='random'):
        search=RandomizedSearchCV(estimator = model, param_distributions = param_grid, 
                               n_iter = 100, cv = kf, verbose=1, 
                               random_state=32, n_jobs = -1,scoring=scoring,return_train_score=True,refit='r2')
        search.fit(X,y)

    print('Best params: {}'.format(search.best_params_))
    print('RMSE: %0.2f'%(np.sqrt(-search.cv_results_['mean_test_mse'][search.best_index_])))
    print("R2(Validation): %0.2f (+/- %0.2f)" % (search.best_score_,search.cv_results_['std_test_r2'][search.best_index_]))
    print("R2(Train): %0.2f (+/- %0.2f)" % (search.cv_results_['mean_train_r2'][search.best_index_],
                                                 search.cv_results_['std_train_r2'][search.best_index_]))
    print("MAE(Validation): %0.2f (+/- %0.2f)" % (-search.cv_results_['mean_test_mae'][search.best_index_],
                                                  search.cv_results_['std_test_mae'][search.best_index_]))
    print("MAE(Train): %0.2f (+/- %0.2f)" % (-search.cv_results_['mean_train_mae'][search.best_index_],
                                                 search.cv_results_['std_train_mae'][search.best_index_]))
    
    return search.best_estimator_,search.best_params_, search.best_score_,search.cv_results_,search.best_index_

In [None]:
def genetic_selection(estimator,X,y):
    """ Returns the selected columns after GA
    """
    np.random.seed(10)
    from pyearth import Earth
    # calculate the optimal population size
    if (isinstance(estimator,Earth)==False):
        population_size=10
        generations=20
    else:
        population_size=100 
        generations=40 # this may not lead to optimal solution and may suffer from premature convergence
    selector = GeneticSelectionCV(estimator,
                                          cv=5,
                                          scoring="r2",
                                          n_population=population_size,
                                          crossover_proba=0.5,
                                          mutation_proba=0.01,
                                          n_generations=generations,
                                          tournament_size=3,
                                          caching=True,
                                          n_jobs=-1)
    start = time.time()
    selector = selector.fit(X, y)
    print("---Finished in %s seconds ---" % (np.round(time.time() - start,3)))
    
    print("Number of columns selected:",len(np.where(selector.support_==True)[0]))
    if isinstance(X, pd.DataFrame):
        columns=X.columns[selector.support_] # returns the column names
    else:
        columns=np.where(selector.support_==True) # return the indices of the numpy array
        
    return columns

In [None]:
def perform_evaluation(path, estimator,param_grid,method='grid',transformation=False, data='study1'):
    """Trains the algorithm and Feature Selection
    """
    if transformation==True:
        # consider only the normally distributed columns
        not_columns=['SEA','PQ','ATT', 'HQI', 'HQ','HQS']
        if data=='study1':
             normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
        else:
            normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
        mahalanobis = pd.read_csv(normality_test_features_path)
        mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)
        for col in not_columns:
            if(col in mahalanobis):
                mahalanobis.remove(col)
        print("Columns that should not be selected are:",mahalanobis)

    targets=['PQ', 'ATT']
    results={}
    predictions={}
    for target in targets:
        target_result={}
        print('Performing prediction for target:',target)
        
        personality=loadDataset(data=data,path=path,target=target)
        X=personality.get('data')
        columns = X.loc[:, X.var() == 0.0].columns
        print("columns thrown away because they have 0 variance:",columns)
        X = X.loc[:, X.var() != 0.0]
        y=personality.get('target')
        
        if transformation==True:
            # apply only the normal columns
            X=X[mahalanobis]
            
        # Create correlation matrix
        corr_matrix = X.select_dtypes(['float64']).corr().abs()

        # Select upper triangle of correlation matrix
        upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find index of feature columns with correlation greater than 0.95
        to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]
            
        # Drop features 
        X = X.drop(X[to_drop_cols], axis=1)
            
        print("Shape of the data after removing 0 variance highly correlated data:",X.shape)
        
        # split the data into train test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
        print("Shape of training data:",X_train.shape)
        print("Shape of test data:",X_test.shape)
            
        # perform genetic algorithm
        #if(isinstance(estimator,Earth)==False):
        
        # features selected
        selected_features= genetic_selection(estimator,X_train,y_train)
        #else:
            #selected_features_index= genetic_selection(estimator,np.array(X_train),np.array(y_train))
            #selected_features= X.columns[selected_features_index]
            
        # scale the data
        scaler=StandardScaler()
        X_train=scaler.fit_transform(X_train[selected_features])
        X_test= scaler.fit_transform(X_test[selected_features])
        
            
        # tune hyperparameters on the optimal subset
        best_estimator_,best_params_, best_score_,cv_results_,best_index_= optimalModelSelection(estimator,param_grid,X_train,y_train,method=method)
        
        # calculate the AIC
        y_pred_train  = best_estimator_.fit(X_train,y_train).predict(X_train)
        aic_score_val = aic(y_train,y_pred_train,X_train.shape[1])
        # calculate Bayesian Information Criterion
        bic_score_val = bic(y_train,y_pred_train,X_train.shape[1])
        # calculate MAPE
        mape_score_val = np.mean(np.abs((y_train - y_pred_train) / y_train)) * 100
        
        # predict on unseen data
        y_pred=best_estimator_.predict(X_test)
        score=r2_score(y_test,y_pred)
        rmse = np.sqrt(mean_squared_error(y_test,y_pred))
        mae = mean_absolute_error(y_test,y_pred)
        aic_score_test = aic(y_test,y_pred,X_test.shape[1])
        bic_score_test = bic(y_test,y_pred,X_test.shape[1])
        mape_score_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        print('Peforming predictions on unseen data')
        print('Performance(R2):%0.2f | RMSE:%0.2f | MAE:%0.2f '%(score,rmse,mae))
        
        '''TODO: Store the residuals in the table for the model'''
        residuals_test = np.array(y_test)- y_pred
        
        #store it in a seperate table
        residuals = np.array(y_train)- y_pred_train
        prediction= {'Original':np.array(y_train),'Predicted':y_pred_train,'Residuals':residuals}
        predictions[target]=prediction
            
        # append the results
        target_result['R2(Validation)']=best_score_
        target_result['Adjusted R2(Validation)']=1-(1-best_score_)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
        target_result['StandardError(Validation)']=cv_results_['std_test_r2'][best_index_]
        target_result['RMSE(Validation)']=np.sqrt(np.abs(cv_results_['mean_test_mse'][best_index_]))
        target_result['R2(Train)']=cv_results_['mean_train_r2'][best_index_]
        target_result['RMSE(Train)']=np.sqrt(np.abs(cv_results_['mean_train_mse'][best_index_]))
        target_result['R2(Test)']=score
        target_result['RMSE(Test)']=rmse
        target_result['#Features']=len(selected_features)
        target_result['Features']=selected_features.values
        target_result['MAE(Validation)']= -cv_results_['mean_test_mae'][best_index_]
        target_result['MAE(Train)']= -cv_results_['mean_train_mae'][best_index_]
        target_result['MAE(Test)']=mae
        target_result['AIC(Validation)']=aic_score_val
        target_result['AIC(Test)']=aic_score_test
        target_result['BIC(Validation)']=bic_score_val
        target_result['BIC(Test)']=bic_score_test
        target_result['MAPE(Validation)']=mape_score_val
        target_result['MAPE(Test)']=mape_score_test
        
            
        # store the result with respect to target
        results[target]=target_result
    
    return results,predictions

In [None]:
def perform_evaluation_with_pca(path, estimator,param_grid,n_components,method='grid',transformation=False, data='study1'):
    """Trains the algorithm after PCA and Feature Selection
    """
    if transformation==True:
        # consider only the normally distributed columns
        not_columns=['SEA','PQ','ATT', 'HQI', 'HQ','HQS']
        if data=='study1':
             normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
        else:
            normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
        mahalanobis = pd.read_csv(normality_test_features_path)
        mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)
        for col in not_columns:
            if(col in mahalanobis):
                mahalanobis.remove(col)
        print("Columns that should not be selected are:",mahalanobis)

    targets=['PQ', 'ATT']
    results={}
    predictions={}
    for target in targets:
        target_result={}
        print('Performing prediction for target:',target)
        
        personality=loadDataset(data=data,path=path,target=target)
        X=personality.get('data')
        columns = X.loc[:, X.var() == 0.0].columns
        print("columns thrown away because they have 0 variance:",columns)
        X = X.loc[:, X.var() != 0.0]
        y=personality.get('target')
        
        if transformation==True:
            # apply only the normal columns
            X=X[mahalanobis]
        
        # Create correlation matrix
        corr_matrix = X.select_dtypes(['float64']).corr().abs()

        # Select upper triangle of correlation matrix
        upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find index of feature columns with correlation greater than 0.95
        to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]
            
        # Drop features 
        X = X.drop(X[to_drop_cols], axis=1)
            
        print("Shape of the data after removing 0 variance highly correlated data:",X.shape)
        
        # split the data into train test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
        print("Shape of training data:",X_train.shape)
        print("Shape of test data:",X_test.shape)
        print("Shape of test data:",y_train.shape)
            
        # scale the data
        scaler=StandardScaler()
        X_train=scaler.fit_transform(X_train)
        X_test= scaler.fit_transform(X_test)
        
        # perform PCA
        print('inside ga with pca function')
        print(type(n_components))
        print(n_components)
        pca= PCA(n_components=n_components)
        pca.fit(X_train)
        X_train = pca.transform(X_train)
        X_test = pca.transform(X_test)
        print('number of principal components:',pca.n_components_)
        
        # apply genetic algorithm to select the best PC
        selected_features= genetic_selection(estimator,X_train,y_train)
        
            
        # tune hyperparameters on the optimal subset
        best_estimator_,best_params_, best_score_,cv_results_,best_index_= optimalModelSelection(estimator,
                                                                                                 param_grid,
                                                                                                 X_train[:,selected_features[0]],y_train,method=method)
        
        
        y_pred_train  = best_estimator_.fit(X_train[:,selected_features[0]],y_train).predict(X_train[:,selected_features[0]])
        
        # calculate the AIC
        aic_score_val = aic(y_train,y_pred_train,X_train[:,selected_features[0]].shape[1])
        # calculate Bayesian Information Criterion
        bic_score_val = bic(y_train,y_pred_train,X_train[:,selected_features[0]].shape[1])
        # calculate MAPE
        mape_score_val = np.mean(np.abs((y_train - y_pred_train) / y_train)) * 100
        
        # predict on unseen data
        y_pred=best_estimator_.predict(X_test[:,selected_features[0]])
        score=r2_score(y_test,y_pred)
        rmse = np.sqrt(mean_squared_error(y_test,y_pred))
        mae = mean_absolute_error(y_test,y_pred)
        
        aic_score_test = aic(y_test,y_pred,X_test[:,selected_features[0]].shape[1])
        bic_score_test = bic(y_test,y_pred,X_test[:,selected_features[0]].shape[1])
        mape_score_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        print('Peforming predictions on unseen data')
        print('Performance(R2):%0.2f | RMSE:%0.2f | MAE:%0.2f | MAPE:%0.2f'%(score,rmse, mae,mape_score_test))
        
        ''''TODO: Store the residuals in the table'''
        residuals = np.array(y_test)- y_pred
        print(len(residuals))
        #store it in a seperate table
        prediction= {'Original':np.array(y_test),'Predicted':y_pred,'Residuals':residuals}
        predictions[target]=prediction
            
        # append the results
        target_result['R2(Validation)']=best_score_
        target_result['Adjusted R2(Validation)']=1-(1-best_score_)*(len(y_train)-1)/(len(y_train)-X_train[:,selected_features[0]].shape[1]-1)
        target_result['StandardError(Validation)']=cv_results_['std_test_r2'][best_index_]
        target_result['RMSE(Validation)']=np.sqrt(np.abs(cv_results_['mean_test_mse'][best_index_]))
        target_result['R2(Train)']=cv_results_['mean_train_r2'][best_index_]
        target_result['RMSE(Train)']=np.sqrt(np.abs(cv_results_['mean_train_mse'][best_index_]))
        target_result['R2(Test)']=score
        target_result['RMSE(Test)']=rmse
        target_result['#Features']=len(selected_features[0])
        target_result['Features']=selected_features[0]
        target_result['MAE(Validation)']= -cv_results_['mean_test_mae'][best_index_]
        target_result['MAE(Train)']= -cv_results_['mean_train_mae'][best_index_]
        target_result['MAE(Test)']=mae
        target_result['AIC(Validation)']=aic_score_val
        target_result['AIC(Test)']=aic_score_test
        target_result['BIC(Validation)']=bic_score_val
        target_result['BIC(Test)']=bic_score_test
        target_result['MAPE(Validation)']=mape_score_val
        target_result['MAPE(Test)']=mape_score_test
        
        # store the result with respect to target
        results[target]=target_result
    
    return results, predictions

In [None]:
#create models
def runAllModels(path, filename,n_components=None,transformation=False,data='study1',perform_pca=False):
    
    # random forest
    print('********Applying Random forest****************')
    rf = RandomForestRegressor(random_state=101)
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
    max_depth = [int(x) for x in np.linspace(1, 5, num = 5)]
    min_samples_split = [int(x) for x in np.linspace(10, 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(10, 60, num = 20)]
    bootstrap = [True, False]
    max_features=['auto','sqrt']
    param_grid={'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'max_features':max_features
               }
    if perform_pca==True:
        results_rf,predictions_rf= perform_evaluation_with_pca(path,rf,param_grid,n_components,method='random',transformation=transformation,data=data)
    else:
        results_rf,predictions_rf= perform_evaluation(path,rf,param_grid,method='random',transformation=transformation,data=data)            
    
    np.random.seed(15)
    print('********Applying Support vector machine****************')
    from sklearn.svm import SVR
    C_space=np.logspace(-1,1,10)
    epsilon_space= np.logspace(-1,0,10)
    gamma_space = np.logspace(-3, -2, 10)
    param_grid={'C':C_space,'epsilon':epsilon_space,'gamma':gamma_space}
    svr = SVR(kernel = 'rbf')
    if perform_pca==True:
        results_svm,predictions_svm = perform_evaluation_with_pca(path,svr,param_grid,n_components,method='random',transformation=transformation,data=data)
    else:
        results_svm,predictions_svm = perform_evaluation(path,svr,param_grid,method='random',transformation=transformation,data=data)

    print('********Applying Linear regression with stochastic gradient descent****************')
    from sklearn.linear_model import SGDRegressor
    param_grid={
                'max_iter':[50,100],
                'penalty':[None],
                'eta0':[0.01,0.1,0.5]
               }
    sgd_reg = SGDRegressor(random_state=32)
    if perform_pca==True:
        print(n_components)
        print(type(n_components))
        results_sgd,predictions_sgd = perform_evaluation_with_pca(path,sgd_reg,param_grid,n_components,transformation=transformation,data=data)
    else:
        results_sgd, predictions_sgd = perform_evaluation(path,sgd_reg,param_grid,transformation=transformation,data=data)
    


    ## lasso regression
    print('********Applying Lasso Regression****************')
    from sklearn.linear_model import Lasso
#     alpha_space = np.logspace(-4, 0, 50)
    alpha_space = np.logspace(0, 1, 100)
    param_grid={'alpha':alpha_space}
    lasso = Lasso(random_state=32)
    if perform_pca==True:
        results_lasso, predictions_lasso = perform_evaluation_with_pca(path,lasso,param_grid,n_components,transformation=transformation,data=data)
    else:
        results_lasso, predictions_lasso = perform_evaluation(path,lasso,param_grid,transformation=transformation,data=data)
    

    ## elastic net 
    print('********Applying Elastic Net Regression****************')
    from sklearn.linear_model import ElasticNet
#     alpha_space = np.logspace(-4, 0, 50)
    alpha_space = np.logspace(0, 2 , 50)
    param_grid={'alpha':alpha_space}
    enet = ElasticNet(random_state=32)
    if perform_pca==True:
        results_enet,predictions_enet = perform_evaluation_with_pca(path,enet,param_grid,n_components,transformation=transformation,data=data)
    else:
        results_enet, predictions_enet = perform_evaluation(path,enet,param_grid,transformation=transformation,data=data)
    
    
    np.random.seed(32)
    #MARS
    print('********Applying MARS****************')
    from pyearth import Earth
    max_degree_space=[1]
    penalty_space=np.logspace(-1,1,20)
    minspan_alpha=np.logspace(-3,1,20)
    max_terms=[10,20,25]
    param_grid={'max_degree':max_degree_space,
        'penalty':penalty_space,
        'use_fast':[True],
        'max_terms':max_terms
               }
    mars= Earth()
    df_rf=pd.DataFrame(results_rf).T
    df_rf['Target']=df_rf.index
    df_rf=df_rf.reset_index(drop=True)
    df_rf['Algorithm']='Random Forest'
    df_rf.set_index(['Algorithm'])

    df_svm=pd.DataFrame(results_svm).T
    df_svm['Target']=df_svm.index
    df_svm=df_svm.reset_index(drop=True)
    df_svm['Algorithm']='SVM'
    df_svm.set_index(['Algorithm'])

    df_sgd=pd.DataFrame(results_sgd).T
    df_sgd['Target']=df_sgd.index
    df_sgd=df_sgd.reset_index(drop=True)
    df_sgd['Algorithm']='Linear regression'
    df_sgd.set_index(['Algorithm'])

    df_lasso=pd.DataFrame(results_lasso).T
    df_lasso['Target']=df_lasso.index
    df_lasso=df_lasso.reset_index(drop=True)
    df_lasso['Algorithm']='Lasso Regression'
    df_lasso.set_index(['Algorithm'])

    df_enet=pd.DataFrame(results_enet).T
    df_enet['Target']=df_enet.index
    df_enet=df_enet.reset_index(drop=True)
    df_enet['Algorithm']='Elastic Net'
    df_enet.set_index(['Algorithm'])

#     concat the df
    pd.concat([
        df_rf,df_svm,df_sgd,
        df_lasso,
        df_enet,
    ]).to_csv(filename,index=False)
    print("File saved")
    del df_rf,df_svm,df_sgd,df_lasso,df_enet,
    
    def createPredictionsTable(predictions):
        pq= pd.DataFrame(predictions.get('PQ'))
        pq.rename(index=str, columns={"Original": "Original_PQ", "Prediction": "Prediction_PQ",'Residuals':'Residuals_PQ'}, inplace=True)
        att=pd.DataFrame(predictions.get('ATT'))
        att.rename(index=str, columns={"Original": "Original_ATT", "Prediction": "Prediction_ATT",'Residuals':'Residuals_ATT'},inplace=True)
        df = pd.concat([pq,att],axis=1)
        return df
    
    df_sgd=createPredictionsTable(predictions_sgd)
    df_lasso=createPredictionsTable(predictions_lasso)
    df_enet=createPredictionsTable(predictions_enet)
    df_svm=createPredictionsTable(predictions_svm)
    df_rf=createPredictionsTable(predictions_rf)
    
    if transformation==False and perform_pca==True:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis_PCA_'+str(n_components)+'_predictions.xlsx'
    elif transformation==False and perform_pca==False:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis'+'_predictions.xlsx'
    elif transformation==True and perform_pca==True:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis_transformed_PCA_'+str(n_components)+'_predictions.xlsx'
    else:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis_transformed'+'_predictions.xlsx'
        
    with pd.ExcelWriter(filename) as writer:  # doctest: +SKIP
        df_sgd.to_excel(writer, sheet_name='Linear Regression')
        df_lasso.to_excel(writer, sheet_name='Lasso Regression')
        df_enet.to_excel(writer, sheet_name='Elastic Net')
        df_svm.to_excel(writer, sheet_name='SVM')
        df_rf.to_excel(writer, sheet_name='Random Forest')
    
    print('file saved sucessfully')

Study1 original
---

In [None]:
# evaluate model on study1 data on original distribution features
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv'
    filename='Tables/study1_feature_selection_mahalanobis_alltargets.csv'
    runAllModels(path,filename)

In [None]:
# evaluate model on study1 data on original distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv'
    filename='Tables/study1_feature_selection_mahalanobis_alltargets_PCA_0.95PC.csv'
    n_components=0.95
    runAllModels(path,filename,n_components,perform_pca=True)

In [None]:
# evaluate model on study1 data on original distribution after PCA with 80% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv'
    filename='Tables/study1_feature_selection_mahalanobis_alltargets_PCA_0.80PC.csv'
    n_components=0.80
    runAllModels(path,filename,n_components,perform_pca=True)

In [None]:
# evaluate model on study1 data on original distribution after PCA with 3 PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv'
    filename='Tables/study1_feature_selection_mahalanobis_alltargets_PCA_3PC.csv'
    n_components=3
    runAllModels(path,filename,n_components,perform_pca=True)

Study1 Transformation
---

In [None]:
# evaluate model on study1 data on transformed distribution features
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study1_feature_selection_mahalanobis_transformed_alltargets.csv'
    runAllModels(path,filename,transformation=True)

In [None]:
# evaluate model on study1 data on transformed distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study1_feature_selection_mahalanobis_transformed_alltargets_PCA_0.95PC.csv'
    n_components=0.95
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True)

In [None]:
# evaluate model on study1 data on transformed distribution after PCA with 80% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study1_feature_selection_mahalanobis_transformed_alltargets_PCA_0.80PC.csv'
    n_components=0.80
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True)

In [None]:
# evaluate model on study1 data on transformed distribution after PCA with 3 PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study1_feature_selection_mahalanobis_transformed_alltargets_PCA_3PC.csv'
    n_components=3
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True)

Study2 original
---

In [None]:
# evaluate model on study2 data on original distribution features
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv'
    filename='Tables/study2_feature_selection_mahalanobis_alltargets.csv'
    runAllModels(path,filename,transformation=False,data='study2')

In [None]:
# evaluate model on study2 data on original distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv'
    filename='Tables/study2_feature_selection_mahalanobis_alltargets_PCA_0.95PC.csv'
    n_components=0.95
    runAllModels(path,filename,n_components,transformation=False,perform_pca=True,data='study2')

In [None]:
# evaluate model on study2 data on original distribution after PCA with 80% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv'
    filename='Tables/study2_feature_selection_mahalanobis_alltargets_PCA_0.80PC.csv'
    n_components=0.80
    runAllModels(path,filename,n_components,transformation=False,perform_pca=True,data='study2')

In [None]:
# evaluate model on study2 data on original distribution after PCA with 3 PCs
if __name__=='__main__':
    path='../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv'
    filename='Tables/study2_feature_selection_mahalanobis_alltargets_PCA_3PC.csv'
    n_components=3
    runAllModels(path,filename,n_components,transformation=False,perform_pca=True,data='study2')

Study2 Transformed
---

In [None]:
# evaluate model on study2 data on transformed distribution features
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study2_feature_selection_mahalanobis_transformed_alltargets.csv'
    runAllModels(path,filename,transformation=True,data='study2')

In [None]:
# evaluate model on study2 data on transformed distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study2_feature_selection_mahalanobis_transformed_alltargets_PCA_0.95PC.csv'
    n_components=0.95
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True,data='study2')

In [None]:
# evaluate model on study2 data on transformed distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study2_feature_selection_mahalanobis_transformed_alltargets_PCA_0.80PC.csv'
    n_components=0.80
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True,data='study2')

In [None]:
# evaluate model on study2 data on transformed distribution after PCA with 3 PCs
if __name__=='__main__':
    path='/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv'
    filename='Tables/study2_feature_selection_mahalanobis_transformed_alltargets_PCA_3PC.csv'
    n_components=3
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True,data='study2')