# Script to train ML algorithms after feature selection and feature extraction  except MARS for STUDY1 and then testing on STUDY2

- MARS is performed using R caret. Check the file MARS_GA.R for reference

In [1]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd
# import seaborn as ns
from sklearn.decomposition import PCA,FastICA,KernelPCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.ensemble import RandomForestRegressor
from genetic_selection import GeneticSelectionCV
import matplotlib.pyplot as plt
import math
import time
from RegscorePy.aic import aic # for calculating Akaike’s Information Criterion
from RegscorePy.bic import bic # for calculating Bayesian Information Criterion



In [2]:
def readDataFromCsv(file):
    import pandas as pd
#     logging.info("Reading the file from:{}".format(file))
    print ("Reading the file from: ",file)
    df = pd.read_csv(file)
    return df

def loadDataset(data='study1',path='../datasets/files_generated/UX/study1_features_data.csv',target='PQ',app='Spell'):    
#     if data== 'study1':
    df = readDataFromCsv(path)
#     if data=='study2':
#         df = readDataFromCsv(path)
    df=df[df['App']==app]   
    print('The shape of the data  currently: ',df.shape)
    
    ## This should not have been there
    if(df.isnull().values.any()==True and data=='study1'):
        df = df.dropna()
        print('The shape of the data after dropping null values: ',df.shape)
    if data == 'startData':
#         df_join= pd.merge(df_stat_summ_withoutna, df_ux, on=['user_id','App','Cond','sessionNr'])
        X,y= df.drop(['PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    elif data=='study1':
#         df_join= pd.merge(df_stat_summ_withoutna, df_ux, on=['user_id','App','Cond','sessionNr'])
        X,y= df.drop(['user_id','App','Cond','sessionNr','SEA', 'PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    elif data=='study2':
#         df_join= pd.merge(df_stat_summ_withoutna, df_ux, how='inner',left_on=['user_id','Cond','sessionNr'],
#                            right_on=['UserId','IconSize','Session'])
        X,y=df.drop(['sessionNr','App','user_id','Size','UserId', 'Session', 
                     'PQ', 'ATT', 'HQI', 'HQS', 'HQ', 'IconSize'],axis=1),df[target]
        print('inside study2 if')
#         print(X.shape)
        
#     print('shape after join: ',df_join.shape)
    df_result={'data':X,'target':y}
    return df_result


def optimalModelSelection(model,param_grid,X,y,method='grid'):
    '''Tune the hyperparameters to find the best score personality data'''
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler,RobustScaler
    from sklearn.pipeline import make_pipeline,Pipeline
    from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV
    
    K = 10
    kf = KFold(n_splits=K, shuffle=True,random_state=32)
    
    scoring={'r2':'r2','mse':'neg_mean_squared_error','mae':'neg_mean_absolute_error'}
#     pca = PCA(random_state=32,svd_solver='full',n_components=0.95)
#     pipe = Pipeline([('scl', StandardScaler()),#('pca',pca),
#                     ('clf', model)])
    if(method=='grid'):
        search = GridSearchCV(model, param_grid, cv=kf,n_jobs=-1,scoring=scoring,return_train_score=True,refit='r2')
        search.fit(X,y)
    if(method=='random'):
        search=RandomizedSearchCV(estimator = model, param_distributions = param_grid, 
                               n_iter = 100, cv = kf, verbose=1, 
                               random_state=32, n_jobs = -1,scoring=scoring,return_train_score=True,refit='r2')
        search.fit(X,y)
    
#     adjusted_R2=1 - (1-search.best_score_)*(len(y)-1)/(len(y)-X.shape[1]-1)
#     print('number of principal components',search.best_estimator.)
    print('Best params: {}'.format(search.best_params_))
#     print('Best params: {}'.format(search.best_params_))
#     print('Best score after fitting the estimator with best params:{}'.format(search.best_score_))
#     print('Best score after fitting the estimator with best params:{}'.format(search.best_score_))
    print('RMSE: %0.2f'%(np.sqrt(-search.cv_results_['mean_test_mse'][search.best_index_])))
    print("R2(Validation): %0.2f (+/- %0.2f)" % (search.best_score_,search.cv_results_['std_test_r2'][search.best_index_]))
    print("R2(Train): %0.2f (+/- %0.2f)" % (search.cv_results_['mean_train_r2'][search.best_index_],
                                                 search.cv_results_['std_train_r2'][search.best_index_]))
    print("MAE(Validation): %0.2f (+/- %0.2f)" % (-search.cv_results_['mean_test_mae'][search.best_index_],
                                                  search.cv_results_['std_test_mae'][search.best_index_]))
    print("MAE(Train): %0.2f (+/- %0.2f)" % (-search.cv_results_['mean_train_mae'][search.best_index_],
                                                 search.cv_results_['std_train_mae'][search.best_index_]))
    
    #logging.info('RMSE: %0.2f'%(np.sqrt(-search.cv_results_['mean_test_mse'][search.best_index_])))
    #logging.info("R2: %0.2f (+/- %0.2f)" % (search.best_score_,search.cv_results_['std_test_r2'][search.best_index_]))
    
    return search.best_estimator_,search.best_params_, search.best_score_,search.cv_results_,search.best_index_

In [3]:
def genetic_selection(estimator,X,y):
    '''
    Returns the selected columns after GA
    '''
    np.random.seed(100)
    from pyearth import Earth
    # calculate the optimal population size
    if (isinstance(estimator,Earth)==False):
#         population_size=math.ceil((267.43*np.log(X.shape[0]))-293.21) # reference from paper
        population_size=10
        generations=20
    else:
        population_size=100 
        generations=40 # this may not lead to optimal solution and may suffer from premature convergence
    selector = GeneticSelectionCV(estimator,
                                          cv=5,
    #                                       verbose=1,
                                          scoring="r2",
#                                           max_features=20,
                                          n_population=population_size,
    #                                       crossover_proba=0.5,
    #                                       mutation_proba=0.2,
                                          crossover_proba=0.5,
                                          mutation_proba=0.01,
                                          n_generations=generations,
#                                           crossover_independent_proba=0.5,
#                                           mutation_independent_proba=0.05,
                                          tournament_size=3,
                                          caching=True,
                                          n_jobs=-1)
    start = time.time()
    selector = selector.fit(X, y)
    print("---Finished in %s seconds ---" % (np.round(time.time() - start,3)))
    
#     print("Number of columns selected:",len(X.columns[selector.support_]))
    print("Number of columns selected:",len(np.where(selector.support_==True)[0]))
    if isinstance(X, pd.DataFrame):
        columns=X.columns[selector.support_] # returns the column names
    else:
        columns=np.where(selector.support_==True) # return the indices of the numpy array
        
    return columns

In [26]:
def perform_evaluation(path, estimator,param_grid,method='grid',transformation=False, data='study1'):
    if transformation==True:
        # consider only the normally distributed columns
        not_columns=['SEA','PQ','ATT', 'HQI', 'HQ','HQS']
#         mahalanobis = pd.read_csv('Tables/Study1/study1_univariate_normality_test_features_mahalanobis_transformed.csv')
        if data=='study1':
#              normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
             normality_test_features_path ='../../Tables/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
        else:
#             normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
            normality_test_features_path ='../../Tables/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
        mahalanobis = pd.read_csv(normality_test_features_path)
        mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)
        for col in not_columns:
            if(col in mahalanobis):
                mahalanobis.remove(col)
        print("Columns that should not be selected are:",mahalanobis)

    targets=['PQ', 'ATT']
    results={}
    predictions={}
    for target in targets:
        target_result={}
        print('Performing prediction for target:',target)
        
        personality=loadDataset(data=data,path=path[0],target=target)
        X=personality.get('data')
        zero_var_columns = X.loc[:, X.var() == 0.0].columns
        print("columns thrown away because they have 0 variance:",zero_var_columns)
        X = X.loc[:, X.var() != 0.0]
#         print("Shape of the data after removing 0 variance columns:",X.shape)
        y=personality.get('target')
        
        if transformation==True:
            # apply only the normal columns
            X=X[mahalanobis]
#             print("Shape of the data after selected transformed columns:",X.shape)
            
        # Create correlation matrix
        corr_matrix = X.select_dtypes(['float64']).corr().abs()

        # Select upper triangle of correlation matrix
        upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find index of feature columns with correlation greater than 0.95
        to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]
            
        # Drop features 
        X = X.drop(X[to_drop_cols], axis=1)
            
        print("Shape of the data after removing 0 variance highly correlated data:",X.shape)
        
        # split the data into train test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
        print("Shape of training data:",X_train.shape)
#         print("Shape of test data:",X_test.shape)
        
        test = loadDataset(data='study2',path=path[1],target=target)
        X_test = test.get('data')
        y_test = test.get('target')
        
        # Drop features to make both the datasets schema equal
#         X_test = X_test.drop(X_test[zero_var_columns], axis=1)
#         if transformation==True:
#             X_test=X_test[mahalanobis]
#         X_test = X_test.drop(X_test[to_drop_cols], axis=1)
        X_test=X_test[X_train.columns]
        
        print("Shape of the data after removing 0 variance highly correlated data:",X_test.shape)
        # perform genetic algorithm
        #if(isinstance(estimator,Earth)==False):
        
        # features selected
        selected_features= genetic_selection(estimator,X_train,y_train)
        #else:
            #selected_features_index= genetic_selection(estimator,np.array(X_train),np.array(y_train))
            #selected_features= X.columns[selected_features_index]
        
        
        # scale the data
        scaler=StandardScaler()
        X_train=scaler.fit_transform(X_train[selected_features])
        X_test= scaler.fit_transform(X_test[selected_features])
        
            
        # tune hyperparameters on the optimal subset
        best_estimator_,best_params_, best_score_,cv_results_,best_index_= optimalModelSelection(estimator,param_grid,X_train,y_train,method=method)
        
        # calculate the AIC
        y_pred_train  = best_estimator_.fit(X_train,y_train).predict(X_train)
        aic_score_val = aic(y_train,y_pred_train,X_train.shape[1])
        # calculate Bayesian Information Criterion
        bic_score_val = bic(y_train,y_pred_train,X_train.shape[1])
        # calculate MAPE
        mape_score_val = np.mean(np.abs((y_train - y_pred_train) / y_train)) * 100
        
        # predict on unseen data
        y_pred=best_estimator_.predict(X_test)
        score=r2_score(y_test,y_pred)
        rmse = np.sqrt(mean_squared_error(y_test,y_pred))
        mae = mean_absolute_error(y_test,y_pred)
        aic_score_test = aic(y_test,y_pred,X_test.shape[1])
        bic_score_test = bic(y_test,y_pred,X_test.shape[1])
        mape_score_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        print('Peforming predictions on unseen data')
        print('Performance(R2):%0.2f | RMSE:%0.2f | MAE:%0.2f '%(score,rmse,mae))
        
        '''TODO: Store the residuals in the table for the model'''
        residuals_test = np.array(y_test)- y_pred
#         print(len(residuals))
        
        #store it in a seperate table
        residuals = np.array(y_train)- y_pred_train
#         prediction= {'Original':np.array(y_train),'Predicted':y_pred_train,'Residuals':residuals}
#         predictions[target]=prediction
        
        prediction= {'Original':np.array(y_test),'Predicted':y_pred,'Residuals':residuals_test}
        predictions[target]=prediction
            
        # append the results
        target_result['R2(Validation)']=best_score_
        target_result['Adjusted R2(Validation)']=1-(1-best_score_)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
        target_result['StandardError(Validation)']=cv_results_['std_test_r2'][best_index_]
        target_result['RMSE(Validation)']=np.sqrt(np.abs(cv_results_['mean_test_mse'][best_index_]))
        target_result['R2(Train)']=cv_results_['mean_train_r2'][best_index_]
        target_result['RMSE(Train)']=np.sqrt(np.abs(cv_results_['mean_train_mse'][best_index_]))
        target_result['R2(Test)']=score
        target_result['RMSE(Test)']=rmse
        target_result['#Features']=len(selected_features)
        target_result['Features']=selected_features.values
        target_result['MAE(Validation)']= -cv_results_['mean_test_mae'][best_index_]
        target_result['MAE(Train)']= -cv_results_['mean_train_mae'][best_index_]
        target_result['MAE(Test)']=mae
        target_result['AIC(Validation)']=aic_score_val
        target_result['AIC(Test)']=aic_score_test
        target_result['BIC(Validation)']=bic_score_val
        target_result['BIC(Test)']=bic_score_test
        target_result['MAPE(Validation)']=mape_score_val
        target_result['MAPE(Test)']=mape_score_test
        
            
        # store the result with respect to target
        results[target]=target_result
    
    return results,predictions

In [25]:
def perform_evaluation_with_pca(path, estimator,param_grid,n_components,method='grid',transformation=False, data='study1'):
    if transformation==True:
        # consider only the normally distributed columns
        not_columns=['SEA','PQ','ATT', 'HQI', 'HQ','HQS']
#         mahalanobis = pd.read_csv('Tables/Study1/study1_univariate_normality_test_features_mahalanobis_transformed.csv')
        if data=='study1':
#              normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
            normality_test_features_path ='../../Tables/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
        else:
#             normality_test_features_path ='/mnt/vdb1/UX-Ratings/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
            normality_test_features_path ='../../Tables/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
            
        mahalanobis = pd.read_csv(normality_test_features_path)
        mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)
        for col in not_columns:
            if(col in mahalanobis):
                mahalanobis.remove(col)
        print("Columns that should not be selected are:",mahalanobis)

    targets=['PQ', 'ATT']
    results={}
    predictions={}
    for target in targets:
        target_result={}
        print('Performing prediction for target:',target)
        
        personality=loadDataset(data=data,path=path[0],target=target)
        X=personality.get('data')
        zero_var_columns = X.loc[:, X.var() == 0.0].columns
        print("columns thrown away because they have 0 variance:",zero_var_columns)
        X = X.loc[:, X.var() != 0.0]
#         print("Shape of the data after removing 0 variance columns:",X.shape)
        y=personality.get('target')
        
        if transformation==True:
            # apply only the normal columns
            X=X[mahalanobis]
#             print("Shape of the data after selected transformed columns:",X.shape)
        
        # Create correlation matrix
        corr_matrix = X.select_dtypes(['float64']).corr().abs()

        # Select upper triangle of correlation matrix
        upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find index of feature columns with correlation greater than 0.95
        to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]
            
        # Drop features 
        X = X.drop(X[to_drop_cols], axis=1)
            
        print("Shape of the data after removing 0 variance highly correlated data:",X.shape)
        
        # split the data into train test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
        print("Shape of training data:",X_train.shape)
        print("Shape of test data:",X_test.shape)
        print("Shape of test data:",y_train.shape)
        
        test = loadDataset(data='study2',path=path[1],target=target)
        X_test = test.get('data')
        y_test = test.get('target')
        
#         Drop features to make both the datasets schema equal
#         X_test = X_test.drop(X_test[zero_var_columns], axis=1)
#         if transformation==True:
#             X_test=X_test[mahalanobis]
#         X_test = X_test.drop(X_test[to_drop_cols], axis=1)
        X_test = X_test[X_train.columns]
        
        print("Shape of the data after removing 0 variance highly correlated data:",X_test.shape)
        
        # scale the data
        scaler=StandardScaler()
        X_train=scaler.fit_transform(X_train)
        X_test= scaler.fit_transform(X_test)
        
        # perform PCA
        print('inside ga with pca function')
        print(type(n_components))
        print(n_components)
        pca= PCA(n_components=n_components)
        pca.fit(X_train)
        X_train = pca.transform(X_train)
        X_test = pca.transform(X_test)
        print('number of principal components:',pca.n_components_)
        
        # apply genetic algorithm to select the best PC
        selected_features= genetic_selection(estimator,X_train,y_train)
        
            
        # tune hyperparameters on the optimal subset
        best_estimator_,best_params_, best_score_,cv_results_,best_index_= optimalModelSelection(estimator,
                                                                                                 param_grid,
                                                                                                 X_train[:,selected_features[0]],y_train,method=method)
        
        
        y_pred_train  = best_estimator_.fit(X_train[:,selected_features[0]],y_train).predict(X_train[:,selected_features[0]])
        
        # calculate the AIC
        aic_score_val = aic(y_train,y_pred_train,X_train[:,selected_features[0]].shape[1])
        # calculate Bayesian Information Criterion
        bic_score_val = bic(y_train,y_pred_train,X_train[:,selected_features[0]].shape[1])
        # calculate MAPE
        mape_score_val = np.mean(np.abs((y_train - y_pred_train) / y_train)) * 100
        
        # predict on unseen data
        y_pred=best_estimator_.predict(X_test[:,selected_features[0]])
        score=r2_score(y_test,y_pred)
        rmse = np.sqrt(mean_squared_error(y_test,y_pred))
        mae = mean_absolute_error(y_test,y_pred)
        
        aic_score_test = aic(y_test,y_pred,X_test[:,selected_features[0]].shape[1])
        bic_score_test = bic(y_test,y_pred,X_test[:,selected_features[0]].shape[1])
        mape_score_test = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        print('Peforming predictions on unseen data')
        print('Performance(R2):%0.2f | RMSE:%0.2f | MAE:%0.2f | MAPE:%0.2f'%(score,rmse, mae,mape_score_test))
        
        ''''TODO: Store the residuals in the table'''
        residuals = np.array(y_test)- y_pred
#         print(len(residuals))
        #store it in a seperate table
        prediction= {'Original':np.array(y_test),'Predicted':y_pred,'Residuals':residuals}
#         predictions[target]=prediction
#         prediction= {'Original':np.array(y_test),'Predicted':y_pred,'Residuals':residuals_test}
        predictions[target]=prediction
            
        # append the results
        target_result['R2(Validation)']=best_score_
        target_result['Adjusted R2(Validation)']=1-(1-best_score_)*(len(y_train)-1)/(len(y_train)-X_train[:,selected_features[0]].shape[1]-1)
        target_result['StandardError(Validation)']=cv_results_['std_test_r2'][best_index_]
        target_result['RMSE(Validation)']=np.sqrt(np.abs(cv_results_['mean_test_mse'][best_index_]))
        target_result['R2(Train)']=cv_results_['mean_train_r2'][best_index_]
        target_result['RMSE(Train)']=np.sqrt(np.abs(cv_results_['mean_train_mse'][best_index_]))
        target_result['R2(Test)']=score
        target_result['RMSE(Test)']=rmse
        target_result['#Features']=len(selected_features[0])
        target_result['Features']=selected_features[0]
        target_result['MAE(Validation)']= -cv_results_['mean_test_mae'][best_index_]
        target_result['MAE(Train)']= -cv_results_['mean_train_mae'][best_index_]
        target_result['MAE(Test)']=mae
        target_result['AIC(Validation)']=aic_score_val
        target_result['AIC(Test)']=aic_score_test
        target_result['BIC(Validation)']=bic_score_val
        target_result['BIC(Test)']=bic_score_test
        target_result['MAPE(Validation)']=mape_score_val
        target_result['MAPE(Test)']=mape_score_test
        
        # store the result with respect to target
        results[target]=target_result
    
    return results, predictions

In [6]:
#create models
def runAllModels(path, filename,n_components=None,transformation=False,data='study1',perform_pca=False):
    
    # random forest
    print('********Applying Random forest****************')
    rf = RandomForestRegressor(random_state=101)
#     n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
#     # n_estimators=[20,50,70,100,200]
#     max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
#     min_samples_split = [2, 5, 10,15]
#     min_samples_leaf = [1, 2, 4,10]
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
    max_depth = [int(x) for x in np.linspace(1, 5, num = 5)]
    min_samples_split = [int(x) for x in np.linspace(10, 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(10, 60, num = 20)]
    bootstrap = [True, False]
    max_features=['auto','sqrt']
    param_grid={'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
#                 'bootstrap': bootstrap,
                'max_features':max_features
               }
    if perform_pca==True:
        results_rf,predictions_rf= perform_evaluation_with_pca(path,rf,param_grid,n_components,method='random',transformation=transformation,data=data)
    else:
        results_rf,predictions_rf= perform_evaluation(path,rf,param_grid,method='random',transformation=transformation,data=data)            
    
    np.random.seed(15)
    print('********Applying Support vector machine****************')
    from sklearn.svm import SVR
#     C_space=np.logspace(-1,0,10)
    C_space=np.logspace(-1,1,10)
#     epsilon_space= np.logspace(-1,0,10)
    epsilon_space= np.logspace(-1,0,10)
#     gamma_space = np.logspace(-3, 3, 10)
    gamma_space = np.logspace(-3, -2, 10)
    param_grid={'C':C_space,'epsilon':epsilon_space,'gamma':gamma_space}
    svr = SVR(kernel = 'rbf')
    if perform_pca==True:
        results_svm,predictions_svm = perform_evaluation_with_pca(path,svr,param_grid,n_components,method='random',transformation=transformation,data=data)
    else:
        results_svm,predictions_svm = perform_evaluation(path,svr,param_grid,method='random',transformation=transformation,data=data)

    print('********Applying Linear regression with stochastic gradient descent****************')
    from sklearn.linear_model import SGDRegressor
    param_grid={#'max_iter':[100,500,1000],
                'max_iter':[50,100],
                'penalty':[None],
                'eta0':[0.01,0.1,0.5]
               }
    sgd_reg = SGDRegressor(random_state=32)
    if perform_pca==True:
        print(n_components)
        print(type(n_components))
        results_sgd,predictions_sgd = perform_evaluation_with_pca(path,sgd_reg,param_grid,n_components,transformation=transformation,data=data)
    else:
        results_sgd, predictions_sgd = perform_evaluation(path,sgd_reg,param_grid,transformation=transformation,data=data)
    


    ## lasso regression
    print('********Applying Lasso Regression****************')
    from sklearn.linear_model import Lasso
#     alpha_space = np.logspace(-4, 0, 50)
    alpha_space = np.logspace(0, 1, 100)
    param_grid={'alpha':alpha_space}
    lasso = Lasso(random_state=32)
    if perform_pca==True:
        results_lasso, predictions_lasso = perform_evaluation_with_pca(path,lasso,param_grid,n_components,transformation=transformation,data=data)
    else:
        results_lasso, predictions_lasso = perform_evaluation(path,lasso,param_grid,transformation=transformation,data=data)
    

    ## elastic net 
    print('********Applying Elastic Net Regression****************')
    from sklearn.linear_model import ElasticNet
#     alpha_space = np.logspace(-4, 0, 50)
    alpha_space = np.logspace(0, 2 , 50)
    param_grid={'alpha':alpha_space}
    enet = ElasticNet(random_state=32)
    if perform_pca==True:
        results_enet,predictions_enet = perform_evaluation_with_pca(path,enet,param_grid,n_components,transformation=transformation,data=data)
    else:
        results_enet, predictions_enet = perform_evaluation(path,enet,param_grid,transformation=transformation,data=data)
    
    
#     np.random.seed(32)
    #MARS
#     print('********Applying MARS****************')
#     from pyearth import Earth
#     max_degree_space=[1]
# #     penalty_space=[3.0,6.0]
# #     minspan_alpha = np.linspace(0, 0.2, num = 5)
# #     max_terms=[10,20]
# #     endspan_alpha = [0.05]
    
#     penalty_space=np.logspace(-1,1,20)
#     minspan_alpha=np.logspace(-3,1,20)
#     max_terms=[10,20,25]
#     # endspan_alpha= np.linspace(0, 1.0, num = 10)
#     # endspan=[5]
#     param_grid={'max_degree':max_degree_space,
#         'penalty':penalty_space,
#                #'minspan_alpha':minspan_alpha,
#         #'endspan_alpha':endspan_alpha,
#                 'use_fast':[True],
#         'max_terms':max_terms
#                }
    #mars= Earth()
#     if perform_pca==True:
#         results_mars, predictions_mars = perform_evaluation_with_pca(path,mars,param_grid,n_components,transformation=transformation,data=data)
#     else:
#         results_mars, predictions_mars = perform_evaluation(path,mars,param_grid,transformation=transformation,data=data)
    

    df_rf=pd.DataFrame(results_rf).T
    df_rf['Target']=df_rf.index
    df_rf=df_rf.reset_index(drop=True)
    df_rf['Algorithm']='Random Forest'
    df_rf.set_index(['Algorithm'])

    df_svm=pd.DataFrame(results_svm).T
    df_svm['Target']=df_svm.index
    df_svm=df_svm.reset_index(drop=True)
    df_svm['Algorithm']='SVM'
    df_svm.set_index(['Algorithm'])

    df_sgd=pd.DataFrame(results_sgd).T
    df_sgd['Target']=df_sgd.index
    df_sgd=df_sgd.reset_index(drop=True)
    df_sgd['Algorithm']='Linear regression'
    df_sgd.set_index(['Algorithm'])

    df_lasso=pd.DataFrame(results_lasso).T
    df_lasso['Target']=df_lasso.index
    df_lasso=df_lasso.reset_index(drop=True)
    df_lasso['Algorithm']='Lasso Regression'
    df_lasso.set_index(['Algorithm'])

    df_enet=pd.DataFrame(results_enet).T
    df_enet['Target']=df_enet.index
    df_enet=df_enet.reset_index(drop=True)
    df_enet['Algorithm']='Elastic Net'
    df_enet.set_index(['Algorithm'])

#     df_mars=pd.DataFrame(results_mars).T
#     df_mars['Target']=df_mars.index
#     df_mars=df_mars.reset_index(drop=True)
#     df_mars['Algorithm']='MARS'
#     df_mars.set_index(['Algorithm'])


#     concat the df
    pd.concat([
        df_rf,df_svm,df_sgd,
        df_lasso,
        df_enet,
#         df_mars
    ]).to_csv(filename,index=False)
    print("File saved")
    del df_rf,df_svm,df_sgd,df_lasso,df_enet,
#     del df_mars
    
    def createPredictionsTable(predictions):
        pq= pd.DataFrame(predictions.get('PQ'))
        pq.rename(index=str, columns={"Original": "Original_PQ", "Prediction": "Prediction_PQ",'Residuals':'Residuals_PQ'}, inplace=True)
        att=pd.DataFrame(predictions.get('ATT'))
        att.rename(index=str, columns={"Original": "Original_ATT", "Prediction": "Prediction_ATT",'Residuals':'Residuals_ATT'},inplace=True)
        df = pd.concat([pq,att],axis=1)
        return df
    
    df_sgd=createPredictionsTable(predictions_sgd)
    df_lasso=createPredictionsTable(predictions_lasso)
    df_enet=createPredictionsTable(predictions_enet)
    df_svm=createPredictionsTable(predictions_svm)
    df_rf=createPredictionsTable(predictions_rf)
#     df_mars=createPredictionsTable(predictions_mars)
#     print(df_lasso)
    if transformation==False and perform_pca==True:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis_PCA_'+str(n_components)+'_predictions_unseen.xlsx'
    elif transformation==False and perform_pca==False:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis'+'_predictions_unseen.xlsx'
    elif transformation==True and perform_pca==True:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis_transformed_PCA_'+str(n_components)+'_predictions_unseen.xlsx'
    else:
        filename=str(data)+'_feature_selection_alltargets_mahalanobis_transformed'+'_predictions_unseen.xlsx'
        
    with pd.ExcelWriter(filename) as writer:  # doctest: +SKIP
        df_sgd.to_excel(writer, sheet_name='Linear Regression')
        df_lasso.to_excel(writer, sheet_name='Lasso Regression')
        df_enet.to_excel(writer, sheet_name='Elastic Net')
        df_svm.to_excel(writer, sheet_name='SVM')
        df_rf.to_excel(writer, sheet_name='Random Forest')
#         df_mars.to_excel(writer, sheet_name='MARS')
    
    print('file saved sucessfully')

Study1 original and test on Study2 original
---

In [7]:
# evaluate model on study1 data on original distribution features
if __name__=='__main__':
    path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
         '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    
#     path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
#          '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    filename='Tables/feature_selection_mahalanobis_alltargets.csv'
    #runAllModels(path,filename)

In [8]:
# evaluate model on study1 data on original distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    filename='Tables/feature_selection_mahalanobis_alltargets_PCA_0.95PC.csv'
    n_components=0.95
    runAllModels(path,filename,n_components,perform_pca=True)

********Applying Random forest****************
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 89)
inside ga with pca function
<class 'float'>
0.95
number of principal components: 34


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.144 seconds ---
Number of columns selected: 17
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 765 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   28.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   29.1s finished


Best params: {'n_estimators': 30, 'min_samples_split': 40, 'min_samples_leaf': 17, 'max_features': 'auto', 'max_depth': 2}
RMSE: 1.11
R2(Validation): 0.34 (+/- 0.20)
R2(Train): 0.50 (+/- 0.02)
MAE(Validation): 0.90 (+/- 0.12)
MAE(Train): 0.82 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.51 | RMSE:1.07 | MAE:0.85 | MAPE:28.28
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 9.789 seconds ---
Number of columns selected: 17
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 676 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   23.7s finished


Best params: {'n_estimators': 60, 'min_samples_split': 60, 'min_samples_leaf': 25, 'max_features': 'auto', 'max_depth': 3}
RMSE: 1.24
R2(Validation): 0.18 (+/- 0.25)
R2(Train): 0.30 (+/- 0.02)
MAE(Validation): 0.98 (+/- 0.25)
MAE(Train): 0.94 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.34 | RMSE:1.12 | MAE:0.89 | MAPE:28.02
********Applying Support vector machine****************
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.693 seconds ---
Number of columns selected: 17
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.9s finished


Best params: {'gamma': 0.01, 'epsilon': 0.21544346900318834, 'C': 2.1544346900318834}
RMSE: 1.09
R2(Validation): 0.39 (+/- 0.15)
R2(Train): 0.75 (+/- 0.02)
MAE(Validation): 0.81 (+/- 0.10)
MAE(Train): 0.51 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.31 | RMSE:1.27 | MAE:1.02 | MAPE:35.20
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance hi

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.652 seconds ---
Number of columns selected: 21
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.0s finished


Best params: {'gamma': 0.007742636826811269, 'epsilon': 0.16681005372000587, 'C': 1.291549665014884}
RMSE: 1.22
R2(Validation): 0.24 (+/- 0.23)
R2(Train): 0.52 (+/- 0.02)
MAE(Validation): 0.96 (+/- 0.28)
MAE(Train): 0.72 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.27 | RMSE:1.18 | MAE:0.96 | MAPE:30.98
********Applying Linear regression with stochastic gradient descent****************
0.95
<class 'float'>
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahala

In [10]:
# evaluate model on study1 data on original distribution after PCA with 80% explained variance PCs
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    filename='Tables/feature_selection_mahalanobis_alltargets_PCA_0.80PC.csv'
    n_components=0.80
    runAllModels(path,filename,n_components,perform_pca=True)

********Applying Random forest****************
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 89)
inside ga with pca function
<class 'float'>
0.8
number of principal components: 16


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.909 seconds ---
Number of columns selected: 9
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 688 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   25.2s finished


Best params: {'n_estimators': 50, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_depth': 4}
RMSE: 1.12
R2(Validation): 0.34 (+/- 0.12)
R2(Train): 0.57 (+/- 0.02)
MAE(Validation): 0.89 (+/- 0.07)
MAE(Train): 0.75 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.43 | RMSE:1.15 | MAE:0.94 | MAPE:31.61
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.621 seconds ---
Number of columns selected: 8
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   18.5s finished


Best params: {'n_estimators': 20, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_features': 'auto', 'max_depth': 3}
RMSE: 1.22
R2(Validation): 0.22 (+/- 0.21)
R2(Train): 0.41 (+/- 0.02)
MAE(Validation): 0.97 (+/- 0.26)
MAE(Train): 0.87 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.34 | RMSE:1.13 | MAE:0.90 | MAPE:28.58
********Applying Support vector machine****************
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.435 seconds ---
Number of columns selected: 7
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.7s finished


Best params: {'gamma': 0.005994842503189409, 'epsilon': 0.5994842503189409, 'C': 10.0}
RMSE: 1.04
R2(Validation): 0.41 (+/- 0.19)
R2(Train): 0.63 (+/- 0.02)
MAE(Validation): 0.82 (+/- 0.07)
MAE(Train): 0.69 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.39 | RMSE:1.19 | MAE:0.94 | MAPE:32.50
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance h

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.482 seconds ---
Number of columns selected: 7
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.7s finished


Best params: {'gamma': 0.004641588833612777, 'epsilon': 0.16681005372000587, 'C': 3.593813663804626}
RMSE: 1.20
R2(Validation): 0.24 (+/- 0.25)
R2(Train): 0.42 (+/- 0.03)
MAE(Validation): 0.96 (+/- 0.24)
MAE(Train): 0.83 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.27 | RMSE:1.18 | MAE:0.95 | MAPE:30.61
********Applying Linear regression with stochastic gradient descent****************
0.8
<class 'float'>
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalan

In [11]:
# evaluate model on study1 data on original distribution after PCA with 3 PCs
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    filename='Tables/feature_selection_mahalanobis_alltargets_PCA_3PC.csv'
    n_components=3
    runAllModels(path,filename,n_components,perform_pca=True)

********Applying Random forest****************
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 89)
inside ga with pca function
<class 'int'>
3
number of principal components: 3


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.869 seconds ---
Number of columns selected: 3
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   25.0s finished


Best params: {'n_estimators': 20, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_features': 'auto', 'max_depth': 3}
RMSE: 1.10
R2(Validation): 0.37 (+/- 0.17)
R2(Train): 0.51 (+/- 0.02)
MAE(Validation): 0.90 (+/- 0.11)
MAE(Train): 0.82 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.48 | RMSE:1.10 | MAE:0.88 | MAPE:30.03
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.448 seconds ---
Number of columns selected: 2
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 210 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 710 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   20.8s finished


Best params: {'n_estimators': 70, 'min_samples_split': 40, 'min_samples_leaf': 23, 'max_features': 'auto', 'max_depth': 5}
RMSE: 1.21
R2(Validation): 0.23 (+/- 0.23)
R2(Train): 0.35 (+/- 0.02)
MAE(Validation): 0.96 (+/- 0.25)
MAE(Train): 0.91 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.37 | RMSE:1.10 | MAE:0.87 | MAPE:28.33
********Applying Support vector machine****************
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.489 seconds ---
Number of columns selected: 1
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.8s finished


Best params: {'gamma': 0.005994842503189409, 'epsilon': 1.0, 'C': 2.1544346900318834}
RMSE: 1.11
R2(Validation): 0.35 (+/- 0.18)
R2(Train): 0.43 (+/- 0.02)
MAE(Validation): 0.88 (+/- 0.13)
MAE(Train): 0.87 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.41 | RMSE:1.17 | MAE:0.95 | MAPE:32.31
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance hi

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.269 seconds ---
Number of columns selected: 2
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.6s finished


Best params: {'gamma': 0.005994842503189409, 'epsilon': 0.35938136638046275, 'C': 1.291549665014884}
RMSE: 1.23
R2(Validation): 0.22 (+/- 0.21)
R2(Train): 0.30 (+/- 0.03)
MAE(Validation): 0.99 (+/- 0.27)
MAE(Train): 0.94 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.34 | RMSE:1.13 | MAE:0.91 | MAPE:30.99
********Applying Linear regression with stochastic gradient descent****************
3
<class 'int'>
Performing prediction for target: PQ
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 89)
Shape of training data: (130, 89)
Shape of test data: (56, 89)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis

Study1 Transformation and test on Study2 Transformation
---

In [27]:
# evaluate model on study1 data on transformed distribution features
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv']
    filename='Tables/feature_selection_mahalanobis_transformed_alltargets.csv'
    runAllModels(path,filename,transformation=True)

********Applying Random forest****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_length_median', 'swipe_length.x_median', 'swipe_length.y_median', 'x_location.release_median', 'y_location.release_median', 'difference.touch_buttonCenter_x_median', 'difference.touch_buttonCenter_y_median', 'touchAccuracy_median', 'touchAccuracy_x_median', 'touchAccuracy_y_median', 'button_touch_x_location_skew', 'button_touch_y_location_skew', 'diff

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 10.396 seconds ---
Number of columns selected: 42
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 187 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 687 tasks      | elapsed:   19.6s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   25.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   25.3s finished


Best params: {'n_estimators': 90, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 3}
RMSE: 1.07
R2(Validation): 0.40 (+/- 0.16)
R2(Train): 0.68 (+/- 0.01)
MAE(Validation): 0.85 (+/- 0.06)
MAE(Train): 0.66 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.05 | RMSE:1.48 | MAE:1.21 
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 79)
Shape of training data: (130, 79)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 79)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 10.6 seconds ---
Number of columns selected: 34
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 682 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   18.7s finished


Best params: {'n_estimators': 90, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 3}
RMSE: 1.16
R2(Validation): 0.31 (+/- 0.17)
R2(Train): 0.60 (+/- 0.01)
MAE(Validation): 0.93 (+/- 0.23)
MAE(Train): 0.73 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.32 | RMSE:1.14 | MAE:0.91 
********Applying Support vector machine****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_length_m

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.65 seconds ---
Number of columns selected: 31
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.1s finished


Best params: {'gamma': 0.005994842503189409, 'epsilon': 0.5994842503189409, 'C': 10.0}
RMSE: 0.98
R2(Validation): 0.51 (+/- 0.11)
R2(Train): 0.78 (+/- 0.01)
MAE(Validation): 0.80 (+/- 0.08)
MAE(Train): 0.58 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.40 | RMSE:1.18 | MAE:0.96 
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 79)
Shape of training data: (130, 79)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 79)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.965 seconds ---
Number of columns selected: 40
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.3s finished


Best params: {'gamma': 0.005994842503189409, 'epsilon': 0.46415888336127786, 'C': 0.774263682681127}
RMSE: 1.25
R2(Validation): 0.21 (+/- 0.14)
R2(Train): 0.40 (+/- 0.02)
MAE(Validation): 0.99 (+/- 0.26)
MAE(Train): 0.87 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.31 | RMSE:1.15 | MAE:0.93 
********Applying Linear regression with stochastic gradient descent****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_le

The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 79)
---Finished in 6.914 seconds ---
Number of columns selected: 40
Best params: {'alpha': 1.0}
RMSE: 1.47
R2(Validation): -0.09 (+/- 0.09)
R2(Train): 0.00 (+/- 0.00)
MAE(Validation): 1.21 (+/- 0.09)
MAE(Train): 1.20 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):-0.03 | RMSE:1.55 | MAE:1.31 
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 79)
Shape of training data: (130, 79)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv
The shape of the data  curre

In [28]:
# evaluate model on study1 data on transformed distribution after PCA with 95% explained variance PCs
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    filename='Tables/feature_selection_mahalanobis_transformed_alltargets_PCA_0.95PC.csv'
    n_components=0.95
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True)

********Applying Random forest****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_length_median', 'swipe_length.x_median', 'swipe_length.y_median', 'x_location.release_median', 'y_location.release_median', 'difference.touch_buttonCenter_x_median', 'difference.touch_buttonCenter_y_median', 'touchAccuracy_median', 'touchAccuracy_x_median', 'touchAccuracy_y_median', 'button_touch_x_location_skew', 'button_touch_y_location_skew', 'diff

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 9.45 seconds ---
Number of columns selected: 19
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   24.8s finished


Best params: {'n_estimators': 90, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 3}
RMSE: 1.13
R2(Validation): 0.32 (+/- 0.23)
R2(Train): 0.60 (+/- 0.02)
MAE(Validation): 0.90 (+/- 0.10)
MAE(Train): 0.73 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.51 | RMSE:1.06 | MAE:0.85 | MAPE:27.02
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data: (56, 65)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 9.014 seconds ---
Number of columns selected: 16
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 263 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 763 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   16.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   16.6s finished


Best params: {'n_estimators': 50, 'min_samples_split': 70, 'min_samples_leaf': 17, 'max_features': 'auto', 'max_depth': 4}
RMSE: 1.22
R2(Validation): 0.22 (+/- 0.17)
R2(Train): 0.31 (+/- 0.02)
MAE(Validation): 0.97 (+/- 0.22)
MAE(Train): 0.94 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.36 | RMSE:1.11 | MAE:0.89 | MAPE:26.66
********Applying Support vector machine****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'sw

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.405 seconds ---
Number of columns selected: 18
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.7s finished


Best params: {'gamma': 0.01, 'epsilon': 0.774263682681127, 'C': 5.994842503189409}
RMSE: 1.15
R2(Validation): 0.30 (+/- 0.24)
R2(Train): 0.72 (+/- 0.02)
MAE(Validation): 0.91 (+/- 0.16)
MAE(Train): 0.64 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.15 | RMSE:1.41 | MAE:1.13 | MAPE:38.45
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data: (56, 65)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (587, 193)
inside study2 if

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.507 seconds ---
Number of columns selected: 18
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.7s finished


Best params: {'gamma': 0.01, 'epsilon': 0.2782559402207124, 'C': 2.1544346900318834}
RMSE: 1.18
R2(Validation): 0.29 (+/- 0.19)
R2(Train): 0.60 (+/- 0.02)
MAE(Validation): 0.95 (+/- 0.25)
MAE(Train): 0.68 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.19 | RMSE:1.25 | MAE:1.00 | MAPE:32.30
********Applying Linear regression with stochastic gradient descent****************
0.95
<class 'float'>
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_m

The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 65)
inside ga with pca function
<class 'float'>
0.95
number of principal components: 34
---Finished in 7.797 seconds ---
Number of columns selected: 10
Best params: {'alpha': 1.0}
RMSE: 1.25
R2(Validation): 0.19 (+/- 0.15)
R2(Train): 0.29 (+/- 0.03)
MAE(Validation): 1.01 (+/- 0.13)
MAE(Train): 1.00 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.29 | RMSE:1.29 | MAE:1.06 | MAPE:36.38
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of te

Best params: {'alpha': 1.0}
RMSE: 1.27
R2(Validation): 0.16 (+/- 0.15)
R2(Train): 0.24 (+/- 0.02)
MAE(Validation): 1.01 (+/- 0.24)
MAE(Train): 0.99 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.28 | RMSE:1.17 | MAE:0.96 | MAPE:32.43
File saved
file saved sucessfully


In [29]:
# evaluate model on study1 data on transformed distribution after PCA with 80% explained variance PCs
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    filename='Tables/feature_selection_mahalanobis_transformed_alltargets_PCA_0.80PC.csv'
    n_components=0.80
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True)

********Applying Random forest****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_length_median', 'swipe_length.x_median', 'swipe_length.y_median', 'x_location.release_median', 'y_location.release_median', 'difference.touch_buttonCenter_x_median', 'difference.touch_buttonCenter_y_median', 'touchAccuracy_median', 'touchAccuracy_x_median', 'touchAccuracy_y_median', 'button_touch_x_location_skew', 'button_touch_y_location_skew', 'diff

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.856 seconds ---
Number of columns selected: 11
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 780 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   16.3s finished


Best params: {'n_estimators': 60, 'min_samples_split': 30, 'min_samples_leaf': 17, 'max_features': 'auto', 'max_depth': 2}
RMSE: 1.13
R2(Validation): 0.32 (+/- 0.21)
R2(Train): 0.47 (+/- 0.02)
MAE(Validation): 0.90 (+/- 0.12)
MAE(Train): 0.84 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.50 | RMSE:1.07 | MAE:0.87 | MAPE:27.45
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data: (56, 65)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.633 seconds ---
Number of columns selected: 11
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 773 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   15.8s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   16.1s finished


Best params: {'n_estimators': 50, 'min_samples_split': 70, 'min_samples_leaf': 17, 'max_features': 'auto', 'max_depth': 4}
RMSE: 1.22
R2(Validation): 0.22 (+/- 0.16)
R2(Train): 0.31 (+/- 0.02)
MAE(Validation): 0.97 (+/- 0.22)
MAE(Train): 0.94 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.36 | RMSE:1.11 | MAE:0.88 | MAPE:26.65
********Applying Support vector machine****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'sw

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.473 seconds ---
Number of columns selected: 7
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.6s finished


Best params: {'gamma': 0.007742636826811269, 'epsilon': 1.0, 'C': 5.994842503189409}
RMSE: 1.10
R2(Validation): 0.38 (+/- 0.21)
R2(Train): 0.59 (+/- 0.02)
MAE(Validation): 0.87 (+/- 0.17)
MAE(Train): 0.76 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.34 | RMSE:1.24 | MAE:0.99 | MAPE:33.75
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data: (56, 65)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (587, 193)
inside study2 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.384 seconds ---
Number of columns selected: 7
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.5s finished


Best params: {'gamma': 0.005994842503189409, 'epsilon': 0.16681005372000587, 'C': 10.0}
RMSE: 1.19
R2(Validation): 0.29 (+/- 0.23)
R2(Train): 0.48 (+/- 0.03)
MAE(Validation): 0.94 (+/- 0.28)
MAE(Train): 0.76 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.32 | RMSE:1.14 | MAE:0.94 | MAPE:31.31
********Applying Linear regression with stochastic gradient descent****************
0.8
<class 'float'>
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down

The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 65)
inside ga with pca function
<class 'float'>
0.8
number of principal components: 18
---Finished in 6.713 seconds ---
Number of columns selected: 6
Best params: {'alpha': 1.0}
RMSE: 1.25
R2(Validation): 0.19 (+/- 0.15)
R2(Train): 0.29 (+/- 0.03)
MAE(Validation): 1.01 (+/- 0.13)
MAE(Train): 1.00 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.29 | RMSE:1.29 | MAE:1.06 | MAPE:36.38
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test

Best params: {'alpha': 1.0}
RMSE: 1.27
R2(Validation): 0.16 (+/- 0.15)
R2(Train): 0.24 (+/- 0.02)
MAE(Validation): 1.01 (+/- 0.24)
MAE(Train): 0.99 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.28 | RMSE:1.17 | MAE:0.96 | MAPE:32.43
File saved
file saved sucessfully


In [30]:
# evaluate model on study1 data on transformed distribution after PCA with 3 PCs
if __name__=='__main__':
#     path=['/mnt/vdb1/datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
#          '/mnt/vdb1/datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    path=['../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv',
         '../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv']
    filename='Tables/feature_selection_mahalanobis_transformed_alltargets_PCA_3PC.csv'
    n_components=3
    runAllModels(path,filename,n_components,transformation=True,perform_pca=True)

********Applying Random forest****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_length_median', 'swipe_length.x_median', 'swipe_length.y_median', 'x_location.release_median', 'y_location.release_median', 'difference.touch_buttonCenter_x_median', 'difference.touch_buttonCenter_y_median', 'touchAccuracy_median', 'touchAccuracy_x_median', 'touchAccuracy_y_median', 'button_touch_x_location_skew', 'button_touch_y_location_skew', 'diff

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.452 seconds ---
Number of columns selected: 3
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 778 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   15.7s finished


Best params: {'n_estimators': 70, 'min_samples_split': 40, 'min_samples_leaf': 23, 'max_features': 'auto', 'max_depth': 5}
RMSE: 1.16
R2(Validation): 0.29 (+/- 0.21)
R2(Train): 0.42 (+/- 0.02)
MAE(Validation): 0.94 (+/- 0.13)
MAE(Train): 0.90 (+/- 0.01)
Peforming predictions on unseen data
Performance(R2):0.50 | RMSE:1.08 | MAE:0.87 | MAPE:27.49
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data: (56, 65)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 8.363 seconds ---
Number of columns selected: 3
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  35 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 335 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 835 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 985 out of 1000 | elapsed:   15.6s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   15.9s finished


Best params: {'n_estimators': 50, 'min_samples_split': 70, 'min_samples_leaf': 17, 'max_features': 'auto', 'max_depth': 4}
RMSE: 1.21
R2(Validation): 0.23 (+/- 0.17)
R2(Train): 0.31 (+/- 0.02)
MAE(Validation): 0.96 (+/- 0.22)
MAE(Train): 0.95 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.36 | RMSE:1.11 | MAE:0.88 | MAPE:26.55
********Applying Support vector machine****************
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'sw

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


---Finished in 6.505 seconds ---
Number of columns selected: 1
Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.6s finished


Best params: {'gamma': 0.01, 'epsilon': 1.0, 'C': 1.291549665014884}
RMSE: 1.20
R2(Validation): 0.24 (+/- 0.22)
R2(Train): 0.35 (+/- 0.03)
MAE(Validation): 0.96 (+/- 0.16)
MAE(Train): 0.94 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.31 | RMSE:1.27 | MAE:1.03 | MAPE:35.91
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data: (56, 65)
Shape of test data: (130,)
Reading the file from:  ../../../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the 

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    1.7s finished


Best params: {'gamma': 0.01, 'epsilon': 1.0, 'C': 1.291549665014884}
RMSE: 1.24
R2(Validation): 0.18 (+/- 0.21)
R2(Train): 0.27 (+/- 0.02)
MAE(Validation): 0.98 (+/- 0.25)
MAE(Train): 0.97 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.34 | RMSE:1.13 | MAE:0.92 | MAPE:30.40
********Applying Linear regression with stochastic gradient descent****************
3
<class 'int'>
Columns that should not be selected are: ['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean', 'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean', 'button_touch_x_location_mean', 'button_touch_y_location_mean', 'target_touch_x_location_mean', 'target_touch_y_location_mean', 'time_between_touches_mean', 'x_location.release_mean', 'y_location.release_mean', 'difference.touch_buttonCenter_x_mean', 'difference.touch_buttonCenter_y_mean', 'touchAccuracy_mean', 'touchAccuracy_x_mean', 'touchAccuracy_y_mean', 'x_location.down_median', 'y_location.down_median', 'swipe_length

The shape of the data  currently:  (587, 193)
inside study2 if
Shape of the data after removing 0 variance highly correlated data: (587, 65)
inside ga with pca function
<class 'int'>
3
number of principal components: 3
---Finished in 6.864 seconds ---
Number of columns selected: 2
Best params: {'alpha': 1.0}
RMSE: 1.25
R2(Validation): 0.19 (+/- 0.15)
R2(Train): 0.29 (+/- 0.03)
MAE(Validation): 1.01 (+/- 0.13)
MAE(Train): 1.00 (+/- 0.02)
Peforming predictions on unseen data
Performance(R2):0.29 | RMSE:1.29 | MAE:1.06 | MAPE:36.38
Performing prediction for target: ATT
Reading the file from:  ../../../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv
The shape of the data  currently:  (186, 191)
columns thrown away because they have 0 variance: Index(['touch.duration_min', 'time_between_touches_min'], dtype='object')
Shape of the data after removing 0 variance highly correlated data: (186, 65)
Shape of training data: (130, 65)
Shape of test data

Best params: {'alpha': 1.0}
RMSE: 1.27
R2(Validation): 0.16 (+/- 0.15)
R2(Train): 0.24 (+/- 0.02)
MAE(Validation): 1.01 (+/- 0.24)
MAE(Train): 0.99 (+/- 0.03)
Peforming predictions on unseen data
Performance(R2):0.28 | RMSE:1.17 | MAE:0.96 | MAPE:32.43
File saved
file saved sucessfully
