# Script to run the ML algorithms on the reduced dimensions generated by PCA

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
import logging
LOG_FILENAME = 'Personality_Ratings_PCA.log'
logging.basicConfig(filename=LOG_FILENAME,level=logging.INFO)
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.preprocessing import StandardScaler,RobustScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [3]:
# %load MLOperationsUtilities.py
def readDataFromCsv(file):
    import pandas as pd
    print ("Reading the file from: ",file)
    df = pd.read_csv(file)
    return df

def loadDataset(paths=['../datasets/files_generated/Personality/study1_features_data.csv',
                      '../datasets/files_generated/Personality/study2_features_data.csv'],target='Neuroticism'):
    for path in paths:
        if 'study1' in path:
            df = readDataFromCsv(path)
            df= df.select_dtypes (['int64','float64']).drop(['VP','age','user_id'],axis=1)
            print('The shape of the data  currently in study1: ',df.shape)
            X_study1,y_study1= df.drop(['Neuroticism', 'Extraversion', 
                                        'Openness', 'Agreeableness','Conscientiousness'],axis=1),df[target]
        elif 'study2'in path:
            df = readDataFromCsv(path)
            df = df.select_dtypes(['int64','float64']).drop(['user_id','UserId','VP','Age','Handedness_Score'],axis=1)
            print('The shape of the data  currently in study2: ',df.shape)
            X_study2,y_study2=df.drop(['Neuroticism', 'Extraversion', 'Openness', 'Agreeableness','Conscientiousness'],axis=1),df[target]
        else:
            df = pd.read_csv(path,index_col=0)
            X,y=df.drop(['Neuroticism', 'Extraversion', 
                                        'Openness', 'Agreeableness','Conscientiousness','user_id'],axis=1),df[target]
    # concat both the studies
    if(len(paths)>1):
        X = pd.concat([X_study1,X_study2])
        y= pd.concat([y_study1,y_study2])
    print('The shape of the data after concating both the studies {}'.format(X.shape))
    print('The shape of the target after concating both the studies {}'.format(y.shape))
    assert df.isnull().values.any()==False, 'Please check for null values'
    df_result={'data':X,'target':y}
    return df_result

In [None]:
# paths=['../datasets/files_generated/Personality/combined_features_data_out_mahalanobis_transformedDistributions.csv']
paths=['../datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv',
                      '../datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv']
# paths=['../datasets/files_generated/Personality/combined_features_data_out_mahalanobis.csv']

target = 'Neuroticism'
filename='Tables/PCA_alltargets_mahalanobis_3PC.csv'
data = loadDataset(paths,target)
X = data.get('data')
y = data.get('target')

# remove 0 variance
col_index = np.where(X.var()!=0)
columns = X.loc[:, X.var()== 0.0].columns.values
X = X.loc[:, X.var() != 0.0]

# Create correlation matrix
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]

# Drop features 
X = X.drop(X[to_drop_cols], axis=1)
print("current shape:",X.shape)

print(X['touchAccuracy_median'].var())

In [None]:
def optimalModelSelection(model,param_grid,X,y,method='grid'):
    '''Tune the hyperparameters to find the best score personality data'''
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler,RobustScaler
    from sklearn.pipeline import make_pipeline,Pipeline
    from sklearn.model_selection import KFold,GridSearchCV,RandomizedSearchCV
    from sklearn.decomposition import PCA
    
    scoring={'r2':'r2','mse':'neg_mean_squared_error','mae':'neg_mean_absolute_error'}
    if(method=='grid'):
        search = GridSearchCV(model, param_grid, cv=10,n_jobs=-1,scoring=scoring,return_train_score=True,refit='r2')
        search.fit(X,y)
    if(method=='random'):
        search=RandomizedSearchCV(estimator = model, param_distributions = param_grid, 
                               n_iter = 100, cv = 10, verbose=1, 
                               random_state=32, n_jobs = -1,scoring=scoring,return_train_score=True,refit='r2')
        search.fit(X,y)
    
    print('Best params: {}'.format(search.best_params_))
    logging.info('Best params: {}'.format(search.best_params_))
    print('RMSE: %0.2f'%(np.sqrt(-search.cv_results_['mean_test_mse'][search.best_index_])))
    print("R2(Validation): %0.2f (+/- %0.2f)" % (search.best_score_,search.cv_results_['std_test_r2'][search.best_index_]))
    print("R2(Train): %0.2f (+/- %0.2f)" % (search.cv_results_['mean_train_r2'][search.best_index_],
                                                 search.cv_results_['std_train_r2'][search.best_index_]))
    print("MAE(Validation): %0.2f (+/- %0.2f)" % (-search.cv_results_['mean_test_mae'][search.best_index_],
                                                  search.cv_results_['std_test_mae'][search.best_index_]))
    print("MAE(Train): %0.2f (+/- %0.2f)" % (-search.cv_results_['mean_train_mae'][search.best_index_],
                                                 search.cv_results_['std_train_mae'][search.best_index_]))
    
    logging.info('RMSE: %0.2f'%(np.sqrt(-search.cv_results_['mean_test_mse'][search.best_index_])))
    logging.info("R2: %0.2f (+/- %0.2f)" % (search.best_score_,search.cv_results_['std_test_r2'][search.best_index_]))
    return search.best_estimator_,search.best_params_, search.best_score_,search.cv_results_,search.best_index_

In [None]:
def perform_evaluation(pathsArr,model,param_grid,method='grid',transformation=False,n_components=0.95):
    
    if transformation==False:
        targets=['Neuroticism', 'Extraversion', 
                        'Openness', 'Agreeableness','Conscientiousness']
    else:
        '''For transformation'''
        not_columns=['Neuroticism', 'Extraversion', 
                        'Openness', 'Agreeableness','Conscientiousness']
        normality_test_features_path='/mnt/vdb1/Personality-Ratings/NormalityCheck/combined_univariate_normality_test_features_mahalanobis_transformed.csv'
        mahalanobis = pd.read_csv(normality_test_features_path)
        mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)

        for col in not_columns:
            if(col in mahalanobis):
                mahalanobis.remove(col)
        targets=['Neuroticism', 'Extraversion', 
                        'Openness', 'Agreeableness','Conscientiousness']
    
    #store the results
    results_r2_val_scores={}
    results_r2_test_scores={}
    results_r2_train_scores={}
    results_rmse_test={}
    results_rmse_train={}
    results_rmse_val={}
    results_adjusted_r2_val_scores={}
    results_std_r2_val_scores={}
    results={}
    results_val_mae={}
    results_test_mae={}
    results_train_mae={}
    predictions={}
    results_val_mape={}
    results_test_mape={}
    i=0
    for target in targets:
        logging.info('Prediction for {}'.format(target))
        print('Prediction for {}'.format(target))
        personality=loadDataset(paths=pathsArr,target=target)
        X=personality.get('data')
#             print(X.isnull().values.any())
        columns = X.loc[:, X.var() == 0.0].columns
        print("columns thrown away because they have 0 variance:",columns)
        X = X.loc[:, X.var() != 0.0]
        
        print("Shape of the data cleaning data :",X.shape)
        if transformation==True:
            X=X[mahalanobis]
            print("Shape of the data after selected transformed columns:",X.shape)
#                 i=i+1
        y=personality.get('target')
    
        # Create correlation matrix
        corr_matrix = X.corr().abs()

        # Select upper triangle of correlation matrix
        upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # Find index of feature columns with correlation greater than or equal to 0.80
        to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]
        X=X.drop(X[to_drop_cols],axis=1)
        
        print("Current Shape of data:",X.shape)
    
        scaler = StandardScaler()
        scaler.fit(X)
        X=scaler.transform(X)
        print(np.where(np.isnan(X)==True))
        print('number of principal components:',n_components)
        pca= PCA(n_components=n_components)
        pca.fit(X)
        print('number of principal components:',pca.n_components_)
            
        logging.info('number of principal components: {}'.format(pca.n_components_))
        predictors=pca.n_components_
        X=pca.transform(X)
                
        estimator, best_params_,best_score_,cv_results_,best_index_ = optimalModelSelection(
                    model,param_grid,X,y,method)

        y_pred_train  = estimator.fit(X,y).predict(X)
        
        # calculate MAPE
        mape_score_val = np.mean(np.abs((y - y_pred_train) / y)) * 100

        '''Store the residuals in the table'''
        residuals = np.array(y)- y_pred_train

        #store it in a seperate table
        prediction= {'Original':np.array(y),'Predicted':y_pred_train,'Residuals':residuals}
        predictions[target]=prediction
            
        # RSquared score
        results_r2_val_scores[target]=best_score_
        results_r2_train_scores[target]=cv_results_['mean_train_r2'][best_index_]
            
        # RMSE 
        results_rmse_train[target]=np.sqrt(np.abs(cv_results_['mean_train_mse'][best_index_]))
        results_rmse_val[target]= np.sqrt(np.abs(cv_results_['mean_test_mse'][best_index_]))
        
        results_std_r2_val_scores[target]= cv_results_['std_test_r2'][best_index_]
            
        # Adjusted RSquared
        results_adjusted_r2_val_scores[target]=1 - (1-best_score_)*(len(y)-1)/(len(y)-predictors-1)
            
        # MAE 
        results_train_mae[target]= -cv_results_['mean_train_mae'][best_index_]
        results_val_mae[target]= -cv_results_['mean_test_mae'][best_index_]
        
        #MAPE
        results_val_mape[target]=mape_score_val
            
        print('*'*100)
        logging.info('*'*100)
                
        
    results['r2_train']=results_r2_train_scores
    results['r2_validation']=results_r2_val_scores
    results['rmse_train']=results_rmse_train
    results['rmse_validation']=results_rmse_val
    results['std_validation']= results_std_r2_val_scores
    results['adjusted_r2_val']=results_adjusted_r2_val_scores
    results['mae_train']=results_train_mae
    results['mae_validation']=results_val_mae
    results['mape_validation']=results_val_mape
        
    print('*'*100)
    logging.info('*'*100)
    return results, predictions

In [None]:
def runAllModels(pathsArr, filename,transformation=False,n_components=0.95):
    # #create models
    
    np.random.seed(32)
    #linear regression
    logging.info('********Applying Linear Regression****************')
    from sklearn.linear_model import LinearRegression
    fit_intercept_space=[True]
    param_grid={'fit_intercept':fit_intercept_space}
    regr= LinearRegression()
    results_regr,predictions_regr= perform_evaluation(pathsArr,regr,param_grid,method='grid',transformation=transformation,n_components=n_components)

    ## lasso regression
    logging.info('********Applying Lasso Regression****************')
    from sklearn.linear_model import Lasso
    alpha_space = np.logspace(-2, 2, 100)
    param_grid={'alpha':alpha_space}
    lasso = Lasso(random_state=32)
    results_lasso,predictions_lasso = perform_evaluation(pathsArr,lasso,param_grid,method='random',
                                                        transformation=transformation,n_components=n_components)
    


    ## elastic net 
    logging.info('********Applying Elastic Net Regression****************')
    from sklearn.linear_model import ElasticNet
    alpha_space = np.logspace(-2, 2 , 50)
    param_grid={'alpha':alpha_space}
    enet = ElasticNet(random_state=32)
    results_enet,predictions_enet = perform_evaluation(pathsArr,enet,param_grid,method='random',transformation=transformation,n_components=n_components)

    ##create models
    np.random.seed(19)
    # support vector machines
    logging.info('********Applying Support vector machine****************')
    from sklearn.svm import SVR
    C_space=np.logspace(-1,1,10)
    epsilon_space= np.logspace(-3,1,10)
    gamma_space = np.logspace(-3, -2, 10)
    param_grid={'C':C_space,'epsilon':epsilon_space,'gamma':gamma_space}
    svr = SVR(kernel = 'rbf')
    results_svm,predictions_svm = perform_evaluation(pathsArr,svr,param_grid,method='random',transformation=transformation,n_components=n_components)


     # random forest
    logging.info('********Applying Random Forest****************')
    from sklearn.ensemble import RandomForestRegressor
    n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
    max_depth = [int(x) for x in np.linspace(1, 5, num = 5)]

    min_samples_split = [int(x) for x in np.linspace(10, 100, num = 10)]
    min_samples_leaf = [int(x) for x in np.linspace(10, 60, num = 20)]
    bootstrap = [True, False]
    max_features=['auto','sqrt']
    param_grid={'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'max_features':max_features
                   }   
    rf = RandomForestRegressor(random_state=32)
    results_rf,predictions_rf= perform_evaluation(pathsArr,rf,param_grid,method='random',transformation=transformation,n_components=n_components)

    #Linear regression using stochastic gradient descent
    logging.info('********Applying Linear regression with stochastic gradient descent****************')
    from sklearn.linear_model import SGDRegressor
    param_grid={#'max_iter':[100,500,1000],
                'max_iter':[50,100],
                'penalty':[None],
                'eta0':[0.01,0.1,0.5]
                   }
    sgd_reg = SGDRegressor(random_state=32)
    results_sgd,predictions_sgd = perform_evaluation(pathsArr,sgd_reg,param_grid,transformation=transformation,n_components=n_components)

# #     # MARS
    np.random.seed(20)
    logging.info('********Applying MARS****************')
    from pyearth import Earth
    max_degree_space=[1]
    penalty_space=np.logspace(-1,1,20)
    minspan_alpha=np.logspace(-3,1,20)
    max_terms=[10,20,25]
    endspan_alpha = [0.05]
    param_grid={'max_degree':max_degree_space,
        'penalty':penalty_space,
                'use_fast':[True],
        'max_terms':max_terms
               }
    mars= Earth()
    results_mars,predictions_mars= perform_evaluation(pathsArr,mars,param_grid,method='grid',transformation=transformation,n_components=n_components)
    
    def createTable(results,name):
        '''Creates the final table'''
        df= pd.concat([
        pd.DataFrame.from_dict(results.get('r2_train'),orient='index'),
                  pd.DataFrame.from_dict(results.get('r2_validation'),orient='index'),
                  pd.DataFrame.from_dict(results.get('rmse_train'),orient='index'),
        pd.DataFrame.from_dict(results.get('rmse_validation'),orient='index'),
                  pd.DataFrame.from_dict(results.get('std_validation'),orient='index'),
                     pd.DataFrame.from_dict(results.get('adjusted_r2_val'),orient='index'),
                       pd.DataFrame.from_dict(results.get('mae_train'),orient='index'),
                      pd.DataFrame.from_dict(results.get('mae_validation'),orient='index'),
        pd.DataFrame.from_dict(results.get('mape_validation'),orient='index')
        ],axis=1)
        
        df.columns=[
            #'R2(Test)',
            'R2(Train)','R2(Validation)',
                    'RMSE(Train)','RMSE(Validation)',
                    'StandardError(Validation)',
                    'Adjusted R2(Validation)',
                    'MAE(Train)','MAE(Validation)',
                     'MAPE(Validation)'
                   ]
        df['Target']=df.index
        df['Algorithm']=name
        df= df.reset_index(drop=True)
        return df

    df_sgd=createTable(results_sgd,name='Linear Regression SGD')
    df_lasso=createTable(results_lasso,name='Lasso Regression')
    df_enet=createTable(results_enet,name='Elastic Net')
    df_svm=createTable(results_svm,name='SVM')
    df_rf=createTable(results_rf,name='Random Forest')
    df_mars=createTable(results_mars,name='MARS')
    df_lr=createTable(results_regr,name='Linear Regression')
    
    # result file saved here
    pd.concat([
        df_rf,df_svm,df_sgd,
        df_lr,
        df_lasso,
        df_enet,df_mars
    ]).to_csv(filename)
    
    del df_sgd,df_lasso,df_enet,df_svm,df_rf,df_mars
    
     
    
    def createPredictionsTable(predictions):
        targets = ['Neuroticism','Extraversion', 'Openness', 'Agreeableness','Conscientiousness']
        df = pd.DataFrame()
        for target in targets:
            pq= pd.DataFrame(predictions.get(target))
            pq.rename(index=str, columns={"Original": "Original_"+target, "Prediction": "Prediction_"+target,
                                          'Residuals':'Residuals_'+target}, inplace=True)
            df = pd.concat([df,pq],axis=1)
        return df
    
    df_sgd=createPredictionsTable(predictions_sgd)
    df_lasso=createPredictionsTable(predictions_lasso)
    df_enet=createPredictionsTable(predictions_enet)
    df_svm=createPredictionsTable(predictions_svm)
    df_rf=createPredictionsTable(predictions_rf)
    df_mars=createPredictionsTable(predictions_mars)
    df_lr=createPredictionsTable(predictions_regr)
    
    if transformation==False:
        filename='PCA_alltargets_mahalanobis_PCA_'+str(n_components)+'_predictions.xlsx'
    else:
        filename='PCA_alltargets_mahalanobis_transformed_PCA_'+str(n_components)+'_predictions.xlsx'
    
    # residuals are saved here
    with pd.ExcelWriter(filename) as writer:  # doctest: +SKIP
        df_sgd.to_excel(writer, sheet_name='Linear Regression SGD')
        df_lasso.to_excel(writer, sheet_name='Lasso Regression')
        df_enet.to_excel(writer, sheet_name='Elastic Net')
        df_svm.to_excel(writer, sheet_name='SVM')
        df_rf.to_excel(writer, sheet_name='Random Forest')
        df_mars.to_excel(writer, sheet_name='MARS')
        df_lr.to_excel(writer, sheet_name='LR')
    
    print('File saved successfully')
    

Study1+Study2 original
---

In [None]:
# evaluate model on original distribution with 95% variance explained PC
if __name__=='__main__':
    paths=['/mnt/vdb1/datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv',
                      '/mnt/vdb1/datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv']

    filename='Tables/PCA_alltargets_mahalanobis_0.95PC.csv'
    runAllModels(paths,filename,transformation=False,n_components=0.95)

In [None]:
# evaluate model on original distribution with 80% variance explained PC
if __name__=='__main__':
    paths=['/mnt/vdb1/datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv',
                      '/mnt/vdb1/datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv']
    filename='Tables/PCA_alltargets_mahalanobis_0.80PC.csv'
    runAllModels(paths,filename,transformation=False,n_components=0.80)

In [None]:
# evaluate model on original distribution with 3 PC
if __name__=='__main__':
    paths=['/mnt/vdb1/datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv',
                      '/mnt/vdb1/datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv']
    filename='Tables/PCA_alltargets_mahalanobis_3PC.csv'
    runAllModels(paths,filename,transformation=False,n_components=3)

Study1+Study2 transformation
---

In [None]:
# evaluate model on transformed distribution with 95% variance explained PC
if __name__=='__main__':
    paths=['/mnt/vdb1/datasets/files_generated/Personality/combined_features_data_out_mahalanobis_transformedDistributions.csv']
    filename='Tables/PCA_alltargets_mahalanobis_transformed_0.95PC.csv'
    runAllModels(paths,filename,transformation=True,n_components=0.95)

In [None]:
# evaluate model on transformed distribution with 80% variance explained PC
if __name__=='__main__':
    paths=['/mnt/vdb1/datasets/files_generated/Personality/combined_features_data_out_mahalanobis_transformedDistributions.csv']
    filename='Tables/PCA_alltargets_mahalanobis_transformed_0.80PC.csv'
    runAllModels(paths,filename,transformation=True,n_components=0.80)

In [None]:
# evaluate model on transformed distribution with 3 PC
if __name__=='__main__':
    paths=['/mnt/vdb1/datasets/files_generated/Personality/combined_features_data_out_mahalanobis_transformedDistributions.csv']
    filename='Tables/PCA_alltargets_mahalanobis_transformed_3PC.csv'
    runAllModels(paths,filename,transformation=True,n_components=3)