# Computes the factor loadings for the data after PCA

In [3]:
import pandas as pd
import numpy as np
import collections
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# %load MLOperationsUtilities.py
def readDataFromCsv(file):
    """ Read csv from files
    
    Parameters
    ----------
    file: str
        Filename to be read
    
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe containing the dataset
    """
    import pandas as pd
    print ("Reading the file from: ",file)
    df = pd.read_csv(file)
    return df

def loadDataset(paths=['../datasets/files_generated/Personality/study1_features_data.csv',
                      '../datasets/files_generated/Personality/study2_features_data.csv'],target='Neuroticism'):
    """ prepares the data and loads it
    
    Parameters
    ----------
    paths: array
        Filenames to be read
    target: str
        perosnality label to be specified
    
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe containing the dataset
    """
    for path in paths:
        if 'study1' in path:
            df = readDataFromCsv(path)
            df= df.select_dtypes (['int64','float64']).drop(['VP','age','user_id'],axis=1)
            print('The shape of the data  currently in study1: ',df.shape)
            X_study1,y_study1= df.drop(['Neuroticism', 'Extraversion', 
                                        'Openness', 'Agreeableness','Conscientiousness'],axis=1),df[target]
        elif 'study2'in path:
            df = readDataFromCsv(path)
            df = df.select_dtypes(['int64','float64']).drop(['user_id','UserId','VP','Age','Handedness_Score'],axis=1)
            print('The shape of the data  currently in study2: ',df.shape)
            X_study2,y_study2=df.drop(['Neuroticism', 'Extraversion', 'Openness', 'Agreeableness','Conscientiousness'],axis=1),df[target]
        else:
            df = pd.read_csv(path,index_col=0)
            X,y=df.drop(['Neuroticism', 'Extraversion', 
                                        'Openness', 'Agreeableness','Conscientiousness','user_id'],axis=1),df[target]
    # concat both the studies
    if(len(paths)>1):
        X = pd.concat([X_study1,X_study2])
        y= pd.concat([y_study1,y_study2])
    
    print('The shape of the data after concating both the studies {}'.format(X.shape))
    print('The shape of the target after concating both the studies {}'.format(y.shape))
    assert df.isnull().values.any()==False, 'Please check for null values'
    df_result={'data':X,'target':y}
    return df_result

In [18]:
def transformedColumns(normality_test_features_path):
    print("reading the normal features from path: ",normality_test_features_path)
    mahalanobis = pd.read_csv(normality_test_features_path)
    mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)
    not_columns=['Neuroticism', 'Extraversion', 
                        'Openness', 'Agreeableness','Conscientiousness']
    for col in not_columns:
        if(col in mahalanobis):
            mahalanobis.remove(col)
    return mahalanobis

def getFactorLoadings(X, y,filename_to_save):
    # removed 0 variance
    X = X.loc[:, X.var() != 0.0]

    # Create correlation matrix
    corr_matrix = X.select_dtypes(['float64']).corr().abs()

    # Select upper triangle of correlation matrix
    upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]

    # Drop features 
    X = X.drop(X[to_drop_cols], axis=1)

    print("Shape of the data after removing 0 variance highly correlated data:",X.shape)

    # split the data into train test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42,shuffle=True)     

    columns = X_train.columns

    scaler = StandardScaler()
    X_train_sc = scaler.fit(X_train)
    X_train_sc=scaler.transform(X_train)
    X_test_sc = scaler.transform(X_test)

    pca= PCA()
    pca.fit(X_train_sc)
    print('number of principal components:',pca.n_components_)

    pd.DataFrame(pca.components_,columns=columns).T.to_csv(filename_to_save)
    print("file saved in {}".format(filename_to_save))

In [19]:
# study1+study2 original distributions
if __name__=='__main__':
    paths=['../datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv',
                      '../datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv']
    target='Neuroticism'
    result = loadDataset(paths,target=target)
    X= result.get('data')
    y=result.get('target')
    filename_to_save = 'Tables/PCA-Factor-Loadings/PCA_mahalanobis.csv'
    getFactorLoadings(X,y,filename_to_save)

Reading the file from:  ../datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv
The shape of the data  currently in study1:  (31, 186)
Reading the file from:  ../datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv
The shape of the data  currently in study2:  (45, 186)
The shape of the data after concating both the studies (76, 181)
The shape of the target after concating both the studies (76,)
Shape of the data after removing 0 variance highly correlated data: (76, 78)
number of principal components: 78
file saved in Tables/PCA-Factor-Loadings/PCA_mahalanobis.csv


In [21]:
# study1#study2 transformed distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv'
    target='Neuroticism'
    
    normality_test_features_path='Tables/NormalityCheck/combined_univariate_normality_test_features_mahalanobis_transformed.csv'
    mahalanobis = transformedColumns(normality_test_features_path)
    result = loadDataset(paths,target=target)
    X= result.get('data')
    y=result.get('target')                
    
    # use only transformed columns
    X = X[mahalanobis]
    print(X.columns)
    
    filename_to_save = 'Tables/PCA-Factor-Loadings/PCA_mahalanobis_transformedDistributions.csv'
    getFactorLoadings(X,y,filename_to_save)

reading the normal features from path:  Tables/NormalityCheck/combined_univariate_normality_test_features_mahalanobis_transformed.csv
Reading the file from:  ../datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv
The shape of the data  currently in study1:  (31, 186)
Reading the file from:  ../datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv
The shape of the data  currently in study2:  (45, 186)
The shape of the data after concating both the studies (76, 181)
The shape of the target after concating both the studies (76,)
Index(['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean',
       'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean',
       'button_touch_x_location_mean', 'button_touch_y_location_mean',
       'target_touch_x_location_mean', 'target_touch_y_location_mean',
       ...
       'target_touch_y_location_count', 'time_between_touches_count',
       'x_location.release_count', 'y_locat