# Computes the factor loadings for STUDY1 and STUDY2 after PCA

In [1]:
import pandas as pd
import numpy as np
import collections
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
def transformedColumns(normality_test_features_path):
    """Fetches the transformed columns for the dataset
    
    Parameters
    ----------
    normality_test_features_path: str
        File path for the normality test for the dataset
    
    Returns
    ----------
    list
        Containing transformed columns which satisfies normality
    """
    print("reading the normal features from path: ",normality_test_features_path)
    mahalanobis = pd.read_csv(normality_test_features_path)
    mahalanobis = list(mahalanobis[mahalanobis['Normality']==True]['Features'].values)
    not_columns=['SEA','PQ','ATT', 'HQI', 'HQ','HQS']
    for col in not_columns:
        if(col in mahalanobis):
            mahalanobis.remove(col)
    return mahalanobis

def getFactorLoadings(X, y,filename_to_save):
    """ Finds the factor loadings for each dataset
    X: pandas.DataFrame
        Dataset containing the feature instances
    y: pandas.Series
        Series containing the labels
    filename_to_save: str
        Filename where the loadings is to be saved
    
    """
    # removed 0 variance
    X = X.loc[:, X.var() != 0.0]

    # Create correlation matrix
    corr_matrix = X.select_dtypes(['float64']).corr().abs()

    # Select upper triangle of correlation matrix
    upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop_cols = [column for column in upper_traingle.columns if any(upper_traingle[column] >= 0.80)]

    # Drop features 
    X = X.drop(X[to_drop_cols], axis=1)

    print("Shape of the data after removing 0 variance highly correlated data:",X.shape)

    # split the data into train test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42,shuffle=True)     

    columns = X_train.columns

    scaler = StandardScaler()
    X_train_sc = scaler.fit(X_train)
    X_train_sc=scaler.transform(X_train)
    X_test_sc = scaler.transform(X_test)

    pca= PCA()
    pca.fit(X_train_sc)
    print('number of principal components:',pca.n_components_)

    pd.DataFrame(pca.components_,columns=columns).T.to_csv(filename_to_save)
    print("file saved in {}".format(filename_to_save))

In [3]:
# study1 original distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/study1_features_data_out_mahalanobis.csv'
    target='PQ'
    df = pd.read_csv(path)

    df=df[df['App']=='Spell']
    X,y= df.drop(['user_id','App','Cond','sessionNr','SEA', 'PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    print(X.columns)
    filename_to_save = 'Tables/PCA-Factor-Loadings/study1_PCA_mahalanobis.csv'
    getFactorLoadings(X,y,filename_to_save)

Index(['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean',
       'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean',
       'button_touch_x_location_mean', 'button_touch_y_location_mean',
       'target_touch_x_location_mean', 'target_touch_y_location_mean',
       ...
       'target_touch_y_location_count', 'time_between_touches_count',
       'x_location.release_count', 'y_location.release_count',
       'difference.touch_buttonCenter_x_count',
       'difference.touch_buttonCenter_y_count', 'touchAccuracy_count',
       'touchAccuracy_x_count', 'touchAccuracy_y_count', 'hit_rate'],
      dtype='object', length=181)
Shape of the data after removing 0 variance highly correlated data: (186, 89)
number of principal components: 89
file saved in Tables/PCA-Factor-Loadings/study1_PCA_mahalanobis.csv


In [4]:
# study1 transformed distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/study1_features_data_out_mahalanobis_transformedDistributions.csv'
    target='PQ'
    df = pd.read_csv(path)
    df=df[df['App']=='Spell']
    
    normality_test_features_path ='Tables/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv'
    mahalanobis = transformedColumns(normality_test_features_path)
                     
    X,y= df.drop(['user_id','App','Cond','sessionNr','SEA', 'PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    
    # use only transformed columns
    X = X[mahalanobis]
    print(X.columns)
    # remove 
    filename_to_save = 'Tables/PCA-Factor-Loadings/study1_PCA_mahalanobis_transformedDistributions.csv'
    getFactorLoadings(X,y,filename_to_save)

reading the normal features from path:  Tables/NormalityCheck/study1_univariate_normality_test_features_mahalanobis_transformed.csv
Index(['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean',
       'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean',
       'button_touch_x_location_mean', 'button_touch_y_location_mean',
       'target_touch_x_location_mean', 'target_touch_y_location_mean',
       ...
       'target_touch_y_location_count', 'time_between_touches_count',
       'x_location.release_count', 'y_location.release_count',
       'difference.touch_buttonCenter_x_count',
       'difference.touch_buttonCenter_y_count', 'touchAccuracy_count',
       'touchAccuracy_x_count', 'touchAccuracy_y_count', 'hit_rate'],
      dtype='object', length=157)
Shape of the data after removing 0 variance highly correlated data: (186, 65)
number of principal components: 65
file saved in Tables/PCA-Factor-Loadings/study1_PCA_mahalanobis_transformedDistributions.csv


In [5]:
# study2 original distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/study2_features_data_out_mahalanobis.csv'
    target='PQ'
    df = pd.read_csv(path)

    df=df[df['App']=='Spell']
    X,y= df.drop(['sessionNr','App','user_id','Size','UserId', 'Session', 'PQ', 'ATT', 'HQI', 'HQS', 'HQ', 'IconSize'],axis=1),df[target]

    filename_to_save = 'Tables/PCA-Factor-Loadings/study2_PCA_mahalanobis.csv'
    print(X.columns)
    getFactorLoadings(X,y,filename_to_save)

Index(['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean',
       'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean',
       'button_touch_x_location_mean', 'button_touch_y_location_mean',
       'target_touch_x_location_mean', 'target_touch_y_location_mean',
       ...
       'target_touch_y_location_count', 'time_between_touches_count',
       'x_location.release_count', 'y_location.release_count',
       'difference.touch_buttonCenter_x_count',
       'difference.touch_buttonCenter_y_count', 'touchAccuracy_count',
       'touchAccuracy_x_count', 'touchAccuracy_y_count', 'hit_rate'],
      dtype='object', length=181)
Shape of the data after removing 0 variance highly correlated data: (587, 99)
number of principal components: 99
file saved in Tables/PCA-Factor-Loadings/study2_PCA_mahalanobis.csv


In [6]:
# study2 transformed distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/study2_features_data_out_mahalanobis_transformedDistributions.csv'
    target='PQ'
    df = pd.read_csv(path)
    df=df[df['App']=='Spell']
    
    normality_test_features_path ='Tables/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv'
    mahalanobis = transformedColumns(normality_test_features_path)
                     
    X,y= df.drop(['sessionNr','App','user_id','Size','UserId', 'Session', 'PQ', 'ATT', 'HQI', 'HQS', 'HQ', 'IconSize'],axis=1),df[target]
    
    # use only transformed columns
    X = X[mahalanobis]
    print(X.columns)
    # remove 
    filename_to_save = 'Tables/PCA-Factor-Loadings/study2_PCA_mahalanobis_transformedDistributions.csv'
    getFactorLoadings(X,y,filename_to_save)

reading the normal features from path:  Tables/NormalityCheck/study2_univariate_normality_test_features_mahalanobis_transformed.csv
Index(['x_location.down_mean', 'y_location.down_mean', 'touch.duration_mean',
       'swipe_length_mean', 'swipe_length.x_mean', 'swipe_length.y_mean',
       'button_touch_x_location_mean', 'button_touch_y_location_mean',
       'target_touch_x_location_mean', 'target_touch_y_location_mean',
       ...
       'target_touch_y_location_count', 'time_between_touches_count',
       'x_location.release_count', 'y_location.release_count',
       'difference.touch_buttonCenter_x_count',
       'difference.touch_buttonCenter_y_count', 'touchAccuracy_count',
       'touchAccuracy_x_count', 'touchAccuracy_y_count', 'hit_rate'],
      dtype='object', length=155)
Shape of the data after removing 0 variance highly correlated data: (587, 65)
number of principal components: 65
file saved in Tables/PCA-Factor-Loadings/study2_PCA_mahalanobis_transformedDistributions.csv


In [13]:
# combined original distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/combined_features_data_out_mahalanobis.csv'
    target='PQ'
    df = pd.read_csv(path,index_col=0)

    df=df[df['App']=='Spell']
    X,y=df.drop(['sessionNr','App','user_id','Cond', 
                     'PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]

    filename_to_save = 'Combined/Tables/PCA-Factor-Loadings/combined_PCA_mahalanobis.csv'
    print(X.columns)
    getFactorLoadings(X,y,filename_to_save)

Index(['button_touch_x_location_count', 'button_touch_x_location_kurt',
       'button_touch_x_location_mad', 'button_touch_x_location_max',
       'button_touch_x_location_mean', 'button_touch_x_location_median',
       'button_touch_x_location_min', 'button_touch_x_location_quantile',
       'button_touch_x_location_skew', 'button_touch_x_location_std',
       ...
       'y_location.release_count', 'y_location.release_kurt',
       'y_location.release_mad', 'y_location.release_max',
       'y_location.release_mean', 'y_location.release_median',
       'y_location.release_min', 'y_location.release_quantile',
       'y_location.release_skew', 'y_location.release_std'],
      dtype='object', length=181)
Shape of the data after removing 0 variance highly correlated data: (773, 101)
number of principal components: 101
file saved in Combined/Tables/PCA-Factor-Loadings/combined_PCA_mahalanobis.csv


In [14]:
# combined transformed distributions
if __name__=='__main__':
    path='../datasets/files_generated/UX/combined_features_data_out_mahalanobis_transformedDistributions.csv'
    target='PQ'
    df = pd.read_csv(path)
#     print(df.columns)

    df=df[df['App']=='Spell']
    X,y=df.drop(['sessionNr','App','user_id','Cond', 
                     'PQ', 'ATT', 'HQI', 'HQS', 'HQ'],axis=1),df[target]
    filename_to_save = 'Combined/Tables/PCA-Factor-Loadings/combined_PCA_mahalanobis_transformedDistributions.csv'
    print(X.columns)
    getFactorLoadings(X,y,filename_to_save)

Index(['button_touch_x_location_count', 'button_touch_x_location_kurt',
       'button_touch_x_location_mad', 'button_touch_x_location_max',
       'button_touch_x_location_mean', 'button_touch_x_location_median',
       'button_touch_x_location_min', 'button_touch_x_location_quantile',
       'button_touch_x_location_skew', 'button_touch_x_location_std',
       ...
       'y_location.release_count', 'y_location.release_kurt',
       'y_location.release_mad', 'y_location.release_max',
       'y_location.release_mean', 'y_location.release_median',
       'y_location.release_min', 'y_location.release_quantile',
       'y_location.release_skew', 'y_location.release_std'],
      dtype='object', length=181)
Shape of the data after removing 0 variance highly correlated data: (773, 83)
number of principal components: 83
file saved in Combined/Tables/PCA-Factor-Loadings/combined_PCA_mahalanobis_transformedDistributions.csv
