# Computes the statistical derviatives from the base variables

In [None]:
# %load ../Utilities/UtilityFunctions.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings; warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def readDataFromCsv(file):
    """ Read csv from files
    
    Parameters
    ----------
    file: str
        Filename to be read
    
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe containing the dataset
    """
    print ("Reading the file from: ",file)
    df = pd.read_csv(file,index_col=False)
    return df

def joinDataset(df1, df2, left_keys, right_keys):
    """Read csv from files
    
    Parameters
    ----------
    df1: pandas.DataFrame
        1st dataframe
    df2: pandas.DataFrame
        2nd dataframe
    left_keys: list
        keys from left table on which joining will be performed
    right_keys: list
        keys from right table on which joining will be performed
    
    
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe
    """
    import pandas as pd
    df_result= pd.merge(df1, df2, left_on=left_keys, right_on=right_keys)
    print("Join operation successful !")
    return df_result
    


In [None]:
# %load ../Utilities/StatisticalOperations.py
def computeStats(df, groupbyCols=['sessionNr','App','Cond','user_id']):
    """Calculate the summarization of the data
    
    Parameters
    ----------
    df: pandas.DataFrame
        dataframe
    groupbyCols: list
        columns on which group by operation to be performed
        
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe
    
    """
    import pandas as pd
    df_temp =df.copy()
    df1= df_temp.groupby(by=groupbyCols).mean()
    print("Null values present in the data after computing mean:%s" %df1.isnull().values.any())
    compute(df1,"mean")
    df2= df_temp.groupby(by=groupbyCols).median()
    print("Null values present in the data after computing median:%s"%df2.isnull().values.any())
    compute(df2,"median")
    df3= df_temp.groupby(by=groupbyCols).skew()
    print("Null values present in the data after computing skew:%s"%df3.isnull().values.any())
    compute(df3,"skew")
    df4= df_temp.groupby(by=groupbyCols).apply(pd.DataFrame.kurt)
    print("Null values present in the data after computing kurtosis:%s"%df4.isnull().values.any())
    compute(df4,"kurt")
    df5= df_temp.groupby(by=groupbyCols).quantile()
    print("Null values present in the data after computing interquartile range:%s"% df5.isnull().values.any())
    compute(df5,"quantile")
    df6= df_temp.groupby(by=groupbyCols).std()
    print("Null values present in the data after computing standard deviation:%s"% df6.isnull().values.any())
    compute(df6,"std")
    df7= df_temp.groupby(by=groupbyCols).mad()
    print("Null values present in the data after computing mean absolute deviation:%s"% df7.isnull().values.any())
    compute(df7,"mad")
    df8= df_temp.groupby(by=groupbyCols).max()
    print("Null values present in the data after computing max:%s"% df8.isnull().values.any())
    compute(df8,"max")
    df9= df_temp.groupby(by=groupbyCols).min()
    print("Null values present in the data after computing min:%s"% df9.isnull().values.any())
    compute(df9,"min")
    df10= df_temp.groupby(by=groupbyCols).count()
    print("Null values present in the data after computing count:%s"% df10.isnull().values.any())
    compute(df10,"count")
    
    result = pd.concat([df1,df2,df3,
                        df4,
                        df5,df6,df7,df8,df9,df10], axis=1, sort=False)
    
    result.reset_index(inplace=True)
    result['user_id']=pd.to_numeric(result['user_id'])
    result['sessionNr']=pd.to_numeric(result['sessionNr'])
    print('Is the stats summ null after concating: ',result.isnull().values.any())
    print('%d number of rows and %d columns present in the data'%(result.shape[0],result.shape[1]))
    return result


def compute(df,name):
    """calculate the hit and miss rate
    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe on which the process will be performed
    name: str
        Name of the dataframe
    """
    import pandas as pd
    for col in df.columns:
        df.rename(index=str,columns={col:col+"_"+name},inplace=True)
    if 'user_id'+'_'+name in df.columns:
        df.drop('user_id'+'_'+name,axis=1,inplace=True)
    if 'sessionNr'+'_'+name in df.columns:
         df.drop('sessionNr'+'_'+name,axis=1,inplace=True)
    if 'App'+'_'+name in df.columns:
         df.drop('App'+'_'+name,axis=1,inplace=True)
    if 'Cond'+'_'+name in df.columns :
        df.drop('Cond'+'_'+name,axis=1,inplace=True)
        
def calcHitRate(df,name='Study1'):
    """calculate the hit and miss rate
    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe on which the process will be performed
    name: str
        Name of the dataframe
        
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe
    """
    sLength = len(df)
    df['hit_rate']=pd.Series(np.zeros(sLength), index=df.index)
    ids = df[(df['button_pressed']==True)&(df['correct_answer']==True)]['id']
    #create views to work on
    df_temp1 = df[df['id'].isin(ids)]
    df_temp2 = df[~df['id'].isin(ids)]
    df_temp1['hit_rate'] = df_temp1['hit_rate'].replace(0,1)
    df = pd.concat([df_temp1,df_temp2],sort=False)
    
    view = df[['sessionNr','hit_rate']]
    view.groupby(by='sessionNr').mean()
    #create dataframe for this
    df_11 = view.groupby(by='sessionNr').mean()
    df_11.reset_index(inplace=True)
    df_11['sessionNr']=pd.to_numeric(df_11['sessionNr'])
    return df_11


In [4]:
def main(path_to_df='../datasets/files_generated/study1_cleandata.csv',
         path_to_ratings="../datasets/rawData.firstTry/UserTraits_study1.csv", 
         path_to_save='../datasets/files_generated/Personality/study1_features_data.csv',name='Study1'):
    '''Calculate the summarization of the data'''
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import warnings
    warnings.filterwarnings('ignore')
    df = readDataFromCsv(path_to_df)
    print('Columns present after reading the file:',df.columns)
    
    df_temp=df.copy()
    
    df_11=calcHitRate(df_temp)

    df_temp_1=df.select_dtypes(['int64','float64'])
    df_temp_1 = df_temp_1.drop(['appCode','User','sessionNr','id'],axis=1)
    
    print('Columns present after dropping few:',df.columns)
 
    if name =='Study2':
        df = computeStats(df_temp_1,groupbyCols=['user_id'])
    else:
        df = computeStats(df_temp_1,groupbyCols=['user_id'])
    

    df =pd.merge(df, df_11, left_on='user_id', right_on='user_id')
    print('After summarization, %d Number of rows and %d number of columns present in data'%(df.shape[0],df.shape[1]))
    
    df_userTraits = readDataFromCsv(path_to_ratings)
    if name=='Study1':
        df_result = joinDataset(df,df_userTraits.iloc[:,1:],left_keys=['user_id'],
                            right_keys=['user_id'])
    else:
        df_result = joinDataset(df,df_userTraits.iloc[:,1:],left_keys=['user_id'],
                            right_keys=['UserId'])
        
    print('%d Number of rows and %d number of columns present in data'%(df_result.shape[0],df_result.shape[1]))
   
    #save stats summ to the file
    df_result.to_csv(path_to_save,index=False)
    print('File saved in path: %s'%path_to_save)

In [5]:
'''creating features for personality rating using study1'''
if __name__=='__main__':
    path_to_df=['../datasets/files_generated/study1_cleandata.csv',
                '../datasets/files_generated/study1_cleandata_out_Zscore.csv',
                '../datasets/files_generated/study1_cleandata_out_modifiedZscore.csv',
                '../datasets/files_generated/study1_cleandata_out_mahalanobis.csv',
               '../datasets/files_generated/study1_cleandata_out_manhattan.csv']
    path_to_save=['../datasets/files_generated/Personality/study1_features_data.csv',
                  '../datasets/files_generated/Personality/study1_features_data_out_Zscore.csv',
                 '../datasets/files_generated/Personality/study1_features_data_out_modifiedZscore.csv',
                 '../datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv',
                 '../datasets/files_generated/Personality/study1_features_data_out_manhattan.csv']
    for descriptors, features in zip(path_to_df,path_to_save):
        main(path_to_df=descriptors,path_to_save=features)
        print('*'*100)
        print('*'*100)
    

Reading the file from:  ../datasets/files_generated/study1_cleandata.csv
Columns present after reading the file: Index(['id', 'timestamp', 'event', 'user_id', 'button_pressed', 'correct_answer', 'appCode', 'App', 'User', 'Cond', 'sessionNr', 'x_location.down', 'y_location.down', 'touch.duration', 'swipe_length', 'swipe_length.x', 'swipe_length.y', 'button_touch_x_location', 'button_touch_y_location', 'target_touch_x_location', 'target_touch_y_location', 'time_between_touches', 'x_location.release', 'y_location.release', 'difference.touch_buttonCenter_x', 'difference.touch_buttonCenter_y', 'touchAccuracy', 'touchAccuracy_x', 'touchAccuracy_y'], dtype='object')
Columns present after dropping few: Index(['id', 'timestamp', 'event', 'user_id', 'button_pressed', 'correct_answer', 'appCode', 'App', 'User', 'Cond', 'sessionNr', 'x_location.down', 'y_location.down', 'touch.duration', 'swipe_length', 'swipe_length.x', 'swipe_length.y', 'button_touch_x_location', 'button_touch_y_location', 'targ

Null values present in the data after computing skew:False
Null values present in the data after computing kurtosis:False
Null values present in the data after computing inter-quantile:False
Null values present in the data after computing standard deviation:False
Null values present in the data after computing mean absolute deviation:False
Null values present in the data after computing max:False
Null values present in the data after computing min:False
Null values present in the data after computing count:False
Is the stats summ null after concating:  False
31 number of rows and 181 columns present in the data
After summarization, 31 Number of rows and 182 number of columns present in data
Reading the file from:  ../datasets/rawData.firstTry/UserTraits_study1.csv
Join operation successful !
31 Number of rows and 198 number of columns present in data
File saved in path: ../datasets/files_generated/Personality/study1_features_data_out_mahalanobis.csv
************************************

In [6]:
'''creating features for personality rating using study2'''
if __name__=='__main__':
    path_to_df=['../datasets/files_generated/study2_cleandata.csv',
                '../datasets/files_generated/study2_cleandata_out_Zscore.csv',
                '../datasets/files_generated/study2_cleandata_out_modifiedZscore.csv',
                '../datasets/files_generated/study2_cleandata_out_mahalanobis.csv',
               '../datasets/files_generated/study2_cleandata_out_manhattan.csv']
    path_to_save=['../datasets/files_generated/Personality/study2_features_data.csv',
                  '../datasets/files_generated/Personality/study2_features_data_out_Zscore.csv',
                 '../datasets/files_generated/Personality/study2_features_data_out_modifiedZscore.csv',
                 '../datasets/files_generated/Personality/study2_features_data_out_mahalanobis.csv',
                 '../datasets/files_generated/Personality/study2_features_data_out_manhattan.csv']
    for descriptors, features in zip(path_to_df,path_to_save):
        main(path_to_df=descriptors,
             path_to_ratings="../datasets/rawData.firstTry/UserTraits_study2.csv",
             path_to_save=features,name='Study2')
        print('*'*100)
        print('*'*100)

Reading the file from:  ../datasets/files_generated/study2_cleandata.csv
Columns present after reading the file: Index(['id', 'timestamp', 'event', 'user_id', 'button_pressed', 'correct_answer', 'appCode', 'App', 'User', 'Size', 'sessionNr', 'x_location.down', 'y_location.down', 'touch.duration', 'swipe_length', 'swipe_length.x', 'swipe_length.y', 'button_touch_x_location', 'button_touch_y_location', 'target_touch_x_location', 'target_touch_y_location', 'time_between_touches', 'x_location.release', 'y_location.release', 'difference.touch_buttonCenter_x', 'difference.touch_buttonCenter_y', 'touchAccuracy', 'touchAccuracy_x', 'touchAccuracy_y'], dtype='object')
Columns present after dropping few: Index(['id', 'timestamp', 'event', 'user_id', 'button_pressed', 'correct_answer', 'appCode', 'App', 'User', 'Size', 'sessionNr', 'x_location.down', 'y_location.down', 'touch.duration', 'swipe_length', 'swipe_length.x', 'swipe_length.y', 'button_touch_x_location', 'button_touch_y_location', 'targ

Null values present in the data after computing median:False
Null values present in the data after computing skew:False
Null values present in the data after computing kurtosis:False
Null values present in the data after computing inter-quantile:False
Null values present in the data after computing standard deviation:False
Null values present in the data after computing mean absolute deviation:False
Null values present in the data after computing max:False
Null values present in the data after computing min:False
Null values present in the data after computing count:False
Is the stats summ null after concating:  False
50 number of rows and 181 columns present in the data
After summarization, 50 Number of rows and 182 number of columns present in data
Reading the file from:  ../datasets/rawData.firstTry/UserTraits_study2.csv
Join operation successful !
45 Number of rows and 212 number of columns present in data
File saved in path: ../datasets/files_generated/Personality/study2_features_