In [1]:
# import pandas as pd
# df = pd.read_csv('datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study2.csv', index_col=0)
# df['sessionNr_revised']=0

In [2]:
# df['sessionNr_revised']=(df[['user_id','sessionNr']]!= df[['user_id','sessionNr']].shift(1)).any(axis=1).cumsum()
# df.to_csv('datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study2_modified.csv')

In [3]:
def readDataFromCsv(file):
    """ Read csv from files
    
    Parameters
    ----------
    file: str
        Filename to be read
    
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe containing the dataset
    """
    import pandas as pd
    print ("Reading the file from: ",file)
    df = pd.read_csv(file,index_col=False)
    return df

def cleanData(df,name='Study1'):
     """Applies cleaning rules for cleaning data
        
        Parameters
        ----------
        df: pd.DataFrame
            Dataframe to be cleaned
        name: str
            Name of the data
        
        Returns
        -------
        pandas.DataFrame
            Returns cleaned data frame
        """
    import pandas as pd
    import numpy as np
    import warnings
    warnings.filterwarnings('ignore')
    ## check for tracking error and skip those
    # 1. if touch.duration = null and x/y_location.down/release is null then skip those rows
    print('Number of data',df.shape[0])
    print('Checking point 1')
    print('%d Number of rows skipped'%(df[(df['touch.duration'].isnull()) & (df['x_location.release'].isnull())
                    & (df['y_location.release'].isnull())].shape[0]))
    df= df[(df['touch.duration'].notnull()) & (df['x_location.release'].notnull())
                    & (df['y_location.release'].notnull())]
    
    
    # 2. If the button_pressed=True and Correct_answer=True and target_height,target_origin is null, skip the rows 
        ##as it is purely experimental error
    ids = df[(df['button_pressed']==True)& (df['correct_answer']==True) & (df['target_height'].isnull())]['id']
    df =df[~df['id'].isin(ids)]
        
    # 4. Impute the missing values where button_pressed = False and  target/button_touch_x/y_location is null
    print('Checking point 4')
    ids=df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_touch_y_location'].isnull())]['id']
    print('%d number of rows filled with 0 '%len(ids))
    df.loc[df['id'].isin(ids),'button_touch_y_location']=0
    df.loc[df['id'].isin(ids),'button_touch_x_location']=0
    df.loc[df['id'].isin(ids),'target_touch_x_location']=0
    df.loc[df['id'].isin(ids),'target_touch_y_location']=0
    
    # Impute the missing values for target_touch_x/y_locaton where button_pressed=True and correct_answer=False
    print('checking condition button_pressed = true and target_touch_x_location is null')
    ids=df[(df['button_pressed']==True) &(df['target_touch_x_location'].isnull())]['id']
    print('%d number of rows filled with 0 '%len(ids))
    df.loc[df['id'].isin(ids),'target_touch_x_location']=0
    df.loc[df['id'].isin(ids),'target_touch_y_location']=0
    
    #skip those rows where touchAccuracy_x/y, diference.touch_buttonCenter_x/y is missing, 
    # because information of button_width or button_center_x/y is not available
    ids=df[(df['button_pressed']==False) & (df['touchAccuracy_x'].isnull())]['id']
    df =df[~df['id'].isin(ids)]
    
     # skip the row with NA values for time_between_touches as it the start of the experiment
    ids = df[df['time_between_touches'].isnull()]['id']
    df=df[~df['id'].isin(ids)]
    print('%d number of rows skipped'%len(ids))
    
    #skip swipe_speed and swipe_speed.x/y
    columns=['id','timestamp','event','user_id', 'button_pressed', 'correct_answer','appCode','App','User', 'Size', 'sessionNr',
      'x_location.down','y_location.down','touch.duration','swipe_length','swipe_length.x','swipe_length.y',      
#       'swipe_speed', 'swipe_speed.x', 'swipe_speed.y',
             'button_touch_x_location','button_touch_y_location','target_touch_x_location','target_touch_y_location',
             'time_between_touches',
    'x_location.release',  'y_location.release',
#              'button_center_x','button_center_y',
    'difference.touch_buttonCenter_x','difference.touch_buttonCenter_y', 'touchAccuracy','touchAccuracy_x', 'touchAccuracy_y' 
     ]
    
    df=df[columns]
    
    print('---Remaining null values in the data---')
    counter=0
    for col in df.columns:
        if df[col].isnull().sum()>0:
            counter = counter+1
            print('%s %d'%(col,df[col].isnull().sum()))
    if counter==0:
        print('0')
        
    #convert the length values to its absolute value, i.e. integer value
    df[['touch.duration','swipe_length','swipe_length.x','swipe_length.y','time_between_touches',
                'difference.touch_buttonCenter_x','difference.touch_buttonCenter_y']] = np.absolute(
    df[['touch.duration','swipe_length','swipe_length.x','swipe_length.y','time_between_touches',
                'difference.touch_buttonCenter_x','difference.touch_buttonCenter_y']])
        
    return df

def reject_outliers(df,threshold=3.5, method='Zscore'):
    """Applies outlier detection techqniue based on the specified method and removes it
    Parameters
    -----------
    df: pandas.DataFrame
        Dataframe on which the outliers to be detected and removed
    threshold: float,optional
        Threshold value to be used when method is Zscore
    method: str
        Specifies the method to be used for outlier detection
    Returns
    --------
    pd.DataFrame
        Returns the outlier treated data frame
    """
    from scipy import stats
    import numpy as np
    df_temp = df.select_dtypes('float64')
    if method=='Zscore':
        '''Calculate the z score using mean and standard deviation'''
        zscore = np.abs(stats.zscore(df_temp))
        df_outlier = df[(zscore < threshold).all(axis=1)]
    if method == 'modifiedZscore':
        '''Calculate the z score using median and Median absolute deviation (MAD)'''
        df_np = np.array(df_temp)
        median_df = np.array(df_temp.median().to_frame().transpose())
        median_absolute_deviation_df = np.array(df_temp.mad().to_frame().transpose())
        modified_z_scores =np.abs(np.divide((df_np-median_df), median_absolute_deviation_df))
#         print(modified_z_scores.shape)
        index= np.where(modified_z_scores < 3)
    #   print(len(index[0]))
        df_outlier = df[(modified_z_scores < threshold).all(axis=1)]
    if method == 'mahalanobis':
        from sklearn.covariance import EllipticEnvelope
        clf =EllipticEnvelope(random_state=14)
        clf.fit(df.select_dtypes('float64'))
        predictions = np.array(clf.predict(df.select_dtypes('float64')))
        # find the index where there are outliers. outliers are labeled as -1 and inliers as 1
        row_num=np.where(predictions==-1)
#         print(row_num)
        id = df.iloc[row_num]['id']
        # print(row_num.shape)
        df_outlier = df[~df['id'].isin(id)]
    if method=='manhattan':
        # uses knn to compute the neighbours based on manhattan distance
        from sklearn.neighbors import LocalOutlierFactor
        lof= LocalOutlierFactor(metric='manhattan',n_jobs=-1)
    #     lof.fit(df.select_dtypes('float64'))
        predictions = lof.fit_predict(df.select_dtypes('float64'))
        row_num=np.where(predictions==-1)
        id = df.iloc[row_num]['id']
        df_outlier = df[~df['id'].isin(id)]
        return df_outlier
        
    return df_outlier

In [4]:
def main():
    """Main function to be executed"""
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import warnings; warnings.filterwarnings('ignore')
    df_study2 = readDataFromCsv("datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study2.csv")
    df_study2_touch_down= df_study2[df_study2['event']=="touch.down"]
    print(df_study2_touch_down.shape)
    df = cleanData(df_study2_touch_down,name='Study2')
    print(df.shape)
    df = df.reset_index(drop=True)
    df.to_csv('datasets/files_generated/study2_cleandata.csv',index=False)
    print('Data is cleaned and file is saved')
    methods=['mahalanobis','Zscore','modifiedZscore','manhattan']
    for method in methods:
        df_out = reject_outliers(df,method=method)
        path = "datasets/files_generated/study2_cleandata_out_"+method+".csv"
        df_out.to_csv(path,index=False)
        print('Outliers rejected and file is saved')
        print("****************Report**************************************")
        print('***********************************************************')
        print('**************************************************************')
        print('%d number of rows rejected due to outliers'%(df.shape[0]-df_out.shape[0]))
        print('%d Total number of rows present in the data'%df_out.shape[0])

In [5]:
if __name__=='__main__':
    main()

Reading the file from:  datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study2.csv
(25270, 93)
Number of data 25270
Checking point 1
49 Number of rows skipped
Checking point 4
9044 number of rows filled with 0 
checking condition button_pressed = true and target_touch_x_location is null
44 number of rows filled with 0 
1 number of rows skipped
---Remaining null values in the data---
0
(23395, 29)
Data is cleaned and file is saved
Outliers rejected and file is saved
****************Report**************************************
***********************************************************
**************************************************************
2340 number of rows rejected due to outliers
21055 Total number of rows present in the data
Outliers rejected and file is saved
****************Report**************************************
***********************************************************
**************************************************************
2187 number of ro

In [6]:
# df = pd.read_csv('datasets/files_generated/study2_cleandata.csv',index_col=0)
# df[df['target_touch_x_location'].isnull()]

In [16]:
def check(df ,user_col='user_id',col1='sessionNr', col2='sessionNr_revised'):
    import pandas as pd
    pd.set_option('display.max_colwidth',500)
#     df = pd.read_csv('datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study2.csv', index_col=0)
#     df=df[[col,'user_id']]
    no_of_user = df[user_col].unique()
    no_of_user
    result={}
    sum_col1=0
    sum_col2=0
    for user in no_of_user:
        user_details={}
        sessions_col1=df[df[user_col]==user][col1].unique().tolist()
#         sessions_col2=df[df['user_id']==user][col2].unique().tolist()
        sessions_col1.sort()
#         sessions_col2.sort()
        count_col1=len(sessions_col1)    #     print(sessions)

#         count_col2=len(sessions_col2)
    #     print(sessions)
        sum_col1=sum_col1+count_col1
#         sum_col2=sum_col2+count_col2
        user_details['session']=str(sessions_col1)
#         user_details['session_revised']=str(sessions_col2)
        user_details['count']=count_col1
#         user_details['count_revised']=count_col2
        result[user]=user_details
    # df.groupby(by=['user_id','sessionNr']).count()

    # counter=1

    print(len(no_of_user)*12)
    print('original session per user: {} | revised session per user: {}'.format(sum_col1,sum_col2))
    return pd.DataFrame(result).T
    # print(result)
    # len(no_of_user)*12
    # df.to_csv('datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study2_modified.csv')

In [19]:
df = pd.read_csv('datasets/files_generated/study2_cleandata.csv',index_col=0)
df.columns
check(df,user_col='user_id',col1='sessionNr')

600
original session per user: 599 | revised session per user: 0


Unnamed: 0,count,session
190,12,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
191,12,"[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]"
192,12,"[25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]"
193,12,"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]"
194,12,"[49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]"
195,12,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72]"
196,12,"[73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]"
197,12,"[85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96]"
198,12,"[97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108]"
199,12,"[109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]"


In [26]:
df= pd.read_csv('datasets/files_generated/UX/study2_features_data.csv',index_col=False)
print(len(df.user_id.unique()))
df.columns
check(df,user_col='user_id',col1='sessionNr')

49
588
original session per user: 587 | revised session per user: 0


Unnamed: 0,count,session
190,12,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
191,12,"[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]"
192,12,"[25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36]"
193,12,"[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]"
194,12,"[49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]"
195,12,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72]"
196,12,"[73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84]"
197,12,"[85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96]"
198,12,"[97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108]"
199,12,"[109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]"


In [9]:
# print(df[(df['button_pressed']==True)& (df['correct_answer']==True) & df['target_height'].isnull()].shape)
# ids= df[(df['button_pressed']==True)& (df['correct_answer']==True) & df['target_touch_y_location'].isnull()]['id']
# df =df[~df['id'].isin(ids)]

#  # 2. If the button_pressed=True and Correct_answer=True and target_height,target_origin is null, skip the rows.
# error_spell = df[(df['button_pressed']==True)& (df['correct_answer']==True)& (df['App']=='Spell') & (df['target_height'].isnull())]['id']
# # error_quiz = df[(df['button_pressed']==True)& (df['correct_answer']==True)& (df['App']=='Quiz') & (df['target_height'].isnull())]['id']
# print('Checking point 2')
# if(len(error_spell)>1):
#     df =df[~df['id'].isin(error_spell)]
#     print('%d number of rows skipped'%len(error_spell))
# # if(len(error_quiz)>1):
# #     df =df[~df['id'].isin(error_quiz)]
# #     print('%d number of rows skipped'%len(error_quiz))




In [10]:
# ids=df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_origin_x'].isnull())]['id']
# df.loc[df['id'].isin(ids),'button_touch_y_location']=0
# df.loc[df['id'].isin(ids),'button_touch_x_location']=0
# df.loc[df['id'].isin(ids),'target_touch_x_location']=0
# df.loc[df['id'].isin(ids),'target_touch_y_location']=0
# # df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_touch_x_location'].isnull())]['id']
# df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_origin_x'].isnull())]
# # df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_origin_x'].isnull())]['button_touch_y_location'].fillna(0,inplace=True)
# # df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_origin_x'].isnull())]['target_touch_y_location'].fillna(0,inplace=True)
# # df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_origin_x'].isnull())]['target_touch_y_location'].fillna(0,inplace=True)

In [11]:
# print('Checking point 3')
# ids=df[(df['button_touch_x_location'].notnull()) & (df['target_touch_x_location'].isnull())]['id']
# df.loc[df['id'].isin(ids),'target_touch_x_location']=0
# df.loc[df['id'].isin(ids),'target_touch_y_location']=0

# df.info()

In [12]:
# df[(df['button_touch_x_location'].notnull()) & (df['target_touch_x_location'].isnull())]
# df.info()

In [13]:
# print(df[(df['button_pressed']==False) & (df['touchAccuracy_x'].isnull())].shape)
# print(df[(df['button_pressed']==False) & (df['touchAccuracy_y'].isnull())].shape)
# df[(df['button_pressed']==False) & (df['difference.touch_buttonCenter_x'].isnull())].shape
# df[(df['button_pressed']==False) & (df['difference.touch_buttonCenter_y'].isnull())].shape
# ids=df[(df['button_pressed']==False) & (df['touchAccuracy_x'].isnull())]['id']
# df =df[~df['id'].isin(ids)]

In [14]:
# df[df['button_touch_x_location']==0]

# df.info()