In [1]:
def readDataFromCsv(file):
    """ Read csv from files
    
    Parameters
    ----------
    file: str
        Filename to be read
    
    Returns
    ----------
    pandas.DataFrame
        Returns the dataframe containing the dataset
    """
    import pandas as pd
    print ("Reading the file from: ",file)
    df = pd.read_csv(file,index_col=False)
    return df

In [2]:
def cleanData(df,name='Study1'):
    """Applies cleaning rules for cleaning data
        
        Parameters
        ----------
        df: pd.DataFrame
            Dataframe to be cleaned
        name: str
            Name of the data
        
        Returns
        -------
        pandas.DataFrame
            Returns cleaned data frame
        """
    import pandas as pd
    import numpy as np
    import warnings
    warnings.filterwarnings('ignore')
    ## check for tracking error and skip those
    # 1. if touch.duration = null and x/y_location.down/release is null then skip those rows
    print('Number of data',df.shape[0])
    print('Checking point 1')
    print('%d Number of rows skipped'%(df[(df['touch.duration'].isnull()) & (df['x_location.release'].isnull())
                    & (df['y_location.release'].isnull())].shape[0]))
    df= df[(df['touch.duration'].notnull()) & (df['x_location.release'].notnull())
                    & (df['y_location.release'].notnull())]
    
    # 2. If the button_pressed=True and Correct_answer=True and target_height,target_origin is null, skip the rows 
        ##as it is purely experimental error
    ids = df[(df['button_pressed']==True)& (df['correct_answer']==True) & (df['target_height'].isnull())]['id']
    df =df[~df['id'].isin(ids)]
    print('%d Number of rows skipped'%ids)   
    
    # 4. Impute the missing values where button_pressed = False and  target/button_touch_x/y_location is null
    print('Checking point 4')
    ids=df[(df['button_pressed']==False) &(df['correct_answer']==False)&(df['button_touch_y_location'].isnull())]['id']
    print('%d number of rows filled with 0 '%len(ids))
    df.loc[df['id'].isin(ids),'button_touch_y_location']=0
    df.loc[df['id'].isin(ids),'button_touch_x_location']=0
    df.loc[df['id'].isin(ids),'target_touch_x_location']=0
    df.loc[df['id'].isin(ids),'target_touch_y_location']=0
    
    # Impute the missing values for target_touch_x/y_locaton where button_pressed=True and correct_answer=False
    print('checking condition button_pressed = true and target_touch_x_location is null')
    ids=df[(df['button_pressed']==True) &(df['target_touch_x_location'].isnull())]['id']
    print('%d number of rows filled with 0 '%len(ids))
    df.loc[df['id'].isin(ids),'target_touch_x_location']=0
    df.loc[df['id'].isin(ids),'target_touch_y_location']=0
    
    
    #skip those rows where touchAccuracy_x/y, diference.touch_buttonCenter_x/y is missing, 
    # because information of button_width or button_center_x/y is not available
    ids=df[(df['button_pressed']==False) & (df['touchAccuracy_x'].isnull())]['id']
    df =df[~df['id'].isin(ids)]
    
    
     # skip the row with NA values for time_between_touches as it the start of the experiment
    ids = df[df['time_between_touches'].isnull()]['id']
    df=df[~df['id'].isin(ids)]
    print('%d number of rows skipped'%len(ids))
    
    columns=['id','timestamp','event','user_id', 'button_pressed', 'correct_answer','appCode','App','User', 'Cond', 'sessionNr',
      'x_location.down','y_location.down','touch.duration','swipe_length','swipe_length.x','swipe_length.y',      
#       'swipe_speed', 'swipe_speed.x', 'swipe_speed.y',
             'button_touch_x_location','button_touch_y_location','target_touch_x_location','target_touch_y_location',
             'time_between_touches',
    'x_location.release',  'y_location.release',
#              'button_center_x','button_center_y',
    'difference.touch_buttonCenter_x','difference.touch_buttonCenter_y', 'touchAccuracy','touchAccuracy_x', 'touchAccuracy_y' 
     ]
    
    df=df[columns]
    
    print('---Remaining null values in the data---')
    counter=0
    for col in df.columns:
        if df[col].isnull().sum()>0:
            counter = counter+1
            print('%s %d'%(col,df[col].isnull().sum()))
    if counter==0:
        print('0')
    
    assert df.isnull().values.any()==False, 'Please check for null values'
    
    #convert the length values to its absolute value, i.e. integer value
    df[['touch.duration','swipe_length','swipe_length.x','swipe_length.y','time_between_touches',
                'difference.touch_buttonCenter_x','difference.touch_buttonCenter_y']] = np.absolute(
    df[['touch.duration','swipe_length','swipe_length.x','swipe_length.y','time_between_touches',
                'difference.touch_buttonCenter_x','difference.touch_buttonCenter_y']])        
    return df

In [3]:
def reject_outliers(df,threshold=3.5, method='Zscore'):
    """Applies outlier detection techqniue based on the specified method and removes it
    Parameters
    -----------
    df: pandas.DataFrame
        Dataframe on which the outliers to be detected and removed
    threshold: float,optional
        Threshold value to be used when method is Zscore
    method: str
        Specifies the method to be used for outlier detection
    Returns
    --------
    pd.DataFrame
        Returns the outlier treated data frame
    """
    from scipy import stats
    import numpy as np
    df_temp = df.select_dtypes('float64')
    if method=='Zscore':
        '''Calculate the z score using mean and standard deviation'''
        zscore = np.abs(stats.zscore(df_temp))
        df_outlier = df[(zscore < threshold).all(axis=1)]
    if method == 'modifiedZscore':
        '''Calculate the z score using median and Median absolute deviation (MAD)'''
        df_np = np.array(df_temp)
        median_df = np.array(df_temp.median().to_frame().transpose())
        median_absolute_deviation_df = np.array(df_temp.mad().to_frame().transpose())
        modified_z_scores =np.abs(np.divide((df_np-median_df), median_absolute_deviation_df))
        index= np.where(modified_z_scores < 3)
        df_outlier = df[(modified_z_scores < threshold).all(axis=1)]
    if method == 'mahalanobis':
        from sklearn.covariance import EllipticEnvelope
        clf =EllipticEnvelope(random_state=14)
        clf.fit(df.select_dtypes('float64'))
        predictions = np.array(clf.predict(df.select_dtypes('float64')))
        # find the index where there are outliers. outliers are labeled as -1 and inliers as 1
        row_num=np.where(predictions==-1)
        id = df.iloc[row_num]['id']
        df_outlier = df[~df['id'].isin(id)]
    if method=='manhattan':
        # uses knn to compute the neighbours based on manhattan distance
        from sklearn.neighbors import LocalOutlierFactor
        lof= LocalOutlierFactor(metric='manhattan',n_jobs=-1)
        predictions = lof.fit_predict(df.select_dtypes('float64'))
        row_num=np.where(predictions==-1)
        id = df.iloc[row_num]['id']
        df_outlier = df[~df['id'].isin(id)]
        return df_outlier
        
    return df_outlier

In [5]:
def main():
    """Main function to be executed"""
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import warnings; warnings.filterwarnings('ignore')
    df_study2 = readDataFromCsv("datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study1.csv")
    df_study2_touch_down= df_study2[df_study2['event']=="touch.down"]
    print(df_study2_touch_down.shape)
    df = cleanData(df_study2_touch_down,name='Study1')
    print(df.shape)
    df = df.reset_index(drop=True)
    df.to_csv('datasets/files_generated/study1_cleandata.csv',index=False)
    print('Data is cleaned and file is saved')
    methods=['mahalanobis','Zscore','modifiedZscore','manhattan']
    for method in methods:
        df_out = reject_outliers(df,method=method)
        path = "datasets/files_generated/study1_cleandata_out_"+method+".csv"
        df_out.to_csv(path,index=False)
        print('Outliers rejected and file is saved in path',path)
        print("****************Report**************************************")
        print('***********************************************************')
        print('**************************************************************')
        print('%d number of rows rejected due to outliers'%(df.shape[0]-df_out.shape[0]))
        print('%d Total number of rows present in the data'%df_out.shape[0])

In [6]:
if __name__=='__main__':
    main()

Reading the file from:  datasets/rawData.firstTry/04_DataExperiment_generalTouchFeatures_study1.csv
(20597, 89)
Number of data 20597
Checking point 1
32 Number of rows skipped
Checking point 4
7329 number of rows filled with 0 
checking condition button_pressed = true and target_touch_x_location is null
0 number of rows filled with 0 
1 number of rows skipped
---Remaining null values in the data---
0
(20428, 29)
Data is cleaned and file is saved
Outliers rejected and file is saved in path datasets/files_generated/study1_cleandata_out_mahalanobis.csv
****************Report**************************************
***********************************************************
**************************************************************
2043 number of rows rejected due to outliers
18385 Total number of rows present in the data
Outliers rejected and file is saved in path datasets/files_generated/study1_cleandata_out_Zscore.csv
****************Report**************************************
******