In [14]:
import numpy as np
import pandas as pd

from scipy.signal import savgol_filter
from warnings import filterwarnings

# Data preparation

In [15]:
data_1 = pd.read_csv('../data/mouseLogs_minigame.csv', delimiter=';')
data_2 = pd.read_csv('../data/mouseLogs_pointsActivity.csv', delimiter=';')
data_3 = pd.read_csv('../data/mouseLogs_prototype.csv', delimiter=';')

#remove uneccessary columns
for dataset in [data_1, data_2, data_3]:
    dataset.drop(['id', 'id.1', 'id.2', 'experience', 'pc_usage', 'employment', 'creation', 'creation.1', 'creation.2', 'prototype'], inplace=True, axis=1)

In [16]:
data_1.head()

Unnamed: 0,sex,age,session_id,position_x,position_y,timestamp,event,next_target_number,is_target,game_number,respondent_id,note
0,man,29,6,880,155,1,mouseup,2,f,1,2,res1_micr1000_sys10
1,man,29,6,876,159,109,mousemove,2,f,1,2,res1_micr1000_sys10
2,man,29,6,867,166,126,mousemove,2,f,1,2,res1_micr1000_sys10
3,man,29,6,860,172,142,mousemove,2,f,1,2,res1_micr1000_sys10
4,man,29,6,847,177,159,mousemove,2,f,1,2,res1_micr1000_sys10


In [17]:
data_1.shape

(286869, 12)

In [18]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286869 entries, 0 to 286868
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   sex                 286869 non-null  object
 1   age                 286869 non-null  int64 
 2   session_id          286869 non-null  int64 
 3   position_x          286869 non-null  int64 
 4   position_y          286869 non-null  int64 
 5   timestamp           286869 non-null  int64 
 6   event               286869 non-null  object
 7   next_target_number  286869 non-null  int64 
 8   is_target           286869 non-null  object
 9   game_number         286869 non-null  int64 
 10  respondent_id       286869 non-null  int64 
 11  note                286869 non-null  object
dtypes: int64(8), object(4)
memory usage: 26.3+ MB


In [19]:
data_2.shape

(389234, 12)

In [20]:
data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 389234 entries, 0 to 389233
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   sex                 389234 non-null  object
 1   age                 389234 non-null  int64 
 2   session_id          389234 non-null  int64 
 3   position_x          389234 non-null  int64 
 4   position_y          389234 non-null  int64 
 5   timestamp           389234 non-null  int64 
 6   event               389234 non-null  object
 7   task_id             389234 non-null  object
 8   next_target_number  389234 non-null  int64 
 9   is_target           389234 non-null  object
 10  respondent_id       389234 non-null  int64 
 11  note                389234 non-null  object
dtypes: int64(7), object(5)
memory usage: 35.6+ MB


In [21]:
data_3.shape

(107580, 12)

In [22]:
data_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107580 entries, 0 to 107579
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   sex                 107580 non-null  object
 1   age                 107580 non-null  int64 
 2   session_id          107580 non-null  int64 
 3   position_x          107580 non-null  int64 
 4   position_y          107580 non-null  int64 
 5   timestamp           107580 non-null  int64 
 6   event               107580 non-null  object
 7   next_target_number  107580 non-null  int64 
 8   is_target           107580 non-null  object
 9   prototype_id        107580 non-null  object
 10  respondent_id       107580 non-null  int64 
 11  note                107580 non-null  object
dtypes: int64(7), object(5)
memory usage: 9.8+ MB


## Removing duplicate data

In [23]:
duplicates_1 = data_1[data_1.duplicated(['session_id', 'timestamp', 'game_number'])]
duplicates_2 = data_2[data_2.duplicated(['session_id', 'timestamp', 'task_id'])]
duplicates_3 = data_3[data_3.duplicated(['session_id', 'timestamp', 'prototype_id'])]

data_1.drop(duplicates_1[duplicates_1.event == 'mousemove'].index, inplace=True)
data_2.drop(duplicates_2[duplicates_2.event == 'mousemove'].index, inplace=True)
data_3.drop(duplicates_3[duplicates_3.event == 'mousemove'].index, inplace=True)

duplicates_1 = data_1[data_1.duplicated(['session_id', 'timestamp', 'game_number'], keep=False)]
duplicates_2 = data_2[data_2.duplicated(['session_id', 'timestamp', 'task_id'], keep=False)]
duplicates_3 = data_3[data_3.duplicated(['session_id', 'timestamp', 'prototype_id'], keep=False)]

data_1.drop(duplicates_1[duplicates_1.event == 'mousemove'].index, inplace=True)
data_2.drop(duplicates_2[duplicates_2.event == 'mousemove'].index, inplace=True)
data_3.drop(duplicates_3[duplicates_3.event == 'mousemove'].index, inplace=True)

## Sorting the data

In [24]:
data_1 = data_1.sort_values(by=['respondent_id', 'session_id', 'game_number', 'timestamp'])
data_2 = data_2.sort_values(by=['respondent_id', 'session_id', 'task_id', 'timestamp'])
data_3 = data_3.sort_values(by=['respondent_id', 'session_id', 'prototype_id', 'timestamp'])

## Calculating basic metrics

In [25]:
# choose wheter to aggregate by task or by move ('task', 'move')
aggregate_by = 'task'

In [26]:
data_1_grouped = None
data_2_grouped = None
data_3_grouped = None

if(aggregate_by == 'task'):
    data_1_grouped = data_1.groupby(['respondent_id', 'session_id', 'game_number'])
    data_2_grouped = data_2.groupby(['respondent_id', 'session_id', 'task_id'])
    data_3_grouped = data_3.groupby(['respondent_id', 'session_id', 'prototype_id'])
elif(aggregate_by == 'move'):
    data_1_grouped = data_1.groupby(['respondent_id', 'session_id', 'game_number', 'next_target_number'])
    data_2_grouped = data_2.groupby(['respondent_id', 'session_id', 'task_id', 'next_target_number'])
    data_3_grouped = data_3.groupby(['respondent_id', 'session_id', 'prototype_id', 'next_target_number'])

for dataset, grouped_dataset in [(data_1, data_1_grouped), (data_2, data_2_grouped), (data_3, data_3_grouped)]:

    #distance metrics
    dataset['pointsDistance'] = np.sqrt((grouped_dataset['position_x'].diff().fillna(0) ** 2) + (grouped_dataset['position_y'].diff().fillna(0) ** 2))
    dataset['pointsDistanceX'] = np.abs(grouped_dataset['position_x'].diff().fillna(0))
    dataset['pointsDistanceY'] = np.abs(grouped_dataset['position_y'].diff().fillna(0))

    #time metric
    dataset['timeDifference'] = grouped_dataset['timestamp'].diff().fillna(dataset['timestamp'])

    #velocity metrics
    dataset['velocity'] = (dataset['pointsDistance'] / dataset['timeDifference']).fillna(0)
    dataset['velocityX'] = (dataset['pointsDistanceX'] / dataset['timeDifference']).fillna(0)
    dataset['velocityY'] = (dataset['pointsDistanceY'] / dataset['timeDifference']).fillna(0)
    dataset['velocityDifference'] = grouped_dataset['velocity'].diff().fillna(dataset['velocity'])
    dataset['velocityDifferenceX'] = grouped_dataset['velocityX'].diff().fillna(dataset['velocityX'])
    dataset['velocityDifferenceY'] = grouped_dataset['velocityY'].diff().fillna(dataset['velocityY'])

    #other metrics - angle metrics, curvature and acceleration, corners and smooth metrics
    dataset['angle'] = np.arctan2(grouped_dataset['position_y'].diff().fillna(0), grouped_dataset['position_x'].diff().fillna(0))
    dataset['angleDifference'] = grouped_dataset['angle'].diff().fillna(0)

    dataset['pace'] = dataset.apply(lambda a: 0 if a['pointsDistance'] == 0 else (a['timeDifference'] / a['pointsDistance']), axis=1)
    dataset['angularVelocity'] = (dataset['angleDifference'] / dataset['timeDifference']).fillna(0)
    dataset['curvature'] = dataset.apply(lambda a: 0 if a['pointsDistance'] == 0 else (a['angleDifference'] / a['pointsDistance']) , axis=1)

    dataset['acceleration'] = (dataset['velocityDifference'] / dataset['timeDifference']).fillna(0)
    dataset['accelerationX'] = (dataset['velocityDifferenceX'] / dataset['timeDifference']).fillna(0)
    dataset['accelerationY'] = (dataset['velocityDifferenceY'] / dataset['timeDifference']).fillna(0)

    dataset['corner'] = dataset.apply(lambda a: 1 if (a['angleDifference'] > np.pi/2) else 0, axis=1)

    dataset['smooth_x'] = dataset[['position_x']].apply(lambda x: savgol_filter(x, 9, 2))
    dataset['smooth_y'] = dataset[['position_y']].apply(lambda y: savgol_filter(y, 9, 2))
    dataset['smoothDistance'] = np.sqrt((grouped_dataset['smooth_x'].diff().fillna(0) ** 2) + (grouped_dataset['smooth_y'].diff().fillna(0) ** 2))
    dataset['smoothVelocity'] = (dataset['smoothDistance'] / dataset['timeDifference']).fillna(0)

    dataset['changeX'] = grouped_dataset['position_x'].diff().fillna(0)
    dataset['changeY'] = grouped_dataset['position_y'].diff().fillna(0)
    dataset['directionX'] = dataset.apply(lambda a: 1 if (a['changeX'] >= 0) else 0, axis=1)
    dataset['directionY'] = dataset.apply(lambda a: 1 if (a['changeY'] >= 0) else 0, axis=1)
    dataset['directionX_change'] = np.abs(grouped_dataset['directionX'].diff().fillna(0))
    dataset['directionY_change'] = np.abs(grouped_dataset['directionY'].diff().fillna(0))

    #counting perpedicular distances
    distances = []
    for index, group in grouped_dataset:
        start = np.array(group.head(1)[['position_x', 'position_y']])
        end = np.array(group.tail(1)[['position_x', 'position_y']])
        line = end - start
        norm = np.linalg.norm(line)
        for i, item in group.iterrows():
            if norm == 0:
                distances.append(0)
            else:
                distances.append(np.abs(np.cross(line, np.array(item[['position_x', 'position_y']]) - start) / norm)[0])

    dataset['perpendicularDistance'] = distances   

In [27]:
data_2.head()

Unnamed: 0,sex,age,session_id,position_x,position_y,timestamp,event,task_id,next_target_number,is_target,...,smooth_y,smoothDistance,smoothVelocity,changeX,changeY,directionX,directionY,directionX_change,directionY_change,perpendicularDistance
2143,man,29,6,499,327,398,mousemove,F1,1,f,...,327.442424,0.0,0.0,0.0,0.0,1,1,0.0,0.0,0.0
2144,man,29,6,497,327,417,mousemove,F1,1,f,...,326.293939,2.104621,0.11077,-2.0,0.0,0,1,1.0,0.0,0.666432
2145,man,29,6,496,325,431,mousemove,F1,1,f,...,325.078355,2.038116,0.14558,-1.0,-2.0,0,0,0.0,1.0,0.886052
2146,man,29,6,495,324,442,mousemove,F1,1,f,...,323.795671,1.979904,0.179991,-1.0,-1.0,0,0,0.0,0.0,1.495686
2147,man,29,6,492,322,459,mousemove,F1,1,f,...,322.445887,1.930738,0.113573,-3.0,-2.0,0,0,0.0,0.0,2.381739


## Calculating aggregated metrics

Help functions

In [28]:
def countIdealDistance(df, activity_identificator):
    idealDistance = 0
    
    grouped = df.groupby(['respondent_id', 'session_id', activity_identificator, 'next_target_number'])
    
    for _, group in grouped:
        start_x = group.head(1)['position_x'].values[0]
        start_y = group.head(1)['position_y'].values[0]
        end_x = group.tail(1)['position_x'].values[0]
        end_y = group.tail(1)['position_y'].values[0]

        idealDistance += np.sqrt((start_x - end_x) ** 2 + (start_y - end_y) ** 2)  
    
    return idealDistance

def countWeightedMean(dataframe, column, weight_column):
    values_sum = 0
    weights = 0
    for _, item in dataframe.iterrows():
        values_sum += item[column] * item[weight_column]
        weights += item[weight_column]
    
    if (weights == 0):
        return 0
    
    average = values_sum / weights
    return average

def addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, column, metrics, colSuffix = ''):
    for metric in metrics:
        result = 0
        if metric == 'Max':
            result = group[column].max()
        elif metric == 'Q95':
            result = group[column].quantile(0.95)
        elif metric == 'Q5':
            result = group[column].quantile(0.05)
        elif metric == 'Min':
            result = group[column].min()
            
        aggregated_dataset.loc[actual_row, column+colSuffix+metric] = result
        
def addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, column, metrics, colSuffix = ''):
    for metric in metrics:
        result = 0
        if metric == 'Mean':
            result = group[column].mean()
        elif metric == 'Median':
            result = group[column].median()
        elif metric == 'Std':
            result = group[column].std()
        elif metric == 'WeightedMean':
            result = countWeightedMean(group, column, 'timeDifference')
            
        aggregated_dataset.loc[actual_row, column+colSuffix+metric] = result

def addSettingString(df):
    parts = df['note'].split('_')
    return ' '.join(parts[1:])

In [29]:
filterwarnings('ignore')

data_1_grouped = None
data_2_grouped = None
data_3_grouped = None
data_1_aggregated = None
data_2_aggregated = None
data_3_aggregated = None

if(aggregate_by == 'task'):
    data_1_grouped = data_1.groupby(['respondent_id', 'session_id', 'game_number'])
    data_2_grouped = data_2.groupby(['respondent_id', 'session_id', 'task_id'])
    data_3_grouped = data_3.groupby(['respondent_id', 'session_id', 'prototype_id'])
    data_1_aggregated = pd.DataFrame(columns=['respondent_id', 'session_id', 'game_number', 'age', 'note', 'sex'])
    data_2_aggregated = pd.DataFrame(columns=['respondent_id', 'session_id', 'task_id', 'age', 'note', 'sex'])
    data_3_aggregated = pd.DataFrame(columns=['respondent_id', 'session_id', 'prototype_id', 'age', 'note', 'sex'])
elif(aggregate_by == 'move'):
    data_1_grouped = data_1.groupby(['respondent_id', 'session_id', 'game_number', 'next_target_number'])
    data_2_grouped = data_2.groupby(['respondent_id', 'session_id', 'task_id', 'next_target_number'])
    data_3_grouped = data_3.groupby(['respondent_id', 'session_id', 'prototype_id', 'next_target_number'])
    data_1_aggregated = pd.DataFrame(columns=['respondent_id', 'session_id', 'game_number', 'next_target_number', 'age', 'note', 'sex'])
    data_2_aggregated = pd.DataFrame(columns=['respondent_id', 'session_id', 'task_id', 'next_target_number', 'age', 'note', 'sex'])
    data_3_aggregated = pd.DataFrame(columns=['respondent_id', 'session_id', 'prototype_id', 'next_target_number', 'age', 'note', 'sex'])



for grouped_dataset, aggregated_dataset, activity_identificator in [(data_1_grouped, data_1_aggregated, 'game_number'), (data_2_grouped, data_2_aggregated, 'task_id'), (data_3_grouped, data_3_aggregated, 'prototype_id')]:
    
    for name, group in grouped_dataset:
        row = None
        if(aggregate_by == 'task'):
            row = [name[0], name[1], name[2], group['age'].max(), group['note'].max(), group['sex'].max()]
        elif(aggregate_by == 'move'):
            row = [name[0], name[1], name[2], name[3], group['age'].max(), group['note'].max(), group['sex'].max()]
        aggregated_dataset.loc[len(aggregated_dataset)] = row

    for name, group in grouped_dataset:

        actual_row = None
        if(aggregate_by == 'task'):
            actual_row = (aggregated_dataset['respondent_id'] == name[0]) & (aggregated_dataset['session_id'] == name[1]) & (aggregated_dataset[activity_identificator] == name[2])
        elif(aggregate_by == 'move'):
            actual_row = (aggregated_dataset['respondent_id'] == name[0]) & (aggregated_dataset['session_id'] == name[1]) & (aggregated_dataset[activity_identificator] == name[2]) & (aggregated_dataset['next_target_number'] == name[3])
        
        idealDistance = countIdealDistance(group, activity_identificator)
        
        aggregated_dataset.loc[actual_row, 'directionX_changes'] = group['directionX_change'].sum()
        aggregated_dataset.loc[actual_row, 'directionY_changes'] = group['directionY_change'].sum()
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'velocity', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'velocity', ['WeightedMean', 'Mean', 'Median', 'Std'])
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'velocityX', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'velocityX', ['WeightedMean', 'Mean', 'Median', 'Std'])
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'velocityY', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'velocityY', ['WeightedMean', 'Mean', 'Median', 'Std'])
            
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'pointsDistance', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'pointsDistance', ['Mean', 'Median', 'Std'])
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'pace', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'pace', ['WeightedMean', 'Mean', 'Median', 'Std'])
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'acceleration', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'acceleration', ['WeightedMean', 'Mean', 'Median', 'Std'])
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'accelerationX', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'accelerationX', ['WeightedMean', 'Mean', 'Median', 'Std'])
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'accelerationY', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'accelerationY', ['WeightedMean', 'Mean', 'Median', 'Std'])
        
        # positive and negative acceleration counting
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group[group['acceleration'] > 0], 'acceleration', ['Max', 'Min', 'Q95', 'Q5'], 'Pos')
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group[group['acceleration'] > 0], 'acceleration', ['WeightedMean', 'Mean', 'Median', 'Std'], 'Pos')
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group[group['acceleration'] < 0], 'acceleration', ['Max', 'Min', 'Q95', 'Q5'], 'Neg')
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group[group['acceleration'] < 0], 'acceleration', ['WeightedMean', 'Mean', 'Median', 'Std'], 'Neg')
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'angle', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'angle', ['Mean', 'Median', 'Std'])
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'angularVelocity', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'angularVelocity', ['WeightedMean', 'Mean', 'Std', 'Median'])
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'curvature', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'curvature', ['Mean', 'Std', 'Median'])
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'perpendicularDistance', ['Max', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'perpendicularDistance', ['Mean', 'Median', 'Std'])
        
        addDefinedMinMaxMetrics(aggregated_dataset, actual_row, group, 'smoothVelocity', ['Max', 'Min', 'Q95', 'Q5'])
        addDefinedMeanMedianStdMetrics(aggregated_dataset, actual_row, group, 'smoothVelocity', ['WeightedMean', 'Mean', 'Median', 'Std'])
        
        totalDistance = group['pointsDistance'].sum()
        aggregated_dataset.loc[actual_row, 'totalDist'] = totalDistance
        aggregated_dataset.loc[actual_row, 'totalDuration'] = int(group.tail(1)['timestamp']) - int(group.head(1)['timestamp'])
        
        aggregated_dataset.loc[actual_row, 'distanceRatio'] = idealDistance / totalDistance
        aggregated_dataset.loc[actual_row, 'corners'] = group['corner'].sum()


    aggregated_dataset['setting_string'] = aggregated_dataset.apply(lambda a: addSettingString(a), axis=1)
    aggregated_dataset.drop(['note'], axis=1, inplace=True)

    if(activity_identificator == 'task_id' and aggregate_by == 'move'):
        aggregated_dataset = aggregated_dataset[aggregated_dataset['next_target_number'] != 9]

In [30]:
data_2_aggregated.head()

Unnamed: 0,respondent_id,session_id,task_id,age,sex,directionX_changes,directionY_changes,velocityMax,velocityMin,velocityQ95,...,smoothVelocityQ5,smoothVelocityWeightedMean,smoothVelocityMean,smoothVelocityMedian,smoothVelocityStd,totalDist,totalDuration,distanceRatio,corners,setting_string
0,2,6,F1,29,man,28.0,54.0,7.408029,0.0,2.529521,...,0.033551,0.651954,0.856339,0.544648,0.991569,5124.309547,7676.0,0.687722,14.0,micr1000 sys10
1,2,6,F2,29,man,30.0,28.0,6.26124,0.0,2.585552,...,0.024309,0.52984,0.771897,0.513012,0.892241,3919.92245,7005.0,0.871911,12.0,micr1000 sys10
2,2,6,F3,29,man,40.0,30.0,5.50701,0.0,2.796963,...,0.033555,0.613715,0.775755,0.471992,0.836972,5288.164786,8588.0,0.716713,9.0,micr1000 sys10
3,2,6,R1,29,man,44.0,34.0,6.825811,0.0,2.549859,...,0.023501,0.598664,0.809593,0.482914,0.980869,4299.803825,7182.0,0.800165,12.0,micr1000 sys10
4,2,6,R2,29,man,24.0,32.0,4.979195,0.0,2.431768,...,0.016764,0.573588,0.803601,0.514136,0.815327,4736.55707,7844.0,0.700936,8.0,micr1000 sys10


In [31]:
print(data_1_aggregated.isna().sum().sum())
print(data_2_aggregated.isna().sum().sum())
print(data_3_aggregated.isna().sum().sum())

0
2
0


In [32]:
data_2_aggregated = data_2_aggregated[data_2_aggregated['task_id'] != 'warmup']
data_2_aggregated = data_2_aggregated.dropna()

## DPI and sensitivity cols

In [33]:
dpis = {'micr1000 sys10': 1000, 'dell2400 sys10': 2400, 'dell2400 sys18': 2400, 'trust4800 sys10': 4800, 'dell2400 sys5': 2400 }
sensitivities = {'micr1000 sys10': 10, 'dell2400 sys10': 10, 'dell2400 sys18': 18, 'trust4800 sys10': 10, 'dell2400 sys5': 5 }

data_2_aggregated.dpi = 0
data_2_aggregated.sens = 0
data_3_aggregated.dpi = 0
data_3_aggregated.sens = 0
data_1_aggregated.dpi = 0
data_1_aggregated.sens = 0

for index, row in data_2_aggregated.iterrows():
    data_2_aggregated.loc[index, 'dpi'] = dpis[data_2_aggregated.loc[index, 'setting_string']]
    data_2_aggregated.loc[index, 'sens'] = sensitivities[data_2_aggregated.loc[index, 'setting_string']]
for index, row in data_3_aggregated.iterrows():
    data_3_aggregated.loc[index, 'dpi'] = dpis[data_3_aggregated.loc[index, 'setting_string']]
    data_3_aggregated.loc[index, 'sens'] = sensitivities[data_3_aggregated.loc[index, 'setting_string']]
for index, row in data_1_aggregated.iterrows():
    data_1_aggregated.loc[index, 'dpi'] = dpis[data_1_aggregated.loc[index, 'setting_string']]
    data_1_aggregated.loc[index, 'sens'] = sensitivities[data_1_aggregated.loc[index, 'setting_string']]

filterwarnings('default')

In [34]:
for dataset in [data_1_aggregated, data_2_aggregated, data_3_aggregated]:
    dataset.drop(['setting_string'], inplace=True, axis=1)

## Column renaming

In [42]:
for dataset in [data_1_aggregated, data_2_aggregated, data_3_aggregated]:
    dataset.rename(inplace=True, columns={
        'directionX_changes': 'flip_x_count',
        'directionY_changes': 'flip_y_count',
        'velocityMax': 'velocity_max',
        'velocityMin': 'velocity_min',
        'velocityQ95': 'velocity_q95',
        'velocityQ5': 'velocity_q5',
        'velocityWeightedMean': 'velocity_mean_weighted',
        'velocityMean': 'velocity_mean',
        'velocityMedian': 'velocity_median',
        'velocityStd': 'velocity_std',
        'velocityXMax': 'velocity_x_max',
        'velocityXMin': 'velocity_x_min',
        'velocityXQ95': 'velocity_x_q95',
        'velocityXQ5': 'velocity_x_q5',
        'velocityXWeightedMean': 'velocity_x_mean_weighted',
        'velocityXMean': 'velocity_x_mean',
        'velocityXMedian': 'velocity_x_median',
        'velocityXStd': 'velocity_x_std',
        'velocityYMax': 'velocity_y_max',
        'velocityYMin': 'velocity_y_min',
        'velocityYQ95': 'velocity_y_q95',
        'velocityYQ5': 'velocity_y_q5',
        'velocityYWeightedMean': 'velocity_y_mean_weighted',
        'velocityYMean': 'velocity_y_mean',
        'velocityYMedian': 'velocity_y_median',
        'velocityYStd': 'velocity_y_std',
        'pointsDistanceMax': 'distance_max',
        'pointsDistanceMin': 'distance_min',
        'pointsDistanceQ95': 'distance_q95',
        'pointsDistanceQ5': 'distance_q5',
        'pointsDistanceMean': 'distance_mean',
        'pointsDistanceMedian': 'distance_median',
        'pointsDistanceStd': 'distance_std',
        'paceMax': 'pace_max',
        'paceMin': 'pace_min',
        'paceQ95': 'pace_q95',
        'paceQ5': 'pace_q5',
        'paceWeightedMean': 'pace_mean_weighted',
        'paceMean': 'pace_mean',
        'paceMedian': 'pace_median',
        'paceStd': 'pace_std',
        'accelerationMax': 'acceleration_max',
        'accelerationMin': 'acceleration_min',
        'accelerationQ95': 'acceleration_q95',
        'accelerationQ5': 'acceleration_q5',
        'accelerationWeightedMean': 'acceleration_mean_weighted',
        'accelerationMean': 'acceleration_mean',
        'accelerationMedian': 'acceleration_median',
        'accelerationStd': 'acceleration_std',
        'accelerationXMax': 'acceleration_x_max',
        'accelerationXMin': 'acceleration_x_min',
        'accelerationXQ95': 'acceleration_x_q95',
        'accelerationXQ5': 'acceleration_x_q5',
        'accelerationXWeightedMean': 'acceleration_x_mean_weighted',
        'accelerationXMean': 'acceleration_x_mean',
        'accelerationXMedian': 'acceleration_x_median',
        'accelerationXStd': 'acceleration_x_std',
        'accelerationYMax': 'acceleration_y_max',
        'accelerationYMin': 'acceleration_y_min',
        'accelerationYQ95': 'acceleration_y_q95',
        'accelerationYQ5': 'acceleration_y_q5',
        'accelerationYWeightedMean': 'acceleration_y_mean_weighted',
        'accelerationYMean': 'acceleration_y_mean',
        'accelerationYMedian': 'acceleration_y_median',
        'accelerationYStd': 'acceleration_y_std',
        'accelerationPosMax': 'acceleration_positive_max',
        'accelerationPosMin': 'acceleration_positive_min',
        'accelerationPosQ95': 'acceleration_positive_q95',
        'accelerationPosQ5': 'acceleration_positive_q5',
        'accelerationPosWeightedMean': 'acceleration_positive_mean_weighted',
        'accelerationPosMean': 'acceleration_positive_mean',
        'accelerationPosMedian': 'acceleration_positive_median',
        'accelerationPosStd': 'acceleration_positive_std',
        'accelerationNegMax': 'acceleration_negative_max',
        'accelerationNegMin': 'acceleration_negative_min',
        'accelerationNegQ95': 'acceleration_negative_q95',
        'accelerationNegQ5': 'acceleration_negative_q5',
        'accelerationNegWeightedMean': 'acceleration_negative_mean_weighted',
        'accelerationNegMean': 'acceleration_negative_mean',
        'accelerationNegMedian': 'acceleration_negative_median',
        'accelerationNegStd': 'acceleration_negative_std',
        'angleMax': 'angle_max',
        'angleMin': 'angle_min',
        'angleQ95': 'angle_q95',
        'angleQ5': 'angle_q5',
        'angleMean': 'angle_mean',
        'angleMedian': 'angle_median',
        'angleStd': 'angle_std',
        'angularVelocityMax': 'velocity_angular_max',
        'angularVelocityMin': 'velocity_angular_min',
        'angularVelocityQ95': 'velocity_angular_q95',
        'angularVelocityQ5': 'velocity_angular_q5',
        'angularVelocityWeightedMean': 'velocity_angular_mean_weighted',
        'angularVelocityMean': 'velocity_angular_mean',
        'angularVelocityStd': 'velocity_angular_std',
        'angularVelocityMedian': 'velocity_angular_median',
        'curvatureMax': 'curvature_max',
        'curvatureMin': 'curvature_min',
        'curvatureQ95': 'curvature_q95',
        'curvatureQ5': 'curvature_q5',
        'curvatureMean': 'curvature_mean',
        'curvatureStd': 'curvature_std',
        'curvatureMedian': 'curvature_median',
        'perpendicularDistanceMax': 'deviation_max',
        'perpendicularDistanceQ95': 'deviation_q95',
        'perpendicularDistanceQ5': 'deviation_q5',
        'perpendicularDistanceMean': 'deviation_mean',
        'perpendicularDistanceMedian': 'deviation_median',
        'perpendicularDistanceStd': 'deviation_std',
        'smoothVelocityMax': 'velocity_smooth_max',
        'smoothVelocityMin': 'velocity_smooth_min',
        'smoothVelocityQ95': 'velocity_smooth_q95',
        'smoothVelocityQ5': 'velocity_smooth_q5',
        'smoothVelocityWeightedMean': 'velocity_smooth_mean_weighted',
        'smoothVelocityMean': 'velocity_smooth_mean',
        'smoothVelocityMedian': 'velocity_smooth_median',
        'smoothVelocityStd': 'velocity_smooth_std',
        'totalDist': 'distance_sum',
        'totalDuration': 'duration_sum',
        'distanceRatio': 'distance_ratio',
        'corners': 'corner_count',
    })

In [43]:
data_2_aggregated.head()

Unnamed: 0,respondent_id,session_id,task_id,age,sex,flip_x_count,flip_y_count,velocity_max,velocity_min,velocity_q95,...,velocity_smooth_mean_weighted,velocity_smooth_mean,velocity_smooth_median,velocity_smooth_std,distance_sum,duration_sum,distance_ratio,corner_count,dpi,sens
0,2,6,F1,29,man,28.0,54.0,7.408029,0.0,2.529521,...,0.651954,0.856339,0.544648,0.991569,5124.309547,7676.0,0.687722,14.0,1000.0,10.0
1,2,6,F2,29,man,30.0,28.0,6.26124,0.0,2.585552,...,0.52984,0.771897,0.513012,0.892241,3919.92245,7005.0,0.871911,12.0,1000.0,10.0
2,2,6,F3,29,man,40.0,30.0,5.50701,0.0,2.796963,...,0.613715,0.775755,0.471992,0.836972,5288.164786,8588.0,0.716713,9.0,1000.0,10.0
3,2,6,R1,29,man,44.0,34.0,6.825811,0.0,2.549859,...,0.598664,0.809593,0.482914,0.980869,4299.803825,7182.0,0.800165,12.0,1000.0,10.0
4,2,6,R2,29,man,24.0,32.0,4.979195,0.0,2.431768,...,0.573588,0.803601,0.514136,0.815327,4736.55707,7844.0,0.700936,8.0,1000.0,10.0


In [44]:
data_1_aggregated.to_csv('../data/processed/minigame_aggregated_data.csv', index = False, header=True, sep=';')
data_2_aggregated.to_csv('../data/processed/points_aggregated_data.csv', index = False, header=True, sep=';')
data_3_aggregated.to_csv('../data/processed/prototype_aggregated_data.csv', index = False, header=True, sep=';')