## cleaning datasets and combine all the JSON files together

In this file we will combine all the datasets together in a single dictionary

In this simulation there are six different datasts: 
'hlisa_traces', 'gremlins', 'za_proxy', 'survey_desktop', 'random_mouse_bot', 'random_mouse_with_sleep_bot'

By the end we will have "pos_dict" for all the datasets and each of them contains the 'time', 'x','y', 'event' and the 'dir' which is the direction of the mouse. 

Please write your Path 

In [2]:
import os
import json
import pandas as pd
import datetime

#cleaning datasets and combine all the JSON files together

# Define the list of directories to check. These are the directories which contain your data files.
PATH= 'samples'
directories = ['hlisa_traces', 'gremlins', 'za_proxy', 'survey_desktop', 'random_mouse_bot']

# Define the list of events to be searched for in the data files.
_docEvents = 'mousedown mouseup mousemove mouseover mouseout mousewheel wheel'
_docEvents += ' touchstart touchend touchmove deviceorientation keydown keyup keypress'
_docEvents += ' click dblclick scroll change select submit reset contextmenu cut copy paste'
_winEvents = 'load unload beforeunload blur focus resize error abort online offline'
_winEvents += ' storage popstate hashchange pagehide pageshow message beforeprint afterprint'


events = _docEvents.split() + _winEvents.split()

num_events=len(events)

df_names=directories
# Initialize an empty dictionary to hold data. One list per directory.
data = {dir: [] for dir in directories}

# Function to convert string to int (used for timestamps)
def str2int(s):
    return int(s.replace(',', ''))

# Iterate over each directory
for directory in directories:
    print(directory)
    # Get the list of files in the directory
    files = os.listdir(PATH+directory)

    # Iterate over each file
    for file in files:
        # Make sure we only read .json files
        if file.endswith('.json'):
            # Construct the full file path by joining the directory and file name
            filepath = PATH+os.path.join(directory, file)
            
            # Open the file
            with open(filepath, 'r') as f:
                # Load the json data
                json_data = json.load(f)
                
                # Append the data to our data list for the current directory
                data[directory].append(json_data)

# Initialize an empty dictionary to hold the pandas dataframes for each directory
df_dict_init = {}

# Create an empty DataFrame for each name
for name in df_names:
    df_dict_init[name] = pd.DataFrame() 

# Create a dictionary for quick event name to index conversion
events_index = {event: index for index, event in enumerate(events)}
for name in df_names:
    all_matches = []  # List to collect all match data

    for i in range(len(data[name])):
        trace = data[name][i]['trace'] 

        for j in range(len(trace)):
            # Convert timestamp to seconds and then to datetime object
            timestamp_in_seconds = str2int(trace[j]['timestamp'])/1000
            dt_object = datetime.datetime.fromtimestamp(timestamp_in_seconds)

            # Convert event name to index
            index = events_index.get(trace[j]['event_name'], -1)
            all_matches.append((j, dt_object,trace[j]['position']['x'], trace[j]['position']['y'], index))

    # Convert all matches into a DataFrame outside of loop
    df_dict_init[name] = pd.DataFrame(all_matches, columns=['userId', 'timestamp', 'x', 'y', 'eventName'])


hlisa_traces
gremlins
za_proxy
survey_desktop
random_mouse_bot


In [3]:
import pandas as pd
import numpy as np

df_dict=df_dict_init

def replace_zero_with_next_non_zero(arr):
    next_non_zero = None
    for i in reversed(range(len(arr))):
        if arr[i] != 0 and arr[i] != '0':
            next_non_zero = arr[i]
        elif next_non_zero is not None:
            arr[i] = next_non_zero
    return arr
    
num_bins = 3

# Iterate over each DataFrame
for df_name in df_names:
    print(df_name)
    df = df_dict[df_name]
    df = df.sort_values(by='timestamp')
    df_x_temp = df['x'].to_numpy()
    df_y_temp = df['y'].to_numpy()

    # Apply the replacement function to the numpy arrays
    df_x_temp = replace_zero_with_next_non_zero(df_x_temp)
    df_y_temp = replace_zero_with_next_non_zero(df_y_temp)

    # Assign the numpy arrays back to the DataFrame
    df['x'] = df_x_temp
    df['y'] = df_y_temp

    # 1. Sort the dataframe by timestamp
    

  
    # Save the modified DataFrame back into the dictionary
    df['time_diff'] = df['timestamp'].diff().dt.total_seconds() * 1000
    df_dict[df_name] = df

    # 2. Compute differences between successive timestamps
    

# Compute the quantile-based bins
df=df_dict['survey_desktop']
thresholds = pd.qcut(df['time_diff'], q=num_bins, duplicates='drop').unique().sort_values()
bin_edges = [interval.right for interval in thresholds[:-1]]
bin_edges=[-1]+bin_edges+[5000000000]
labels = list(range(len(bin_edges) - 1))

# Prepare dictionaries to hold the replicated rows
X_dict = {}
Y_dict = {}
T_dict = {}
Event_dict={}

# Iterate over each DataFrame again
for name in df_names:
    print(name)
    X_dict[name] = []
    Y_dict[name] = []
    T_dict[name] = []
    Event_dict[name] = []

    df=df_dict[name]

    # Assign bins to time differences
    df['time_diff_bins'] = pd.cut(df['time_diff'], bins=bin_edges, labels=labels)

    # Iterate over DataFrame rows
    for idx, row in df.iterrows():
        # Determine number of times to replicate the row
        n_rep = row['time_diff_bins']

        # If n_rep is not nan, replicate the row
        if not np.isnan(n_rep):
            n_rep = int(n_rep) + 1
                
            x = row['x'] if type(row['x']) != str else str2int(row['x'])
            y = row['y'] if type(row['y']) != str else str2int(row['y'])

            # Replicate and extend the lists
            X_dict[name].extend([x] * n_rep)
            Y_dict[name].extend([y] * n_rep)
            T_dict[name].extend([row['timestamp']] * n_rep)
            Event_dict[name].extend([row['eventName']] * n_rep)


    print(len(X_dict[name]))

# Combine the dictionaries into a single dictionary
pos_dict= {'x': X_dict, 'y': Y_dict, 't': T_dict, 'event': Event_dict}


hlisa_traces
gremlins
za_proxy
survey_desktop
random_mouse_bot
hlisa_traces
921721
gremlins
386281
za_proxy
90992
survey_desktop
1589067
random_mouse_bot
150507


In [4]:
pos_dict['dir']={}
pos_dict['time']={}
import numpy as np
lim = 40
bins = 2
num_events=len(events)

for name in df_names:
    print(name)
    
    length = len(pos_dict['x'][name])
    
    # Pre-allocate NumPy arrays
    pos_dict['dir'][name] = np.zeros(length)
    pos_dict['time'][name]=[]
    
    # Calculate time differences efficiently
    for i in range(len(pos_dict['x'][name])):
        pos_dict['time'][name].append(pos_dict['t'][name][i]-pos_dict['t'][name][0])
    
    # Calculate differences for x and y
    x_diff = np.diff(pos_dict['x'][name], prepend=pos_dict['x'][name][0])
    y_diff = np.diff(pos_dict['y'][name], prepend=pos_dict['y'][name][0])
    
    # Bin x and y directions
    x_dir_binned = np.clip(np.floor(x_diff / lim), -bins, bins) + bins
    y_dir_binned = np.clip(np.floor(y_diff / lim), -bins, bins) + bins
    
    # Update dir field
    pos_dict['dir'][name] = pos_dict['event'][name] + x_dir_binned * num_events + y_dir_binned * num_events**2


hlisa_traces
gremlins
za_proxy
survey_desktop
random_mouse_bot


In [None]:
# Save pos_dict using pickle to be used in classifiers
import pickle
with open('data/pos_dict.pickle', 'wb') as handle:
    pickle.dump(pos_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)