In [1]:
import numpy as np
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns


np.random.seed(42)

In [2]:
datasetText = open('../input/browserfingerprint/v3.csv', 'r').read()
datasetText = datasetText.replace('id,data\n', '')

# Average Speed (avg_speed)
The typical speed at which the mouse is moving.

Compute the distance between each pair of consecutive points using the Euclidean distance formula:
    
    distance = √[(x₂ – x₁)² + (y₂ – y₁)²]

Divide each distance by the fixed time interval (0.15 seconds) to obtain a speed for that segment:
    
    speed = distance / 0.15

**Answer** = mean of all these speeds.

# Acceleration (acceleration)
How quickly the speed changes over time. It indicates whether the mouse is speeding up or slowing down.

1. Take the sequence of speeds computed previously.
2. For each consecutive pair of speeds, find the change in speed and divide by the time interval (0.15 seconds):
   
    acceleration for a segment = (speed₂ – speed₁) / 0.15

**Answer** = mean of these acceleration values.

# Jerk (jerk)
Rate of acceleration changes. It describes how “smooth” or “jerky” the movement is.

1. Calculate acceleration as described above.
2. For each consecutive pair of acceleration values, compute the difference and divide by the time interval:
   
    jerk for a segment = (acceleration₂ – acceleration₁) / 0.15

**Answer** = mean of these jerk values.

# Curvature (curvature)
How sharply the mouse’s path is turning. A higher curvature means the path is bending more sharply.

1. For each pair of consecutive segments (which gives three consecutive points), compute the direction of movement (angle) using the arctan2 function on the differences in y and x coordinates.
   - Unwrap the angles to avoid jumps (e.g., going from 359° to 0°).
   - Compute the difference between consecutive angles to see how much the direction changes as you move from one segment to the next.
   - For each change, divide the absolute difference in angle by the average of the two segment distances (i.e., (d₁ + d₂)⁄2). This gives a curvature value per turn.
   - Finally, take the mean of all these curvature values to get an overall average curvature.

# Straightness (straightness)
How directly the mouse moves from the start to the end of its trajectory.

1. Compute the total path length by summing all the distances between consecutive points.
2. Calculate the straight-line (Euclidean) distance between the starting point and the ending point.
3. Divide the total path length by the straight-line distance.
4. A value close to 1 indicates that the movement is nearly a straight line; larger values indicate more winding or indirect paths.

# Jitter (jitter)
Variability in movement. It captures how inconsistent or “noisy” the distances between consecutive points are.

1. First, compute all the distances between consecutive points as described for average speed.
2. Then, calculate the standard deviation of these distances, which shows how much they vary from their average value.

# Direction Changes (direction_changes)
Counts the number of times the mouse makes a significant change in its moving direction.

1. Calculate the movement angle for each segment (using arctan2 on the differences in y and x).
2. Determine the difference between consecutive angles (after “unwrapping” to avoid sudden jumps).
3. Count the number of instances where the absolute change in angle is greater than 45° (which is π/4 radians).

In [3]:
datasetTextRows = datasetText.split('\n')
print(len(datasetTextRows))
datasetList = []

for line in datasetTextRows:
    if(line != ""):
        newFormatLine = '{"' + line.split(',{"')[1]
        
        parsed = json.loads(newFormatLine)
        datasetList.append(parsed)



df = pd.DataFrame(datasetList)

for index, row in df.iterrows():
    dataSplit = row['sensorData'].split(';')[1:len(row['sensorData'].split(';')) - 1]
    df.at[index, 'sensorData'] = dataSplit

df = df[df['sensorData'].map(lambda x: len(x) > 4)]
df.reset_index(drop=True, inplace=True)


for index, row in df.iterrows():
    sensorData = row['sensorData']
    totalMousePoints = 0
    
    x_coords = []
    y_coords = []
    
    for data in sensorData:
        if 'scroll' in data:
            pass
        elif 'click' in data:
            pass
        elif 'key' in data:
            pass
        else:
            try:
                mouseAtX, mouseAtY = data.split('-')
                x_coords.append(int(mouseAtX))
                y_coords.append(int(mouseAtY))
                totalMousePoints += 1
            except ValueError:
                continue

    df.at[index, 'totalMousePoints'] = totalMousePoints
    df.at[index, 'x_coords'] = json.dumps(x_coords)
    df.at[index, 'y_coords'] = json.dumps(y_coords)



TIME_INTERVAL = 0.15  # seconds

def compute_curvature(x_coords, y_coords, time_interval=0.15):
    x_coords = np.array(x_coords, dtype=float)
    y_coords = np.array(y_coords, dtype=float)
        
    dx = np.diff(x_coords)
    dy = np.diff(y_coords)
        
    distances = np.sqrt(dx ** 2 + dy ** 2)
    angles = np.arctan2(dy, dx)
    angles = np.unwrap(angles)
    delta_angles = np.diff(angles)

    if len(distances) < 2:
        return 0
        
    avg_lengths = (distances[:-1] + distances[1:]) / 2.0 + 1e-6
        
    curvatures = np.abs(delta_angles) / avg_lengths
        
    mean_curvature = np.mean(curvatures)
        
    return mean_curvature

def calculate_features(df):
    new_features = {
        "path_length": [],
        "avg_speed": [],
        "acceleration": [],
        "jerk": [],
        "curvature": [],
        "straightness": [],
        "idle_time_count": [],
        "jitter": [],
        "direction_changes": []
    }

    for index, row in df.iterrows():
        x_coords = np.array(json.loads(row['x_coords']))
        y_coords = np.array(json.loads(row['y_coords']))

        if len(x_coords) < 2:
            for key in new_features:
                new_features[key].append(0)
            continue

        distances = np.sqrt(np.diff(x_coords) ** 2 + np.diff(y_coords) ** 2)
        total_distance = np.sum(distances)

        speeds = distances / TIME_INTERVAL  # speed in pixels per second
        avg_speed = np.mean(speeds) if len(speeds) > 0 else 0

        accelerations = np.diff(speeds) / TIME_INTERVAL  # acceleration in pixels per second^2
        avg_acceleration = np.mean(accelerations) if len(accelerations) > 0 else 0

        jerks = np.diff(accelerations) / TIME_INTERVAL  # jerk in pixels per second^3
        avg_jerk = np.mean(jerks) if len(jerks) > 0 else 0

        curvature = compute_curvature(x_coords, y_coords, TIME_INTERVAL)
            

        straight_line_distance = np.linalg.norm([x_coords[-1] - x_coords[0], y_coords[-1] - y_coords[0]])
        straightness = total_distance / (straight_line_distance + 1e-6)  # Avoid division by zero

        idle_time_count = np.sum((x_coords[:-1] == x_coords[1:]) & (y_coords[:-1] == y_coords[1:]))

        jitter = np.std(distances) if len(distances) > 1 else 0

        angles = np.arctan2(np.diff(y_coords), np.diff(x_coords))
        angles = np.unwrap(angles)
        direction_changes = np.sum(np.abs(np.diff(angles)) > np.pi / 4)  # Count large direction changes

        # Store values
        new_features["path_length"].append(total_distance)
        new_features["avg_speed"].append(avg_speed)
        new_features["acceleration"].append(avg_acceleration)
        new_features["jerk"].append(avg_jerk)
        new_features["curvature"].append(curvature)
        new_features["straightness"].append(straightness)
        new_features["idle_time_count"].append(idle_time_count)
        new_features["jitter"].append(jitter)
        new_features["direction_changes"].append(direction_changes)

    # Add new columns to DataFrame
    for key, values in new_features.items():
        df[key] = values

    return df

500


In [4]:
df.drop(columns=['userAgent', 'language', 'platform', 
                    'deviceMemory', 'doNotTrack', 
                   'screenResolution', 'colorDepth', 'plugins', 
                 'mimeTypes', 'timezoneOffset',
                'hardwareConcurrency', 'touchSupport', 'webdriver', 'viewportWidth', 'viewportHeight', 'sensorData'], inplace=True)

df = df[df['totalMousePoints'] >= 2]

df = df.reset_index(drop=True)

df = calculate_features(df)

# df = df[df['curvature'] >= 0.08]

df.drop(columns=['idle_time_count', 'x_coords', 'y_coords', 'path_length'], inplace=True)

df = df.drop(columns=["totalMousePoints"])

df = df.reset_index(drop=True)

In [5]:
df

Unnamed: 0,avg_speed,acceleration,jerk,curvature,straightness,jitter,direction_changes
0,145.224903,-1.082483e+01,-52.902900,0.071230,1.206260,25.856432,1
1,50.486546,-4.454495e-01,-27.966524,0.129407,1.185958,10.032445,4
2,57.789384,1.846896e+00,12.598981,0.055746,1.017386,7.588942,0
3,111.853120,-1.780130e+01,-26.337167,0.015053,1.027440,10.035863,1
4,102.069400,1.011985e-14,9.259259,0.018427,1.038153,15.576288,0
...,...,...,...,...,...,...,...
495,34.363565,2.057237e-01,3.471327,0.196513,1.686794,4.645925,3
496,34.648574,-4.753566e+00,18.938798,0.131589,2.611069,3.874536,4
497,46.619860,-5.465560e+00,21.879860,0.031274,1.144504,6.211892,0
498,24.646180,-9.456265e-01,-1.520567,0.056970,1.527381,2.730885,0


In [6]:
data = df.values

In [7]:
# Normalize data (Min-Max Scaling)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Split into train/test sets
X_train, X_test = train_test_split(data_scaled, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

train_loader = DataLoader(TensorDataset(X_train_tensor), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor), batch_size=32, shuffle=False)

In [8]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4)
        )
        self.decoder = nn.Sequential(
            nn.Linear(4, 8),
            nn.ReLU(),
            nn.Linear(8, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [9]:
input_dim = X_train.shape[1]
model = Autoencoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
best_loss = float('inf')
patience = 5
patience_counter = 0

num_epochs = 1000
train_losses = []

In [11]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        data_batch = batch[0]
        optimizer.zero_grad()
        reconstructed = model(data_batch)
        loss = criterion(reconstructed, data_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    epoch_loss /= len(train_loader)
    train_losses.append(epoch_loss)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.6f}")

    # Early Stopping
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

Epoch 1, Loss: 0.151010
Epoch 2, Loss: 0.108592
Epoch 3, Loss: 0.071235
Epoch 4, Loss: 0.043332
Epoch 5, Loss: 0.028037
Epoch 6, Loss: 0.023709
Epoch 7, Loss: 0.021087
Epoch 8, Loss: 0.020945
Epoch 9, Loss: 0.020564
Epoch 10, Loss: 0.020800
Epoch 11, Loss: 0.020539
Epoch 12, Loss: 0.020164
Epoch 13, Loss: 0.020062
Epoch 14, Loss: 0.020247
Epoch 15, Loss: 0.019798
Epoch 16, Loss: 0.019623
Epoch 17, Loss: 0.018978
Epoch 18, Loss: 0.018422
Epoch 19, Loss: 0.017971
Epoch 20, Loss: 0.016864
Epoch 21, Loss: 0.016145
Epoch 22, Loss: 0.014916
Epoch 23, Loss: 0.014278
Epoch 24, Loss: 0.012993
Epoch 25, Loss: 0.012231
Epoch 26, Loss: 0.011789
Epoch 27, Loss: 0.011609
Epoch 28, Loss: 0.011269
Epoch 29, Loss: 0.011236
Epoch 30, Loss: 0.010763
Epoch 31, Loss: 0.010590
Epoch 32, Loss: 0.010545
Epoch 33, Loss: 0.010402
Epoch 34, Loss: 0.010364
Epoch 35, Loss: 0.009981
Epoch 36, Loss: 0.009877
Epoch 37, Loss: 0.009740
Epoch 38, Loss: 0.009548
Epoch 39, Loss: 0.009399
Epoch 40, Loss: 0.009128
Epoch 41,

In [12]:
# Load the best model
model.load_state_dict(torch.load("best_model.pth"))
model.eval()

  model.load_state_dict(torch.load("best_model.pth"))


Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=7, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=4, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=4, out_features=8, bias=True)
    (1): ReLU()
    (2): Linear(in_features=8, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=7, bias=True)
  )
)

In [13]:
# Compute reconstruction error on test set
reconstruction_errors = []
with torch.no_grad():
    train_reconstruction_errors = []
    for batch in train_loader:
        data_batch = batch[0]
        reconstructed = model(data_batch)
        loss = torch.mean((data_batch - reconstructed) ** 2, dim=1)
        train_reconstruction_errors.extend(loss.numpy())
        reconstruction_errors.extend(loss.numpy())

min_error = np.min(train_reconstruction_errors)

threshold = np.percentile(reconstruction_errors, 95)

In [14]:
# --- Function to Detect Anomalies with Confidence Score ---
def detect_anomaly(new_data):
    new_data_scaled = scaler.transform(new_data.reshape(1, -1))
    new_data_tensor = torch.tensor(new_data_scaled, dtype=torch.float32)
    
    with torch.no_grad():
        reconstructed = model(new_data_tensor)
        loss = torch.mean((new_data_tensor - reconstructed) ** 2).item()
    
    # Normalize error into a 0-100% confidence score
    confidence = 100 * (1 - (loss - min_error) / (threshold - min_error))
    confidence = np.clip(confidence, 0, 100)  # Ensure it's between 0-100%

    return confidence

    # print(f"Reconstruction Error: {loss:.6f}")
    # print(f"Confidence Score: {confidence:.2f}%")

    # if confidence >= 50:
    #     print("✅ Likely a Human Movement.")
    # else:
    #     print("🚨 Likely a Bot!")

# Example: Test a custom sample
sample_data = df.iloc[0].values  # Take a random row from the dataset
detect_anomaly(sample_data)

24.46052942912814

In [15]:
sample_data

array([ 1.45224903e+02, -1.08248304e+01, -5.29029003e+01,  7.12304374e-02,
        1.20626020e+00,  2.58564320e+01,  1.00000000e+00])

In [16]:
threshold, min_error

(0.01301986980251962, 9.669427e-05)

In [17]:
avgAccuracy = 0.0
allScores = []
numReal = 0
for i in range(len(df)):
    sData = df.iloc[i].values
    c = detect_anomaly(sData)
    avgAccuracy = avgAccuracy + c
    allScores.append(c)
    if c > 0.5:
        numReal = numReal + 1

avgAccuracy = avgAccuracy / len(df)

In [18]:
numReal

474

In [19]:
len(df)

500

In [20]:
f"{(numReal / len(df)) * 100}% Acc"

'94.8% Acc'

In [21]:
import joblib
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']