# References

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
import constants
import os
from dotenv import load_dotenv
import matplotlib.cm as cm
import scipy.io as sp
import pprint as pp
from scipy.signal import welch
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from datetime import datetime

In [2]:
import sys
np.set_printoptions(threshold=sys.maxsize, linewidth=300, suppress=True)
pd.set_option('display.max_colwidth', 500)

# Constants

In [None]:
# some calculations
# for the dataset, 10 seconds = 1280 frames. 1 second = 128 frames. We can sue this to find the number of seconds that were taken to record the signature.

recording_samp_rate = 128 # per second
per_phase_frames = 1280 # seconds

# Fetching raw data (Sign + EEG)

In [3]:
load_dotenv()
dataset_path = os.getenv('DATASET_PATH')

def get_dataset_files_and_user_ids(data_category = constants.GENUINE, data_type = constants.TRAIN):
    user_ids = []
    labels = []
    files_csv = []
    files_mat = []

    # Get training and testing data
    # data_split = pd.read_csv(os.path.join(dataset_path, "Identification_split.csv"))
    # files_for_task = list(data_split[data_split.set == data_type].filename)

    for root, dirs, files in os.walk(dataset_path):
        if os.path.basename(root) == constants.GENUINE == data_category:
            for file in files:
                # if file.endswith('.mat') and file in files_for_task:
                if file.endswith('.mat'):
                    files_mat.append(os.path.join(root, file))
                    labels.append(data_category)
        elif os.path.basename(root) == constants.FORGED == data_category:
            for file in files:
                # if file.endswith('.mat') and file in files_for_task:
                if file.endswith('.mat'):
                    files_mat.append(os.path.join(root, file))
                    labels.append(data_category)
        if os.path.basename(root) != constants.GENUINE and os.path.basename(root) != constants.FORGED and os.path.basename(root) != 'SignEEGv1.0':
            user_ids.append(os.path.basename(root))
        
    # files_mat = sorted(files_mat, key=lambda x: int(x.split('_')[3]))
    # files_mat = [files_mat, [data_category for _ in files_mat]]
    return files_mat, user_ids, labels

In [4]:
# # All files

files_mat_genuine, user_ids_genuine, genuine_labels = get_dataset_files_and_user_ids(data_category=constants.GENUINE)
files_mat_forged, user_ids_forged, forged_labels = get_dataset_files_and_user_ids(data_category=constants.FORGED)

files_mat_genuine.extend(files_mat_forged)
files_mat_appended = files_mat_genuine
genuine_labels.extend(forged_labels)
labels_appended = genuine_labels

# sign_data_dict, eeg_data_dict, labels = get_sig_eeg_data_dicts()

# # print(labels_appended)
# # # print(len(labels_appended))

In [None]:

files_all = np.array(files_mat_appended)
labels_all = np.array(labels_appended)

indices = np.arange(len(files_all))
np.random.shuffle(indices)

files_mat_appended = files_all[indices]
labels_appended = labels_all[indices]

In [None]:
# print(len(files_mat_appended))
to_check = sp.loadmat(files_mat_appended[0])
print("User ID: ", to_check['subject']['SubjectID'])
print("Sign Data: ", to_check['subject']['SignWacom'][0][0].shape)
print("EEG Data: ", to_check['subject']['ICA_EEG'][0][0].shape)
sign_data_test = to_check['subject']['SignWacom'][0][0]
eeg_data_test = to_check['subject']['ICA_EEG'][0][0]

# np.delete(sign_data_test, 0, axis = 1)


In [None]:
print(sign_data_test.shape)
eeg_data_test = eeg_data_test.T
total_frames_eeg = eeg_data_test.shape[0]
roi_frames_start = -(total_frames_eeg % per_phase_frames)
print(eeg_data_test[roi_frames_start:].shape)
# print(eeg_data_test[(total_frames_eeg % per_phase_frames) - 1 :, :].T.shape)

In [None]:
# small note: np.delete(axis = 1) will delete a column, axis = 0 will delete a row. be careful

In [None]:
normalized_sign_data = np.delete(sign_data_test, 0, axis = 1)
print(normalized_sign_data)

## Getting user based dictionaries with sign and EEG data

In [5]:
def get_sig_eeg_data_dicts(mat_files, labels):
    sign_data_dict = {}
    eeg_data_dict = {}
    user_labels = {}
    for mat_file, label in zip(mat_files, labels):
        mat_content = sp.loadmat(mat_file)
        user_id = str(mat_content['subject']['SubjectID'][0][0][0])
        sig_data = mat_content['subject']['SignWacom'][0][0]
        eeg_ica_data = mat_content['subject']['ICA_EEG'][0][0].T

        # removing unwanted columns from sign data
        sig_data = torch.from_numpy(np.delete(sig_data, 0, axis=1)).to(dtype=torch.float32)
        
        # getting part of eeg data during which signature was recorded (ROI)
        roi_frames_start = -(eeg_ica_data.shape[0] % per_phase_frames) if per_phase_frames > 0 else 0
        eeg_ica_data = torch.from_numpy(eeg_ica_data[roi_frames_start:]).to(dtype=torch.float32)

        if sig_data.shape[0] > 3000:
            print("Caught you!!!")
            print("User ID: ", user_id)
            print("File: ", mat_file)  
            continue # Skip these files because it's too long, outlier

        if user_id not in sign_data_dict.keys():
            sign_data_dict[user_id] = []
        sign_data_dict[user_id].append(sig_data)

        if user_id not in eeg_data_dict.keys():
            eeg_data_dict[user_id] = []
        eeg_data_dict[user_id].append(eeg_ica_data)

        if user_id not in user_labels.keys():
            user_labels[user_id] = torch.tensor([], dtype=torch.float32)
        user_labels[user_id] = torch.cat(
            [user_labels[user_id], torch.tensor([0. if label == constants.GENUINE else 1.], dtype=torch.float32)]
        )

    return sign_data_dict, eeg_data_dict, user_labels


In [None]:
# def normalize_sign_data(data): # for 2D array only
#     mean = np.mean(data[:, 2:], axis = 0) # not normalizing the stroke and btn columns
#     std = np.std(data[:, 2:], axis = 0)
#     std = np.where(std == 0, 1, std)
#     data = (data[:, 2:] - mean) / std
#     print(data)

# Sign Data Classification Process

## Sign data preprocessing

In [7]:
def normalize_sign_data_dict(sign_data_dict):
    normalized_sign_data_dict = {}
    for user_id, sign_list in sign_data_dict.items():
        normalized_sign_data_dict[user_id] = []
        for sign_data in sign_list:
            # print(sign_data.shape)
            mean = torch.mean(sign_data[:, 2:], dim=0)
            std = torch.std(sign_data[:, 2:], dim=0)
            std = torch.where(std == 0, torch.tensor(1.0, dtype=torch.float32), std)
            normalized = (sign_data[:, 2:] - mean) / std
            normalized = torch.cat([sign_data[:, 0:2], normalized], dim=1).to(dtype=torch.float32)
            # print("Normalized shape: ", normalized.shape)
            normalized_sign_data_dict[user_id].append(normalized)
            # print(normalized)
    return normalized_sign_data_dict

In [None]:
# data dict for each user will have two dictionaries: 'cls_token' and 'data'

## Sign Data Feature Extraction

In [8]:
def get_sign_data_features(sign_data_raw_dict):

    sign_data_dict = normalize_sign_data_dict(sign_data_raw_dict)

    sign_data_with_features = {}
    for user_id in sign_data_dict.keys():
        sign_data_with_features[user_id] = {}
        sign_data_with_features[user_id]['cls_tokens'] = []
        sign_data_with_features[user_id]['data'] = []
        for sign_data, normalized_sign_data in zip(sign_data_raw_dict[user_id], sign_data_dict[user_id]):
            
            x = sign_data[:, 2]
            y = sign_data[:, 3]

            normalized_sign_data = torch.tensor(normalized_sign_data, dtype=torch.float32)
            norm_x = normalized_sign_data[:, 2]
            norm_y = normalized_sign_data[:, 3]
            vx = torch.gradient(norm_x)[0]
            vy = torch.gradient(norm_y)[0]
            velocity = torch.sqrt(vx**2 + vy**2)
            ax = torch.gradient(vx)[0]
            ay = torch.gradient(vy)[0]
            acceleration = torch.sqrt(ax**2 + ay**2)
            
            avg_vx = torch.mean(vx)
            avg_vy = torch.mean(vy)
            avg_ax = torch.mean(ax)
            avg_ay = torch.mean(ay)
            
            # log curvature radius
            dt = 1
            dx = torch.gradient(norm_x, spacing=(dt,))[0]
            dy = torch.gradient(norm_y, spacing=(dt,))[0]
            v_t = torch.sqrt(dx ** 2 + dy ** 2)
            v_t = torch.where(v_t == 0, torch.tensor(1e-10, dtype=v_t.dtype), v_t)
            theta = torch.atan2(dy, dx)
            dtheta = torch.gradient(theta, spacing=(dt,))[0]
            dtheta = torch.where(dtheta == 0, torch.tensor(1e-10, dtype=dtheta.dtype), dtheta)
            log_curv_radius = torch.log(torch.abs(v_t / dtheta) + 1e-10)
            # print("Log Curve Radius shape: ", log_curv_radius.shape)
            # getting static features
            pendown_frames = normalized_sign_data[:, 1] == 1
            num_strokes = torch.unique(normalized_sign_data[pendown_frames][:, 0]).shape[0]
            x_down = normalized_sign_data[pendown_frames][:, 2]
            y_down = normalized_sign_data[pendown_frames][:, 3]
            sign_centroid = torch.tensor([torch.mean(x_down), torch.mean(y_down)], dtype=torch.float32)
            if y_down.shape[0] > 0:
                sign_height = torch.max(y_down) - torch.min(y_down)
            else:
                sign_height = 0
            if x_down.shape[0] > 0:
                sign_width = torch.max(x_down) - torch.min(x_down)
            else:
                sign_width = 0
            height_width_ratio = sign_height / sign_width if sign_width != 0 else torch.tensor(0.0, dtype=torch.float32)
            
            pressure = sign_data[pendown_frames][:, 4]
            azimuth = sign_data[pendown_frames][:, 5]
            altitude = sign_data[pendown_frames][:, 6]
            avg_pressure = torch.mean(pressure)
            avg_azimuth = torch.mean(azimuth)
            avg_altitude = torch.mean(altitude)
            max_pressure = torch.max(pressure) if pressure.numel() > 0 else torch.tensor(0.0, dtype=torch.float32)
            cls_token = torch.tensor([
                num_strokes, sign_height, sign_width, height_width_ratio, sign_centroid[0], sign_centroid[1], avg_pressure, avg_azimuth, avg_altitude, avg_vx, avg_vy, avg_ax, avg_ay, max_pressure], dtype=torch.float32)
            sign_data_aug = torch.cat([normalized_sign_data, velocity.unsqueeze(1), acceleration.unsqueeze(1), log_curv_radius.unsqueeze(1)], dim=1)
            
            sign_data_with_features[user_id]['cls_tokens'].append(cls_token)
            sign_data_with_features[user_id]['data'].append(sign_data_aug)

    return sign_data_with_features

In [9]:
def get_sign_max_seq_len(sign_data):
    max_len = 0
    for user in sign_data.keys():
        for data in sign_data[user]['data']:
            max_len = max(max_len, data.shape[0])
    return max_len

In [None]:
def get_time_taken_for_each_sign(normalized_eeg_data_dict, sign_features_dict):
    sign_stats = {}
    for user in normalized_eeg_data_dict.keys():
        user_eeg_data = normalized_eeg_data_dict[user]
        eeg_frames_for_sign = user_eeg_data[0].shape[0]
        time_taken_for_sign = eeg_frames_for_sign / recording_samp_rate
        if user not in sign_stats.keys():
            sign_stats[user] = []
        sign_stats[user].append(time_taken_for_sign)
    return sign_stats

In [None]:
sign_times_for_users = get_time_taken_for_each_sign(normalized_eeg_data_dict)

## Prepare sign dataset

In [10]:
def sign_attach_attention_tokens_and_labels(sign_data_with_features, labels):
    final_sign_data = {}
    max_len = get_sign_max_seq_len(sign_data_with_features)
    for user in sign_data_with_features.keys():
        final_sign_data[user] = {}
        final_sign_data[user]['cls_tokens'] = sign_data_with_features[user]['cls_tokens']
        final_sign_data[user]['data'] = []
        final_sign_data[user]['attention_masks'] = []
        final_sign_data[user]['labels'] = torch.tensor(labels[user], dtype=torch.long)
        for cls_token, data in zip(sign_data_with_features[user]['cls_tokens'], sign_data_with_features[user]['data']):
            attention_mask_cls = torch.ones(cls_token.shape, dtype=torch.float32)
            seq_len, feat_dim = data.shape
            pad_width = (0, max_len - seq_len)
            padded_data = torch.nn.functional.pad(data, (0, 0, 0, pad_width[1]), mode='constant', value=0)
            attention_mask = torch.zeros(max_len + 1, dtype=torch.float32)
            attention_mask[:seq_len + 1] = 1  # +1 for cls_token
            final_sign_data[user]['data'].append(padded_data)
            final_sign_data[user]['attention_masks'].append(attention_mask)
    return final_sign_data


In [11]:
def prepare_sign_dataset_with_all_parts(final_sign_data):
    final_sign_dataset = {}
    final_sign_dataset['cls_tokens'] = []
    final_sign_dataset['data'] = []
    final_sign_dataset['attention_masks'] = []
    final_sign_dataset['labels'] = []
    for user in final_sign_data:
        final_sign_dataset['cls_tokens'].extend(final_sign_data[user]['cls_tokens'])
        final_sign_dataset['data'].extend(final_sign_data[user]['data'])
        final_sign_dataset['attention_masks'].extend(final_sign_data[user]['attention_masks'])
        final_sign_dataset['labels'].extend(final_sign_data[user]['labels'])
    final_sign_dataset['cls_tokens'] = torch.stack(final_sign_dataset['cls_tokens'])
    final_sign_dataset['data'] = torch.stack(final_sign_dataset['data'])
    final_sign_dataset['attention_masks'] = torch.stack(final_sign_dataset['attention_masks'])
    final_sign_dataset['labels'] = torch.stack(final_sign_dataset['labels'])
    return final_sign_dataset

In [None]:
# For single data

sign_dict_sample, eeg_dict_sample, label = get_sig_eeg_data_dicts([files_mat_appended[0]], [1])
# print(sign_dict_sample['000000000200894'][0].shape)
normalized_sign_data_sample = normalize_sign_data_dict(sign_dict_sample)
sign_with_features = get_sign_data_features(normalized_sign_data_sample)
# print()
final_sign_data = sign_attach_attention_tokens_and_labels(sign_with_features, label)
final_sign_dataset = prepare_sign_dataset_with_all_parts(final_sign_data)
# print("Max len: ", get_sign_max_seq_len(sign_with_features))
# print(final_sign_data)

## Sign Transformer

In [None]:
# num_classes will be 2 (0 = unauthenticated, 1 = authenticated)

In [None]:
''' current data format:

Just for reference, so that I don't forget later

original data: 

{
    user_id: string,
    data: [ [ [], [], [] ... ], [ [], [], [] ... ] ...  ], size: (30 * 1200 * 7) = (num_samples_per_user * time_series_len * num_features)
}

extracted data:

{
    cls_tokens: tensor([ , , , , ... ], [ , , , ,  ....], [ , , , , ...] ... ), size: (total_num_samples * num_features),
    data: tensor([ [], [], [] ... ], [ [], [], [] ... ] ...), size: (total_num_samples * time_series_len * num_features),
    attention_masks: tensor([1, 1, 1, ...], [1, 1,1, ...] ... ), size: (total_num_samples * time_series_len),
    labels: tensor([1, 0, 1, 0 ...]), size: (total_num_samples, )
}
'''

class SignatureDataset(Dataset):
    def __init__(self, input_data, num_classes):
        self.seq_len = input_data['data'][0].shape[0]
        self.ts_dim = input_data['data'][0].shape[1]
        self.num_classes = num_classes

        self.x_ts = input_data['data']
        self.cls_token = input_data['cls_tokens']
        self.labels = input_data['labels']
        self.attention_mask = input_data['attention_masks']

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'x_ts': self.x_ts[idx],
            'cls_token': self.cls_token[idx],
            'labels': self.labels[idx],
            'attention_mask': self.attention_mask[idx]
        }


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout = 0.1):
        super().__init__()
        pe = torch.zeros(max_len + 1, d_model)
        position = torch.arange(0, max_len + 1, dtype = torch.float).unsqueeze(1)
        divterm = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # Credits to hkproj@github for this as https://github.com/hkproj/pytorch-transformer/blob/main/model.py
        pe[:, 0::2] = torch.sin(position * divterm)
        pe[:, 1::2] = torch.cos(position * divterm)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return self.pe[:, :x.shape[1]]

In [None]:
class SignatureTransformer(nn.Module):
    def __init__(self, input_dim, cls_dim, d_model, num_classes, num_heads, num_layers, max_seq_len, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
        self.cls_proj = nn.Linear(cls_dim, d_model)
        self.input_projection = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dropout = dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        self.classifier = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, num_classes))

    def forward(self, x_ts, cls_token, attn_mask = None):
        x_ts = torch.nan_to_num(x_ts, nan=0.0, posinf=0.0, neginf=0.0)
        cls_token = torch.nan_to_num(cls_token, nan=0.0, posinf=0.0, neginf=0.0)
        batch_size, t, feat_dim = x_ts.shape
        x_proj = self.input_projection(x_ts)
        cls_proj = self.cls_proj(cls_token).unsqueeze(1)
        # print("x_proj size: ", x_proj.shape)
        # print("cls_proj size: ", cls_proj.shape)
        x = torch.cat([cls_proj, x_proj], dim=1)
        # print("x_proj and cls_proj concatenated size: ", x.shape)
        x = x + self.positional_encoding(x)

        if attn_mask is not None:
            attn_mask = attn_mask == 0 # True = ignore the value, False = include it!!!!!!!!!!
            # cls_mask = torch.zeros((batch_size, 1), dtype=torch.bool, device=attn_mask.device)
            full_mask = torch.cat([attn_mask], dim=1)  # [batch_size, t+1]
        else:
            full_mask = None

        x = self.transformer(x, src_key_padding_mask=full_mask)
        cls_output = x[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

In [None]:
def sign_training_loop(model, dataloader, optimizer, loss_fn, device, num_epochs = 10):
    model.to(device)
    model.train()

    model_path = os.getenv("MODEL_PATH")

    for epoch in range(num_epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0
        for batch in dataloader:
            x_ts = batch['x_ts'].to(device)
            cls_token = batch['cls_token'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            optimizer.zero_grad()
            logits = model(x_ts, cls_token, attention_mask)
            loss = loss_fn(logits, labels)

            if torch.isnan(loss):
              print("NaN loss detected!")
              print("Labels:", labels)
              print("Logits:", logits)
              break

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # added to check for exploding gradients
            optimizer.step()

            total_loss += loss.item() * x_ts.size(0)
            preds = logits.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += x_ts.size(0)

        avg_loss = total_loss / total_samples if total_samples > 0 else 0
        accuracy = total_correct / total_samples
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f} - Acc: {accuracy:.4f}")
    now = datetime.now
    torch.save(model.state_dict(), os.path.join(model_path, f"model{now.strftime("%Y-%m-%d-%H-%M-%S")}.pth"))

In [None]:
# For all data

files_mat_genuine, user_ids_genuine = get_dataset_files_and_user_ids(data_category=constants.GENUINE)
files_mat_forged, user_ids_forged = get_dataset_files_and_user_ids(data_category=constants.FORGED)

files_mat_genuine[0].extend(files_mat_forged[0])
files_mat_appended = files_mat_genuine[0]
files_mat_genuine[1].extend(files_mat_forged[1])
labels_appended = files_mat_genuine[1]

sign_data_dict, eeg_data_dict, user_labels = get_sig_eeg_data_dicts(files_mat_appended, labels_appended)
normalized_sign_data_dict = normalize_sign_data_dict(sign_data_dict)
sign_data_with_features = get_sign_data_features(sign_data_dict) # changing the get features function by shifting normalization logic into it
# final_signature_data = sign_attach_attention_tokens_and_labels(sign_data_with_features, user_labels)
# final_sign_dataset_for_all_users = prepare_sign_dataset_with_all_parts(final_signature_data)

## Sign Data Feed to Transformer

In [None]:
input_data = final_sign_dataset_for_all_users
ts_dim = input_data['data'][0].size(1)
cls_dim = input_data['cls_tokens'][0].size(0)
d_model = 64
num_classes = 2
seq_len = get_sign_max_seq_len(sign_data_with_features)
batch_size = 8

dataset = SignatureDataset(input_data, num_classes)
# dataset.__getitem__(0)['x_ts'].shape
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = SignatureTransformer(input_dim=ts_dim, cls_dim=cls_dim, d_model=d_model, num_classes=num_classes, num_heads=4, num_layers=4, max_seq_len=seq_len)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

sign_training_loop(model, dataloader, optimizer, loss_fn, device, num_epochs=10)

## Sign - Misc

In [None]:
# final_signature_data = sign_attach_attention_tokens_and_labels(sign_data_with_features, user_labels)
# final_sign_dataset_for_all_users = prepare_sign_dataset_with_all_parts(final_signature_data)

In [None]:
# print(len(final_signature_data['002108410100044']['cls_tokens']))
# print(len(final_signature_data['002108410100044']['data']))
# print(len(final_signature_data['002108410100044']['attention_masks']))

# print(final_signature_data['002108410100044']['cls_tokens'][0].shape)
# print(final_signature_data['002108410100044']['data'][0].shape)
# print(final_signature_data['002108410100044']['attention_masks'][0].shape)


print(len(final_sign_dataset_for_all_users['cls_tokens']))
print(len(final_sign_dataset_for_all_users['data']))
print(len(final_sign_dataset_for_all_users['attention_masks']))
print(len(final_sign_dataset_for_all_users['labels']))

In [None]:


# print(len(final_sign_dataset_for_all_users['cls_tokens']))
# print(len(final_sign_dataset_for_all_users['data']))
# print(len(final_sign_dataset_for_all_users['attention_masks']))

# print(final_sign_dataset_for_all_users['cls_tokens'][0].shape)
# print(final_sign_dataset_for_all_users['data'][0].shape)
# print(final_sign_dataset_for_all_users['attention_masks'][0].shape)

In [None]:
# print(normalized_sign_data_dict['000000000200894'][0].shape)

In [None]:
# normalized_sign_data_dict = normalize_sign_data_dict(sign_data_dict)
# # print(sign_data_dict['000000000200894'])

In [None]:
# print(len(sign_times_for_users))


# EEG Data Classification Process

In [6]:
def normalize_eeg_data_dict(eeg_data_dict):
    normalized_eeg_data_dict = {}
    for user_id, eeg_list in eeg_data_dict.items():
        normalized_eeg_data_dict[user_id] = []
        for eeg_data in eeg_list:
            mean = np.mean(eeg_data, axis=0)
            std = np.std(eeg_data, axis=0)
            std = np.where(std == 0, 1, std)
            normalized = (eeg_data - mean) / std
            normalized_eeg_data_dict[user_id].append(normalized)
    return normalized_eeg_data_dict

In [None]:
def get_eeg_data_features(eeg_data_dict, fs=128):
    eeg_with_power = {}
    for user_id, eeg_list in eeg_data_dict.items():
        eeg_with_power[user_id] = []
        for eeg_data in eeg_list:
            # eeg_data shape: (frames, channels)
            fw_powers = []
            for ch in range(eeg_data.shape[1]):
                f, Pxx = welch(eeg_data[:, ch], fs=fs)
                # Frequency weighted power: sum(f * Pxx) / sum(Pxx)
                fw_power = np.sum(f * Pxx) / np.sum(Pxx) if np.sum(Pxx) > 0 else 0
                fw_powers.append(fw_power)
            # Repeat the fw_powers for each frame to match shape
            fw_power_arr = np.tile(fw_powers, (eeg_data.shape[0], 1))
            eeg_aug = np.concatenate([eeg_data, fw_power_arr], axis=1)
            eeg_with_power[user_id].append(eeg_aug)
    return eeg_with_power

In [None]:
# print(normalized_sign_data_dict['000000000200894'][0])
# print(normalized_eeg_data_dict['000000000200894'][0])
# print(user_labels['000000000200894'][0])
eeg_data = get_eeg_data_features(eeg_data_dict)


In [None]:
# Single data

normalized_eeg_data_dict = normalize_eeg_data_dict(eeg_data_dict)
sign_data_with_features = get_time_taken_for_each_sign(normalize_eeg_data_dict, sign_data_with_features)

eeg_data_with_features = get_eeg_data_features(normalized_eeg_data_dict)

# Misc - Trials

In [None]:
# According to the paper: The instances corresponding to the last EEG activity for each subject were interpolated to match the length of the longest genuine instance for that subject.

In [None]:
# import torch
# import numpy as np
# from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split
# import torch.nn as nn
# import torch.optim as optim

# class SignatureEEGDataset(Dataset):
#     def __init__(self, signature_data, eeg_data, labels):

#         self.samples = []
        
#         # Combine the data
#         for user_id in signature_data:
#             # Ensure we have matching signature, EEG, and label entries
#             n_samples = len(signature_data[user_id])
#             for i in range(n_samples):
#                 signature = signature_data[user_id][i]
#                 eeg = eeg_data[user_id][i]
#                 label = labels[user_id][i]
                
#                 # Convert to tensors
#                 signature_tensor = torch.FloatTensor(signature)
#                 eeg_tensor = torch.FloatTensor(eeg)
#                 label_tensor = torch.LongTensor([label])
                
#                 self.samples.append((signature_tensor, eeg_tensor, label_tensor))
    
#     def __len__(self):
#         return len(self.samples)
    
#     def __getitem__(self, idx):
#         return self.samples[idx]

# def collate_fn(batch):
    
#     signatures, eegs, labels = zip(*batch)
    
#     # Pad sequences to the same length
#     signatures_padded = torch.nn.utils.rnn.pad_sequence(signatures, batch_first=True)
#     eegs_padded = torch.nn.utils.rnn.pad_sequence(eegs, batch_first=True)
    
#     labels = torch.cat(labels)
    
#     return signatures_padded, eegs_padded, labels

In [None]:
# class SignatureEEGTransformer(nn.Module):
#     def __init__(self, signature_dim=7, eeg_channels=10, d_model=128, nhead=4, num_layers=3, num_classes=2):
#         super(SignatureEEGTransformer, self).__init__()
        
#         # Signature embedding
#         self.signature_embedding = nn.Linear(signature_dim, d_model)
        
#         # EEG embedding
#         self.eeg_embedding = nn.Linear(eeg_channels, d_model)
        
#         # Positional encoding
#         self.positional_encoding = PositionalEncoding(d_model)
        
#         # Transformer encoder
#         encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
#         # Classifier
#         self.classifier = nn.Sequential(
#             nn.Linear(d_model * 2, d_model),  # *2 because we concatenate signature and EEG features
#             nn.ReLU(),
#             nn.Linear(d_model, num_classes)
#         )
    
#     def forward(self, signature, eeg):
#         # Signature processing
#         signature_embedded = self.signature_embedding(signature)
#         signature_embedded = self.positional_encoding(signature_embedded)
        
#         # EEG processing
#         eeg_embedded = self.eeg_embedding(eeg)
#         eeg_embedded = self.positional_encoding(eeg_embedded)
        
#         # Transformer expects (seq_len, batch, features)
#         signature_embedded = signature_embedded.permute(1, 0, 2)
#         eeg_embedded = eeg_embedded.permute(1, 0, 2)
        
#         # Process through transformer
#         signature_features = self.transformer_encoder(signature_embedded)
#         eeg_features = self.transformer_encoder(eeg_embedded)
        
#         # Average over time dimension
#         signature_features = signature_features.mean(dim=0)
#         eeg_features = eeg_features.mean(dim=0)
        
#         # Concatenate features
#         combined_features = torch.cat([signature_features, eeg_features], dim=1)
        
#         # Classify
#         output = self.classifier(combined_features)
        
#         return output

# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         return x + self.pe[:x.size(0), :]

In [None]:
# def train_model(signature_data, eeg_data, labels, epochs=20, batch_size=32):
#     # Create dataset
#     dataset = SignatureEEGDataset(signature_data, eeg_data, labels)
    
#     # Split into train and validation
#     train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
    
#     # Create dataloaders
#     train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
#     val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
#     # Initialize model
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model = SignatureEEGTransformer().to(device)
    
#     # Loss and optimizer
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001)
    
#     # Training loop
#     for epoch in range(epochs):
#         model.train()
#         train_loss = 0.0
        
#         for signatures, eegs, labels in train_loader:
#             signatures, eegs, labels = signatures.to(device), eegs.to(device), labels.to(device)
            
#             optimizer.zero_grad()
            
#             outputs = model(signatures, eegs)
#             loss = criterion(outputs, labels)
            
#             loss.backward()
#             optimizer.step()
            
#             train_loss += loss.item()
        
#         # Validation
#         model.eval()
#         val_loss = 0.0
#         correct = 0
#         total = 0
        
#         with torch.no_grad():
#             for signatures, eegs, labels in val_loader:
#                 signatures, eegs, labels = signatures.to(device), eegs.to(device), labels.to(device)
                
#                 outputs = model(signatures, eegs)
#                 loss = criterion(outputs, labels)
                
#                 val_loss += loss.item()
#                 _, predicted = torch.max(outputs.data, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()
        
#         print(f'Epoch {epoch+1}/{epochs}')
#         print(f'Train Loss: {train_loss/len(train_loader):.4f}')
#         print(f'Val Loss: {val_loss/len(val_loader):.4f}')
#         print(f'Val Accuracy: {100*correct/total:.2f}%')
#         print('-' * 50)
    
#     return model

In [None]:
model = train_model(normalized_sign_data_dict, normalized_eeg_data_dict, user_labels)

In [None]:
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras import layers, models

# def prepare_data_with_masking(signature_data, eeg_data, labels_dict):
#     """
#     Prepares data with dynamic padding and preserves original lengths for masking.
#     Returns:
#         - Padded sequences
#         - Sequence length arrays (for masking)
#         - Labels
#     """
#     # Initialize lists
#     X_signature, X_eeg = [], []
#     len_signature, len_eeg = [], []
#     y = []
    
#     for user_id in labels_dict.keys():
#         for i in range(len(labels_dict[user_id])):
#             # Signature data
#             sig = signature_data[user_id][i]
#             X_signature.append(sig)
#             len_signature.append(len(sig))
            
#             # EEG data
#             eeg = eeg_data[user_id][i]
#             X_eeg.append(eeg)
#             len_eeg.append(len(eeg))
            
#             # Label
#             y.append(labels_dict[user_id][i])
    
#     # Pad sequences to max length in dataset
#     max_len_sig = max(len_signature)
#     max_len_eeg = max(len_eeg)
    
#     X_signature_pad = tf.keras.preprocessing.sequence.pad_sequences(
#         X_signature, maxlen=max_len_sig, dtype='float32', padding='post'
#     )
    
#     X_eeg_pad = tf.keras.preprocessing.sequence.pad_sequences(
#         X_eeg, maxlen=max_len_eeg, dtype='float32', padding='post'
#     )
    
#     return (
#         X_signature_pad, np.array(len_signature),
#         X_eeg_pad, np.array(len_eeg),
#         np.array(y)
#     )

In [None]:
# def create_masked_model(signature_shape, eeg_shape):
#     """Creates a model with masking layers to ignore padding"""
#     # Signature branch
#     signature_input = layers.Input(shape=signature_shape, name='signature_input')
#     signature_length = layers.Input(shape=(1,), name='signature_length', dtype='int32')
    
#     sig = layers.Masking(mask_value=0.0)(signature_input)
#     sig = layers.Conv1D(32, 5, activation='relu', padding='same')(sig)
#     sig = layers.MaxPooling1D(2)(sig)
#     sig = layers.Conv1D(64, 5, activation='relu', padding='same')(sig)
#     sig = layers.MaxPooling1D(2)(sig)
#     sig = layers.GlobalAveragePooling1D()(sig)
    
#     # EEG branch
#     eeg_input = layers.Input(shape=eeg_shape, name='eeg_input')
#     eeg_length = layers.Input(shape=(1,), name='eeg_length', dtype='int32')
    
#     eeg = layers.Masking(mask_value=0.0)(eeg_input)
#     eeg = layers.Conv1D(32, 5, activation='relu', padding='same')(eeg)
#     eeg = layers.MaxPooling1D(2)(eeg)
#     eeg = layers.Conv1D(64, 5, activation='relu', padding='same')(eeg)
#     eeg = layers.MaxPooling1D(2)(eeg)
#     eeg = layers.GlobalAveragePooling1D()(eeg)
    
#     # Combine branches
#     combined = layers.concatenate([sig, eeg])
#     combined = layers.Dense(128, activation='relu')(combined)
#     outputs = layers.Dense(1, activation='sigmoid')(combined)
    
#     # Model with length inputs
#     model = models.Model(
#         inputs=[signature_input, signature_length, eeg_input, eeg_length],
#         outputs=outputs
#     )
    
#     model.compile(optimizer='adam',
#                  loss='binary_crossentropy',
#                  metrics=['accuracy'])
    
#     return model

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, X_sig, len_sig, X_eeg, len_eeg, y, batch_size=32):
        self.X_sig = X_sig
        self.len_sig = len_sig
        self.X_eeg = X_eeg
        self.len_eeg = len_eeg
        self.y = y
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.ceil(len(self.y) / self.batch_size))
    
    def __getitem__(self, idx):
        batch_sig = self.X_sig[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_len_sig = self.len_sig[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_eeg = self.X_eeg[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_len_eeg = self.len_eeg[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
        
        return [batch_sig, batch_len_sig, batch_eeg, batch_len_eeg], batch_y

In [None]:
# def train_authentication_model(signature_data, eeg_data, labels_dict):
#     # Prepare data
#     X_signature, X_eeg, y = prepare_data(signature_data, eeg_data, labels_dict)
    
#     # Split into train and test sets
#     (X_signature_train, X_signature_test, 
#      X_eeg_train, X_eeg_test, 
#      y_train, y_test) = train_test_split(X_signature, X_eeg, y, test_size=0.2, random_state=42)
    
#     # Create model
#     signature_shape = X_signature_train.shape[1:]
#     eeg_shape = X_eeg_train.shape[1:]
#     model = create_dual_input_model(signature_shape, eeg_shape)
    
#     # Train model
#     history = model.fit(
#         [X_signature_train, X_eeg_train],
#         y_train,
#         epochs=50,
#         batch_size=32,
#         validation_data=([X_signature_test, X_eeg_test], y_test),
#         callbacks=[
#             tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
#             tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=3)
#         ]
#     )
    
#     return model, history

In [None]:
# 1. Prepare data
X_sig, len_sig, X_eeg, len_eeg, norm_y = prepare_data_with_masking(signature_data, eeg_data, labels_dict)

# 2. Split data
(X_sig_train, X_sig_test, 
 len_sig_train, len_sig_test,
 X_eeg_train, X_eeg_test,
 len_eeg_train, len_eeg_test,
 y_train, y_test) = train_test_split(X_sig, len_sig, X_eeg, len_eeg, norm_y, test_size=0.2)

# 3. Create model
model = create_masked_model(X_sig_train.shape[1:], X_eeg_train.shape[1:])

# 4. Create generators
train_gen = DataGenerator(X_sig_train, len_sig_train, X_eeg_train, len_eeg_train, y_train)
val_gen = DataGenerator(X_sig_test, len_sig_test, X_eeg_test, len_eeg_test, y_test)

# 5. Train
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5)
    ]
)

In [None]:
def evaluate_model(model, X_signature_test, X_eeg_test, y_test):
    results = model.evaluate([X_signature_test, X_eeg_test], y_test)
    print(f"Test Loss: {results[0]:.4f}")
    print(f"Test Accuracy: {results[1]:.4f}")
    print(f"Test AUC: {results[2]:.4f}")
    
    # You can add more evaluation metrics as needed
    y_pred = model.predict([X_signature_test, X_eeg_test])
    y_pred = (y_pred > 0.5).astype(int)
    
    from sklearn.metrics import classification_report, confusion_matrix
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))