# References

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint
import constants
import os
from dotenv import load_dotenv
import matplotlib.cm as cm
import scipy.io as sp
import pprint as pp
from scipy.signal import welch
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

In [3]:
import sys
np.set_printoptions(threshold=sys.maxsize, linewidth=300, suppress=True)
pd.set_option('display.max_colwidth', 500)

# Constants

In [4]:
# some calculations
# for the dataset, 10 seconds = 1280 frames. 1 second = 128 frames. We can sue this to find the number of seconds that were taken to record the signature.

recording_samp_rate = 128 # per second
per_phase_frames = 1280 # seconds
max_seq_len_for_data = 3000 # frames

# Fetching raw data (Sign + EEG)

## Processing raw files

In [5]:
load_dotenv()
dataset_path = os.getenv('DATASET_PATH')

def get_dataset_files_and_user_ids(data_category = constants.GENUINE, data_type = constants.TRAIN):
    user_ids = []
    labels = []
    files_csv = []
    files_mat = []

    # Get training and testing data
    # data_split = pd.read_csv(os.path.join(dataset_path, "Identification_split.csv"))
    # files_for_task = list(data_split[data_split.set == data_type].filename)

    for root, dirs, files in os.walk(dataset_path):
        if os.path.basename(root) == constants.GENUINE == data_category:
            for file in files:
                # if file.endswith('.mat') and file in files_for_task:
                if file.endswith('.mat'):
                    files_mat.append(os.path.join(root, file))
                    labels.append(data_category)
        elif os.path.basename(root) == constants.FORGED == data_category:
            for file in files:
                # if file.endswith('.mat') and file in files_for_task:
                if file.endswith('.mat'):
                    files_mat.append(os.path.join(root, file))
                    labels.append(data_category)
        if os.path.basename(root) != constants.GENUINE and os.path.basename(root) != constants.FORGED and os.path.basename(root) != 'SignEEGv1.0':
            user_ids.append(os.path.basename(root))
        
    # files_mat = sorted(files_mat, key=lambda x: int(x.split('_')[3]))
    # files_mat = [files_mat, [data_category for _ in files_mat]]
    return files_mat, user_ids, labels

## For debugging

In [None]:
# # All files

# files_mat_genuine, user_ids_genuine, genuine_labels = get_dataset_files_and_user_ids(data_category=constants.GENUINE)
# files_mat_forged, user_ids_forged, forged_labels = get_dataset_files_and_user_ids(data_category=constants.FORGED)

# files_mat_genuine.extend(files_mat_forged)
# files_mat_appended = files_mat_genuine
# genuine_labels.extend(forged_labels)
# labels_appended = genuine_labels

# sign_data_dict, eeg_data_dict, labels = get_sig_eeg_data_dicts()

# # print(labels_appended)
# # # print(len(labels_appended))

In [None]:

# files_all = np.array(files_mat_appended)
# labels_all = np.array(labels_appended)

# indices = np.arange(len(files_all))
# np.random.shuffle(indices)

# files_mat_appended = files_all[indices]
# labels_appended = labels_all[indices]

In [None]:
# # print(len(files_mat_appended))
# to_check = sp.loadmat(files_mat_appended[0])
# print("User ID: ", to_check['subject']['SubjectID'])
# print("Sign Data: ", to_check['subject']['SignWacom'][0][0].shape)
# print("EEG Data: ", to_check['subject']['ICA_EEG'][0][0].shape)
# sign_data_test = to_check['subject']['SignWacom'][0][0]
# eeg_data_test = to_check['subject']['ICA_EEG'][0][0]

# # np.delete(sign_data_test, 0, axis = 1)


In [None]:
# print(sign_data_test.shape)
# eeg_data_test = eeg_data_test.T
# total_frames_eeg = eeg_data_test.shape[0]
# roi_frames_start = -(total_frames_eeg % per_phase_frames)
# print(eeg_data_test[roi_frames_start:].shape)
# # print(eeg_data_test[(total_frames_eeg % per_phase_frames) - 1 :, :].T.shape)

In [None]:
# small note: np.delete(axis = 1) will delete a column, axis = 0 will delete a row. be careful

In [None]:
# normalized_sign_data = np.delete(sign_data_test, 0, axis = 1)
# print(normalized_sign_data)

## Getting list of sign data, eeg data and label for each file

In [6]:
def get_sig_eeg_raw_data(mat_files, labels):
    raw_data_list = []
    for mat_file, label in zip(mat_files, labels):
        mat_content = sp.loadmat(mat_file)
        user_id = str(mat_content['subject']['SubjectID'][0][0][0])
        sig_data = mat_content['subject']['SignWacom'][0][0]
        eeg_ica_data = mat_content['subject']['ICA_EEG'][0][0].T
        # if user_id not in user_ids_master_list:
        #     user_ids_master_list.append(user_id)
        # removing unwanted columns from sign data
        sig_data = torch.from_numpy(np.delete(sig_data, 0, axis=1)).to(dtype=torch.float32)
        
        # getting part of eeg data during which signature was recorded (ROI)
        roi_frames_start = -(eeg_ica_data.shape[0] % per_phase_frames) if per_phase_frames > 0 else 0
        eeg_ica_data = torch.from_numpy(eeg_ica_data[roi_frames_start:]).to(dtype=torch.float32)

        if sig_data.shape[0] > max_seq_len_for_data:
            # print("Caught you!!!")
            # print("User ID: ", user_id)
            # print("File: ", mat_file)  
            continue # Skip these files because it's too long, outlier
        raw_data_list.append({
            'sign_data': sig_data,
            'eeg_data': eeg_ica_data,
            'user_id': user_id,
            'label': 1 if label == constants.GENUINE else 0,
            'file': mat_file
        })

    return raw_data_list


# Sign Data Classification Process

## Sign data preprocessing

In [7]:
def normalize_sign_data_dict(sign_data):

    mean = torch.mean(sign_data[:, 2:], dim=0)
    std = torch.std(sign_data[:, 2:], dim=0)
    std = torch.where(std == 0, torch.tensor(1.0, dtype=torch.float32), std)
    normalized = (sign_data[:, 2:] - mean) / std
    normalized = torch.cat([sign_data[:, 0:2], normalized], dim=1).to(dtype=torch.float32)
    return normalized


## Sign Data Feature Extraction

In [8]:
def get_sign_data_features(sign_data):
    normalized_sign_data = normalize_sign_data_dict(sign_data)
    x = sign_data[:, 2]
    y = sign_data[:, 3]

    normalized_sign_data = torch.tensor(normalized_sign_data, dtype=torch.float32)
    norm_x = normalized_sign_data[:, 2]
    norm_y = normalized_sign_data[:, 3]
    vx = torch.gradient(norm_x)[0]
    vy = torch.gradient(norm_y)[0]
    velocity = torch.sqrt(vx**2 + vy**2)
    ax = torch.gradient(vx)[0]
    ay = torch.gradient(vy)[0]
    acceleration = torch.sqrt(ax**2 + ay**2)
    
    avg_vx = torch.mean(vx)
    avg_vy = torch.mean(vy)
    avg_ax = torch.mean(ax)
    avg_ay = torch.mean(ay)
    
    # log curvature radius
    dt = 1
    dx = torch.gradient(norm_x, spacing=(dt,))[0]
    dy = torch.gradient(norm_y, spacing=(dt,))[0]
    v_t = torch.sqrt(dx ** 2 + dy ** 2)
    v_t = torch.where(v_t == 0, torch.tensor(1e-10, dtype=v_t.dtype), v_t)
    theta = torch.atan2(dy, dx)
    dtheta = torch.gradient(theta, spacing=(dt,))[0]
    dtheta = torch.where(dtheta == 0, torch.tensor(1e-10, dtype=dtheta.dtype), dtheta)
    log_curv_radius = torch.log(torch.abs(v_t / dtheta) + 1e-10)
    # print("Log Curve Radius shape: ", log_curv_radius.shape)
    # getting static features
    pendown_frames = normalized_sign_data[:, 1] == 1
    num_strokes = torch.unique(normalized_sign_data[pendown_frames][:, 0]).shape[0]
    x_down = normalized_sign_data[pendown_frames][:, 2]
    y_down = normalized_sign_data[pendown_frames][:, 3]
    sign_centroid = torch.tensor([torch.mean(x_down), torch.mean(y_down)], dtype=torch.float32)
    if y_down.shape[0] > 0:
        sign_height = torch.max(y_down) - torch.min(y_down)
    else:
        sign_height = 0
    if x_down.shape[0] > 0:
        sign_width = torch.max(x_down) - torch.min(x_down)
    else:
        sign_width = 0
    height_width_ratio = sign_height / sign_width if sign_width != 0 else torch.tensor(0.0, dtype=torch.float32)
    
    pressure = sign_data[pendown_frames][:, 4]
    azimuth = sign_data[pendown_frames][:, 5]
    altitude = sign_data[pendown_frames][:, 6]
    avg_pressure = torch.mean(pressure)
    avg_azimuth = torch.mean(azimuth)
    avg_altitude = torch.mean(altitude)
    max_pressure = torch.max(pressure) if pressure.numel() > 0 else torch.tensor(0.0, dtype=torch.float32)
    cls_token = torch.tensor([
        num_strokes, sign_height, sign_width, height_width_ratio, sign_centroid[0], sign_centroid[1], avg_pressure, avg_azimuth, avg_altitude, avg_vx, avg_vy, avg_ax, avg_ay, max_pressure], dtype=torch.float32)
    sign_data_aug = torch.cat([normalized_sign_data, velocity.unsqueeze(1), acceleration.unsqueeze(1), log_curv_radius.unsqueeze(1)], dim=1)

    return sign_data_aug, cls_token

In [None]:
# def get_sign_max_seq_len(sign_data):
#     max_len = 0
#     for user in sign_data.keys():
#         for data in sign_data[user]['data']:
#             max_len = max(max_len, data.shape[0])
#     return max_len

In [None]:
# def get_time_taken_for_each_sign(normalized_eeg_data_dict, sign_features_dict):
#     sign_stats = {}
#     for user in normalized_eeg_data_dict.keys():
#         user_eeg_data = normalized_eeg_data_dict[user]
#         eeg_frames_for_sign = user_eeg_data[0].shape[0]
#         time_taken_for_sign = eeg_frames_for_sign / recording_samp_rate
#         if user not in sign_stats.keys():
#             sign_stats[user] = []
#         sign_stats[user].append(time_taken_for_sign)
#     return sign_stats

In [None]:
# sign_times_for_users = get_time_taken_for_each_sign(normalized_eeg_data_dict)

## Prepare sign dataset

In [9]:
def attach_attention_tokens_and_padding(data, max_len):
    seq_len, feat_dim = data.shape
    pad_width = (0, max_len - seq_len)
    padded_data = torch.nn.functional.pad(data, (0, 0, 0, pad_width[1]), mode='constant', value=0)
    attention_mask = torch.zeros(max_len + 1, dtype=torch.float32)
    attention_mask[:seq_len + 1] = 1  # +1 for cls_token
    return padded_data, attention_mask


In [None]:
# def prepare_sign_dataset_with_all_parts(final_sign_data):
#     final_sign_dataset = {}
#     final_sign_dataset['cls_tokens'] = []
#     final_sign_dataset['data'] = []
#     final_sign_dataset['attention_masks'] = []
#     final_sign_dataset['labels'] = []
#     for user in final_sign_data:
#         final_sign_dataset['cls_tokens'].extend(final_sign_data[user]['cls_tokens'])
#         final_sign_dataset['data'].extend(final_sign_data[user]['data'])
#         final_sign_dataset['attention_masks'].extend(final_sign_data[user]['attention_masks'])
#         final_sign_dataset['labels'].extend(final_sign_data[user]['labels'])
#     final_sign_dataset['cls_tokens'] = torch.stack(final_sign_dataset['cls_tokens'])
#     final_sign_dataset['data'] = torch.stack(final_sign_dataset['data'])
#     final_sign_dataset['attention_masks'] = torch.stack(final_sign_dataset['attention_masks'])
#     final_sign_dataset['labels'] = torch.stack(final_sign_dataset['labels'])
#     return final_sign_dataset

In [None]:
# # For single data

# sign_dict_sample, eeg_dict_sample, label = get_sig_eeg_data_dicts([files_mat_appended[0]], [1])
# # print(sign_dict_sample['000000000200894'][0].shape)
# normalized_sign_data_sample = normalize_sign_data_dict(sign_dict_sample)
# sign_with_features = get_sign_data_features(normalized_sign_data_sample)
# # print()
# final_sign_data = sign_attach_attention_tokens_and_labels(sign_with_features, label)
# final_sign_dataset = prepare_sign_dataset_with_all_parts(final_sign_data)
# # print("Max len: ", get_sign_max_seq_len(sign_with_features))
# # print(final_sign_data)

## Sign Transformer

In [None]:
# num_classes will be 2 (0 = unauthenticated, 1 = authenticated)

In [None]:
''' current data format:

Just for reference, so that I don't forget later

original data: 

{
    user_id: string,
    data: [ [ [], [], [] ... ], [ [], [], [] ... ] ...  ], size: (30 * 1200 * 7) = (num_samples_per_user * time_series_len * num_features)
}

extracted data:

{
    cls_tokens: tensor([ , , , , ... ], [ , , , ,  ....], [ , , , , ...] ... ), size: (total_num_samples * num_features),
    data: tensor([ [], [], [] ... ], [ [], [], [] ... ] ...), size: (total_num_samples * time_series_len * num_features),
    attention_masks: tensor([1, 1, 1, ...], [1, 1,1, ...] ... ), size: (total_num_samples * time_series_len),
    labels: tensor([1, 0, 1, 0 ...]), size: (total_num_samples, )
}
'''

class SignatureDataset(Dataset):
    def __init__(self, input_data, num_classes):
        self.seq_len = input_data['data'][0].shape[0]
        self.ts_dim = input_data['data'][0].shape[1]
        self.num_classes = num_classes

        self.x_ts = input_data['data']
        self.cls_token = input_data['cls_tokens']
        self.labels = input_data['labels']
        self.attention_mask = input_data['attention_masks']

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'x_ts': self.x_ts[idx],
            'cls_token': self.cls_token[idx],
            'labels': self.labels[idx],
            'attention_mask': self.attention_mask[idx]
        }


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout = 0.1):
        super().__init__()
        pe = torch.zeros(max_len + 1, d_model)
        position = torch.arange(0, max_len + 1, dtype = torch.float).unsqueeze(1)
        divterm = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # Credits to hkproj@github for this as https://github.com/hkproj/pytorch-transformer/blob/main/model.py
        pe[:, 0::2] = torch.sin(position * divterm)
        pe[:, 1::2] = torch.cos(position * divterm)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return self.pe[:, :x.shape[1]]

In [10]:
class SignatureTransformer(nn.Module):
    def __init__(self, input_dim, cls_dim, d_model, num_classes, num_heads, num_layers, max_seq_len, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
        self.cls_proj = nn.Linear(cls_dim, d_model)
        self.input_projection = nn.Linear(input_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dropout = dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        # uncomment for single modality
        # self.classifier = nn.Sequential(nn.Linear(d_model, d_model), nn.ReLU(), nn.Linear(d_model, num_classes))

    def forward(self, x_ts, cls_token, attn_mask = None):
        x_ts = torch.nan_to_num(x_ts, nan=0.0, posinf=0.0, neginf=0.0)
        cls_token = torch.nan_to_num(cls_token, nan=0.0, posinf=0.0, neginf=0.0)
        batch_size, t, feat_dim = x_ts.shape
        x_proj = self.input_projection(x_ts)
        cls_proj = self.cls_proj(cls_token).unsqueeze(1)
        # print("x_proj size: ", x_proj.shape)
        # print("cls_proj size: ", cls_proj.shape)
        x = torch.cat([cls_proj, x_proj], dim=1)
        # print("x_proj and cls_proj concatenated size: ", x.shape)
        x = x + self.positional_encoding(x)

        if attn_mask is not None:
            attn_mask = attn_mask == 0 # True = ignore the value, False = include it!!!!!!!!!!
            # cls_mask = torch.zeros((batch_size, 1), dtype=torch.bool, device=attn_mask.device)
            full_mask = torch.cat([attn_mask], dim=1)  # [batch_size, t+1]
        else:
            full_mask = None
        # print("Mask shape: ", full_mask.shape)
        x = self.transformer(x, src_key_padding_mask=full_mask)
        cls_output = x[:, 0, :]
        # uncomment for single modality transformer
        # logits = self.classifier(cls_output)
        # return logits

        # uncomment for multimodal transformer
        return cls_output 

In [None]:
def sign_training_loop(model, dataloader, optimizer, loss_fn, device, num_epochs = 10):
    model.to(device)
    model.train()

    model_path = os.getenv("MODEL_PATH")

    for epoch in range(num_epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0
        for batch in dataloader:
            x_ts = batch['x_ts'].to(device)
            cls_token = batch['cls_token'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            optimizer.zero_grad()
            logits = model(x_ts, cls_token, attention_mask)
            loss = loss_fn(logits, labels)

            if torch.isnan(loss):
              print("NaN loss detected!")
              print("Labels:", labels)
              print("Logits:", logits)
              break

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # added to check for exploding gradients
            optimizer.step()

            total_loss += loss.item() * x_ts.size(0)
            preds = logits.argmax(dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += x_ts.size(0)

        avg_loss = total_loss / total_samples if total_samples > 0 else 0
        accuracy = total_correct / total_samples
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f} - Acc: {accuracy:.4f}")
    torch.save(model.state_dict(), os.path.join(model_path, f"model{datetime.now().strftime('%m%d%Y-%H%M%S')}.pth"))

In [None]:
# # For all data

# files_mat_genuine, user_ids_genuine, genuine_labels = get_dataset_files_and_user_ids(data_category=constants.GENUINE)
# files_mat_forged, user_ids_forged, forged_labels = get_dataset_files_and_user_ids(data_category=constants.FORGED)

# files_mat_genuine.extend(files_mat_forged)
# files_mat_appended = files_mat_genuine
# genuine_labels.extend(forged_labels)
# labels_appended = genuine_labels

# # shuffling to prevent overfitting
# files_all = np.array(files_mat_appended)
# labels_all = np.array(labels_appended)

# indices = np.arange(len(files_all))
# np.random.shuffle(indices)

# files_mat_appended = files_all[indices]
# labels_appended = labels_all[indices]

# sign_data_dict, eeg_data_dict, user_labels = get_sig_eeg_raw_data(files_mat_appended, labels_appended)
# normalized_sign_data_dict = normalize_sign_data_dict(sign_data_dict)
# sign_data_with_features = get_sign_data_features(sign_data_dict) # changing the get features function by shifting normalization logic into it
# final_signature_data = sign_attach_attention_tokens_and_labels(sign_data_with_features, user_labels)
# final_sign_dataset_for_all_users = prepare_sign_dataset_with_all_parts(final_signature_data)

## Sign Data Feed to Transformer

In [None]:
input_data = final_sign_dataset_for_all_users
ts_dim = input_data['data'][0].size(1)
cls_dim = input_data['cls_tokens'][0].size(0)
d_model = 64
num_classes = 2
seq_len = get_sign_max_seq_len(sign_data_with_features)
batch_size = 8

dataset = SignatureDataset(input_data, num_classes)
# dataset.__getitem__(0)['x_ts'].shape
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

sign_model = SignatureTransformer(input_dim=ts_dim, cls_dim=cls_dim, d_model=d_model, num_classes=num_classes, num_heads=4, num_layers=4, max_seq_len=seq_len)
optimizer = optim.Adam(sign_model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

sign_training_loop(sign_model, dataloader, optimizer, loss_fn, device, num_epochs=10)

## Sign - Misc

In [None]:
# final_signature_data = sign_attach_attention_tokens_and_labels(sign_data_with_features, user_labels)
# final_sign_dataset_for_all_users = prepare_sign_dataset_with_all_parts(final_signature_data)

In [None]:
# print(len(final_signature_data['002108410100044']['cls_tokens']))
# print(len(final_signature_data['002108410100044']['data']))
# print(len(final_signature_data['002108410100044']['attention_masks']))

# print(final_signature_data['002108410100044']['cls_tokens'][0].shape)
# print(final_signature_data['002108410100044']['data'][0].shape)
# print(final_signature_data['002108410100044']['attention_masks'][0].shape)


print(len(final_sign_dataset_for_all_users['cls_tokens']))
print(len(final_sign_dataset_for_all_users['data']))
print(len(final_sign_dataset_for_all_users['attention_masks']))
print(len(final_sign_dataset_for_all_users['labels']))

In [None]:


# print(len(final_sign_dataset_for_all_users['cls_tokens']))
# print(len(final_sign_dataset_for_all_users['data']))
# print(len(final_sign_dataset_for_all_users['attention_masks']))

# print(final_sign_dataset_for_all_users['cls_tokens'][0].shape)
# print(final_sign_dataset_for_all_users['data'][0].shape)
# print(final_sign_dataset_for_all_users['attention_masks'][0].shape)

In [None]:
# print(normalized_sign_data_dict['000000000200894'][0].shape)

In [None]:
# normalized_sign_data_dict = normalize_sign_data_dict(sign_data_dict)
# # print(sign_data_dict['000000000200894'])

In [None]:
# print(len(sign_times_for_users))


# EEG Data Classification Process

## EEG Data Preprocessing

In [None]:
def normalize_eeg_data_dict(eeg_data_dict):
    normalized_eeg_data_dict = {}
    for user_id, eeg_list in eeg_data_dict.items():
        normalized_eeg_data_dict[user_id] = []
        for eeg_data in eeg_list:
            mean = eeg_data.mean(dim=0, keepdim=True)
            std = eeg_data.std(dim=0, keepdim=True)
            std = torch.where(std == 0, torch.tensor(1.0, dtype=std.dtype, device=std.device), std)
            normalized = (eeg_data - mean) / std
            normalized_eeg_data_dict[user_id].append(normalized)
    return normalized_eeg_data_dict

## EEG Feature Extraction

In [11]:
def get_nth_difference_mean_for_signal(input_signal, n):
    input_signal = torch.as_tensor(input_signal)
    diff = torch.abs(input_signal[n:] - input_signal[:-n])
    res = torch.sum(diff) / (input_signal.shape[0] - n)
    return res

def normalize_for_eeg_related_data(data):
    data = torch.as_tensor(data, dtype=torch.float32)
    mean = torch.mean(data, dim=0)
    std = torch.std(data, dim=0)
    std = torch.where(std == 0, torch.tensor(1.0, dtype=std.dtype, device=std.device), std)
    norm = (data - mean) / std
    return norm

def get_eeg_data_features(eeg_data, fs=recording_samp_rate):

    signal_mean = torch.mean(eeg_data)
    signal_std = torch.std(eeg_data)

    first_difference_sample_mean_absolute_difference_raw_signal = get_nth_difference_mean_for_signal(eeg_data, 1)
    second_difference_sample_mean_absolute_difference_raw_signal = get_nth_difference_mean_for_signal(eeg_data, 2)

    normalized_signal = normalize_for_eeg_related_data(eeg_data)
    first_difference_sample_mean_absolute_difference_normalized_signal = get_nth_difference_mean_for_signal(normalized_signal, 1)
    second_difference_sample_mean_absolute_difference_normalized_signal = get_nth_difference_mean_for_signal(normalized_signal, 2)
    fw_powers = []
    eeg_data = torch.as_tensor(eeg_data, dtype=torch.float32)
    for ch in range(eeg_data.shape[1]):
        # Welch returns numpy arrays, so convert to torch
        f, Pxx = welch(eeg_data[:, ch].cpu().numpy(), fs=fs)
        f = torch.from_numpy(f).to(eeg_data.device)
        Pxx = torch.from_numpy(Pxx).to(eeg_data.device)
        fw_power = torch.sum(f * Pxx) / torch.sum(Pxx) if torch.sum(Pxx) > 0 else torch.tensor(0.0, device=eeg_data.device)
        fw_powers.append(fw_power)
    fw_power_arr = torch.stack(fw_powers).unsqueeze(0)
    # cls_token = torch.cat([signal_mean, signal_std, first_difference_sample_mean_absolute_difference_raw_signal, second_difference_sample_mean_absolute_difference_raw_signal, first_difference_sample_mean_absolute_difference_normalized_signal, second_difference_sample_mean_absolute_difference_normalized_signal])
    # cls_token = torch.stack(fw_power_arr)
    # Combine all extracted features into a 1D tensor (flattened)
    features = [signal_mean, signal_std, first_difference_sample_mean_absolute_difference_raw_signal, second_difference_sample_mean_absolute_difference_raw_signal, first_difference_sample_mean_absolute_difference_normalized_signal, second_difference_sample_mean_absolute_difference_normalized_signal]
    features.extend(fw_power_arr.squeeze(0).tolist())
    cls_token = torch.tensor(features, dtype=torch.float32)
    return normalized_signal, cls_token

In [None]:
# def get_eeg_max_seq_len(eeg_feature_data):
#     max_len = 0
#     for user in eeg_feature_data:
#         for eeg_data in eeg_feature_data[user]['data']:
#             max_len = max(max_len, eeg_data.shape[0])
#     return max_len

## EEG Dataset Preparation

In [None]:
def eeg_attach_attention_tokens_and_labels(eeg_feature_data, labels_dict, user_ids_list):
    max_seq_len = get_eeg_max_seq_len(eeg_feature_data)
    print("Max seq len: ", max_seq_len)
    final_eeg_data = {}
    for user in user_ids_list:
        final_eeg_data[user] = {}
        final_eeg_data[user]['data'] = []
        final_eeg_data[user]['cls_tokens'] = eeg_feature_data[user]['cls_tokens']
        final_eeg_data[user]['attention_masks'] = []
        final_eeg_data[user]['labels'] = labels_dict[user]
        for data in eeg_feature_data[user]['data']:
            seq_len = data.shape[0]
            pad_len = max_seq_len - seq_len
            padded_data = torch.nn.functional.pad(data, (0, 0, 0, pad_len), mode='constant', value=0)
            attention_mask = torch.zeros(max_seq_len+1, dtype=torch.float32)
            attention_mask[:seq_len + 1] = 1
            final_eeg_data[user]['data'].append(padded_data)
            final_eeg_data[user]['attention_masks'].append(attention_mask)
    return final_eeg_data


In [None]:
# def prepare_eeg_dataset_with_all_parts(final_eeg_data):
#     final_eeg_dataset = {}
#     final_eeg_dataset['cls_tokens'] = []
#     final_eeg_dataset['data'] = []
#     final_eeg_dataset['attention_masks'] = []
#     final_eeg_dataset['labels'] = []
#     for user in final_eeg_data:
#         final_eeg_dataset['cls_tokens'].extend(final_eeg_data[user]['cls_tokens'])
#         final_eeg_dataset['data'].extend(final_eeg_data[user]['data'])
#         final_eeg_dataset['attention_masks'].extend(final_eeg_data[user]['attention_masks'])
#         final_eeg_dataset['labels'].extend(final_eeg_data[user]['labels'])
#     final_eeg_dataset['cls_tokens'] = torch.stack(final_eeg_dataset['cls_tokens'])
#     final_eeg_dataset['data'] = torch.stack(final_eeg_dataset['data'])
#     final_eeg_dataset['attention_masks'] = torch.stack(final_eeg_dataset['attention_masks'])
#     final_eeg_dataset['labels'] = torch.stack(final_eeg_dataset['labels'])
#     return final_eeg_dataset

## EEG Classification

In [None]:
# All EEG Data

files_mat_genuine, user_ids_genuine, genuine_labels = get_dataset_files_and_user_ids(data_category=constants.GENUINE)
files_mat_forged, user_ids_forged, forged_labels = get_dataset_files_and_user_ids(data_category=constants.FORGED)

files_mat_genuine.extend(files_mat_forged)
files_mat_appended = files_mat_genuine
genuine_labels.extend(forged_labels)
labels_appended = genuine_labels

# shuffling to prevent overfitting
files_all = np.array(files_mat_appended)
labels_all = np.array(labels_appended)

indices = np.arange(len(files_all))
np.random.shuffle(indices)

files_mat_appended = files_all[indices]
labels_appended = labels_all[indices]

sign_data_dict, eeg_data_dict, labels = get_sig_eeg_raw_data(files_mat_appended, labels_appended)
normalized_eeg_data_dict = normalize_eeg_data_dict(eeg_data_dict)
eeg_data_with_features = get_eeg_data_features(normalized_eeg_data_dict)
eeg_final_data = eeg_attach_attention_tokens_and_labels(eeg_data_with_features, labels)
eeg_final_dataset = prepare_eeg_dataset_with_all_parts(eeg_final_data)

In [None]:
input_data = eeg_final_dataset
ts_dim = input_data['data'][0].size(1)
cls_dim = input_data['cls_tokens'][0].size(0)
d_model = 64
num_classes = 2
seq_len = get_eeg_max_seq_len(eeg_data_with_features)
batch_size = 8

dataset = SignatureDataset(input_data, num_classes)
# dataset.__getitem__(0)['x_ts'].shape
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

eeg_model = SignatureTransformer(input_dim=ts_dim, cls_dim=cls_dim, d_model=d_model, num_classes=num_classes, num_heads=4, num_layers=4, max_seq_len=seq_len)
optimizer = optim.Adam(eeg_model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

sign_training_loop(eeg_model, dataloader, optimizer, loss_fn, device, num_epochs=10)

# Sign + EEG Classification

## Getting the EEG and Sign data

In [12]:
files_mat_genuine, user_ids_genuine, genuine_labels = get_dataset_files_and_user_ids(data_category=constants.GENUINE)
files_mat_forged, user_ids_forged, forged_labels = get_dataset_files_and_user_ids(data_category=constants.FORGED)

files_mat_genuine.extend(files_mat_forged)
files_mat_appended = files_mat_genuine
genuine_labels.extend(forged_labels)
labels_appended = genuine_labels

# shuffling to prevent overfitting
files_all = np.array(files_mat_appended)
labels_all = np.array(labels_appended)

indices = np.arange(len(files_all))
np.random.shuffle(indices)

files_mat_appended = files_all[indices]
labels_appended = labels_all[indices]

raw_data = get_sig_eeg_raw_data(files_mat_appended, labels_appended)
for i in range(len(raw_data)):
    sign_data_with_features, sign_cls_token = get_sign_data_features(raw_data[i]['sign_data'])
    eeg_data_with_features, eeg_cls_token = get_eeg_data_features(raw_data[i]['eeg_data'])
    raw_data[i]['sign_data'] = sign_data_with_features
    raw_data[i]['sign_cls_token'] = sign_cls_token
    raw_data[i]['eeg_data'] = eeg_data_with_features
    raw_data[i]['eeg_cls_token'] = eeg_cls_token

sign_max_seq_len = max([data['sign_data'].shape[0] for data in raw_data])
eeg_max_seq_len = max([data['eeg_data'].shape[0] for data in raw_data])

for i in range(len(raw_data)):
    sign_data = raw_data[i]['sign_data']
    eeg_data = raw_data[i]['eeg_data']
    sign_data, sign_attention_mask = attach_attention_tokens_and_padding(sign_data, sign_max_seq_len)
    eeg_data, eeg_attention_mask = attach_attention_tokens_and_padding(eeg_data, eeg_max_seq_len)
    raw_data[i]['sign_data'] = sign_data
    raw_data[i]['eeg_data'] = eeg_data
    raw_data[i]['sign_attention_mask'] = sign_attention_mask
    raw_data[i]['eeg_attention_mask'] = eeg_attention_mask

sign_data = [data['sign_data'] for data in raw_data]
eeg_data = [data['eeg_data'] for data in raw_data]
sign_attention_masks = [data['sign_attention_mask'] for data in raw_data]
eeg_attention_masks = [data['eeg_attention_mask'] for data in raw_data]
sign_cls_tokens = [data['sign_cls_token'] for data in raw_data]
eeg_cls_tokens = [data['eeg_cls_token'] for data in raw_data]
labels = [data['label'] for data in raw_data]
files = [data['file'] for data in raw_data]

  normalized_sign_data = torch.tensor(normalized_sign_data, dtype=torch.float32)
  f, Pxx = welch(eeg_data[:, ch].cpu().numpy(), fs=fs)
  f, Pxx = welch(eeg_data[:, ch].cpu().numpy(), fs=fs)
  f, Pxx = welch(eeg_data[:, ch].cpu().numpy(), fs=fs)


In [13]:
print("Sign data type: ", type(sign_data))
print("EEG data type: ", type(eeg_data))
print("Sign attention mask type: ", type(sign_attention_masks[0]))
print("EEG attention mask type: ", type(eeg_attention_masks[0]))
print("Sign cls token type: ", type(sign_cls_tokens[0]))
print("EEG cls token type: ", type(eeg_cls_tokens[0]))
print("Labels type: ", type(labels))
print("Files type: ", type(files))


print("Sign data shape: ", sign_data[0].shape)
print("EEG data shape: ", eeg_data[0].shape)
print("Sign attention mask shape: ", sign_attention_masks[0].shape)
print("EEG attention mask shape: ", eeg_attention_masks[0].shape)
print("Sign cls token shape: ", sign_cls_tokens[0].shape)
print("EEG cls token shape: ", eeg_cls_tokens[0].shape)
print("Labels shape: ", len(labels))
print("Files shape: ", len(files))

Sign data type:  <class 'list'>
EEG data type:  <class 'list'>
Sign attention mask type:  <class 'torch.Tensor'>
EEG attention mask type:  <class 'torch.Tensor'>
Sign cls token type:  <class 'torch.Tensor'>
EEG cls token type:  <class 'torch.Tensor'>
Labels type:  <class 'list'>
Files type:  <class 'list'>
Sign data shape:  torch.Size([2996, 10])
EEG data shape:  torch.Size([1247, 5])
Sign attention mask shape:  torch.Size([2997])
EEG attention mask shape:  torch.Size([1248])
Sign cls token shape:  torch.Size([14])
EEG cls token shape:  torch.Size([11])
Labels shape:  2042
Files shape:  2042


## Redesigning the Transformer model implementation

In [13]:
class SignatureEEGDataset(Dataset):
    def __init__(self, input_data, num_classes):
        sign_data = input_data['sign_data']
        eeg_data = input_data['eeg_data']
        sign_attention_masks = input_data['sign_attention_masks']
        eeg_attention_masks = input_data['eeg_attention_masks']
        sign_cls_tokens = input_data['sign_cls_tokens']
        eeg_cls_tokens = input_data['eeg_cls_tokens']
        labels = input_data['labels']

        # self.sign_x_ts = sign_data['data']
        # self.sign_cls_token = sign_data['cls_tokens']
        # self.sign_attention_mask = sign_data['attention_masks']
        # self.sign_seq_len = sign_data['data'][0].shape[0]
        # self.sign_ts_dim = sign_data['data'][0].shape[1]

        self.sign_x_ts = sign_data
        self.sign_cls_token = sign_cls_tokens
        self.sign_attention_mask = sign_attention_masks
        self.sign_seq_len = sign_data[0].shape[0]
        self.sign_ts_dim = sign_data[0].shape[1]

        # self.eeg_x_ts = eeg_data['data']
        # self.eeg_cls_token = eeg_data['cls_tokens']
        # self.eeg_attention_mask = eeg_data['attention_masks']
        # self.eeg_seq_len = eeg_data['data'][0].shape[0]
        # self.eeg_ts_dim = eeg_data['data'][0].shape[1]

        self.eeg_x_ts = eeg_data
        self.eeg_cls_token = eeg_cls_tokens
        self.eeg_attention_mask = eeg_attention_masks
        self.eeg_seq_len = eeg_data[0].shape[0]
        self.eeg_ts_dim = eeg_data[0].shape[1]

        self.num_classes = num_classes
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'sign_x_ts': self.sign_x_ts[idx],
            'sign_cls_token': self.sign_cls_token[idx],
            'sign_attention_mask': self.sign_attention_mask[idx],
            'eeg_x_ts': self.eeg_x_ts[idx],
            'eeg_cls_token': self.eeg_cls_token[idx],
            'eeg_attention_mask': self.eeg_attention_mask[idx],
            'labels': self.labels[idx],
        }

In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len, dropout = 0.1):
        super().__init__()
        pe = torch.zeros(max_len + 1, d_model)
        position = torch.arange(0, max_len + 1, dtype = torch.float).unsqueeze(1)
        divterm = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # Credits to hkproj@github for this as https://github.com/hkproj/pytorch-transformer/blob/main/model.py
        pe[:, 0::2] = torch.sin(position * divterm)
        pe[:, 1::2] = torch.cos(position * divterm)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        return self.pe[:, :x.shape[1]]

In [15]:
class SignatureEEGTransformer(nn.Module):
    def __init__(self, sign_input_dim, sign_cls_dim, eeg_input_dim, eeg_cls_dim, d_model, num_classes, num_heads, num_layers, sign_max_seq_len, eeg_max_seq_len, dropout = 0.1):
        super().__init__()
        # self.d_model = d_model
        # self.positional_encoding = PositionalEncoding(d_model, max_seq_len)
        # self.cls_proj = nn.Linear(cls_dim, d_model)
        # self.input_projection = nn.Linear(input_dim, d_model)
        # encoder_layer = nn.TransformerEncoderLayer(
        #     d_model=d_model,
        #     nhead=num_heads,
        #     dropout = dropout,
        #     batch_first=True
        # )
        # self.transformer = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)

        self.sign_transfomer = SignatureTransformer(sign_input_dim, sign_cls_dim, d_model, num_classes, num_heads, num_layers, sign_max_seq_len, dropout)
        self.eeg_transformer = SignatureTransformer(eeg_input_dim, eeg_cls_dim, d_model, num_classes, num_heads, num_layers, eeg_max_seq_len, dropout)

        self.classifier = nn.Sequential(nn.Linear(d_model * 2, d_model), nn.ReLU(), nn.Linear(d_model, num_classes))

    def forward(self, sign_x_ts, sign_cls_token, eeg_x_ts, eeg_cls_token, sign_attn_mask = None, eeg_attn_mask = None):
        # x_ts = torch.nan_to_num(x_ts, nan=0.0, posinf=0.0, neginf=0.0)
        # cls_token = torch.nan_to_num(cls_token, nan=0.0, posinf=0.0, neginf=0.0)
        # batch_size, t, feat_dim = x_ts.shape
        # x_proj = self.input_projection(x_ts)
        # cls_proj = self.cls_proj(cls_token).unsqueeze(1)
        # # print("x_proj size: ", x_proj.shape)
        # # print("cls_proj size: ", cls_proj.shape)
        # x = torch.cat([cls_proj, x_proj], dim=1)
        # # print("x_proj and cls_proj concatenated size: ", x.shape)
        # x = x + self.positional_encoding(x)

        # if attn_mask is not None:
        #     attn_mask = attn_mask == 0 # True = ignore the value, False = include it!!!!!!!!!!
        #     # cls_mask = torch.zeros((batch_size, 1), dtype=torch.bool, device=attn_mask.device)
        #     full_mask = torch.cat([attn_mask], dim=1)  # [batch_size, t+1]
        # else:
        #     full_mask = None

        # x = self.transformer(x, src_key_padding_mask=full_mask)
        # cls_output = x[:, 0, :]

        sign_cls = self.sign_transfomer(sign_x_ts, sign_cls_token, sign_attn_mask)
        eeg_cls = self.eeg_transformer(eeg_x_ts, eeg_cls_token, eeg_attn_mask)
        multimodal_cls_output = torch.cat([sign_cls, eeg_cls], dim = 1)

        logits = self.classifier(multimodal_cls_output)
        return logits

In [16]:
input_data = {
    'sign_data': sign_data,
    'eeg_data': eeg_data,
    'sign_attention_masks': sign_attention_masks,
    'eeg_attention_masks': eeg_attention_masks,
    'sign_cls_tokens': sign_cls_tokens,
    'eeg_cls_tokens': eeg_cls_tokens,
    'labels': labels,
}
num_classes = 2
multimodal_dataset = SignatureEEGDataset(input_data, num_classes)
batch_size = 8
multimodal_dataloader = DataLoader(multimodal_dataset, batch_size=batch_size, shuffle=True)

# sign_ts_dim = input_data['sign_data']['data'][0].size(1)
# sign_cls_dim = input_data['sign_data']['cls_tokens'][0].size(0)
# sign_seq_len = input_data['sign_data']['data'][0].size(0)
# eeg_ts_dim = input_data['eeg_data']['data'][0].size(1)
# eeg_cls_dim = input_data['eeg_data']['cls_tokens'][0].size(0)
# eeg_seq_len = input_data['eeg_data']['data'][0].size(0)

sign_ts_dim = input_data['sign_data'][0].size(1)
sign_cls_dim = input_data['sign_cls_tokens'][0].size(0)
sign_seq_len = input_data['sign_data'][0].size(0)
eeg_ts_dim = input_data['eeg_data'][0].size(1)
eeg_cls_dim = input_data['eeg_cls_tokens'][0].size(0)
eeg_seq_len = input_data['eeg_data'][0].size(0)


In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SignatureEEGTransformer(sign_input_dim=sign_ts_dim, sign_cls_dim=sign_cls_dim, eeg_input_dim=eeg_ts_dim, eeg_cls_dim=eeg_cls_dim, d_model=64, num_classes=2, num_heads=4, num_layers=2, sign_max_seq_len=sign_seq_len, eeg_max_seq_len=eeg_seq_len).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_labels = []
    all_preds = []

    # Print a batch to check alignment
    for batch in multimodal_dataloader:
        print(batch['sign_x_ts'].shape, batch['eeg_x_ts'].shape, batch['labels'])
        break

    pbar = tqdm(enumerate(multimodal_dataloader), total=len(multimodal_dataloader), desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch_idx, batch in pbar:
        sign_x_ts = batch['sign_x_ts'].to(device)
        sign_cls_token = batch['sign_cls_token'].to(device)
        sign_attention_mask = batch['sign_attention_mask'].to(device)
        eeg_x_ts = batch['eeg_x_ts'].to(device)
        eeg_cls_token = batch['eeg_cls_token'].to(device)
        eeg_attention_mask = batch['eeg_attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(sign_x_ts, sign_cls_token, eeg_x_ts, eeg_cls_token, sign_attention_mask, eeg_attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

        pbar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})

    avg_loss = total_loss / len(all_labels) if all_labels else 0
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    print("\n" + "="*50)
    print(f"Epoch {epoch+1}/{num_epochs} Summary:")
    print(f"{'Loss':<10}{'Accuracy':<12}{'Precision':<12}{'Recall':<10}{'F1-Score':<10}")
    print(f"{avg_loss:<10.4f}{acc:<12.4f}{prec:<12.4f}{rec:<10.4f}{f1:<10.4f}")
    print("="*50 + "\n")

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 1, 1, 1, 1, 1, 0])


Epoch 1/20: 100%|██████████| 256/256 [02:53<00:00,  1.48it/s, Batch Loss=0.7140]



Epoch 1/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6920    0.5122      0.5238      0.5158    0.5198    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 0, 1, 0, 1, 0, 0])


Epoch 2/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.6998]



Epoch 2/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6793    0.5744      0.5760      0.6383    0.6055    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 0, 1, 1, 1, 0, 1, 0])


Epoch 3/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.5305]



Epoch 3/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6668    0.5926      0.6037      0.5933    0.5985    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 0, 1, 0, 0, 0, 0, 1])


Epoch 4/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=1.0045]



Epoch 4/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6549    0.6131      0.6324      0.5828    0.6066    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 1, 0, 1, 0, 1, 1, 1])


Epoch 5/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.5213]



Epoch 5/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6475    0.6396      0.6608      0.6077    0.6331    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 1, 0, 1, 0, 1, 0, 1])


Epoch 6/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.6490]



Epoch 6/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6363    0.6410      0.6563      0.6268    0.6412    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 0, 0, 0, 0, 1, 1])


Epoch 7/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.5431]



Epoch 7/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6255    0.6459      0.6620      0.6297    0.6454    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 1, 1, 0, 1, 1, 1, 0])


Epoch 8/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.4896]



Epoch 8/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6157    0.6577      0.6727      0.6450    0.6585    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 1, 0, 1, 1, 0, 0])


Epoch 9/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.4931]



Epoch 9/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.6068    0.6729      0.6763      0.6919    0.6840    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 0, 1, 1, 0, 1, 0, 0])


Epoch 10/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.8635]



Epoch 10/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5954    0.6836      0.6829      0.7129    0.6976    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 1, 0, 1, 1, 1, 1])


Epoch 11/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.2135]



Epoch 11/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5887    0.6851      0.6879      0.7043    0.6960    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 0, 1, 1, 0, 1, 1, 1])


Epoch 12/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.6911]



Epoch 12/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5811    0.6993      0.7035      0.7129    0.7082    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 1, 1, 0, 1, 0, 1, 1])


Epoch 13/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.5146]



Epoch 13/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5800    0.6978      0.6949      0.7301    0.7121    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 0, 1, 1, 1, 0, 0, 1])


Epoch 14/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.2633]



Epoch 14/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5666    0.7189      0.7131      0.7541    0.7330    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([0, 0, 1, 0, 1, 1, 1, 0])


Epoch 15/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.4035]



Epoch 15/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5630    0.7209      0.7165      0.7522    0.7339    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 0, 1, 1, 0, 0, 0])


Epoch 16/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.3545]



Epoch 16/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5577    0.7189      0.7068      0.7703    0.7372    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 1, 0, 0, 0, 0, 1, 1])


Epoch 17/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.1966]



Epoch 17/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5556    0.7199      0.7113      0.7617    0.7357    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 0, 1, 0, 0, 1, 0])


Epoch 18/20: 100%|██████████| 256/256 [02:51<00:00,  1.49it/s, Batch Loss=0.3866]



Epoch 18/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5503    0.7228      0.7187      0.7531    0.7355    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 1, 0, 1, 1, 0, 1])


Epoch 19/20: 100%|██████████| 256/256 [02:52<00:00,  1.48it/s, Batch Loss=0.2743]



Epoch 19/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5483    0.7209      0.7049      0.7818    0.7414    

torch.Size([8, 2996, 10]) torch.Size([8, 1247, 5]) tensor([1, 0, 0, 1, 1, 0, 0, 1])


Epoch 20/20: 100%|██████████| 256/256 [02:52<00:00,  1.48it/s, Batch Loss=0.1536]


Epoch 20/20 Summary:
Loss      Accuracy    Precision   Recall    F1-Score  
0.5394    0.7341      0.7282      0.7665    0.7469    






## EEG - Misc

In [None]:
# # For debugging
# def get_max_attention_token_len(attention_tokens):
#     max_len = 0
#     for item in attention_tokens:
#         max_len = max(max_len, item.shape[0])
#     return max_len

In [None]:
# print(get_max_attention_token_len(eeg_final_dataset['attention_masks']))

In [None]:
# # print(normalized_sign_data_dict['000000000200894'][0])
# # print(normalized_eeg_data_dict['000000000200894'][0])
# # print(user_labels['000000000200894'][0])
# eeg_data = get_eeg_data_features(eeg_data_dict)


In [None]:
# sign_data_dict, eeg_data_dict, labels = get_sig_eeg_data_dicts(files_mat_appended, labels_appended)

In [None]:
# # Single data
# user_id = '002108410100048'
# single_eeg_data = {}
# single_eeg_data[user_id] = eeg_data_dict[user_id]

# normalized_eeg_data_dict = normalize_eeg_data_dict(single_eeg_data)
# eeg_data_with_features = get_eeg_data_features(normalized_eeg_data_dict)
# eeg_final_data = eeg_attach_attention_tokens_and_labels(eeg_data_with_features, labels)
# eeg_final_dataset = prepare_eeg_dataset_with_all_parts(eeg_final_data)

In [None]:

# print(eeg_data_with_features['002108410100048']['data'][2].shape)

In [None]:
# max_len = get_eeg_max_seq_len(eeg_data_with_features)
# print(max_len)

In [None]:
# print(eeg_final_data['002108410100048']['labels'][0])


In [None]:

# print(eeg_final_dataset['labels'].shape[0])

# Misc - Trials

In [None]:
# import torch
# import torch.nn.functional as F
# import matplotlib.pyplot as plt

# x1 = torch.tensor([[1.0, 2.0],
#                    [3.0, 1.0],
#                    [0.0, 0.0]])

# x2 = torch.tensor([[2.0, 1.0],
#                    [0.0, 3.0],
#                    [1.0, 1.0]])
# distances = F.pairwise_distance(x1, x2)

# print("Pairwise distances:", distances)

In [None]:
# According to the paper: The instances corresponding to the last EEG activity for each subject were interpolated to match the length of the longest genuine instance for that subject.

In [None]:
# import torch
# import numpy as np
# from torch.utils.data import Dataset, DataLoader
# from sklearn.model_selection import train_test_split
# import torch.nn as nn
# import torch.optim as optim

# class SignatureEEGDataset(Dataset):
#     def __init__(self, signature_data, eeg_data, labels):

#         self.samples = []
        
#         # Combine the data
#         for user_id in signature_data:
#             # Ensure we have matching signature, EEG, and label entries
#             n_samples = len(signature_data[user_id])
#             for i in range(n_samples):
#                 signature = signature_data[user_id][i]
#                 eeg = eeg_data[user_id][i]
#                 label = labels[user_id][i]
                
#                 # Convert to tensors
#                 signature_tensor = torch.FloatTensor(signature)
#                 eeg_tensor = torch.FloatTensor(eeg)
#                 label_tensor = torch.LongTensor([label])
                
#                 self.samples.append((signature_tensor, eeg_tensor, label_tensor))
    
#     def __len__(self):
#         return len(self.samples)
    
#     def __getitem__(self, idx):
#         return self.samples[idx]

# def collate_fn(batch):
    
#     signatures, eegs, labels = zip(*batch)
    
#     # Pad sequences to the same length
#     signatures_padded = torch.nn.utils.rnn.pad_sequence(signatures, batch_first=True)
#     eegs_padded = torch.nn.utils.rnn.pad_sequence(eegs, batch_first=True)
    
#     labels = torch.cat(labels)
    
#     return signatures_padded, eegs_padded, labels

In [None]:
# class SignatureEEGTransformer(nn.Module):
#     def __init__(self, signature_dim=7, eeg_channels=10, d_model=128, nhead=4, num_layers=3, num_classes=2):
#         super(SignatureEEGTransformer, self).__init__()
        
#         # Signature embedding
#         self.signature_embedding = nn.Linear(signature_dim, d_model)
        
#         # EEG embedding
#         self.eeg_embedding = nn.Linear(eeg_channels, d_model)
        
#         # Positional encoding
#         self.positional_encoding = PositionalEncoding(d_model)
        
#         # Transformer encoder
#         encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
#         self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
#         # Classifier
#         self.classifier = nn.Sequential(
#             nn.Linear(d_model * 2, d_model),  # *2 because we concatenate signature and EEG features
#             nn.ReLU(),
#             nn.Linear(d_model, num_classes)
#         )
    
#     def forward(self, signature, eeg):
#         # Signature processing
#         signature_embedded = self.signature_embedding(signature)
#         signature_embedded = self.positional_encoding(signature_embedded)
        
#         # EEG processing
#         eeg_embedded = self.eeg_embedding(eeg)
#         eeg_embedded = self.positional_encoding(eeg_embedded)
        
#         # Transformer expects (seq_len, batch, features)
#         signature_embedded = signature_embedded.permute(1, 0, 2)
#         eeg_embedded = eeg_embedded.permute(1, 0, 2)
        
#         # Process through transformer
#         signature_features = self.transformer_encoder(signature_embedded)
#         eeg_features = self.transformer_encoder(eeg_embedded)
        
#         # Average over time dimension
#         signature_features = signature_features.mean(dim=0)
#         eeg_features = eeg_features.mean(dim=0)
        
#         # Concatenate features
#         combined_features = torch.cat([signature_features, eeg_features], dim=1)
        
#         # Classify
#         output = self.classifier(combined_features)
        
#         return output

# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         return x + self.pe[:x.size(0), :]

In [None]:
# def train_model(signature_data, eeg_data, labels, epochs=20, batch_size=32):
#     # Create dataset
#     dataset = SignatureEEGDataset(signature_data, eeg_data, labels)
    
#     # Split into train and validation
#     train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)
    
#     # Create dataloaders
#     train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
#     val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    
#     # Initialize model
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model = SignatureEEGTransformer().to(device)
    
#     # Loss and optimizer
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001)
    
#     # Training loop
#     for epoch in range(epochs):
#         model.train()
#         train_loss = 0.0
        
#         for signatures, eegs, labels in train_loader:
#             signatures, eegs, labels = signatures.to(device), eegs.to(device), labels.to(device)
            
#             optimizer.zero_grad()
            
#             outputs = model(signatures, eegs)
#             loss = criterion(outputs, labels)
            
#             loss.backward()
#             optimizer.step()
            
#             train_loss += loss.item()
        
#         # Validation
#         model.eval()
#         val_loss = 0.0
#         correct = 0
#         total = 0
        
#         with torch.no_grad():
#             for signatures, eegs, labels in val_loader:
#                 signatures, eegs, labels = signatures.to(device), eegs.to(device), labels.to(device)
                
#                 outputs = model(signatures, eegs)
#                 loss = criterion(outputs, labels)
                
#                 val_loss += loss.item()
#                 _, predicted = torch.max(outputs.data, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()
        
#         print(f'Epoch {epoch+1}/{epochs}')
#         print(f'Train Loss: {train_loss/len(train_loader):.4f}')
#         print(f'Val Loss: {val_loss/len(val_loader):.4f}')
#         print(f'Val Accuracy: {100*correct/total:.2f}%')
#         print('-' * 50)
    
#     return model

In [None]:
model = train_model(normalized_sign_data_dict, normalized_eeg_data_dict, user_labels)

In [None]:
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras import layers, models

# def prepare_data_with_masking(signature_data, eeg_data, labels_dict):
#     """
#     Prepares data with dynamic padding and preserves original lengths for masking.
#     Returns:
#         - Padded sequences
#         - Sequence length arrays (for masking)
#         - Labels
#     """
#     # Initialize lists
#     X_signature, X_eeg = [], []
#     len_signature, len_eeg = [], []
#     y = []
    
#     for user_id in labels_dict.keys():
#         for i in range(len(labels_dict[user_id])):
#             # Signature data
#             sig = signature_data[user_id][i]
#             X_signature.append(sig)
#             len_signature.append(len(sig))
            
#             # EEG data
#             eeg = eeg_data[user_id][i]
#             X_eeg.append(eeg)
#             len_eeg.append(len(eeg))
            
#             # Label
#             y.append(labels_dict[user_id][i])
    
#     # Pad sequences to max length in dataset
#     max_len_sig = max(len_signature)
#     max_len_eeg = max(len_eeg)
    
#     X_signature_pad = tf.keras.preprocessing.sequence.pad_sequences(
#         X_signature, maxlen=max_len_sig, dtype='float32', padding='post'
#     )
    
#     X_eeg_pad = tf.keras.preprocessing.sequence.pad_sequences(
#         X_eeg, maxlen=max_len_eeg, dtype='float32', padding='post'
#     )
    
#     return (
#         X_signature_pad, np.array(len_signature),
#         X_eeg_pad, np.array(len_eeg),
#         np.array(y)
#     )

In [None]:
# def create_masked_model(signature_shape, eeg_shape):
#     """Creates a model with masking layers to ignore padding"""
#     # Signature branch
#     signature_input = layers.Input(shape=signature_shape, name='signature_input')
#     signature_length = layers.Input(shape=(1,), name='signature_length', dtype='int32')
    
#     sig = layers.Masking(mask_value=0.0)(signature_input)
#     sig = layers.Conv1D(32, 5, activation='relu', padding='same')(sig)
#     sig = layers.MaxPooling1D(2)(sig)
#     sig = layers.Conv1D(64, 5, activation='relu', padding='same')(sig)
#     sig = layers.MaxPooling1D(2)(sig)
#     sig = layers.GlobalAveragePooling1D()(sig)
    
#     # EEG branch
#     eeg_input = layers.Input(shape=eeg_shape, name='eeg_input')
#     eeg_length = layers.Input(shape=(1,), name='eeg_length', dtype='int32')
    
#     eeg = layers.Masking(mask_value=0.0)(eeg_input)
#     eeg = layers.Conv1D(32, 5, activation='relu', padding='same')(eeg)
#     eeg = layers.MaxPooling1D(2)(eeg)
#     eeg = layers.Conv1D(64, 5, activation='relu', padding='same')(eeg)
#     eeg = layers.MaxPooling1D(2)(eeg)
#     eeg = layers.GlobalAveragePooling1D()(eeg)
    
#     # Combine branches
#     combined = layers.concatenate([sig, eeg])
#     combined = layers.Dense(128, activation='relu')(combined)
#     outputs = layers.Dense(1, activation='sigmoid')(combined)
    
#     # Model with length inputs
#     model = models.Model(
#         inputs=[signature_input, signature_length, eeg_input, eeg_length],
#         outputs=outputs
#     )
    
#     model.compile(optimizer='adam',
#                  loss='binary_crossentropy',
#                  metrics=['accuracy'])
    
#     return model

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, X_sig, len_sig, X_eeg, len_eeg, y, batch_size=32):
        self.X_sig = X_sig
        self.len_sig = len_sig
        self.X_eeg = X_eeg
        self.len_eeg = len_eeg
        self.y = y
        self.batch_size = batch_size
        
    def __len__(self):
        return int(np.ceil(len(self.y) / self.batch_size))
    
    def __getitem__(self, idx):
        batch_sig = self.X_sig[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_len_sig = self.len_sig[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_eeg = self.X_eeg[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_len_eeg = self.len_eeg[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
        
        return [batch_sig, batch_len_sig, batch_eeg, batch_len_eeg], batch_y

In [None]:
# def train_authentication_model(signature_data, eeg_data, labels_dict):
#     # Prepare data
#     X_signature, X_eeg, y = prepare_data(signature_data, eeg_data, labels_dict)
    
#     # Split into train and test sets
#     (X_signature_train, X_signature_test, 
#      X_eeg_train, X_eeg_test, 
#      y_train, y_test) = train_test_split(X_signature, X_eeg, y, test_size=0.2, random_state=42)
    
#     # Create model
#     signature_shape = X_signature_train.shape[1:]
#     eeg_shape = X_eeg_train.shape[1:]
#     model = create_dual_input_model(signature_shape, eeg_shape)
    
#     # Train model
#     history = model.fit(
#         [X_signature_train, X_eeg_train],
#         y_train,
#         epochs=50,
#         batch_size=32,
#         validation_data=([X_signature_test, X_eeg_test], y_test),
#         callbacks=[
#             tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
#             tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=3)
#         ]
#     )
    
#     return model, history

In [None]:
# 1. Prepare data
X_sig, len_sig, X_eeg, len_eeg, norm_y = prepare_data_with_masking(signature_data, eeg_data, labels_dict)

# 2. Split data
(X_sig_train, X_sig_test, 
 len_sig_train, len_sig_test,
 X_eeg_train, X_eeg_test,
 len_eeg_train, len_eeg_test,
 y_train, y_test) = train_test_split(X_sig, len_sig, X_eeg, len_eeg, norm_y, test_size=0.2)

# 3. Create model
model = create_masked_model(X_sig_train.shape[1:], X_eeg_train.shape[1:])

# 4. Create generators
train_gen = DataGenerator(X_sig_train, len_sig_train, X_eeg_train, len_eeg_train, y_train)
val_gen = DataGenerator(X_sig_test, len_sig_test, X_eeg_test, len_eeg_test, y_test)

# 5. Train
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5)
    ]
)

In [None]:
def evaluate_model(model, X_signature_test, X_eeg_test, y_test):
    results = model.evaluate([X_signature_test, X_eeg_test], y_test)
    print(f"Test Loss: {results[0]:.4f}")
    print(f"Test Accuracy: {results[1]:.4f}")
    print(f"Test AUC: {results[2]:.4f}")
    
    # You can add more evaluation metrics as needed
    y_pred = model.predict([X_signature_test, X_eeg_test])
    y_pred = (y_pred > 0.5).astype(int)
    
    from sklearn.metrics import classification_report, confusion_matrix
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))