In [1]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score, accuracy_score
from torch import zeros
from torch.utils.data import DataLoader
from torchsampler import ImbalancedDatasetSampler
from tqdm.auto import tqdm
from yaml import load as load_yaml, FullLoader

from dataset import KIDataset, k_fold_cross_validator
from models.rocket import ROCKET
from processor.processor import Leif
from utils.const import SEED
from utils.data import binarize
from utils.misc import set_random_state
from utils.path import config_path

set_random_state(SEED)
!conda activate eyetrackpdc

# Data parameters
BINARY_CLF = True

# Rocket parameters
NUM_KERNELS = 1000
NORMALIZE = True

# Classifier parameters
REG_FACTOR = 1e5

# Number of folds in cross validation
K = 5

## Initialize Datasets and Dataloaders

In [2]:
with open(f'{config_path}/leif.yaml', 'r') as reader:
    config = load_yaml(reader, Loader=FullLoader)

# Configure processor
processor = Leif(config)

# Initialize Datasets
train_val_ds = KIDataset(data_processor=processor, train=True)
test_ds = KIDataset(data_processor=processor, train=False)

loading files: 0files [00:00, ?files/s]

skipped 1 files (position max value outlier)
skipped 5 files (position snr outlier)
skipped 41 files (position mean velocity outlier)
skipped 1 files (drift max value outlier)
skipped 3 files (drift snr outlier)
skipped 1 files (drift mean velocity outlier)
skipped 2 files (no target movement)


segmenting time series: 100%|██████████| 156/156 [00:09<00:00, 15.91ts/s]


loading files: 0files [00:00, ?files/s]

segmenting time series: 100%|██████████| 89/89 [00:05<00:00, 15.76ts/s]


In [3]:
scores, accuracies = zeros(K), zeros(K)
for i, (train_ds, val_ds) in tqdm(enumerate(k_fold_cross_validator(train_val_ds, k=K)), unit='fold', total=K):
    set_random_state(SEED)

    # Binarize dataset after split to make sure split is stratified w.r.t all three classes
    if BINARY_CLF:
        for ds in [train_ds, val_ds, test_ds]:
            binarize(ds)

    # Initialize Dataloaders
    train_dl = DataLoader(train_ds,
                          batch_size=train_ds.x.shape[0],
                          sampler=ImbalancedDatasetSampler(train_ds, callback_get_label=lambda item: item.y))
    val_dl = DataLoader(val_ds, batch_size=val_ds.x.shape[0])

    # Initialize Rocket
    rocket = ROCKET(c_in=train_ds.x.shape[1],
                    seq_len=train_ds.x.shape[2],
                    n_kernels=NUM_KERNELS,
                    normalize=NORMALIZE)

    # Initialize Classifier
    clf = RidgeClassifier(alpha=REG_FACTOR, random_state=SEED)

    # Batch is entire dataset
    train_batch = next(iter(train_dl))
    val_batch = next(iter(val_dl))

    # Perform ROCKET transformation stage on train set
    features = rocket(train_batch.x)

    # Fit Classifier
    clf.fit(features, train_batch.y.numpy())

    # Perform ROCKET transformation stage on validation set
    test_features = rocket(val_batch.x)

    # Make predictions on validation set
    pred = clf.predict(test_features)

    # The RidgeClassifier maps the targets to {-1, 1}, but our labels are {0, 1}
    pred[pred < 0] = 0

    # Compute F1 Score
    scores[i] = f1_score(val_batch.y.numpy(), pred, average='weighted')
    accuracies[i] = accuracy_score(val_batch.y.numpy(), pred)

# Print average fold score
print(f'average fold f1 score: {scores.mean()}')
# Print score for each fold
print('\n'.join([f'fold {i}: {score}' for i, score in enumerate(scores)]))
# Print average fold accuracy
print(f'average fold accuracy: {accuracies.mean():.2%}')
# Print accuracy for each fold
print('\n'.join([f'fold {i}: {accuracy:.2%}' for i, accuracy in enumerate(accuracies)]))

  0%|          | 0/5 [00:00<?, ?fold/s]

average fold f1 score: 0.6133731603622437
fold 0: 0.5948056578636169
fold 1: 0.6530392169952393
fold 2: 0.6237348318099976
fold 3: 0.5800867676734924
fold 4: 0.6151996850967407
average fold accuracy: 60.98%
fold 0: 58.49%
fold 1: 66.76%
fold 2: 61.65%
fold 3: 57.04%
fold 4: 60.95%
