In [3]:
import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.utils
import sklearn.metrics
import iisignature
import torch
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List, Optional, Dict, Set, Callable, Any
from joblib import Memory, Parallel, delayed
import tslearn
import tslearn.metrics
from tslearn.datasets import UCR_UEA_datasets
import sigkernel
import scipy
from scipy.interpolate import interp1d
from numba import njit
import pickle

from signature import streams_to_sigs, transform_stream
from conformance import BaseclassConformanceScore, stream_to_torch
from kernels import linear_kernel_gram, rbf_kernel_gram, poly_kernel_gram
from kernels import pairwise_kernel_gram, integral_kernel_gram, sig_kernel_gram
from experiment_code import print_dataset_stats

## Find all tslearn datasets

In [4]:
# _datasets = [
#             'ArticularyWordRecognition', 
#             'BasicMotions', 
#             'Cricket',
#             #'ERing',
#             'Libras', 
#             'NATOPS', 
#             'RacketSports',     
#             'FingerMovements',
#             'Heartbeat',
#             'SelfRegulationSCP1', 
#             'UWaveGestureLibrary'
#             ]

# import tslearn
# UCR_UEA_datasets = tslearn.datasets.UCR_UEA_datasets()

# for dataset_name in UCR_UEA_datasets.list_multivariate_datasets():
# #for dataset_name in _datasets:
#     print("Dataset:", dataset_name)
#     dataset = UCR_UEA_datasets.load_dataset(dataset_name)
#     if dataset[0] is not None:
#         X_train, y_train, X_test, y_test = dataset
#         num_classes = len(np.unique(y_train))
#         N_train, T, d = X_train.shape
#         N_test, _, _  = X_test.shape
        
#         print("Number of Classes:", num_classes)
#         print("Dimension of path:", d)
#         print("Length:", T)
#         print("Train Size, Test Size", N_train, N_test)
#         print()
#     else:
#         print("No dataset found")
#         print()

#yes
# Dataset: ArticularyWordRecognition
# Number of Classes: 25
# Dimension of path: 9
# Length: 144
# Train Size, Test Size 275 300

# Dataset: AtrialFibrillation
# No dataset found

#yes
# Dataset: BasicMotions
# Number of Classes: 4
# Dimension of path: 6
# Length: 100
# Train Size, Test Size 40 40

# Dataset: CharacterTrajectories
# No dataset found

#yes
# Dataset: Cricket
# Number of Classes: 12
# Dimension of path: 6
# Length: 1197
# Train Size, Test Size 108 72

# Dataset: DuckDuckGeese
# No dataset found

# Dataset: EigenWorms
# Number of Classes: 5
# Dimension of path: 6
# Length: 17984
# Train Size, Test Size 128 131

#why not
# Dataset: Epilepsy
# Number of Classes: 4
# Dimension of path: 3
# Length: 206
# Train Size, Test Size 137 138

#longLength
# Dataset: EthanolConcentration
# Number of Classes: 4
# Dimension of path: 3
# Length: 1751
# Train Size, Test Size 261 263

# Dataset: ERing
# No dataset found

#big
# Dataset: FaceDetection
# Number of Classes: 2
# Dimension of path: 144
# Length: 62
# Train Size, Test Size 5890 3524

#yes
# Dataset: FingerMovements
# Number of Classes: 2
# Dimension of path: 28
# Length: 50
# Train Size, Test Size 316 100

#why not, maybe big length
# Dataset: HandMovementDirection
# Number of Classes: 4
# Dimension of path: 10
# Length: 400
# Train Size, Test Size 160 74

#smallTrain
# Dataset: Handwriting
# Number of Classes: 26
# Dimension of path: 3
# Length: 152
# Train Size, Test Size 150 850

#yes
# Dataset: Heartbeat
# Number of Classes: 2
# Dimension of path: 61
# Length: 405
# Train Size, Test Size 204 205

#big
# Dataset: InsectWingbeat
# Number of Classes: 10
# Dimension of path: 200
# Length: 22
# Train Size, Test Size 25000 25000

# Dataset: JapaneseVowels
# No dataset found

#yes
# Dataset: Libras
# Number of Classes: 15
# Dimension of path: 2
# Length: 45
# Train Size, Test Size 180 180

#TODO I SHOULD INCLUDE
# Dataset: LSST
# Number of Classes: 14
# Dimension of path: 6
# Length: 36
# Train Size, Test Size 2459 2466

#length
# Dataset: MotorImagery
# Number of Classes: 2
# Dimension of path: 64
# Length: 3000
# Train Size, Test Size 278 100

#yes
# Dataset: NATOPS
# Number of Classes: 6
# Dimension of path: 24
# Length: 51
# Train Size, Test Size 180 180

#TODO NOT TSLEARN. LENGTH WRONG
# Dataset: PenDigits
# Number of Classes: 10
# Dimension of path: 2
# Length: 8
# Train Size, Test Size 7494 3498

#highDim
# Dataset: PEMS-SF
# Number of Classes: 7
# Dimension of path: 963
# Length: 144
# Train Size, Test Size 267 173

#dim=1, big length
# Dataset: Phoneme
# Number of Classes: 39
# Dimension of path: 1
# Length: 1024
# Train Size, Test Size 214 1896

#yes
# Dataset: RacketSports
# Number of Classes: 4
# Dimension of path: 6
# Length: 30
# Train Size, Test Size 151 152

#yes
# Dataset: SelfRegulationSCP1
# Number of Classes: 2
# Dimension of path: 6
# Length: 896
# Train Size, Test Size 268 293

# Dataset: SelfRegulationSCP2
# Number of Classes: 2
# Dimension of path: 7
# Length: 1152
# Train Size, Test Size 200 180

# Dataset: SpokenArabicDigits
# No dataset found

#long, also very small set
# Dataset: StandWalkJump
# Number of Classes: 3
# Dimension of path: 4
# Length: 2500
# Train Size, Test Size 12 15

#yes
# Dataset: UWaveGestureLibrary
# Number of Classes: 8
# Dimension of path: 3
# Length: 315
# Train Size, Test Size 120 320


# (tslearn) Cross Validation on Train

In [6]:
from cross_validation import cv_tslearn

cv_best_models = cv_tslearn(
    dataset_names = [
        #'ArticularyWordRecognition', 
        #'BasicMotions', 
        #'Cricket',
         ##########'ERing', #cant find dataset
        'Libras', 
        #'NATOPS', 
        #'RacketSports',     
        #'FingerMovements',
        #'Heartbeat',
        #'SelfRegulationSCP1', 
        #'UWaveGestureLibrary'
        ],
    kernel_names = [
        #"linear",
        "rbf",
        #"poly",
        "gak",
        #"truncated sig",
        #"truncated sig rbf",
        #"truncated sig poly",
        #"signature pde",
        "signature pde rbf",
        #"signature pde poly",
        #"integral linear",
        "integral rbf",
        #"integral poly",
        ],
        k=4,
        n_repeats=1,
        n_jobs_repeats=1,
        n_jobs_gram=4,
        verbose=False,
        )

print(cv_best_models)

Dataset: Libras


Label for rbf: 100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


Time taken for kernel rbf: 12.121767520904541 seconds


Label for gak: 100%|██████████| 15/15 [00:15<00:00,  1.03s/it]


Time taken for kernel gak: 15.490598678588867 seconds


Label for signature pde: 100%|██████████| 15/15 [00:55<00:00,  3.68s/it]


Time taken for kernel signature pde: 55.266334533691406 seconds


Label for integral rbf: 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]

Time taken for kernel integral rbf: 11.5567786693573 seconds
Time taken for dataset Libras: 94.43749403953552 seconds



{'Libras': {'kernel_results': {'rbf': {'1': {'sigma': 1.0, 'kernel_name': 'rbf', 'normal_class_label': '1', 'threshold': 3, 'CV_train_auc': 0.8988095238095238, 'auc_params': array([0.86507937, 0.85714286, 0.8452381 , 0.86507937, 0.8968254 ,
       0.89880952]), 'auc_thresh': array([0.70634921, 0.84722222, 0.89880952, 0.8968254 , 0.73214286,
       0.75793651, 0.86507937, 0.86309524, 0.8452381 ])}, '10': {'sigma': 0.36787944117144233, 'kernel_name': 'rbf', 'normal_class_label': '10', 'threshold': 5, 'CV_train_auc': 0.8908730158730159, 'auc_params': array([0.8531746 , 0.83928571, 0.82936508, 0.78769841, 0.89087302,
       0.88492063]), 'auc_thresh': array([0.81150794, 0.83333333, 0.72222222, 0.82738095, 0.89087302,
       0.8531746 , 0.80753968, 0.88492063, 0.82539683])}, '11': {'sigma': 0.006737946999085467, 'kernel_name': 'rbf', 'normal_class_label': '11', 'threshold




### Print CV results

In [8]:
def average_labels(labelwise_dict:Dict[str, Dict[str, Any]],
                          field:str):
    """Averages the values of a field over the labels."""
    return np.mean([param_dict[field] for param_dict in labelwise_dict.values()],
                   axis=0)


def print_cv_tslearn_results(
        dataset_kernel_label_paramdict : Dict[str, Dict[str, Dict[str, Any]]],
        ):

    # return experiments
    for dataset_name, results in dataset_kernel_label_paramdict.items():
        print(dataset_name)
        kernelwise_dict = results["kernel_results"]
        n_classes = results['num_classes']
        ts_length = results['ts_length']
        n_train = results['N_train']
        path_dim = results['path dim']
        from experiment_code import print_dataset_stats
        print_dataset_stats(n_classes, path_dim, ts_length, n_train, "unknown")
        for kernel_name, labelwise_dict in kernelwise_dict.items():
            final_auc_avgs = average_labels(labelwise_dict, "CV_train_auc")
            params_auc_avgs = average_labels(labelwise_dict, "auc_params")
            thresh_auc_avgs = average_labels(labelwise_dict, "auc_thresh")
            print(f"\n{kernel_name}")
            print("final_auc_avgs", final_auc_avgs)
            print("params_auc_avgs", params_auc_avgs)
            print("thresh_auc_avgs", thresh_auc_avgs)
            if "truncated sig" in kernel_name:
                trunc_auc_avgs = average_labels(labelwise_dict, "auc_truncs")
                print("trunc_auc_avgs", trunc_auc_avgs)
        print("\nEnd dataset \n\n\n")

print_cv_tslearn_results(cv_best_models)

Libras
Number of Classes: 15
Dimension of path: 2
Length: 45
Train: 180
Test: unknown

rbf
final_auc_avgs 0.958862433862434
params_auc_avgs [0.93465608 0.93518519 0.91746032 0.88955026 0.91653439 0.90582011]
thresh_auc_avgs [0.75621693 0.8265873  0.87407407 0.90502646 0.90727513 0.91468254
 0.9212963  0.93227513 0.91150794]

gak
final_auc_avgs 0.9142857142857144
params_auc_avgs [0.91428571]
thresh_auc_avgs [0.6457672  0.72857143 0.80820106 0.83769841 0.80238095 0.79404762
 0.74365079 0.73783069 0.57089947]

signature pde
final_auc_avgs 0.896031746031746
params_auc_avgs [0.89603175]
thresh_auc_avgs [0.83677249 0.84444444 0.85357143 0.86428571 0.87050265 0.87103175
 0.87685185 0.88148148 0.87579365]

integral rbf
final_auc_avgs 0.953042328042328
params_auc_avgs [0.93518519 0.9281746  0.91560847 0.88915344 0.87896825 0.88108466]
thresh_auc_avgs [0.72579365 0.80806878 0.85925926 0.87275132 0.8984127  0.91071429
 0.91494709 0.9281746  0.90793651]

End dataset 





# (tslearn) Validate on Test

In [9]:
from experiment_code import validate_tslearn

test_results_tslearn = validate_tslearn(cv_best_models)

Libras
Number of Classes: 15
Dimension of path: 2
Length: 45
Train: 180
Test: 180
Covariance operator numerical rank = 3
Covariance operator numerical rank = 5
Covariance operator numerical rank = 2
Covariance operator numerical rank = 8
Covariance operator numerical rank = 8
Covariance operator numerical rank = 9
Covariance operator numerical rank = 1
Covariance operator numerical rank = 8
Covariance operator numerical rank = 9
Covariance operator numerical rank = 7
Covariance operator numerical rank = 8
Covariance operator numerical rank = 6
Covariance operator numerical rank = 7
Covariance operator numerical rank = 5
Covariance operator numerical rank = 6


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 162.83it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2174.22it/s]


Covariance operator numerical rank = 3


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1536.92it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2224.32it/s]


Covariance operator numerical rank = 7


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1599.86it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2238.07it/s]


Covariance operator numerical rank = 2


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1779.10it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2368.27it/s]


Covariance operator numerical rank = 5


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1662.39it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2220.34it/s]


Covariance operator numerical rank = 8


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1572.21it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2386.06it/s]


Covariance operator numerical rank = 5


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1762.27it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2579.99it/s]


Covariance operator numerical rank = 1


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1340.80it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2427.81it/s]


Covariance operator numerical rank = 8


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1886.37it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2446.10it/s]


Covariance operator numerical rank = 2


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 2115.19it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2493.76it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1831.28it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2422.16it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1738.99it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2521.18it/s]


Covariance operator numerical rank = 7


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1666.19it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2372.03it/s]


Covariance operator numerical rank = 6


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1700.40it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2533.08it/s]


Covariance operator numerical rank = 2


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 1852.72it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:00<00:00, 2346.92it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 299.87it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 298.52it/s]


Covariance operator numerical rank = 1


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 239.01it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 289.07it/s]


Covariance operator numerical rank = 8


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 288.31it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 285.43it/s]


Covariance operator numerical rank = 6


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 273.28it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 294.38it/s]


Covariance operator numerical rank = 6


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 286.56it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 300.73it/s]


Covariance operator numerical rank = 3


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 308.83it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:08<00:00, 243.34it/s]


Covariance operator numerical rank = 1


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 203.97it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:08<00:00, 249.78it/s]


Covariance operator numerical rank = 7


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 223.12it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 290.77it/s]


Covariance operator numerical rank = 3


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 300.60it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 306.43it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 253.41it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 273.00it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 245.83it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 272.11it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 291.06it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 298.99it/s]


Covariance operator numerical rank = 9


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 307.61it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 273.57it/s]


Covariance operator numerical rank = 5


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 257.46it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 273.17it/s]


Covariance operator numerical rank = 4


Kernel Gram Matrix: 100%|██████████| 78/78 [00:00<00:00, 258.69it/s]
Kernel Gram Matrix: 100%|██████████| 2160/2160 [00:07<00:00, 280.74it/s]


Covariance operator numerical rank = 4
Covariance operator numerical rank = 3
Covariance operator numerical rank = 6
Covariance operator numerical rank = 2
Covariance operator numerical rank = 8
Covariance operator numerical rank = 6
Covariance operator numerical rank = 8
Covariance operator numerical rank = 2
Covariance operator numerical rank = 8
Covariance operator numerical rank = 8
Covariance operator numerical rank = 8
Covariance operator numerical rank = 8
Covariance operator numerical rank = 6
Covariance operator numerical rank = 6
Covariance operator numerical rank = 5
Covariance operator numerical rank = 6


### Print test results

In [10]:
def print_experiment_results(experiments, round_digits=5):
    for dataset_name, results in experiments.items():
        #Dataset:
        print("Dataset:", dataset_name)
        print_dataset_stats(results["num_classes"], results["path dim"], 
                            results["ts_length"], results["N_train"], 
                            results["N_test"])

        #Results for each kernel:
        for kernel_name, scores in results["results"].items():
            print("\nKernel:", kernel_name)
            print("Conformance AUC:", round(scores[0, 0], round_digits))
            print("Mahalanobis AUC:", round(scores[1, 0], round_digits))
            print("Conformance PR AUC:", round(scores[0, 1], round_digits))
            print("Mahalanobis PR AUC:", round(scores[1, 1], round_digits))

        print("\nEnd Dataset\n\n\n")


print_experiment_results(test_results_tslearn)

Dataset: Libras
Number of Classes: 15
Dimension of path: 2
Length: 45
Train: 180
Test: 180

Kernel: rbf
Conformance AUC: 0.9297
Mahalanobis AUC: 0.67533
Conformance PR AUC: 0.9907
Mahalanobis PR AUC: 0.94469

Kernel: gak
Conformance AUC: 0.85539
Mahalanobis AUC: 0.09514
Conformance PR AUC: 0.97347
Mahalanobis PR AUC: 0.83018

Kernel: signature pde
Conformance AUC: 0.87255
Mahalanobis AUC: 0.83267
Conformance PR AUC: 0.98736
Mahalanobis PR AUC: 0.98396

Kernel: integral rbf
Conformance AUC: 0.92371
Mahalanobis AUC: 0.74358
Conformance PR AUC: 0.99101
Mahalanobis PR AUC: 0.96143

End Dataset



