In [1]:
import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.utils
import sklearn.metrics
import iisignature
import torch
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List, Optional, Dict, Set, Callable, Any
from joblib import Memory, Parallel, delayed
import tslearn
import tslearn.metrics
from tslearn.datasets import UCR_UEA_datasets
import sigkernel
import scipy
from scipy.interpolate import interp1d
from numba import njit
import pickle

from signature import streams_to_sigs, transform_stream
from conformance import BaseclassConformanceScore, stream_to_torch
from kernels import linear_kernel_gram, rbf_kernel_gram, poly_kernel_gram
from kernels import pairwise_kernel_gram, integral_kernel_gram, sig_kernel_gram
from experiment_code import print_dataset_stats

## Find all tslearn datasets

In [None]:
# _datasets = [
#             'ArticularyWordRecognition', 
#             'BasicMotions', 
#             'Cricket',
#             #'ERing',
#             'Libras', 
#             'NATOPS', 
#             'RacketSports',     
#             'FingerMovements',
#             'Heartbeat',
#             'SelfRegulationSCP1', 
#             'UWaveGestureLibrary'
#             ]

# import tslearn
# UCR_UEA_datasets = tslearn.datasets.UCR_UEA_datasets()

# for dataset_name in UCR_UEA_datasets.list_multivariate_datasets():
# #for dataset_name in _datasets:
#     print("Dataset:", dataset_name)
#     dataset = UCR_UEA_datasets.load_dataset(dataset_name)
#     if dataset[0] is not None:
#         X_train, y_train, X_test, y_test = dataset
#         num_classes = len(np.unique(y_train))
#         N_train, T, d = X_train.shape
#         N_test, _, _  = X_test.shape
        
#         print("Number of Classes:", num_classes)
#         print("Dimension of path:", d)
#         print("Length:", T)
#         print("Train Size, Test Size", N_train, N_test)
#         print()
#     else:
#         print("No dataset found")
#         print()

#yes
# Dataset: ArticularyWordRecognition
# Number of Classes: 25
# Dimension of path: 9
# Length: 144
# Train Size, Test Size 275 300

# Dataset: AtrialFibrillation
# No dataset found

#yes
# Dataset: BasicMotions
# Number of Classes: 4
# Dimension of path: 6
# Length: 100
# Train Size, Test Size 40 40

# Dataset: CharacterTrajectories
# No dataset found

#yes
# Dataset: Cricket
# Number of Classes: 12
# Dimension of path: 6
# Length: 1197
# Train Size, Test Size 108 72

# Dataset: DuckDuckGeese
# No dataset found

# Dataset: EigenWorms
# Number of Classes: 5
# Dimension of path: 6
# Length: 17984
# Train Size, Test Size 128 131

#why not
# Dataset: Epilepsy
# Number of Classes: 4
# Dimension of path: 3
# Length: 206
# Train Size, Test Size 137 138

#longLength
# Dataset: EthanolConcentration
# Number of Classes: 4
# Dimension of path: 3
# Length: 1751
# Train Size, Test Size 261 263

# Dataset: ERing
# No dataset found

#big
# Dataset: FaceDetection
# Number of Classes: 2
# Dimension of path: 144
# Length: 62
# Train Size, Test Size 5890 3524

#yes
# Dataset: FingerMovements
# Number of Classes: 2
# Dimension of path: 28
# Length: 50
# Train Size, Test Size 316 100

#why not, maybe big length
# Dataset: HandMovementDirection
# Number of Classes: 4
# Dimension of path: 10
# Length: 400
# Train Size, Test Size 160 74

#smallTrain
# Dataset: Handwriting
# Number of Classes: 26
# Dimension of path: 3
# Length: 152
# Train Size, Test Size 150 850

#yes
# Dataset: Heartbeat
# Number of Classes: 2
# Dimension of path: 61
# Length: 405
# Train Size, Test Size 204 205

#big
# Dataset: InsectWingbeat
# Number of Classes: 10
# Dimension of path: 200
# Length: 22
# Train Size, Test Size 25000 25000

# Dataset: JapaneseVowels
# No dataset found

#yes
# Dataset: Libras
# Number of Classes: 15
# Dimension of path: 2
# Length: 45
# Train Size, Test Size 180 180

#TODO I SHOULD INCLUDE
# Dataset: LSST
# Number of Classes: 14
# Dimension of path: 6
# Length: 36
# Train Size, Test Size 2459 2466

#length
# Dataset: MotorImagery
# Number of Classes: 2
# Dimension of path: 64
# Length: 3000
# Train Size, Test Size 278 100

#yes
# Dataset: NATOPS
# Number of Classes: 6
# Dimension of path: 24
# Length: 51
# Train Size, Test Size 180 180

#TODO NOT TSLEARN. LENGTH WRONG
# Dataset: PenDigits
# Number of Classes: 10
# Dimension of path: 2
# Length: 8
# Train Size, Test Size 7494 3498

#highDim
# Dataset: PEMS-SF
# Number of Classes: 7
# Dimension of path: 963
# Length: 144
# Train Size, Test Size 267 173

#dim=1, big length
# Dataset: Phoneme
# Number of Classes: 39
# Dimension of path: 1
# Length: 1024
# Train Size, Test Size 214 1896

#yes
# Dataset: RacketSports
# Number of Classes: 4
# Dimension of path: 6
# Length: 30
# Train Size, Test Size 151 152

#yes
# Dataset: SelfRegulationSCP1
# Number of Classes: 2
# Dimension of path: 6
# Length: 896
# Train Size, Test Size 268 293

# Dataset: SelfRegulationSCP2
# Number of Classes: 2
# Dimension of path: 7
# Length: 1152
# Train Size, Test Size 200 180

# Dataset: SpokenArabicDigits
# No dataset found

#long, also very small set
# Dataset: StandWalkJump
# Number of Classes: 3
# Dimension of path: 4
# Length: 2500
# Train Size, Test Size 12 15

#yes
# Dataset: UWaveGestureLibrary
# Number of Classes: 8
# Dimension of path: 3
# Length: 315
# Train Size, Test Size 120 320


# (tslearn) Cross Validation on Train

In [2]:
from cross_validation import cv_tslearn

cv_best_models = cv_tslearn(
    dataset_names = [
        #'ArticularyWordRecognition', 
        #'BasicMotions', 
        #'Cricket',
         ##########'ERing', #cant find dataset
        'Libras', 
        #'NATOPS', 
        #'RacketSports',     
        #'FingerMovements',
        #'Heartbeat',
        #'SelfRegulationSCP1', 
        #'UWaveGestureLibrary'
        ],
    kernel_names = [
        #"linear",
        "rbf",
        #"poly",
        #"gak",
        #"truncated sig",
        #"truncated sig rbf",
        #"truncated sig poly",
        #"signature pde",
        #"signature pde rbf",
        #"signature pde poly",
        #"integral linear",
        #"integral rbf",
        #"integral poly",
        ],
        k=4,
        n_repeats=5,
        n_jobs_repeats=4
        )

print(cv_best_models)

Dataset: Libras


Label for rbf: 100%|██████████| 15/15 [00:23<00:00,  1.55s/it]

Time taken for kernel rbf: 23.18761444091797 seconds
Time taken for dataset Libras: 23.19777750968933 seconds



{'Libras': {'kernel_results': {'rbf': {'1': {'sigma': 1.0, 'kernel_name': 'rbf', 'normal_class_label': '1', 'threshold': 8, 'CV_train_auc': 0.8432539682539686, 'auc_params': array([0.83690476, 0.83650794, 0.80039683, 0.76269841, 0.82579365,
       0.84325397]), 'auc_thresh': array([0.61904762, 0.79801587, 0.82579365, 0.7734127 , 0.73373016,
       0.79365079, 0.79920635, 0.84325397, 0.83650794])}, '10': {'sigma': 1.0, 'kernel_name': 'rbf', 'normal_class_label': '10', 'threshold': 6, 'CV_train_auc': 0.869047619047619, 'auc_params': array([0.86785714, 0.86388889, 0.8484127 , 0.79880952, 0.84960317,
       0.86904762]), 'auc_thresh': array([0.75039683, 0.83134921, 0.80039683, 0.84444444, 0.84960317,
       0.86904762, 0.85277778, 0.86785714, 0.83134921])}, '11': {'sigma': 0.006737946999085467, 'kernel_name': 'rbf', 'normal_class_label': '11', 'threshold': 3, 'CV_train_auc': 0.9




In [None]:
def average_labels(labelwise_dict:Dict[str, Dict[str, Any]],
                          field:str):
    """Averages the values of a field over the labels."""
    return np.mean([param_dict[field] for param_dict in labelwise_dict.values()],
                   axis=0)


def print_cv_tslearn_results(
        dataset_kernel_label_paramdict : Dict[str, Dict[str, Dict[str, Any]]],
        ):

    # return experiments
    for dataset_name, results in dataset_kernel_label_paramdict.items():
        print(dataset_name)
        kernelwise_dict = results["kernel_results"]
        n_classes = results['num_classes']
        ts_length = results['ts_length']
        n_train = results['N_train']
        path_dim = results['path dim']
        from experiment_code import print_dataset_stats
        print_dataset_stats(n_classes, path_dim, ts_length, n_train, "unknown")
        for kernel_name, labelwise_dict in kernelwise_dict.items():
            final_auc_avgs = average_labels(labelwise_dict, "CV_train_auc")
            params_auc_avgs = average_labels(labelwise_dict, "auc_params")
            thresh_auc_avgs = average_labels(labelwise_dict, "auc_thresh")
            print(f"\n{kernel_name}")
            print("final_auc_avgs", final_auc_avgs)
            print("params_auc_avgs", params_auc_avgs)
            print("thresh_auc_avgs", thresh_auc_avgs)
            if "truncated sig" in kernel_name:
                trunc_auc_avgs = average_labels(labelwise_dict, "auc_truncs")
                print("trunc_auc_avgs", trunc_auc_avgs)
        print("\nEnd dataset \n\n\n")

print_cv_tslearn_results(cv_best_models)

# (tslearn) Validate on Test

In [21]:
from experiment_code import validate_tslearn

validate_tslearn(cv_best_models)

Libras
Number of Classes: 15
Dimension of path: 2
Length: 45
Train: 180
Test: 180
Covariance operator numerical rank = 11
Covariance operator numerical rank = 11
Covariance operator numerical rank = 9
Covariance operator numerical rank = 11
Covariance operator numerical rank = 10
Covariance operator numerical rank = 11
Covariance operator numerical rank = 9
Covariance operator numerical rank = 11
Covariance operator numerical rank = 11
Covariance operator numerical rank = 10
Covariance operator numerical rank = 11
Covariance operator numerical rank = 11
Covariance operator numerical rank = 11
Covariance operator numerical rank = 11
Covariance operator numerical rank = 11


{'Libras': {'results': {'rbf': array([[0.93875661, 0.99490023],
          [0.7223545 , 0.95405502]])},
  'num_classes': 15,
  'path dim': 2,
  'ts_length': 45,
  'N_train': 180,
  'N_test': 180}}

# Print CV results

Libras
Number of Classes: 15
Dimension of path: 2
Length: 45
Train: 180
Test: unknown

rbf
final_auc_avgs 0.9479100529100528
params_auc_avgs [0.93777778 0.93301587 0.91843915 0.8876455  0.87642857 0.89793651]
thresh_auc_avgs [0.7357672  0.83878307 0.87706349 0.89272487 0.90283069 0.91624339
 0.92103175 0.92666667 0.89433862]

End dataset 



