In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics
from typing import List, Optional, Dict, Set, Callable, Any
from joblib import Memory, Parallel, delayed
import tslearn
import tslearn.metrics
from tslearn.datasets import UCR_UEA_datasets
from scipy.interpolate import interp1d
from numba import njit

from cross_validation import cv_tslearn, print_cv_results
from eval_on_test import validate_tslearn, print_test_results
from utils import load_from_pickle, save_to_pickle

# Enumerate all tslearn datasets

In [None]:
# _datasets = [
#             'ArticularyWordRecognition', 
#             'BasicMotions', 
#             'Cricket',
#             #'ERing',
#             'Libras', 
#             'NATOPS', 
#             'RacketSports',     
#             'FingerMovements',
#             'Heartbeat',
#             'SelfRegulationSCP1', 
#             'UWaveGestureLibrary'
#             ]

# import tslearn
# UCR_UEA_datasets = tslearn.datasets.UCR_UEA_datasets()

# for dataset_name in UCR_UEA_datasets.list_multivariate_datasets():
# #for dataset_name in _datasets:
#     print("Dataset:", dataset_name)
#     dataset = UCR_UEA_datasets.load_dataset(dataset_name)
#     if dataset[0] is not None:
#         X_train, y_train, X_test, y_test = dataset
#         num_classes = len(np.unique(y_train))
#         N_train, T, d = X_train.shape
#         N_test, _, _  = X_test.shape
        
#         print("Number of Classes:", num_classes)
#         print("Dimension of path:", d)
#         print("Length:", T)
#         print("Train Size, Test Size", N_train, N_test)
#         print()
#     else:
#         print("No dataset found")
#         print()

#yes
# Dataset: ArticularyWordRecognition
# Number of Classes: 25
# Dimension of path: 9
# Length: 144
# Train Size, Test Size 275 300

# Dataset: AtrialFibrillation
# No dataset found

#yes
# Dataset: BasicMotions
# Number of Classes: 4
# Dimension of path: 6
# Length: 100
# Train Size, Test Size 40 40

# Dataset: CharacterTrajectories
# No dataset found

#yes
# Dataset: Cricket
# Number of Classes: 12
# Dimension of path: 6
# Length: 1197
# Train Size, Test Size 108 72

# Dataset: DuckDuckGeese
# No dataset found

# Dataset: EigenWorms
# Number of Classes: 5
# Dimension of path: 6
# Length: 17984
# Train Size, Test Size 128 131

#why not
# Dataset: Epilepsy
# Number of Classes: 4
# Dimension of path: 3
# Length: 206
# Train Size, Test Size 137 138

#longLength
# Dataset: EthanolConcentration
# Number of Classes: 4
# Dimension of path: 3
# Length: 1751
# Train Size, Test Size 261 263

# Dataset: ERing
# No dataset found

#big
# Dataset: FaceDetection
# Number of Classes: 2
# Dimension of path: 144
# Length: 62
# Train Size, Test Size 5890 3524

#yes
# Dataset: FingerMovements
# Number of Classes: 2
# Dimension of path: 28
# Length: 50
# Train Size, Test Size 316 100

#why not, maybe big length
# Dataset: HandMovementDirection
# Number of Classes: 4
# Dimension of path: 10
# Length: 400
# Train Size, Test Size 160 74

#smallTrain
# Dataset: Handwriting
# Number of Classes: 26
# Dimension of path: 3
# Length: 152
# Train Size, Test Size 150 850

#yes
# Dataset: Heartbeat
# Number of Classes: 2
# Dimension of path: 61
# Length: 405
# Train Size, Test Size 204 205

#big
# Dataset: InsectWingbeat
# Number of Classes: 10
# Dimension of path: 200
# Length: 22
# Train Size, Test Size 25000 25000

# Dataset: JapaneseVowels
# No dataset found

#yes
# Dataset: Libras
# Number of Classes: 15
# Dimension of path: 2
# Length: 45
# Train Size, Test Size 180 180

#TODO I SHOULD INCLUDE
# Dataset: LSST
# Number of Classes: 14
# Dimension of path: 6
# Length: 36
# Train Size, Test Size 2459 2466

#length
# Dataset: MotorImagery
# Number of Classes: 2
# Dimension of path: 64
# Length: 3000
# Train Size, Test Size 278 100

#yes
# Dataset: NATOPS
# Number of Classes: 6
# Dimension of path: 24
# Length: 51
# Train Size, Test Size 180 180

#yes
# Dataset: PenDigits
# Number of Classes: 10
# Dimension of path: 2
# Length: 8
# Train Size, Test Size 7494 3498

#TODO SHOULD INCLUDE highDim
# Dataset: PEMS-SF
# Number of Classes: 7
# Dimension of path: 963
# Length: 144
# Train Size, Test Size 267 173

#NO, dim=1, big length, large num classes
# Dataset: Phoneme
# Number of Classes: 39
# Dimension of path: 1
# Length: 1024
# Train Size, Test Size 214 1896

#yes
# Dataset: RacketSports
# Number of Classes: 4
# Dimension of path: 6
# Length: 30
# Train Size, Test Size 151 152

#yes
# Dataset: SelfRegulationSCP1
# Number of Classes: 2
# Dimension of path: 6
# Length: 896
# Train Size, Test Size 268 293

# Dataset: SelfRegulationSCP2
# Number of Classes: 2
# Dimension of path: 7
# Length: 1152
# Train Size, Test Size 200 180

# Dataset: SpokenArabicDigits
# No dataset found

#NO, long, also very small set
# Dataset: StandWalkJump
# Number of Classes: 3
# Dimension of path: 4
# Length: 2500
# Train Size, Test Size 12 15

#yes
# Dataset: UWaveGestureLibrary
# Number of Classes: 8
# Dimension of path: 3
# Length: 315
# Train Size, Test Size 120 320


# Cross Validation on Train

In [2]:
cv_best_models = cv_tslearn(
    dataset_names = [
        #'ArticularyWordRecognition', 
        #'BasicMotions', 
        #'Cricket',
             #########'ERing', #cant find dataset
        #'Libras', 
        #'NATOPS', 
        #'RacketSports',     
        #'FingerMovements',
        #'Heartbeat',
        #'SelfRegulationSCP1', 
        #'UWaveGestureLibrary',
        'PenDigits',
        #'LSST',
        #'EthanolConcentration',
        ],
    kernel_names = [
        "linear",
        #"rbf",
        #"poly",
        #"gak",
        #"truncated sig",
        #"truncated sig rbf",
        #"signature pde rbf",
        #"integral linear",
        #"integral rbf",
        #"integral poly",
        ],
        k=5,
        n_repeats=1,
        n_jobs_repeats=1,
        n_jobs_gram=1,
        verbose=False,
        )

Dataset: PenDigits
Number of Classes: 10
Dimension of path: 2
Length: 8
Train: 7494
Test: N/A


Label for linear:   0%|          | 0/10 [00:00<?, ?it/s]

min std: 11.470955930690362 max std: 27.137755545206648
min std: 11.819009126053183 max std: 26.85940196098031
min std: 11.436556767816313 max std: 26.97159376212377
min std: 11.84980067908839 max std: 26.41815986538572
min std: 11.838578159396002 max std: 26.702939880735283


Label for linear:  10%|█         | 1/10 [00:02<00:23,  2.57s/it]

min std: 3.101986693181394 max std: 36.60268902320962
min std: 2.9157549685067816 max std: 36.84233336853625
min std: 2.9463782294493064 max std: 36.02928755209803
min std: 3.203352090567901 max std: 36.029251153367724
min std: 3.1900655677836345 max std: 36.165512160151266


Label for linear:  20%|██        | 2/10 [00:05<00:20,  2.55s/it]

min std: 2.4762431380814514 max std: 19.38517811545415
min std: 2.453294299833928 max std: 19.92814223540346
min std: 2.303967792117276 max std: 19.810825552431407
min std: 2.4096335247712424 max std: 19.68122584558303
min std: 1.5658584878833264 max std: 20.165494763983986


Label for linear:  30%|███       | 3/10 [00:07<00:17,  2.53s/it]

min std: 1.5833863354641486 max std: 19.7081368627563
min std: 1.3616780484940139 max std: 19.824049780921325
min std: 1.387658265978323 max std: 19.638820857884813
min std: 1.5894263092193213 max std: 19.57766823152702
min std: 1.4708529648647857 max std: 19.818816356453617


  train = (train - mean) / std
  test = (test - mean) / std
  test = (test - mean) / std
Label for linear:  40%|████      | 4/10 [00:10<00:16,  2.68s/it]


min std: 0.0 max std: 30.353891096476314
std: [[[23.78657237  2.2577478 ]
  [17.07604237  9.51937915]
  [10.18737067 10.89957558]
  [30.3538911  10.83667781]
  [19.29319568 18.14079283]
  [13.64034775 15.48424339]
  [17.34263448  7.24088828]
  [25.89026894  0.        ]]]


LinAlgError: SVD did not converge

##### Print CV results

In [None]:
print_cv_results(cv_best_models)

# Validate on Test

In [None]:
test_results = validate_tslearn(cv_best_models, n_jobs=1, verbose=False)

##### Print test results

In [None]:
print_test_results(test_results)

# Read CV data from file and print results

In [None]:
def read_dicts_from_pickle(paths:List[str]) -> Dict:
    dicts = [load_from_pickle(path)
             for path in paths]
    joined_dicts = {}
    for d in dicts:
        joined_dicts.update(d)
    return joined_dicts

# Load the cross validation results
cv_results = read_dicts_from_pickle(
    [
    "../Data/cv_results_Heart.pkl", 
    "../Data/cv_results_UWave_BM.pkl",
    ])
print_cv_results(cv_results)

In [None]:
test_results = read_dicts_from_pickle([]