In [None]:
import time
import numpy as np
from numba import njit
from typing import Callable, List, Any, Optional
import sigkernel

import numpy as np
import pandas as pd
import sklearn.metrics
from typing import List, Optional, Dict, Set, Callable, Any
from joblib import Memory, Parallel, delayed
import tslearn
import tslearn.metrics
from tslearn.datasets import UCR_UEA_datasets
from scipy.interpolate import interp1d
from numba import njit
import numba as nb
import pickle

from experiments.experiment_code import print_dataset_stats, run_all_kernels

from models.kernels import linear_kernel_gram, pairwise_kernel_gram
from models.normalize_streams import normalize_streams

# Test variability of sig between datasets

$  \|x\|_{TV} = \sum_{i=1}^L \|x_{i}- x_{i-1}\|_{R^d} $

$ \|y\|_{R^d} = \sqrt{\sum_{k=1}^d y_k^2} $

In [11]:
from experiments.experiment_code import normalize_streams, calc_grams



# def do_trunc_sig_gram(train, test, factor:float = 1.0):
#     ORDER = 10
#     ker = lambda X, Y: linear_kernel_gram(X, Y, param_dict["sigma"], custom_factor=factor) #TODO assumes fixed length
#     return case_truncated_sig(train, test, ORDER, 
#                                 linear_kernel_gram, sig_kernel_only_last, 
#                                 n_jobs, verbose)



def total_variation(X:np.ndarray, 
                    channelwise:bool = False,
                    mean:bool = True,
                    ):
    """Calculates the total variation of time series.
    X has shape (..., T, d)"""
    #Total Variation
    diffs = np.diff(X, axis=-2)
    if channelwise:
        TV = np.abs(diffs).sum(axis=-2) #shape (..., d)
    else:
        TV = np.linalg.norm(diffs, axis=-1).sum(axis=-1) #shape (...,)
    
    # Average
    if mean:
        ndim = TV.ndim - int(channelwise)
        if ndim > 0:
            TV = np.mean(TV, axis=tuple(range(ndim)))

    return TV


def mean_distance_between_times(X:np.ndarray, 
                                n_samples_N:int = 100,
                                n_samples_T:int = 100):
    """ X shape (N, T, d)"""

    #Sample at timesteps and instances
    N, T, d = X.shape
    n_samples_N = min(n_samples_N, N)
    n_samples_T = min(n_samples_T, T)
    choice_N = np.random.choice(N, size=n_samples_N, replace=False)
    choice_T = np.random.choice(T, size=n_samples_T, replace=False)
    X = X[choice_N][:, choice_T] #shape (n_samples_N, n_samples_T, d)

    # #out: (N, T, T))
    # new = X.transpose(1, 0, 2)
    # xx = linear_kernel_gram(new, new, diag=True, divide_by_dims=False)
    # xy = linear_kernel_gram(new, new, diag=False, divide_by_dims=False)
    # norms_squared = -2*xy + xx[:, np.newaxis] + xx[np.newaxis, :]
    # return np.mean(np.sqrt(norms_squared))

    #out: (N, T, T)) TAKE MAXIMUM ALONG T's
    new = X.transpose(1, 0, 2)
    xx = linear_kernel_gram(new, new, diag=True, divide_by_dims=False)
    xy = linear_kernel_gram(new, new, diag=False, divide_by_dims=False)
    norms_squared = -2*xy + xx[:, np.newaxis] + xx[np.newaxis, :]
    max_distances = np.max(np.sqrt(norms_squared), axis=(-1,-2))
    return np.mean(max_distances)



def test_variability(dataset_name:str):
    X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset(dataset_name)
    print(dataset_name)
    unique_labels = np.unique(y_train)
    num_classes = len(unique_labels)
    N_train, T, d = X_train.shape

    corpus, test = normalize_streams(X_train, X_test)
    print("corpus", corpus.shape)
    s = tslearn.metrics.sigma_gak(dataset=corpus,
          n_samples=100,
          random_state=0)
 
    ### calculate the kernel
    choice = np.random.choice(N_train, size=20)
    choice_test = np.random.choice(len(X_test), size=8)
    TRAIN = np.array([corpus[i] for i in choice])
    TEST = np.array([test[i] for i in choice_test])
    # param_dict = {"kernel_name": "gak",
    #                 "gak_factor" : 1}
    param_dict = {"kernel_name": "truncated sig",
                    "order" : 5}
    # param_dict = {"kernel_name": "signature pde",
    #                 "dyadic_order" : 3}
    vv, uv = calc_grams(TRAIN, TEST, param_dict, sig_kernel_only_last=False, n_jobs=4, verbose=False)
    print(uv.shape)
    abs = np.mean(np.abs(uv), axis=(-1,-2))
    print("abs", abs)
    print("\n")
    pass

def med_time_dist(corpus:np.ndarray, 
                n_samples_N:int = 100,
                n_samples_T:int = 70):
    """ corpus shape (N, T, d)"""  #for flattened kernels, first reshape to (N, 1, T*d)

    #Sample at timesteps and instances
    N, T, d = corpus.shape
    n_samples_N = min(n_samples_N, N)
    n_samples_T = min(n_samples_T, T)
    choice_N = np.random.choice(N, size=n_samples_N, replace=False)
    choice_T = np.random.choice(T, size=n_samples_T, replace=False)
    X = corpus[choice_N][:, choice_T] #shape (n_samples_N, n_samples_T, d)

    # calculate ||x_i - x_j||^2 for all i,j
    X = X.reshape(-1, d)
    xx = linear_kernel_gram(X, X, diag=True, divide_by_dims=True)  #shape (N1, ...)
    xy = linear_kernel_gram(X, X, diag=False, divide_by_dims=True) #shape (N1, N2, ...)
    yy = xx  #shape (N2, ...)
    norms_squared = -2*xy + xx[:, np.newaxis] + yy[np.newaxis, :] 
    return np.median(norms_squared), np.mean(norms_squared)


def test_var(dataset_name:str):
    print(dataset_name)
    X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset(dataset_name)
    for label in np.unique(y_train):
        X = X_train[y_train == label]
        corpus, test = normalize_streams(X, X_test)
    med, mean = med_time_dist(X_train) # need to do separately for each corpus
    print("med", med, "mean", mean)
    pass

for dataset_name in [
        'Epilepsy',                    # N_corpus = 34
        'EthanolConcentration',        # N_corpus = 65
        'FingerMovements',             # N_corpus = 158
        'HandMovementDirection',       # N_corpus = 40
        'Heartbeat',                   # N_corpus = 102
        'LSST',                        # N_corpus = 176
        'MotorImagery',                # N_corpus = 139
        'NATOPS',                      # N_corpus = 30
        'PenDigits',                   # N_corpus = 749
        'PEMS-SF',                     # N_corpus = 38
        'PhonemeSpectra',              # N_corpus = 85
        'RacketSports',                # N_corpus = 38
        'SelfRegulationSCP1',          # N_corpus = 134
        ]:
    test_var(dataset_name)

med 1.0595666666666665 mean 1.496789435131975
med 86804647.75481951 mean 254880023.8247414
med 1876.9955357142849 mean 2769.522326863764
med 38043.00582078421 mean 55708.49103754038
med 0.026847782270745905 mean 0.5367304634136321
med 1315.1765663387378 mean 3025866.949560729


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f663e7ffe90>>
Traceback (most recent call last):
  File "/home/nikita/Code/kernel-timeseries-anomaly-detection/.conda/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


In [None]:
# ArticularyWordRecognition T=144, d=9
# (10, 8, 9)
# abs [1.29565776 2.27435506 2.87972308 4.07389224 5.01688708 6.38028238
#  7.11759463 7.86446547 8.08651308 8.29304982]


# BasicMotions T=100, d=6
# (10, 8, 9)
# abs [1.04759194e+00 7.15178976e+01 1.40339471e+02 4.10876657e+03
#  1.92255294e+04 2.50874495e+05 1.49520768e+06 1.27200498e+07
#  7.06710438e+07 4.39333097e+08]


# Libras T=45, d=2
# (10, 8, 9)
# abs [1.89531427 3.83885875 4.15721433 6.99827019 7.40133744 8.93425362
#  9.22978483 9.44218473 9.4780136  9.49202141]


# NATOPS T=51, d=24
# (10, 8, 9)
# abs [ 1.01604486  1.53247836  2.82986402  4.49605307  7.90352287 12.42609689
#  18.56948677 24.6321559  29.69169966 33.1874835 ]


# RacketSports T=30, d=6
# (10, 8, 9)
# abs [ 1.16522057  4.40396722  9.04935699 15.67076359 30.32986175 39.15169919
#  61.60201888 72.30843763 90.79824697 99.62780377]


# FingerMovements T=50, d=28
# (10, 8, 9)
# abs [0.97984797 1.04719032 1.04531916 1.04813437 1.04803513 1.04810413
#  1.04810097 1.048102   1.04810194 1.04810195]


# Heartbeat T=405, d=61
# (10, 8, 9)
# abs [1.08550586e+00 7.23266419e+02 9.05013282e+04 5.82884262e+06
#  8.83717077e+08 6.35578512e+10 2.92053048e+12 8.94131107e+13
#  4.99193194e+15 3.01476687e+17]


# UWaveGestureLibrary T=315, d=3
# (10, 8, 9)
# abs [  1.20473682   9.60296452  14.84277035  39.26442477  67.65798534
#  121.57540175 170.10270738 231.90516802 276.56580554 318.60532653]

$          <x, x>  + <y, y> - <x, y>  - <y, x>  =  <x-y, x> + <x-y, y> = <x-y, x-y>      $

# Plot datasets

In [None]:
from experiments.experiment_code import normalize_streams
import plotly.express as px

def plot_dataset(dataset_name:str):
    X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset(dataset_name)
    print(dataset_name)
    unique_labels = np.unique(y_train)
    num_classes = len(unique_labels)
    N_train, T, d = X_train.shape

    corpus, test = normalize_streams(X_train, X_test)

    choice = np.random.choice(N_train, size=9)
    TRAIN = np.array([corpus[i] for i in choice])
    fig = px.line(TRAIN[0])
    fig.show()



for dataset_name in [
        'ArticularyWordRecognition', 
        'BasicMotions', 
         ###'Cricket',             # fuck cricket, too big and n_samples=10...
         ##########'ERing', #cant find dataset
        'Libras', 
        'NATOPS', 
        'RacketSports',     
        'FingerMovements',      # estimates a bit low, 10e-3
        'Heartbeat',
        'SelfRegulationSCP1',   # CAN RESAMPLE 2x or even 3x, 4x
        'UWaveGestureLibrary',
        "PenDigits",
        ""
        ]:
    plot_dataset(dataset_name)

In [6]:
from models.normalize_streams import normalize_streams

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("UWaveGestureLibrary")
import plotly.express as px
print(X_train.shape)
idx = 5
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [7]:
import plotly.express as px
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("SelfRegulationSCP1")
import plotly.express as px
print(X_train.shape)
idx = 2
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

(268, 896, 6)


(268, 69, 7)


In [8]:
import plotly.express as px
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("SelfRegulationSCP2")
import plotly.express as px
print(X_train.shape)
idx = 3
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

(200, 1152, 7)


(200, 68, 8)


In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("Heartbeat")
import plotly.express as px
print(X_train.shape)
idx = 2
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("BasicMotions")
import plotly.express as px
print(X_train.shape)
idx = 8
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()


In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("Libras")
import plotly.express as px
print(X_train.shape)
idx = 2
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("EthanolConcentration")
import plotly.express as px
print(X_train.shape)
idx = 200
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("HandMovementDirection")
import plotly.express as px
print(X_train.shape)
idx = 20
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("LSST")
import plotly.express as px
print(X_train.shape)
idx = 28
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("PenDigits")
import plotly.express as px
print(X_train.shape)
idx = 7
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("MotorImagery")
import plotly.express as px
print(X_train.shape)
idx = 7
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset("Epilepsy")
import plotly.express as px
print(X_train.shape)
idx = 10
fig = px.line(X_train[idx])
fig.show()
X_train, X_test = normalize_streams(X_train, X_test)
print(X_train.shape)
fig = px.line(X_train[idx])
fig.show()

In [None]:
from experiments.experiment_code import normalize_streams
import plotly.express as px
from tslearn.datasets import UCR_UEA_datasets

In [None]:
# Epilepsy 34
# EthanolConcentration 65
# FaceDetection 2945
# FingerMovements 158
# HandMovementDirection 40
# Heartbeat 102
# LSST 176
# MotorImagery 139
# NATOPS 30
# PenDigits 749
# PEMS-SF 38
# PhonemeSpectra 85
# RacketSports 38
# SelfRegulationSCP1 134

for dataset_name in [
    "Epilepsy",
    "EthanolConcentration",
    "FaceDetection",
    "FingerMovements",
    "HandMovementDirection",
    "Heartbeat",
    "LSST",
    "MotorImagery",   #NO --- 3000 length too big, too oscillatory
    "NATOPS",
    "PenDigits",
    "PEMS-SF",
    "PhonemeSpectra",
    "RacketSports",
    "SelfRegulationSCP1",
]:
    X_train, y_train, X_test, y_test = UCR_UEA_datasets().load_dataset(dataset_name)
    print(dataset_name)
    print(X_train.shape)
    idx=0
    px.line(X_train[idx]).show()
    X_train, X_test = normalize_streams(X_train, X_test)
    print(X_train.shape)
    px.line(X_train[idx]).show()
