In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import jax
import jax.numpy as jnp
import jax.lax as lax
from jaxtyping import Array, Float, Int, PRNGKeyArray
import aeon
import pandas as pd
from preprocessing.timeseries_augmentation import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from sklearn.metrics import accuracy_score, f1_score
from aeon.datasets.tsc_datasets import multivariate_equal_length
from aeon.datasets import load_regression, load_classification
from sklearn.linear_model import RidgeClassifierCV

from features.sig import SigTransform, LogSigTransform
from features.base import TimeseriesFeatureTransformer, TabularTimeseriesFeatures, RandomGuesser
from features.sig_neural import RandomizedSignature, TimeInhomogenousRandomizedSignature
from features.SWIM_controlled_resnet import SampledControlledResNet
from features.rocket_wrappers import RocketWrapper
from utils.utils import print_name, print_shape

jax.config.update('jax_platform_name', 'cpu') # Used to set the platform (cpu, gpu, etc.)
np.set_printoptions(precision=3, threshold=10) # Print options

2024-09-23 16:01:23.471521: W external/xla/xla/service/gpu/nvptx_compiler.cc:718] The NVIDIA driver's CUDA version is 12.4 which is older than the ptxas CUDA version (12.5.82). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


# Download dataset code

In [2]:
def get_aeon_dataset(
        dataset_name:str,
        normalize:bool = True,
        TSER_or_MTSC:str = "MTSC",
        extract_path = "/home/nikita/hdd/Data/",
        max_T:int = 1000,
        ):
    """Loads a dataset from the UCR/UEA archive using
    the aeon library.

    Args:
        dataset_name (str): Name of the dataset
        normalize (bool): Whether to apply z-score normalization based on
            the train set data.
        TSER_or_TSC (str): Whether to load a TSER or MTSC dataset.
        extract_path (str): Path to the directory where the datasets are stored.
            Note that the datasets are stored in a subdirectory called "TSER" or "TSC".
        max_T (int): Maximum length of the time series. If the time series is longer
            than this value, it will be average pooled down.

    Returns:
        Tuple: 4-tuple of the form (X_train, y_train, X_test, y_test)
    """
    if TSER_or_MTSC == "TSER":
        load_fun = load_regression 
    elif TSER_or_MTSC == "MTSC":
        load_fun = load_classification
    else:
        raise ValueError("TSER_or_TSC must be either 'TSER' or 'MTSC'")
    
    #download
    X_train, y_train = load_fun(dataset_name, split="train", extract_path=extract_path + TSER_or_MTSC + "/")
    X_test, y_test = load_fun(dataset_name, split="test", extract_path=extract_path + TSER_or_MTSC + "/")

    if load_fun == load_classification:
        # y is a list of string labels. I want to number these from 0 to n_labels
        y_train = np.unique(y_train, return_inverse=True)[1]
        y_test = np.unique(y_test, return_inverse=True)[1]

    #to jax array NOTE hardcoded no gradients
    X_train = lax.stop_gradient(jnp.array(X_train).transpose(0,2,1))
    X_test = lax.stop_gradient(jnp.array(X_test).transpose(0,2,1))
    y_train = lax.stop_gradient(jnp.array(y_train))
    y_test = lax.stop_gradient(jnp.array(y_test))

    #normalize
    if normalize:
        X_train, X_test = normalize_streams(X_train, X_test, max_T)
        c = 5.0
        X_train = X_train.clip(-c, c)
        X_test = X_test.clip(-c, c)

    return X_train, y_train, X_test, y_test


# df = []
# for dataset_name in list(multivariate_equal_length):
#     print(dataset_name)
#     X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name)

#     N1, T, D = X_train.shape
#     N2, T, D = X_test.shape
#     metadata_row = {
#                     "Dataset": dataset_name,
#                     "N_train": N1,
#                     "N_test": N2,
#                     "T": T,
#                     "D": D,
#                     "y_train_shape": y_train.shape,
#                     "n_labels": len(np.unique(y_train)),}
#     df.append(metadata_row)
# df = pd.DataFrame(df).set_index("Dataset")
# df.to_pickle("MTSC_dataset_metadata.pkl")

In [3]:
df_meta = pd.read_pickle("MTSC_dataset_metadata.pkl")
df_meta.head()

Unnamed: 0_level_0,N_train,N_test,T,D,y_train_shape,n_labels
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PhonemeSpectra,3315,3353,217,11,"(3315, 1)",39
UWaveGestureLibrary,120,320,315,3,"(120, 1)",8
AtrialFibrillation,15,15,640,2,"(15, 1)",3
FaceDetection,5890,3524,62,144,"(5890, 1)",2
MotorImagery,278,100,1000,64,"(278, 1)",2


In [4]:
large_datasets = df_meta.query("N_train >= 2000").sort_values("N_train")
print(large_datasets)
large_datasets = list(large_datasets.index)

                N_train  N_test    T    D y_train_shape  n_labels
Dataset                                                          
LSST               2459    2466   36    6     (2459, 1)        14
PhonemeSpectra     3315    3353  217   11     (3315, 1)        39
FaceDetection      5890    3524   62  144     (5890, 1)         2
PenDigits          7494    3498    8    2     (7494, 1)        10


In [5]:
small_datasets = df_meta.query("100 <= N_train <= 1000 & D < 100").sort_values("N_train")
print(small_datasets)
small_datasets = list(small_datasets.index)

                           N_train  N_test     T   D y_train_shape  n_labels
Dataset                                                                     
Cricket                        108      72   599   6      (108, 1)        12
UWaveGestureLibrary            120     320   315   3      (120, 1)         8
EigenWorms                     128     131  1000   6      (128, 1)         5
Epilepsy                       137     138   206   3      (137, 1)         4
Handwriting                    150     850   152   3      (150, 1)        26
RacketSports                   151     152    30   6      (151, 1)         4
HandMovementDirection          160      74   400  10      (160, 1)         4
Libras                         180     180    45   2      (180, 1)        15
NATOPS                         180     180    51  24      (180, 1)         6
SelfRegulationSCP2             200     180   576   7      (200, 1)         2
Heartbeat                      204     205   405  61      (204, 1)         2

# Experiment code

In [6]:
def run_1model_1dataset(
        model: TimeseriesFeatureTransformer,
        X_train: Float[Array, "N1  T  D"],
        y_train: Float[Array, "N1  1"],
        X_test: Float[Array, "N2  T  D"],
        y_test: Float[Array, "N2"],
        normalize_features: bool,
        apply_augment_time: bool,
        apply_basepoint: bool,
        ):

    if apply_basepoint:
        X_train = add_basepoint_zero(X_train)
        X_test = add_basepoint_zero(X_test)
    if apply_augment_time:
        X_train = augment_time(X_train)
        X_test  = augment_time(X_test)

    #fit feature model
    t0 = time.time()
    model.fit(X_train, y_train)

    #obtain features
    t1 = time.time()
    feat_train = model.transform(X_train)
    feat_test = model.transform(X_test)
    print(model)
    print_name(feat_train)

    if normalize_features:
        feat_train, feat_test = normalize_mean_std_traindata(feat_train, feat_test)

    #fit ridge regression
    t2 = time.time()
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    feat_train = np.array(feat_train)
    feat_test = np.array(feat_test)
    ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 30))
    ridge.fit(feat_train, y_train)

    # predict
    pred_test = ridge.predict(feat_test)
    pred_train = ridge.predict(feat_train)
    test_acc = np.mean(pred_test == y_test)
    train_acc = np.mean(pred_train == y_train)
    alpha = ridge.alpha_
    t3 = time.time()
    # print("Model:", model)
    # print("train_RMSE:", train_rmse)
    # print("test_RMSE:", test_rmse)
    # print("alpha (regularization):", alpha, "\n")
    t_fit_feat = t1 - t0
    t_trans_feat = t2 - t1
    t_ridge = t3 - t2
    return train_acc, test_acc, alpha, t_fit_feat, t_trans_feat, t_ridge



def run_allmodels_1dataset(
        X_train: Float[Array, "N1  T  D"],
        y_train: Float[Array, "N1  1"],
        X_test: Float[Array, "N2  T  D"],
        y_test: Float[Array, "N2"],
        normalize_features: bool,
        apply_augment_time: bool,
        apply_basepoint: bool,
        ):

    prng_key = jax.random.PRNGKey(999)
    max_batch = 1000
    n_features = 100
    models = [
        ["Random Guesser", RandomGuesser(prng_key, 100, max_batch)],
        ["Tabular", TabularTimeseriesFeatures(max_batch)],
        ["Sig", SigTransform(2, max_batch)],
        #["Log Sig", LogSigTransform(2, max_batch)],
        ["Randomized Signature", RandomizedSignature(
            prng_key,
            n_features,
            max_batch,
            )],
        ["Time Inhomogenous Randomized Signature", TimeInhomogenousRandomizedSignature(
            prng_key,
            n_features,
            max_batch,
            )],
        ["Sampled Controlled ResNet", SampledControlledResNet(
            prng_key,
            n_features,
            jnp.tanh,
            max_batch,
            transform_label_to_onehot = True,
            )],
        ["Rocket", RocketWrapper(
            20000,
            10000000,
            )],
        ]
    
    results = []
    model_names = [name for name, _ in models]
    for name, model in models:
        res = run_1model_1dataset(
            model, X_train, y_train, X_test, y_test, 
            normalize_features, apply_augment_time, apply_basepoint
            )
        results.append(res)
    return model_names, results



def run_all_experiments(
        datasets: List[str],
        name_save: str,
        normalize_X: bool = True,
        normalize_features: bool = False,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        max_T = 100,
        ):
    # Run experiments
    experiments = {}
    failed = {}
    for dataset_name in datasets:
        t0 = time.time()
        # try:
        print(dataset_name)
        X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name, normalize_X, max_T=max_T)
        X_train, X_test = normalize_streams(X_train, X_test, max_T=max_T)
        print(X_train.shape, X_test.shape)
        results = run_allmodels_1dataset(
            X_train, y_train, X_test, y_test, 
            normalize_features, apply_augment_time, apply_basepoint,
            )
        jax.clear_caches()
        experiments[dataset_name] = results
        # except Exception as e:
        #     print(f"Error: {e}")
        #     failed[dataset_name] = e
        print(dataset_name, "Elapsed time", time.time()-t0)
    
    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["acc_train", "acc_test", "alpha", "t_fit_feat", "t_trans_feat", "t_ridge"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"TSER_results_{name_save}.pkl")
    return df

In [7]:
df = run_all_experiments(small_datasets, "small", max_T=100)

Cricket
(108, 100, 6) (72, 100, 6)
RandomGuesser(max_batch=1000, n_features=100,
              seed=Array([  0, 999], dtype=uint32))
(108, 100) feat_train
[[ 0.3    0.025  3.56  ...  1.641 -0.29   1.591]
 [ 1.421 -0.594  1.536 ...  0.588 -1.833  0.743]
 [-0.076 -1.144 -1.099 ... -0.504  1.089  0.821]
 ...
 [-0.313  2.361 -0.116 ... -1.376 -0.888 -0.256]
 [ 0.589  0.727  0.973 ... -1.807  0.499  0.449]
 [ 2.993  0.378 -0.269 ...  0.392  0.444 -0.793]] 

TabularTimeseriesFeatures(max_batch=1000)
(108, 606) feat_train
[[ 0.     0.     0.    ... -0.838  0.628 -0.295]
 [ 0.     0.     0.    ...  0.594 -1.081 -0.031]
 [ 0.     0.     0.    ... -0.298  0.631  0.092]
 ...
 [ 0.     0.     0.    ...  0.09   0.699 -0.405]
 [ 0.     0.     0.    ...  0.128  0.986 -0.469]
 [ 0.     0.     0.    ... -0.118  1.329 -0.491]] 

SigTransform(max_batch=1000, trunc_level=2)
(108, 42) feat_train
[[-9.203e-01 -5.993e-01 -3.394e-01 ... -5.215e-01 -5.750e+00  4.353e-02]
 [-3.035e-01  4.650e-01 -4.776e-02 ... 

In [8]:
df_small = pd.read_pickle("TSER_results_small.pkl")

In [9]:
df_small["acc_test"].mean()

MultiRocket                               0.723939
Random Guesser                            0.281465
Randomized Signature                      0.465592
Sampled Controlled ResNet                 0.629924
Sig                                       0.545245
Tabular                                   0.654787
Time Inhomogenous Randomized Signature    0.475510
dtype: float64

In [10]:
df_small["acc_test"].rank(axis=1, ascending=False).mean()

MultiRocket                               1.96875
Random Guesser                            6.62500
Randomized Signature                      5.31250
Sampled Controlled ResNet                 2.87500
Sig                                       3.87500
Tabular                                   2.50000
Time Inhomogenous Randomized Signature    4.84375
dtype: float64

In [11]:
df_small["acc_test"].rank(axis=1, ascending=False)

Unnamed: 0,MultiRocket,Random Guesser,Randomized Signature,Sampled Controlled ResNet,Sig,Tabular,Time Inhomogenous Randomized Signature
Cricket,1.0,7.0,5.0,4.0,2.0,3.0,6.0
UWaveGestureLibrary,1.0,7.0,6.0,2.0,5.0,3.0,4.0
EigenWorms,1.0,6.0,6.0,3.5,2.0,3.5,6.0
Epilepsy,1.0,7.0,2.0,3.0,5.0,4.0,6.0
Handwriting,1.0,7.0,5.0,4.0,2.5,2.5,6.0
RacketSports,1.0,7.0,6.0,3.0,4.0,2.0,5.0
HandMovementDirection,5.0,4.0,3.0,2.0,7.0,1.0,6.0
Libras,1.0,7.0,6.0,2.0,5.0,3.0,4.0
NATOPS,2.0,7.0,6.0,3.0,4.0,1.0,5.0
SelfRegulationSCP2,1.0,7.0,6.0,5.0,3.0,4.0,2.0


In [12]:
df_small["acc_test"]

Unnamed: 0,MultiRocket,Random Guesser,Randomized Signature,Sampled Controlled ResNet,Sig,Tabular,Time Inhomogenous Randomized Signature
Cricket,0.972222,0.041667,0.736111,0.847222,0.930556,0.875,0.347222
UWaveGestureLibrary,0.946875,0.153125,0.378125,0.825,0.59375,0.81875,0.7125
EigenWorms,0.816794,0.419847,0.419847,0.48855,0.580153,0.48855,0.419847
Epilepsy,0.992754,0.210145,0.768116,0.652174,0.5,0.507246,0.304348
Handwriting,0.372941,0.031765,0.135294,0.151765,0.165882,0.165882,0.090588
RacketSports,0.842105,0.25,0.486842,0.690789,0.605263,0.796053,0.559211
HandMovementDirection,0.27027,0.283784,0.310811,0.418919,0.189189,0.594595,0.216216
Libras,0.95,0.072222,0.288889,0.75,0.355556,0.6,0.583333
NATOPS,0.866667,0.15,0.227778,0.766667,0.644444,0.877778,0.483333
SelfRegulationSCP2,0.6,0.438889,0.5,0.511111,0.527778,0.516667,0.555556
