In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import jax
import jax.numpy as jnp
import jax.lax as lax
from jaxtyping import Array, Float, Int, PRNGKeyArray
import aeon
import pandas as pd
from preprocessing.timeseries_augmentation import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from aeon.datasets.tser_datasets import tser_soton
from aeon.datasets import load_regression, load_classification
from sklearn.linear_model import RidgeCV

from features.sig import SigTransform, LogSigTransform
from features.base import TimeseriesFeatureTransformer, TabularTimeseriesFeatures, RandomGuesser
from features.sig_neural import RandomizedSignature, TimeInhomogenousRandomizedSignature
from features.SWIM_controlled_resnet import SampledControlledResNet
from features.rocket_wrappers import MultiRocketWrapper
from utils.utils import print_name, print_shape

jax.config.update('jax_platform_name', 'gpu') # Used to set the platform (cpu, gpu, etc.)
np.set_printoptions(precision=3, threshold=5) # Print options

2024-09-20 17:34:04.245781: W external/xla/xla/service/gpu/nvptx_compiler.cc:718] The NVIDIA driver's CUDA version is 12.4 which is older than the ptxas CUDA version (12.5.82). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


# Download dataset code

In [2]:
def get_aeon_dataset(
        dataset_name:str,
        normalize:bool = True,
        TSER_or_TSC:str = "TSER",
        extract_path = "/home/nikita/hdd/Data/",
        max_T:int = 1000,
        ):
    """Loads a dataset from the UCR/UEA archive using
    the aeon library.

    Args:
        dataset_name (str): Name of the dataset
        normalize (bool): Whether to apply z-score normalization based on
            the train set data.
        TSER_or_TSC (str): Whether to load a TSER or TSC dataset.
        extract_path (str): Path to the directory where the datasets are stored.
            Note that the datasets are stored in a subdirectory called "TSER" or "TSC".
        max_T (int): Maximum length of the time series. If the time series is longer
            than this value, it will be average pooled down.

    Returns:
        Tuple: 4-tuple of the form (X_train, y_train, X_test, y_test)
    """
    if TSER_or_TSC == "TSER":
        load_fun = load_regression 
    elif TSER_or_TSC == "TSC":
        load_fun = load_classification
    else:
        raise ValueError("TSER_or_TSC must be either 'TSER' or 'TSC'")
    
    #download
    X_train, y_train = load_fun(dataset_name, split="train", extract_path=extract_path)
    X_test, y_test = load_fun(dataset_name, split="test", extract_path=extract_path)

    #to jax array NOTE hardcoded no gradients
    X_train = lax.stop_gradient(jnp.array(X_train).transpose(0,2,1))
    X_test = lax.stop_gradient(jnp.array(X_test).transpose(0,2,1))
    y_train = lax.stop_gradient(jnp.array(y_train))[:, None]
    y_test = lax.stop_gradient(jnp.array(y_test))[:, None]

    #normalize
    if normalize:
        X_train, X_test = normalize_streams(X_train, X_test, max_T)
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)
        c = 5.0
        X_train = X_train.clip(-c, c)
        X_test = X_test.clip(-c, c)

    return X_train, y_train, X_test, y_test


# df = []
# for dataset_name in list(tser_soton):
#     print(dataset_name)
#     X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name)

#     N1, T, D = X_train.shape
#     N2, T, D = X_test.shape
#     metadata_row = {
#                     "Dataset": dataset_name,
#                     "N_train": N1,
#                     "N_test": N2,
#                     "T": T,
#                     "D": D,
#                     "y_train_shape": y_train.shape,
#                     "n_labels": len(np.unique(y_train)),}
#     df.append(metadata_row)
# df = pd.DataFrame(df).set_index("Dataset")
# df.to_pickle("TSER_dataset_metadata.pkl")


In [3]:
df_meta = pd.read_pickle("TSER_dataset_metadata.pkl")
df_meta.head()

Unnamed: 0_level_0,N_train,N_test,T,D,y_train_shape,n_labels
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FloodModeling1,471,202,266,1,"(471, 1)",85
BarCrawl6min,140,61,360,3,"(140, 1)",137
TetuanEnergyConsumption,254,110,144,5,"(254, 1)",254
GasSensorArrayEthanol,324,140,938,1,"(324, 1)",4
FloodModeling3,429,184,266,1,"(429, 1)",76


In [4]:
large_datasets = df_meta.query("N_train >= 2000").sort_values("N_train")
print(large_datasets)
large_datasets = list(large_datasets.index)

                               N_train  N_test     T  D y_train_shape  \
Dataset                                                                 
LPGasMonitoringHomeActivity       2017     865   100  1     (2017, 1)   
MethaneMonitoringHomeActivity     2017     865   100  1     (2017, 1)   
BenzeneConcentration              3349    5163   240  8     (3349, 1)   
LiveFuelMoistureContent           3493    1510   365  7     (3493, 1)   
MadridPM10Quality                 4845    2077   168  3     (4845, 1)   
BIDMC32RR                         5471    2399  1000  2     (5471, 1)   
BIDMC32HR                         5550    2399  1000  2     (5550, 1)   
BIDMC32SpO2                       5550    2399  1000  2     (5550, 1)   
BeijingPM10Quality               11918    5048    24  9    (11918, 1)   
BeijingPM25Quality               11918    5048    24  9    (11918, 1)   
ElectricMotorTemperature         15503    6645    60  6    (15503, 1)   
DailyTemperatureLatitude         27440   11760   36

In [5]:
medium_datasets = df_meta.query("1000 < N_train < 2000").sort_values("N_train")
print(medium_datasets)
medium_datasets = list(medium_datasets.index)

                              N_train  N_test     T  D y_train_shape  n_labels
Dataset                                                                       
BeijingIntAirportPM25Quality     1099     472    24  6     (1099, 1)       970
WaveDataTension                  1325     568    57  1     (1325, 1)      1325
ParkingBirmingham                1391     597    14  1     (1391, 1)      1385
DhakaHourlyAirQuality            1447     621    24  1     (1447, 1)      1255
MagnesiumConcentration           1560     669   895  1     (1560, 1)       904
PotassiumConcentration           1561     669   895  1     (1561, 1)       853
PhosphorusConcentration          1573     675   895  1     (1573, 1)       877
IEEEPPG                          1768    1328  1000  5     (1768, 1)      1035


In [6]:
small_datasets = df_meta.query("N_train <= 1000").sort_values("N_train")
print(small_datasets)
small_datasets = list(small_datasets.index)


                              N_train  N_test    T   D y_train_shape  n_labels
Dataset                                                                       
NaturalGasPricesSentiment          65      28   20   1       (65, 1)        65
CardanoSentiment                   74      33   24   2       (74, 1)        66
AppliancesEnergy                   95      42  144  24       (95, 1)        93
DailyOilGasPrices                 133      58   30   2      (133, 1)       133
Covid3Month                       140      61   84   1      (140, 1)       110
BarCrawl6min                      140      61  360   3      (140, 1)       137
Covid19Andalusia                  142      62   91   1      (142, 1)       142
AcousticContaminationMadrid       166      72  365   1      (166, 1)       114
BinanceCoinSentiment              184      79   24   2      (184, 1)       173
SteamPredictor                    210      90  168   4      (210, 1)       210
BitcoinSentiment                  232     100   24  

# Experiment code

In [7]:
def run_1model_1dataset(
        model: TimeseriesFeatureTransformer,
        X_train: Float[Array, "N1  T  D"],
        y_train: Float[Array, "N1  1"],
        X_test: Float[Array, "N2  T  D"],
        y_test: Float[Array, "N2  1"],
        normalize_features: bool = False,
        normalize_y: bool = True,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        ):

    if apply_basepoint:
        X_train = add_basepoint_zero(X_train)
        X_test = add_basepoint_zero(X_test)
    if apply_augment_time:
        X_train = augment_time(X_train)
        X_test  = augment_time(X_test)
    if normalize_y:
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)

    #fit feature model
    t0 = time.time()
    model.fit(X_train, y_train)

    #obtain features
    t1 = time.time()
    feat_train = model.transform(X_train)
    feat_test = model.transform(X_test)
    if normalize_features:
        feat_train, feat_test = normalize_mean_std_traindata(feat_train, feat_test)

    #fit ridge regression
    t2 = time.time()
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    feat_train = np.array(feat_train)
    feat_test = np.array(feat_test)
    ridge = RidgeCV(alphas=np.logspace(-3, 3, 30))
    ridge.fit(feat_train, y_train)

    # predict
    pred_test = ridge.predict(feat_test)
    pred_train = ridge.predict(feat_train)
    test_rmse = np.sqrt(mean_squared_error(y_test, pred_test))
    train_rmse = np.sqrt(mean_squared_error(y_train, pred_train))
    alpha = ridge.alpha_
    t3 = time.time()
    # print("Model:", model)
    # print("train_RMSE:", train_rmse)
    # print("test_RMSE:", test_rmse)
    # print("alpha (regularization):", alpha, "\n")
    t_fit_feat = t1 - t0
    t_trans_feat = t2 - t1
    t_ridge = t3 - t2
    return train_rmse, test_rmse, alpha, t_fit_feat, t_trans_feat, t_ridge



def run_allmodels_1dataset(
        X_train: Float[Array, "N1  T  D"],
        y_train: Float[Array, "N1  1"],
        X_test: Float[Array, "N2  T  D"],
        y_test: Float[Array, "N2  1"],
        normalize_features: bool = False,
        normalize_y: bool = True,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        ):
    
    prng_key = jax.random.PRNGKey(999)
    max_batch = 1000
    n_features = 100
    models = [
        ["Random Guesser", RandomGuesser(prng_key, 100, max_batch)],
        ["Tabular", TabularTimeseriesFeatures(max_batch)],
        ["Sig", SigTransform(4, max_batch)],
        #["Log Sig", LogSigTransform(4, max_batch)],
        ["Randomized Signature", RandomizedSignature(
            prng_key,
            n_features,
            max_batch,
            )],
        ["Time Inhomogenous Randomized Signature", TimeInhomogenousRandomizedSignature(
            prng_key,
            n_features,
            max_batch,
            )],
        ["Sampled Controlled ResNet", SampledControlledResNet(
            prng_key,
            n_features,
            jnp.tanh,
            max_batch,
            )],
        ["MultiRocket", MultiRocketWrapper(
            50000,
            10000000,
            )],
        ]
    
    results = []
    model_names = [name for name, _ in models]
    for name, model in models:
        res = run_1model_1dataset(
            model, X_train, y_train, X_test, y_test, 
            normalize_features, normalize_y, apply_augment_time, apply_basepoint
            )
        results.append(res)
    return model_names, results



def run_all_experiments(
        datasets: List[str],
        name_save: str,
        normalize_X: bool = True,
        normalize_features: bool = False,
        normalize_y: bool = True,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        max_T = 100,
        ):
    # Run experiments
    experiments = {}
    failed = {}
    for dataset_name in tqdm(datasets):
        t0 = time.time()
        # try:
        print(dataset_name)
        X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name, normalize_X, max_T=max_T)
        X_train, X_test = normalize_streams(X_train, X_test, max_T=max_T)
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)
        results = run_allmodels_1dataset(
            X_train, y_train, X_test, y_test, 
            normalize_features, normalize_y, apply_augment_time, apply_basepoint,
            )
        experiments[dataset_name] = results
        # except Exception as e:
        #     print(f"Error: {e}")
        #     failed[dataset_name] = e
        print(dataset_name, "Elapsed time", time.time()-t0)
    
    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["RMSE_train", "RMSE_test", "alpha", "t_fit_feat", "t_trans_feat", "t_ridge"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"TSER_results_{name_save}.pkl")
    return df

In [8]:
df = run_all_experiments(medium_datasets, "medium", max_T=100)

  0%|          | 0/8 [00:00<?, ?it/s]

BeijingIntAirportPM25Quality


 12%|█▎        | 1/8 [00:14<01:39, 14.26s/it]

BeijingIntAirportPM25Quality Elapsed time 14.260642051696777
WaveDataTension


 25%|██▌       | 2/8 [00:28<01:25, 14.17s/it]

WaveDataTension Elapsed time 14.10701060295105
ParkingBirmingham


 38%|███▊      | 3/8 [00:41<01:08, 13.75s/it]

ParkingBirmingham Elapsed time 13.24158000946045
DhakaHourlyAirQuality


 50%|█████     | 4/8 [00:56<00:56, 14.24s/it]

DhakaHourlyAirQuality Elapsed time 14.991873264312744
MagnesiumConcentration


 62%|██████▎   | 5/8 [01:23<00:56, 18.75s/it]

MagnesiumConcentration Elapsed time 26.756635665893555
PotassiumConcentration


 75%|███████▌  | 6/8 [01:44<00:39, 19.58s/it]

PotassiumConcentration Elapsed time 21.170787572860718
PhosphorusConcentration


 88%|████████▊ | 7/8 [02:07<00:20, 20.71s/it]

PhosphorusConcentration Elapsed time 23.031734228134155
IEEEPPG


100%|██████████| 8/8 [02:47<00:00, 20.92s/it]

IEEEPPG Elapsed time 39.74850535392761
                               RMSE_test                                      \
                             MultiRocket Random Guesser Randomized Signature   
BeijingIntAirportPM25Quality    0.762462       0.998072             0.837342   
WaveDataTension                 1.016923       1.088821             0.919409   
ParkingBirmingham               0.621193       1.067190             1.020380   
DhakaHourlyAirQuality           0.092696       1.040910             0.321173   
MagnesiumConcentration          0.375519       0.985363             0.665889   
PotassiumConcentration          0.587728       0.739649             0.671364   
PhosphorusConcentration         0.636007       0.937045             0.885559   
IEEEPPG                         1.232147       1.540215             1.303069   

                                                                             \
                             Sampled Controlled ResNet        Sig   Tabular   
Be




In [13]:
df_med = pd.read_pickle("TSER_results_medium.pkl")

In [14]:
df_med["RMSE_test"].mean()

MultiRocket                               0.665585
Random Guesser                            1.049658
Randomized Signature                      0.828023
Sampled Controlled ResNet                 0.782996
Sig                                       4.380231
Tabular                                   0.738496
Time Inhomogenous Randomized Signature    0.806068
dtype: float64

In [15]:
df_med["RMSE_test"].rank(axis=1, method='min').mean()

MultiRocket                               1.875
Random Guesser                            6.875
Randomized Signature                      4.250
Sampled Controlled ResNet                 3.125
Sig                                       5.750
Tabular                                   2.250
Time Inhomogenous Randomized Signature    3.875
dtype: float64

In [19]:
df_med["RMSE_test"].rank(axis=1, method='min')

Unnamed: 0,MultiRocket,Random Guesser,Randomized Signature,Sampled Controlled ResNet,Sig,Tabular,Time Inhomogenous Randomized Signature
BeijingIntAirportPM25Quality,2.0,7.0,3.0,4.0,6.0,1.0,5.0
WaveDataTension,6.0,7.0,4.0,3.0,5.0,1.0,2.0
ParkingBirmingham,1.0,7.0,6.0,2.0,4.0,5.0,3.0
DhakaHourlyAirQuality,1.0,7.0,5.0,3.0,6.0,2.0,4.0
MagnesiumConcentration,2.0,7.0,5.0,3.0,6.0,1.0,4.0
PotassiumConcentration,1.0,7.0,4.0,2.0,6.0,3.0,5.0
PhosphorusConcentration,1.0,7.0,5.0,3.0,6.0,2.0,4.0
IEEEPPG,1.0,6.0,2.0,5.0,7.0,3.0,4.0


In [20]:
df_med["RMSE_test"]

Unnamed: 0,MultiRocket,Random Guesser,Randomized Signature,Sampled Controlled ResNet,Sig,Tabular,Time Inhomogenous Randomized Signature
BeijingIntAirportPM25Quality,0.762462,0.998072,0.837342,0.882803,0.89376,0.74868,0.888635
WaveDataTension,1.016923,1.088821,0.919409,0.88304,0.989143,0.821681,0.872344
ParkingBirmingham,0.621193,1.06719,1.02038,0.765438,0.919772,1.000532,0.879767
DhakaHourlyAirQuality,0.092696,1.04091,0.321173,0.224035,0.43596,0.198016,0.275619
MagnesiumConcentration,0.375519,0.985363,0.665889,0.531011,0.849769,0.356027,0.544208
PotassiumConcentration,0.587728,0.739649,0.671364,0.623145,0.698375,0.647635,0.676318
PhosphorusConcentration,0.636007,0.937045,0.885559,0.835389,0.910623,0.820897,0.84763
IEEEPPG,1.232147,1.540215,1.303069,1.519105,29.344443,1.314504,1.464025
