In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import jax
import jax.numpy as jnp
import jax.lax as lax
from jaxtyping import Array, Float, Int, PRNGKeyArray
import aeon
import pandas as pd
from preprocessing.timeseries_augmentation import normalize_mean_std_traindata, normalize_streams, augment_time, add_basepoint_zero
from aeon.regression.sklearn import RotationForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from aeon.datasets.tser_datasets import tser_soton
from aeon.datasets import load_regression, load_classification
from sklearn.linear_model import RidgeCV

from features.sig import SigTransform, LogSigTransform
from features.base import TimeseriesFeatureTransformer, TabularTimeseriesFeatures, RandomGuesser
from features.sig_neural import RandomizedSignature, TimeInhomogenousRandomizedSignature
from features.SWIM_controlled_resnet import SampledControlledResNet
from features.rocket_wrappers import MultiRocketWrapper
from utils.utils import print_name, print_shape

jax.config.update('jax_platform_name', 'gpu') # Used to set the platform (cpu, gpu, etc.)
np.set_printoptions(precision=3, threshold=5) # Print options

2024-09-20 13:20:46.372853: W external/xla/xla/service/gpu/nvptx_compiler.cc:718] The NVIDIA driver's CUDA version is 12.4 which is older than the ptxas CUDA version (12.5.82). Because the driver is older than the ptxas version, XLA is disabling parallel compilation, which may slow down compilation. You should update your NVIDIA driver or use the NVIDIA-provided CUDA forward compatibility packages.


# Download dataset code

In [2]:
def get_aeon_dataset(
        dataset_name:str,
        normalize:bool = True,
        TSER_or_TSC:str = "TSER",
        extract_path = "/home/nikita/hdd/Data/",
        max_T:int = 1000,
        ):
    """Loads a dataset from the UCR/UEA archive using
    the aeon library.

    Args:
        dataset_name (str): Name of the dataset
        normalize (bool): Whether to apply z-score normalization based on
            the train set data.
        TSER_or_TSC (str): Whether to load a TSER or TSC dataset.
        extract_path (str): Path to the directory where the datasets are stored.
            Note that the datasets are stored in a subdirectory called "TSER" or "TSC".
        max_T (int): Maximum length of the time series. If the time series is longer
            than this value, it will be average pooled down.

    Returns:
        Tuple: 4-tuple of the form (X_train, y_train, X_test, y_test)
    """
    if TSER_or_TSC == "TSER":
        load_fun = load_regression 
    elif TSER_or_TSC == "TSC":
        load_fun = load_classification
    else:
        raise ValueError("TSER_or_TSC must be either 'TSER' or 'TSC'")
    
    #download
    X_train, y_train = load_fun(dataset_name, split="train", extract_path=extract_path)
    X_test, y_test = load_fun(dataset_name, split="test", extract_path=extract_path)

    #to jax array NOTE hardcoded no gradients
    X_train = lax.stop_gradient(jnp.array(X_train).transpose(0,2,1))
    X_test = lax.stop_gradient(jnp.array(X_test).transpose(0,2,1))
    y_train = lax.stop_gradient(jnp.array(y_train))[:, None]
    y_test = lax.stop_gradient(jnp.array(y_test))[:, None]

    #normalize
    if normalize:
        X_train, X_test = normalize_streams(X_train, X_test, max_T)
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)
        c = 5.0
        X_train = X_train.clip(-c, c)
        X_test = X_test.clip(-c, c)

    return X_train, y_train, X_test, y_test


# for dataset_name in list(tser_soton):
#     print(dataset_name)
#     X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name)
#     N1, T, D = X_train.shape
#     N2, T, D = X_test.shape
#     print(f"Ntrain={N1}, Ntest={N2}, T={T}, D={D}, ytrainshape={y_train.shape}, nuniquelabels={jnp.unique(y_train).shape}")
#     print("\n")

In [3]:
############ dataset details ############

# TetuanEnergyConsumption
# Ntrain=254, Ntest=110, T=144, D=5, ytrainshape=(254,), nuniquelabels=(254,)


# HouseholdPowerConsumption1
# Ntrain=745, Ntest=686, T=1440, D=5, ytrainshape=(745,), nuniquelabels=(745,)


# ChilledWaterPredictor
# Ntrain=321, Ntest=138, T=168, D=4, ytrainshape=(321,), nuniquelabels=(303,)


# NewsTitleSentiment
# Ntrain=58213, Ntest=24951, T=144, D=3, ytrainshape=(58213,), nuniquelabels=(10552,)


# ZincConcentration
# Ntrain=445, Ntest=191, T=2307, D=1, ytrainshape=(445,), nuniquelabels=(314,)


# AustraliaRainfall
# Ntrain=112186, Ntest=48081, T=24, D=3, ytrainshape=(112186,), nuniquelabels=(782,)


# BeijingIntAirportPM25Quality
# Ntrain=1099, Ntest=472, T=24, D=6, ytrainshape=(1099,), nuniquelabels=(970,)


# WaveDataTension
# Ntrain=1325, Ntest=568, T=57, D=1, ytrainshape=(1325,), nuniquelabels=(1325,)


# WindTurbinePower
# Ntrain=596, Ntest=256, T=144, D=1, ytrainshape=(596,), nuniquelabels=(596,)


# BeijingPM25Quality
# Ntrain=11918, Ntest=5048, T=24, D=9, ytrainshape=(11918,), nuniquelabels=(575,)


# FloodModeling2
# Ntrain=466, Ntest=201, T=266, D=1, ytrainshape=(466,), nuniquelabels=(27,)


# FloodModeling3
# Ntrain=429, Ntest=184, T=266, D=1, ytrainshape=(429,), nuniquelabels=(76,)


# PotassiumConcentration
# Ntrain=1561, Ntest=669, T=3578, D=1, ytrainshape=(1561,), nuniquelabels=(853,)


# BIDMC32SpO2
# Ntrain=5550, Ntest=2399, T=4000, D=2, ytrainshape=(5550,), nuniquelabels=(281,)


# MadridPM10Quality
# Ntrain=4845, Ntest=2077, T=168, D=3, ytrainshape=(4845,), nuniquelabels=(4825,)


# SodiumConcentration
# Ntrain=424, Ntest=183, T=1716, D=1, ytrainshape=(424,), nuniquelabels=(373,)


# IEEEPPG
# Ntrain=1768, Ntest=1328, T=1000, D=5, ytrainshape=(1768,), nuniquelabels=(1035,)


# Covid19Andalusia
# Ntrain=142, Ntest=62, T=91, D=1, ytrainshape=(142,), nuniquelabels=(142,)


# BitcoinSentiment
# Ntrain=232, Ntest=100, T=24, D=2, ytrainshape=(232,), nuniquelabels=(228,)


# SulphurConcentration
# Ntrain=444, Ntest=191, T=2307, D=1, ytrainshape=(444,), nuniquelabels=(377,)


# BIDMC32HR
# Ntrain=5550, Ntest=2399, T=4000, D=2, ytrainshape=(5550,), nuniquelabels=(1373,)


# DailyTemperatureLatitude
# Ntrain=27440, Ntest=11760, T=365, D=1, ytrainshape=(27440,), nuniquelabels=(954,)


# MetroInterstateTrafficVolume
# Ntrain=849, Ntest=365, T=24, D=4, ytrainshape=(849,), nuniquelabels=(838,)


# SierraNevadaMountainsSnow
# Ntrain=350, Ntest=150, T=30, D=3, ytrainshape=(350,), nuniquelabels=(195,)


# LiveFuelMoistureContent
# Ntrain=3493, Ntest=1510, T=365, D=7, ytrainshape=(3493,), nuniquelabels=(796,)


# PrecipitationAndalusia
# Ntrain=470, Ntest=202, T=365, D=4, ytrainshape=(470,), nuniquelabels=(467,)


# MethaneMonitoringHomeActivity
# Ntrain=2017, Ntest=865, T=100, D=1, ytrainshape=(2017,), nuniquelabels=(2017,)


# PhosphorusConcentration
# Ntrain=1573, Ntest=675, T=3578, D=1, ytrainshape=(1573,), nuniquelabels=(877,)


# BenzeneConcentration
# Ntrain=3349, Ntest=5163, T=240, D=8, ytrainshape=(3349,), nuniquelabels=(2194,)


# CopperConcentration
# Ntrain=440, Ntest=189, T=2542, D=1, ytrainshape=(440,), nuniquelabels=(310,)


# ManganeseConcentration
# Ntrain=427, Ntest=184, T=1716, D=1, ytrainshape=(427,), nuniquelabels=(396,)


# BIDMC32RR
# Ntrain=5471, Ntest=2399, T=4000, D=2, ytrainshape=(5471,), nuniquelabels=(502,)


# AcousticContaminationMadrid
# Ntrain=166, Ntest=72, T=365, D=1, ytrainshape=(166,), nuniquelabels=(114,)


# VentilatorPressure
# Ntrain=52815, Ntest=22635, T=80, D=2, ytrainshape=(52815,), nuniquelabels=(38384,)


# AppliancesEnergy
# Ntrain=95, Ntest=42, T=144, D=24, ytrainshape=(95,), nuniquelabels=(93,)


# SteamPredictor
# Ntrain=210, Ntest=90, T=168, D=4, ytrainshape=(210,), nuniquelabels=(210,)


# CalciumConcentration
# Ntrain=444, Ntest=191, T=2307, D=1, ytrainshape=(444,), nuniquelabels=(405,)


# NewsHeadlineSentiment
# Ntrain=58213, Ntest=24951, T=144, D=3, ytrainshape=(58213,), nuniquelabels=(28562,)


# IronConcentration
# Ntrain=427, Ntest=184, T=1716, D=1, ytrainshape=(427,), nuniquelabels=(361,)


# ElectricityPredictor
# Ntrain=567, Ntest=243, T=168, D=4, ytrainshape=(567,), nuniquelabels=(567,)


# BeijingPM10Quality
# Ntrain=11918, Ntest=5048, T=24, D=9, ytrainshape=(11918,), nuniquelabels=(710,)


# HotwaterPredictor
# Ntrain=245, Ntest=106, T=168, D=4, ytrainshape=(245,), nuniquelabels=(245,)


# DailyOilGasPrices
# Ntrain=133, Ntest=58, T=30, D=2, ytrainshape=(133,), nuniquelabels=(133,)


# FloodModeling1
# Ntrain=471, Ntest=202, T=266, D=1, ytrainshape=(471,), nuniquelabels=(85,)


# OccupancyDetectionLight
# Ntrain=237, Ntest=103, T=60, D=3, ytrainshape=(237,), nuniquelabels=(100,)


# PPGDalia
# Ntrain=43215, Ntest=21482, T=256, D=4, ytrainshape=(43215,), nuniquelabels=(43211,)


# GasSensorArrayAcetone
# Ntrain=324, Ntest=140, T=7500, D=1, ytrainshape=(324,), nuniquelabels=(4,)


# HouseholdPowerConsumption2
# Ntrain=745, Ntest=686, T=1440, D=5, ytrainshape=(745,), nuniquelabels=(740,)


# BoronConcentration
# Ntrain=438, Ntest=188, T=2542, D=1, ytrainshape=(438,), nuniquelabels=(235,)


# ElectricMotorTemperature
# Ntrain=15503, Ntest=6645, T=60, D=6, ytrainshape=(15503,), nuniquelabels=(15502,)


# AluminiumConcentration
# Ntrain=440, Ntest=189, T=2542, D=1, ytrainshape=(440,), nuniquelabels=(381,)


# ParkingBirmingham
# Ntrain=1391, Ntest=597, T=14, D=1, ytrainshape=(1391,), nuniquelabels=(1385,)


# LPGasMonitoringHomeActivity
# Ntrain=2017, Ntest=865, T=100, D=1, ytrainshape=(2017,), nuniquelabels=(2017,)


# GasSensorArrayEthanol
# Ntrain=324, Ntest=140, T=7500, D=1, ytrainshape=(324,), nuniquelabels=(4,)


# MagnesiumConcentration
# Ntrain=1560, Ntest=669, T=3578, D=1, ytrainshape=(1560,), nuniquelabels=(904,)


# BinanceCoinSentiment
# Ntrain=184, Ntest=79, T=24, D=2, ytrainshape=(184,), nuniquelabels=(173,)


# DhakaHourlyAirQuality
# Ntrain=1447, Ntest=621, T=24, D=1, ytrainshape=(1447,), nuniquelabels=(1255,)


# SolarRadiationAndalusia
# Ntrain=470, Ntest=202, T=365, D=2, ytrainshape=(470,), nuniquelabels=(470,)


# EthereumSentiment
# Ntrain=249, Ntest=107, T=24, D=2, ytrainshape=(249,), nuniquelabels=(241,)


# Covid3Month
# Ntrain=140, Ntest=61, T=84, D=1, ytrainshape=(140,), nuniquelabels=(110,)


# NaturalGasPricesSentiment
# Ntrain=65, Ntest=28, T=20, D=1, ytrainshape=(65,), nuniquelabels=(65,)


# BarCrawl6min
# Ntrain=140, Ntest=61, T=360, D=3, ytrainshape=(140,), nuniquelabels=(137,)


# CardanoSentiment
# Ntrain=74, Ntest=33, T=24, D=2, ytrainshape=(74,), nuniquelabels=(66,)

In [4]:
small_datasets = [
    "CardanoSentiment", 
    "BarCrawl6min", 
    "NaturalGasPricesSentiment",
    "Covid3Month",
    "EthereumSentiment",
    "SolarRadiationAndalusia",
    "GasSensorArrayEthanol",
    "DailyOilGasPrices",
    "AppliancesEnergy",
    ]

# Experiment code

In [5]:
def run_1model_1dataset(
        model: TimeseriesFeatureTransformer,
        X_train: Float[Array, "N1  T  D"],
        y_train: Float[Array, "N1  1"],
        X_test: Float[Array, "N2  T  D"],
        y_test: Float[Array, "N2  1"],
        normalize_features: bool = False,
        normalize_y: bool = True,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        ):

    if apply_basepoint:
        X_train = add_basepoint_zero(X_train)
        X_test = add_basepoint_zero(X_test)
    if apply_augment_time:
        X_train = augment_time(X_train)
        X_test  = augment_time(X_test)
    if normalize_y:
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)

    #fit feature model
    t0 = time.time()
    model.fit(X_train, y_train)

    #obtain features
    t1 = time.time()
    feat_train = model.transform(X_train)
    feat_test = model.transform(X_test)
    if normalize_features:
        feat_train, feat_test = normalize_mean_std_traindata(feat_train, feat_test)

    #fit ridge regression
    t2 = time.time()
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    feat_train = np.array(feat_train)
    feat_test = np.array(feat_test)
    ridge = RidgeCV(alphas=np.logspace(-3, 3, 30))
    ridge.fit(feat_train, y_train)

    # predict
    pred_test = ridge.predict(feat_test)
    pred_train = ridge.predict(feat_train)
    test_rmse = np.sqrt(mean_squared_error(y_test, pred_test))
    train_rmse = np.sqrt(mean_squared_error(y_train, pred_train))
    alpha = ridge.alpha_
    t3 = time.time()
    # print("Model:", model)
    # print("train_RMSE:", train_rmse)
    # print("test_RMSE:", test_rmse)
    # print("alpha (regularization):", alpha, "\n")
    t_fit_feat = t1 - t0
    t_trans_feat = t2 - t1
    t_ridge = t3 - t2
    return train_rmse, test_rmse, alpha, t_fit_feat, t_trans_feat, t_ridge



def run_allmodels_1dataset(
        X_train: Float[Array, "N1  T  D"],
        y_train: Float[Array, "N1  1"],
        X_test: Float[Array, "N2  T  D"],
        y_test: Float[Array, "N2  1"],
        normalize_features: bool = True,
        normalize_y: bool = True,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        ):
    
    prng_key = jax.random.PRNGKey(999)
    max_batch = 1000
    n_features = 100
    models = [
        ["Random Guesser", RandomGuesser(prng_key, 100, max_batch)],
        ["Tabular", TabularTimeseriesFeatures(max_batch)],
        ["Sig", SigTransform(4, max_batch)],
        #["Log Sig", LogSigTransform(4, max_batch)],
        ["Randomized Signature", RandomizedSignature(
            prng_key,
            n_features,
            max_batch,
            )],
        ["Time Inhomogenous Randomized Signature", TimeInhomogenousRandomizedSignature(
            prng_key,
            n_features,
            max_batch,
            )],
        ["Sampled Controlled ResNet", SampledControlledResNet(
            prng_key,
            n_features,
            jnp.tanh,
            max_batch,
            )],
        ["MultiRocket", MultiRocketWrapper(
            50000,
            10000000,
            )],
        ]
    
    results = []
    model_names = [name for name, _ in models]
    for name, model in models:
        res = run_1model_1dataset(
            model, X_train, y_train, X_test, y_test, 
            normalize_features, normalize_y, apply_augment_time, apply_basepoint
            )
        results.append(res)
    return model_names, results



def run_all_experiments(
        datasets: List[str],
        normalize_X: bool = True,
        normalize_features: bool = False,
        normalize_y: bool = True,
        apply_augment_time: bool = False,
        apply_basepoint: bool = True,
        max_T = 100,
        ):
    # Run experiments
    experiments = {}
    failed = {}
    for dataset_name in tqdm(datasets):
        t0 = time.time()
        # try:
        print(dataset_name)
        X_train, y_train, X_test, y_test = get_aeon_dataset(dataset_name, normalize_X, max_T=max_T)
        print("X_train shape", X_train.shape, "y_test shape", y_test.shape)
        #TODO FILTER START
        if X_train.shape[0] > 1000:
            print("Skipping dataset", dataset_name)
            continue
        #TODO FILTER END
        X_train, X_test = normalize_streams(X_train, X_test, max_T=max_T)
        y_train, y_test = normalize_mean_std_traindata(y_train, y_test)
        results = run_allmodels_1dataset(
            X_train, y_train, X_test, y_test, 
            normalize_features, normalize_y, apply_augment_time, apply_basepoint,
            )
        experiments[dataset_name] = results
        # except Exception as e:
        #     print(f"Error: {e}")
        #     failed[dataset_name] = e
        print(dataset_name, "Elapsed time", time.time()-t0)
    
    # Save results
    # Assuming experiments is a dict where keys are dataset names and values are tuples (model_names, results)
    attributes = ["RMSE_train", "RMSE_test", "alpha", "t_fit_feat", "t_trans_feat", "t_ridge"]
    data_list = []
    # Process the data
    for dataset_name, (model_names, results) in experiments.items():
        dataset_data = {}
        for attr_idx, attribute in enumerate(attributes):
            for model_idx, model_name in enumerate(model_names):
                dataset_data[(attribute, model_name)] = results[model_idx][attr_idx]
        data_list.append(pd.DataFrame(dataset_data, index=[dataset_name]))

    # Combine all datasets into a single DataFrame
    df = pd.concat(data_list)
    df = df.sort_index(axis=1)
    print(df)
    df.to_pickle(f"TSER_results.pkl")
    return df

In [6]:
#df = run_all_experiments(tser_soton)

In [7]:
df = pd.read_pickle("TSER_results.pkl")

In [8]:
df["RMSE_test"]

Unnamed: 0,MultiRocket,Random Guesser,Randomized Signature,Sampled Controlled ResNet,Sig,Tabular,Time Inhomogenous Randomized Signature
Covid19Andalusia,0.478009,0.941507,0.802195,1.032347,0.729921,0.839759,0.901075
MetroInterstateTrafficVolume,1.17911,0.969703,0.954417,0.959055,0.95559,0.95149,0.963759
WindTurbinePower,0.158138,1.06034,2.668295,1.062872,0.740283,1.018327,1.047106
ElectricityPredictor,1.420269,1.178119,1.049715,1.15499,1.161169,1.104583,1.155629
BoronConcentration,0.519036,0.340614,0.380499,0.328043,0.324365,0.322991,0.336314
HouseholdPowerConsumption1,0.279803,0.778683,0.538582,0.780107,1.038575,0.220358,0.791675
ManganeseConcentration,0.765767,0.997768,0.919123,0.8249,0.930177,0.815962,0.836522
HouseholdPowerConsumption2,0.898125,1.237572,1.135112,1.21645,1.562218,0.973609,1.267563
TetuanEnergyConsumption,0.627258,0.984222,0.676474,0.75149,0.765065,0.627147,0.737154
AluminiumConcentration,0.545124,1.024684,0.865221,0.583082,0.91522,0.480519,0.652138


In [9]:
df["RMSE_test"].rank(axis=1, method='min')

Unnamed: 0,MultiRocket,Random Guesser,Randomized Signature,Sampled Controlled ResNet,Sig,Tabular,Time Inhomogenous Randomized Signature
Covid19Andalusia,1.0,6.0,3.0,7.0,2.0,4.0,5.0
MetroInterstateTrafficVolume,7.0,6.0,2.0,4.0,3.0,1.0,5.0
WindTurbinePower,1.0,5.0,7.0,6.0,2.0,3.0,4.0
ElectricityPredictor,7.0,6.0,1.0,3.0,5.0,2.0,4.0
BoronConcentration,7.0,5.0,6.0,3.0,2.0,1.0,4.0
HouseholdPowerConsumption1,2.0,4.0,3.0,5.0,7.0,1.0,6.0
ManganeseConcentration,1.0,7.0,5.0,3.0,6.0,2.0,4.0
HouseholdPowerConsumption2,1.0,5.0,3.0,4.0,7.0,2.0,6.0
TetuanEnergyConsumption,2.0,7.0,3.0,5.0,6.0,1.0,4.0
AluminiumConcentration,2.0,7.0,5.0,3.0,6.0,1.0,4.0


In [10]:
df["RMSE_test"].rank(axis=1, method='min').mean(axis=0)

MultiRocket                               3.157895
Random Guesser                            5.657895
Randomized Signature                      3.315789
Sampled Controlled ResNet                 4.394737
Sig                                       4.421053
Tabular                                   2.210526
Time Inhomogenous Randomized Signature    4.842105
dtype: float64