In [6]:
import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.utils
import sklearn.metrics
import iisignature
import torch
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List, Optional, Dict, Set, Callable, Any
from joblib import Memory, Parallel, delayed
import tslearn
import tslearn.metrics
from tslearn.datasets import UCR_UEA_datasets
import sigkernel
import scipy
from scipy.interpolate import interp1d
from numba import njit
import pickle
import time

from models.signature import streams_to_sigs, transform_stream
from models.conformance import BaseclassConformanceScore, stream_to_torch
from models.kernels import linear_kernel_gram, rbf_kernel_gram, poly_kernel_gram
from models.kernels import pairwise_kernel_gram, integral_kernel_gram, sig_kernel_gram
from experiment_code import print_dataset_stats

from models.signature import transform_stream

# PenDigits dataset (Variable Length) 

* Can't use ts-learn since it interpolated and homogenized the length of all time series

In [None]:
############################################################################################## |
################################### PenDigits experiments #################################### |
############################################################################################## \/

def run_pendigits_experiments(df:pd.DataFrame, 
                              kernel_names:List[str],
                              stream_transforms = ["time_enhance", "min_max_normalize"],):
    """Calculates AUCs for each kernel on the PenDigits dataset.
    df has columns ["data", "label", "subset"]. Each data point 
    is a timeseries of shape (T_i, d) of variable length."""
    #transform streams
    df["data"] = df["data"].apply(lambda x : transform_stream(x, stream_transforms))

    #Gather dataset info
    X_train = df[df["subset"]=="train"]["data"].values
    y_train = np.array(df[df["subset"]=="train"]["label"].values)
    X_test = df[df["subset"]=="test"]["data"].values
    y_test = np.array(df[df["subset"]=="test"]["label"].values)
    labels = sorted(df["label"].unique())
    num_classes = len(labels)
    d = X_train[0].shape[1]
    T = "variable length"
    N_train = len(X_train)
    N_test = len(X_test)
    print_dataset_stats(num_classes, d, T, N_train, N_test)

    # Run each kernel
    kernel_results = {}
    for kernel_name in kernel_names:
        print(kernel_name)
        scores = run_single_kernel(X_train, y_train, X_test, y_test, labels, 
                        kernel_name, variable_length=True, normalize=False,
                        trunc_sig_dim_bound=200, SVD_max_rank=None)
        kernel_results[kernel_name] = scores

    #log results
    pendigits_results = {"results": kernel_results, 
                         "num_classes": num_classes,
                         "dim": d,
                         "ts_length":T, 
                         "N_train":N_train, 
                         "N_test":N_test}
    return pendigits_results

# pendigits_results = run_pendigits_experiments(
#     df_pendigits_raw, 
#     kernel_names=[
#         #"gak",
#         "truncated signature", 
#         #"signature pde", 
#         #"signature pde RBF"
#         ],
#         )

In [None]:
# print_experiment_results({"PenDigits": pendigits_results})