Note that to run this notebook you should first download the multivariate datasets from the UEA repository and unzip them into the ../data directory: http://www.timeseriesclassification.com/dataset.php

In [4]:
from sktime.utils.load_data import load_from_tsfile_to_dataframe
from sklearn import preprocessing
import time
import pickle
import os
import numpy as np
from tslearn.metrics import dtw
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

In [7]:
datasets = ['ArticularyWordRecognition',
'AtrialFibrillation',
'BasicMotions',
'CharacterTrajectories',
'Cricket',
'DuckDuckGeese',
'EigenWorms',
'Epilepsy',
'ERing',
'EthanolConcentration',
'FaceDetection',
'FingerMovements',
'HandMovementDirection',
'Handwriting',
'Heartbeat',
'InsectWingbeat',
'JapaneseVowels',
'Libras',
'LSST',
'MotorImagery',
'NATOPS',
'PEMS-SF',
'PenDigits',
'PhonemeSpectra',
'RacketSports',
'SelfRegulationSCP1',
'SelfRegulationSCP2',
'SpokenArabicDigits',
'StandWalkJump',
'UWaveGestureLibrary']

In [8]:
irregular_datasets = ['JapaneseVowels',
                      'CharacterTrajectories',
                      'SpokenArabicDigits'
                     ]

In [9]:
large_datasets =  ['DuckDuckGeese',
  'InsectWingbeat',
  'PEMS-SF', 
  'FaceDetection',
  'MotorImagery',
  'Handwriting',
  'Heartbeat',
  'PenDigits',
  'PhonemeSpectra',
  'LSST',
  'EigenWorms',
  'FingerMovements']

In [10]:
def ts_df_to_array(ts_df, index):
    return np.array([ts_df.iloc[index].iloc[i].values for i in range(len(ts_df.iloc[0]))])

In [27]:
def ts_df_to_arrays(ts_df, swapaxes=False):
    arrays=[]
    for index in range(len(ts_df)):
        array = ts_df_to_array(ts_df, index)
        if swapaxes:
            arrays.append(np.swapaxes(array,0,1))
        else:
            arrays.append(array)
    return np.array(arrays)

In [255]:
def softdtw_augment_train_set(x_train, y_train, classes, num_synthetic_ts, max_neighbors=5): 
    from tslearn.neighbors import KNeighborsTimeSeries
    from tslearn.barycenters import softdtw_barycenter
    from tslearn.metrics import gamma_soft_dtw
    
    # synthetic train set and labels 
    synthetic_x_train = []
    synthetic_y_train = []
    # loop through each class
    for c in classes:
        # get the MTS for this class 
        c_x_train = x_train[np.where(y_train==c)]
        if len(c_x_train) == 1 :
            # skip if there is only one time series per set
            continue
        # compute appropriate gamma for softdtw for the entire class
        class_gamma = gamma_soft_dtw(c_x_train)
        # loop through the number of synthtectic examples needed
        generated_samples = 0
        while generated_samples < num_synthetic_ts:
            # Choose a random representative for the class
            representative_indices = np.arange(len(c_x_train))
            random_representative_index = np.random.choice(representative_indices, size=1, replace=False)
            random_representative = c_x_train[random_representative_index]
            # Choose a random number of neighbors (between 1 and one minus the total number of class representatives)
            random_number_of_neighbors = int(np.random.uniform(1, max_neighbors, size=1))
            knn = KNeighborsTimeSeries(n_neighbors=random_number_of_neighbors+1, metric='softdtw', metric_params={'gamma': class_gamma}).fit(c_x_train)
            random_neighbor_distances, random_neighbor_indices = knn.kneighbors(X=random_representative, return_distance=True)
            random_neighbor_indices = random_neighbor_indices[0]
            random_neighbor_distances = random_neighbor_distances[0]
            nearest_neighbor_distance = np.sort(random_neighbor_distances)[1]
            random_neighbors = np.zeros((random_number_of_neighbors+1, c_x_train.shape[1], c_x_train.shape[2]), dtype=float)
            for j, neighbor_index in enumerate(random_neighbor_indices):
                random_neighbors[j,:] = c_x_train[neighbor_index]
            # Choose a random weight vector (and then normalize it)
            weights = np.exp(np.log(0.5)*random_neighbor_distances/nearest_neighbor_distance)
            weights /= np.sum(weights)
            # Compute tslearn.barycenters.softdtw_barycenter with weights=random weights and gamma value specific to neighbors
            random_neighbors_gamma = gamma_soft_dtw(random_neighbors)
            generated_sample = softdtw_barycenter(random_neighbors, weights=weights, gamma=random_neighbors_gamma)
            synthetic_x_train.append(generated_sample)
            synthetic_y_train.append(c)         
            # Repeat until you have the desired number of synthetic samples for each class
            generated_samples += 1
    # return the synthetic set 
    return np.array(synthetic_x_train), np.array(synthetic_y_train)

In [260]:
for dataset_name in datasets:
    if dataset_name in large_datasets + irregular_datasets:
        continue
    num_synthetic_ts = 1000

    print("-----------")
    start = time.process_time()
    print(dataset_name)
    train_x, train_y = load_from_tsfile_to_dataframe("../data/%s/%s_TRAIN.ts" % (dataset_name, dataset_name)) 
    test_x, test_y = load_from_tsfile_to_dataframe("../data/%s/%s_TEST.ts" % (dataset_name, dataset_name))
    
    train_x = ts_df_to_arrays(train_x, swapaxes=True)       
    test_x = ts_df_to_arrays(test_x, swapaxes=True)
    
    num_replicates = train_x.shape[0]
    print("# replicates: %d" % (num_replicates))
    num_dimensions = train_x.shape[2]
    print("# dimensions: %d" % (num_dimensions))
    len_series = train_x.shape[1]
    print("length of series: %d" % (len_series))
    num_classes = len(np.unique(train_y))
    print("# classes: %d" % (num_classes))
    total_size = num_replicates*num_dimensions*len_series
    print("total 'size': %d" % (total_size))
    
    classes = np.unique(train_y)    
    synthetic_x_train, synthetic_y_train = softdtw_augment_train_set(train_x, 
                                                             train_y, 
                                                             classes,
                                                             num_synthetic_ts)
    
    pickle.dump(synthetic_x_train, open("../syntheticdata/%s_softdtw_synthetic_x_train_%d_%d_%s.pkl" % (dataset_name, num_synthetic_ts, dba_iters, str(limit_N)), 'wb'))
    pickle.dump(synthetic_y_train, open("../syntheticdata/%s_softdtw_synthetic_y_train_%d_%d_%s.pkl" % (dataset_name, num_synthetic_ts, dba_iters, str(limit_N)), 'wb'))

            
    print("Time (s): %f" % (time.process_time() - start))
    print("-----------")

-----------
ArticularyWordRecognition
# replicates: 275
# dimensions: 9
length of series: 144
# classes: 25
total 'size': 356400
Time (s): 3485.125000
-----------
-----------
AtrialFibrillation
# replicates: 15
# dimensions: 2
length of series: 640
# classes: 3
total 'size': 19200
Time (s): 6193.250000
-----------
-----------
BasicMotions
# replicates: 40
# dimensions: 6
length of series: 100
# classes: 4
total 'size': 24000
Time (s): 384.640625
-----------
-----------
Cricket
# replicates: 108
# dimensions: 6
length of series: 1197
# classes: 12
total 'size': 775656
Time (s): 143778.359375
-----------
-----------
Epilepsy
# replicates: 137
# dimensions: 3
length of series: 206
# classes: 4
total 'size': 84666
Time (s): 1791.171875
-----------
-----------
ERing
# replicates: 30
# dimensions: 4
length of series: 65
# classes: 6
total 'size': 7800
Time (s): 39.671875
-----------
-----------
EthanolConcentration
# replicates: 261
# dimensions: 3
length of series: 1751
# classes: 4
total '