In [1]:
from sktime.utils.load_data import load_from_tsfile_to_dataframe
import time
import pickle
import numpy as np
from tslearn.metrics import gamma_soft_dtw
from tslearn.metrics import cdist_soft_dtw_normalized
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

In [5]:
def ts_df_to_array(ts_df, index):
    return np.array([ts_df.iloc[index].iloc[i].values for i in range(len(ts_df.iloc[0]))])

In [6]:
def ts_df_to_arrays(ts_df, swapaxes=False):
    arrays=[]
    for index in range(len(ts_df)):
        array = ts_df_to_array(ts_df, index)
        if swapaxes:
            arrays.append(np.swapaxes(array,0,1))
        else:
            arrays.append(array)
    return np.array(arrays)

In [15]:
def plot_softdtw_tsne_3d(dataset_name, train_x, test_x, synthetic_x_train, train_y, test_y, synthetic_y_train, dtw_distance_matrices, augmentation_degree):
    total_x = np.array(list(train_x) + list(test_x) + list(synthetic_x_train))
    total_y = np.array(list(train_y) + list(test_y) + list(synthetic_y_train))
    gamma = gamma_soft_dtw(total_x)
    
    if dataset_name in dtw_distance_matrices.keys() and augmentation_degree in dtw_distance_matrices[dataset_name].keys():
        X = dtw_distance_matrices[dataset_name][augmentation_degree]
    else:
        if dataset_name in dtw_distance_matrices.keys():
            dtw_distance_matrices[dataset_name][augmentation_degree] = cdist_soft_dtw_normalized(total_x, gamma=gamma)
            X = dtw_distance_matrices[dataset_name][augmentation_degree]
        else:
            dtw_distance_matrices[dataset_name] = {}
            dtw_distance_matrices[dataset_name][augmentation_degree] = cdist_soft_dtw_normalized(total_x, gamma=gamma)
            X = dtw_distance_matrices[dataset_name][augmentation_degree]
    
    X_embedded = TSNE(n_components=3).fit_transform(X)
    train_test = ['train_real']*len(train_x) + ['test']*len(test_x) + ['train_synthetic']*len(synthetic_x_train)
    real_synthetic = ['real']*(len(train_x) + len(test_x)) + ['synthetic']*len(synthetic_x_train)

    df = pd.DataFrame(data=np.array([total_y, X_embedded[:,0], X_embedded[:,1], X_embedded[:,2], train_test, real_synthetic]).T, columns=['label', 'x', 'y', 'z', 'train_test', 'real_synthetic'])

    plt.figure()
    fig = px.scatter_3d(df, x="x", y="y", z="z", color="label", symbol="train_test", title=dataset_name, symbol_map={"train_real": "circle", "train_synthetic": "circle-open", "test": "cross"})
    fig.show()
    
    return fig

In [24]:
figs = {}
for dataset_name in ['AtrialFibrillation', 'Epilepsy']:
    if dataset_name in large_datasets + irregular_datasets:
        continue
    num_synthetic_ts = 1000
    dba_iters = 5
    limit_N = False

    print("-----------")
    start = time.process_time()
    print(dataset_name)
    train_x, train_y = load_from_tsfile_to_dataframe("../data/%s/%s_TRAIN.ts" % (dataset_name, dataset_name)) 
    test_x, test_y = load_from_tsfile_to_dataframe("../data/%s/%s_TEST.ts" % (dataset_name, dataset_name))
    
    train_x = ts_df_to_arrays(train_x, swapaxes=True)       
    test_x = ts_df_to_arrays(test_x, swapaxes=True)
    
    num_replicates = train_x.shape[0]
    print("# replicates: %d" % (num_replicates))
    num_dimensions = train_x.shape[2]
    print("# dimensions: %d" % (num_dimensions))
    len_series = train_x.shape[1]
    print("length of series: %d" % (len_series))
    num_classes = len(np.unique(train_y))
    print("# classes: %d" % (num_classes))
    total_size = num_replicates*num_dimensions*len_series
    print("total 'size': %d" % (total_size))

    synthetic_x_train = pickle.load(open("../syntheticdata/%s_softdtw_synthetic_x_train_%d_%d_%s.pkl" % (dataset_name, num_synthetic_ts, dba_iters, str(limit_N)), 'rb'))
    synthetic_y_train = pickle.load(open("../syntheticdata/%s_softdtw_synthetic_y_train_%d_%d_%s.pkl" % (dataset_name, num_synthetic_ts, dba_iters, str(limit_N)), 'rb'))
    
    augmentation_degree = -1
    fig = plot_softdtw_tsne_3d(dataset_name, train_x, test_x, [], train_y, test_y, [], dtw_distance_matrices, augmentation_degree)
    figs["%s_%f" % (dataset_name, augmentation_degree)] = fig
    
    # loop over desired augmentation degrees
    for augmentation_degree in [4.0]:
        print("Augmentation degree: %f" % augmentation_degree)

        # find highest represented class, save this number
        labels, counts = np.unique(train_y, return_counts=True)
        max_counts = np.max(counts)
        # find the number of synthetic examples you need to add to balance
        balance_counts = max_counts - counts
        # loop over all labels
        synthetic_samples = []
        synthetic_labels = []
        for label, balance_count in zip(labels, balance_counts):
            synthetic_data = synthetic_x_train[synthetic_y_train == label]

            # compute the number of synthetic samples to choose based on balance_count and augmentation_degree
            samples_to_draw = balance_count + int(augmentation_degree * max_counts)

            # randomly sample the synthetic data
            indices = np.arange(len(synthetic_data))
            random_indices = np.random.choice(indices, size=samples_to_draw, replace=False)
            random_synthetic_samples = synthetic_data[random_indices]

            # add synthetic samples for this label
            synthetic_samples += list(random_synthetic_samples)
            synthetic_labels += [label]*samples_to_draw

        # augment training set with all synthetic samples
        augmented_train_x = np.array(list(train_x) + list(synthetic_samples))
        augmented_train_y = np.array(list(train_y) + list(synthetic_labels))
        
        fig = plot_softdtw_tsne_3d(dataset_name, train_x, test_x, synthetic_samples, train_y, test_y, synthetic_labels, dtw_distance_matrices, augmentation_degree)
        figs["%s_%f" % (dataset_name, augmentation_degree)] = fig

    print("Time (s): %f" % (time.process_time() - start))
    print("-----------")

-----------
AtrialFibrillation
# replicates: 15
# dimensions: 2
length of series: 640
# classes: 3
total 'size': 19200


Augmentation degree: 4.000000


Time (s): 2.484375
-----------
-----------
Epilepsy
# replicates: 137
# dimensions: 3
length of series: 206
# classes: 4
total 'size': 84666


Augmentation degree: 4.000000


Time (s): 18.328125
-----------


<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [25]:
pickle.dump(figs, open("../results/tsne_figs.pkl", 'wb'))