In [89]:
""" This file vectorizes persistence diagrams and their signatures with the ATOL algorithm."""

' This file vectorizes persistence diagrams and their signatures with the ATOL algorithm.'

In [1]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension, PersistenceImage
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from sklearn.cluster import KMeans
from gudhi.representations.vector_methods import Atol
import warnings

In [2]:
# TODO change deprecated behaviour

# Load Data

In [81]:
# Choose if you want to look at EEG or EMG data

data_type = "EEG"
#data_type = "EMG"

In [82]:
# choose individuum
subject = "m294"

In [83]:
label_list = [0, 1, 2, 3, 4]

In [84]:
# Load persistence diagrams

train_persistence_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values
test_persistence_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values


for label in label_list:
    train_persistence_diagrams[label] = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Train/"+str(data_type)+"/PD"+str(label)+".npy", allow_pickle=True)
    test_persistence_diagrams[label] = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Test/"+str(data_type)+"/PD"+str(label)+".npy", allow_pickle=True)

In [85]:
# Load shortened persistence diagrams

train_shortened_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values
test_shortened_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values


for label in label_list:
    train_shortened_diagrams[label] = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Train/"+str(data_type)+"/Shortened_Diagrams"+str(label)+".npy", allow_pickle=True)
    test_shortened_diagrams[label] = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Test/"+str(data_type)+"/Shortened_Diagrams"+str(label)+".npy", allow_pickle=True)

In [86]:
# Load removed indices

train_removed_indices = {} # dictionary with labels as keys, indices of the respective classes as values
test_removed_indices = {} # dictionary with labels as keys, indices of the respective classes as values


for label in label_list:
    train_removed_indices[label] = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Train/"+str(data_type)+"/Removed_Indices"+str(label)+".npy", allow_pickle=True)
    test_removed_indices[label] = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Test/"+str(data_type)+"/Removed_Indices"+str(label)+".npy", allow_pickle=True)

# Set parameters and important functions

In [87]:
# Choose dimensionality of the vectorization

# Later in the classification, a dimension of 4 works already works approx. optimally at least for persistence diagrams
vector_dim = 4  

In [88]:
# Define all data type objects

HK = HeatKernel(sigma=0.00003, n_bins=100)
BC = BettiCurve()
SH = Silhouette()
PL = PersistenceLandscape()
PI = PersistenceImage(sigma=0.00003, n_bins=100)

In [89]:
# TODO these functions deal with HK and PI as global variables, which is not ideal. 

def train_atol(training_data, label_list, vector_dim, type_of_data_to_vectorize = None):
    """ Trains the ATOl model with the training data.
    
    Parameters:
    - training_data (dictionary of np.ndarrays of np.ndarrays of np.ndarrays): Data used for training. 
    Shape (labels, #persistence diagrams/features, shape of persistence diagram/feature).
    - label_list (list): List of labels (e.g. [1, 3, 5, 7].
    - vector_dim (int): Dimension the vectorizations should have, e.g. 4.
    - type_of_data_to_vectorize (object): either "HK", "BC", "SH" or "PL", or None if we are directly vectorizing the data_to_vectorize.

    Returns
    - atol_vectoriser (object): Atol() object; trained model to vectorize the data to vectorize later.
    """
    
    # Concatenate all training data
    all_training_data = []
    
    for label in label_list:
        if not type_of_data_to_vectorize:
            all_training_data.extend(training_data[label])
            
         # HK & PI have a different shape than the other signatures
        elif type_of_data_to_vectorize == HK or type_of_data_to_vectorize == PI:
            all_training_data.extend(type_of_data_to_vectorize.fit_transform(training_data[label])[0])
        else:
            all_training_data.extend(type_of_data_to_vectorize.fit_transform(training_data[label]))
            
    # Train Atol vectorizer with all training data
    atol_vectoriser = Atol(quantiser=KMeans(n_clusters=vector_dim, random_state=202006))
    atol_vectoriser.fit(X=all_training_data).centers

    return atol_vectoriser


In [90]:
def create_vectorizations(data_to_vectorize, atol_vectoriser, label_list, type_of_data_to_vectorize = None):
    """ Creates vectorizations from signatures.
    
    Parameters:
    - data_to_vectorize (dictionary of np.ndarrays of np.ndarrays of np.ndarrays): Data to vectorize. 
    Shape (labels, #persistence diagrams/features, shape of persistence diagram/feature).
    - atol_vectoriser (object): Atol() object; trained model to vectorize the data_to_vectorize.
    - label_list (list): List of labels (e.g. [1, 3, 5, 7].
    - type_of_data_to_vectorize (object): either "HK", "BC", "SH" or "PL", or None if we are directly vectorizing the data_to_vectorize.

    Returns
    - Vectorization. Shape (Number of homology dimensions, number of labels, data_to_vectorize, length of vectorization)
    """

    
    # If we are directly vectorizing persistence diagrams
    
    if not type_of_data_to_vectorize: 
        
        vectorizations = {} # initialize dictionary with labels as keys and vectorizations as values

        for label in label_list:
            vectorizations[label] = []
            for diagram in data_to_vectorize[label]:
                vectorization = atol_vectoriser(diagram)
                vectorizations[label].append(vectorization)

        return vectorizations


    # If we are vectorizing features
    
    vectorizations = [{}, {}, {}] # initialize list which has an dictionary for each homology dimensions, with labels as keys

    for label in label_list:
        for dim in range(3):

            # Initialize list of vectorizations
            vectorizations[dim][label] = []
    
            for diagram in data_to_vectorize[label]:
                # We get a depreciation warning if we do not convert the diagram to a numeric type explicitly
                signature = type_of_data_to_vectorize.fit_transform([diagram.astype("float")])
 
                if type_of_data_to_vectorize == HK or type_of_data_to_vectorize == PI:

                    vector = atol_vectoriser(signature[0][dim])
                    vectorizations[dim][label].append(vector)
                else:
                    vector = atol_vectoriser(signature[0][dim].reshape(1, -1))
                    vectorizations[dim][label].append(vector)

    return vectorizations


In [91]:
# Initialize dictionaries with all vectorizations

train_all_vectorizations = {}
test_all_vectorizations = {}

In [92]:
# Set variable that will make sure that zero embeddings at missing indices will only be inserted once

already_inserted = False

# Vectorize persistence diagrams directly

In [93]:
# Train ATOL
atol_vectoriser = train_atol(train_persistence_diagrams, label_list, vector_dim)

# Apply trained model to the entire data
train_all_vectorizations["PD"] = create_vectorizations(train_persistence_diagrams, atol_vectoriser, label_list)
test_all_vectorizations["PD"]  = create_vectorizations(test_persistence_diagrams, atol_vectoriser, label_list)

# Vectorize Signatures

## Heatkernel vectorizations

In [94]:
# Train ATOL
atol_vectoriser = train_atol(train_shortened_diagrams, label_list, vector_dim, HK)
# Create vectorizations
train_all_vectorizations["HK"] = create_vectorizations(train_shortened_diagrams, atol_vectoriser, label_list, HK)
test_all_vectorizations["HK"] = create_vectorizations(test_shortened_diagrams, atol_vectoriser, label_list, HK)

## Betti Curve Vectorizations

In [95]:
# Train ATOL
atol_vectoriser = train_atol(train_shortened_diagrams, label_list, vector_dim, BC)

# Create vectorizations
train_all_vectorizations["BC"] = create_vectorizations(train_shortened_diagrams, atol_vectoriser, label_list, BC)
test_all_vectorizations["BC"] = create_vectorizations(test_shortened_diagrams, atol_vectoriser, label_list, BC)

## Vectorize Silhouettes

In [96]:
# Train ATOL
atol_vectoriser = train_atol(train_shortened_diagrams, label_list, vector_dim, SH)

# Create vectorizations
train_all_vectorizations["SH"] = create_vectorizations(train_shortened_diagrams, atol_vectoriser, label_list, SH)
test_all_vectorizations["SH"] = create_vectorizations(test_shortened_diagrams, atol_vectoriser, label_list, SH)

## Persistence Landscapes Vectorization

In [97]:
# Train ATOL
atol_vectoriser = train_atol(train_shortened_diagrams, label_list, vector_dim, PL)

# Create vectorizations
train_all_vectorizations["PL"] = create_vectorizations(train_shortened_diagrams, atol_vectoriser, label_list, PL)
test_all_vectorizations["PL"] = create_vectorizations(test_shortened_diagrams, atol_vectoriser, label_list, PL)

## Persistence Image Vectorization

Takes long to compute.

In [98]:
# Train ATOL
#atol_vectoriser = train_atol(train_shortened_diagrams, label_list, vector_dim, PI)

# Create vectorizations
#train_all_vectorizations["PI"] = create_vectorizations(train_shortened_diagrams, atol_vectoriser, label_list, PI)
#test_all_vectorizations["PI"] = create_vectorizations(test_shortened_diagrams, atol_vectoriser, label_list, PI)

# Create dataframes and save

In [99]:
def insert_zeros_for_removed_indices(all_vectorizations, removed_indices, vector_dim):
    """ Inserts zero embeddings to the places where diagrams that were to short were removed before.
    
    Parameters:
    - 
    - vector_dim (int): Dimension the vectorizations should have, e.g. 4.

    Returns
    - 
    """

    for type_of_data_to_vectorize in all_vectorizations.keys(): # the keys are the types of the data that was vectorized (PD, BC, HK,...)
        for label in label_list: # labels

            # Persistence diagrams do not use the shortened diagrams
            if str(type_of_data_to_vectorize) == "PD":                
                pass
                
            # If type_of_data_to_vectorize is a signature
            else:
                for dim in range(3): # homology dimension

                    vectorization = all_vectorizations[type_of_data_to_vectorize][dim][label]
                
                    for idx in removed_indices[label]:
                        vectorization.insert(idx, np.zeros(vector_dim))

                    all_vectorizations[type_of_data_to_vectorize][dim][label] = vectorization

    return all_vectorizations

In [100]:
# Add zero vectorizations to those outlier diagrams that were removed before because they are too small

if not already_inserted:
    train_all_vectorizations = insert_zeros_for_removed_indices(train_all_vectorizations, train_removed_indices, vector_dim)
    test_all_vectorizations = insert_zeros_for_removed_indices(test_all_vectorizations, test_removed_indices, vector_dim)
    
# Make sure the above is only run once
already_inserted = True

In [101]:
def create_feature_df(data_type, all_vectorizations, vector_dim, num_diagrams, label):
    """
    Create DataFrame for each label from features.

    Parameters:
    - all_vectorizations (dictionary): all vectorizations for all datatypes (keys) 
    - vector_dim (int): dimension of the vectorization (e.g. 5)
    - num_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 0, 1, 2, 3 or 4

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    for type_of_data_to_vectorize in all_vectorizations.keys():
        # Persistence diagrams are shaped differently (not separated according to homology dimension)
        if str(type_of_data_to_vectorize) == "PD":
            for dim in range(vector_dim):
                feature_df[str(data_type)+"_PD_Vectorization_Coord_"+str(dim)] = [arr[dim] for arr in all_vectorizations[type_of_data_to_vectorize][label]]

        else:
            for hom_dim in range(3):
                for dim in range(vector_dim):
                    feature_df[str(data_type)+"_"+str(type_of_data_to_vectorize)+"_Vectorization_Dim_"+str(hom_dim)+"Coord_"+str(dim)] = \
                            [arr[dim] for arr in all_vectorizations[type_of_data_to_vectorize][hom_dim][label]]
    
    
    # Label
    feature_df["Label"] = label

    return feature_df

In [102]:
# Create train dataframes
train_dataframes = {}

for label in label_list:
    train_dataframes[label] = create_feature_df(data_type, train_all_vectorizations, vector_dim, \
                                                len(train_persistence_diagrams[label]), label)

# Create test dataframes
test_dataframes = {}

for label in label_list:
    test_dataframes[label] = create_feature_df(data_type, test_all_vectorizations, vector_dim, \
                                               len(test_persistence_diagrams[label]), label)

In [103]:
# Concatenate and save features of training persistence diagrams
train_feature_df = pd.concat([train_dataframes[0], train_dataframes[1], train_dataframes[2], train_dataframes[3], train_dataframes[4]], ignore_index=True)
train_feature_df.to_csv("Features/"+str(subject)+"/Train/"+str(data_type)+"/Vectorization_Features.csv")

# Concatenate and save features of training persistence diagrams
test_feature_df = pd.concat([test_dataframes[0], test_dataframes[1], test_dataframes[2], test_dataframes[3], test_dataframes[4]], ignore_index=True)
test_feature_df.to_csv("Features/"+str(subject)+"/Test/"+str(data_type)+"/Vectorization_Features.csv")