In [776]:
""" This file vectorizes persistence diagrams and their signatures with the ATOL algorithm."""

' This file vectorizes persistence diagrams and their signatures with the ATOL algorithm.'

In [777]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension, PersistenceImage
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from sklearn.cluster import KMeans
from gudhi.representations.vector_methods import Atol
import warnings

# Load Data

In [778]:
# Choose if you want to look at EEG or EMG data

data_type = "EEG"
#data_type = "EMG"

In [779]:
# choose individuum
subject = "m292"

In [780]:
label_list = [0, 1, 2, 3, 4]

In [781]:
# Load persistence diagrams

persistence_diagrams  = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/'+str(data_type)+'/Persistence_Diagrams_All_Labels.npy', \
    allow_pickle=True).item() # .item() to convert the dtype to dict again

In [782]:
extended_persistence_diagrams  = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/'+str(data_type)+'/Extended_Persistence_Diagrams_All_Labels.npy', \
    allow_pickle=True).item() # .item() to convert the dtype to dict again

In [783]:
reshaped_persistence_diagrams = {}

for label in label_list:
    reshaped_persistence_diagrams["Label_"+str(label)] = [persistence_diagram[0] for persistence_diagram in list(persistence_diagrams["Label_"+str(label)])]

persistence_diagrams = reshaped_persistence_diagrams

## Get training indices

In [784]:
def load_indices(subject):
    train_indices = np.load("Train_Test_Splitting/"+str(subject)+"/Train_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()
    validation_indices = np.load("Train_Test_Splitting/"+str(subject)+"/Validation_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()

    return train_indices, validation_indices


train_indices_dict, validation_indices_dict = load_indices(subject)

# Set parameters and important functions

In [785]:
# Choose dimensionality of the vectorization

# Later in the classification, a dimension of 4 works already works approx. optimally at least for persistence diagrams
vector_dim = 4

In [786]:
# TODO these functions deal with HK and PI as global variables, which is not ideal. 

def train_atol(training_data, label_list, vector_dim, type_of_data_to_vectorize = None):
    """ Trains the ATOl model with the training data.
    
    Parameters:
    - training_data (dictionary of np.ndarrays of np.ndarrays of np.ndarrays): Data used for training. 
    Shape (labels, #persistence diagrams/features, shape of persistence diagram/feature).
    - label_list (list): List of labels (e.g. [1, 3, 5, 7].
    - vector_dim (int): Dimension the vectorizations should have, e.g. 4.
    - type_of_data_to_vectorize (object): either "HK", "BC", "SH" or "PL", or None if we are directly vectorizing the data_to_vectorize.

    Returns
    - atol_vectoriser (object): Atol() object; trained model to vectorize the data to vectorize later.
    """
    
    # Concatenate all training data
    all_training_data = []
    
    for label in label_list:
        if not type_of_data_to_vectorize:
            all_training_data.extend(training_data["Label_"+str(label)])
            
         # HK & PI have a different shape than the other signatures
        elif type_of_data_to_vectorize == HK or type_of_data_to_vectorize == PI:
            all_training_data.extend(type_of_data_to_vectorize.fit_transform(training_data["Label_"+str(label)])[0])
        else:
            all_training_data.extend(type_of_data_to_vectorize.fit_transform(training_data["Label_"+str(label)]))
            
    # Train Atol vectorizer with all training data
    atol_vectoriser = Atol(quantiser=KMeans(n_clusters=vector_dim, random_state=202006))
    atol_vectoriser.fit(X=all_training_data).centers

    return atol_vectoriser


In [787]:
def create_vectorizations(data_to_vectorize, atol_vectoriser, label_list, type_of_data_to_vectorize = None):
    """ Creates vectorizations from signatures.
    
    Parameters:
    - data_to_vectorize (dictionary of np.ndarrays of np.ndarrays of np.ndarrays): Data to vectorize. 
    Shape (labels, #persistence diagrams/features, shape of persistence diagram/feature).
    - atol_vectoriser (object): Atol() object; trained model to vectorize the data_to_vectorize.
    - label_list (list): List of labels (e.g. [1, 3, 5, 7].
    - type_of_data_to_vectorize (object): either "HK", "BC", "SH" or "PL", or None if we are directly vectorizing the data_to_vectorize.

    Returns
    - Vectorization. Shape (Number of homology dimensions, number of labels, data_to_vectorize, length of vectorization)
    """

    
    # If we are directly vectorizing persistence diagrams
    
    if not type_of_data_to_vectorize: 
        
        vectorizations = {} # initialize dictionary with labels as keys and vectorizations as values

        for label in label_list:
            vectorizations["Label_"+str(label)] = []
            for diagram in data_to_vectorize["Label_"+str(label)]:
                vectorization = atol_vectoriser(diagram)
                vectorizations["Label_"+str(label)].append(vectorization)

        return vectorizations


    # If we are vectorizing features
    
    vectorizations = {} # initialize dictionary

    
    for hom_dim in range(3):
        vectorizations["Hom_Dim_"+str(hom_dim)] = {}
        
        for label in label_list:

            # Initialize list of vectorizations
            vectorizations["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)] = []
    
            for diagram in data_to_vectorize["Label_"+str(label)]:
                # We get a depreciation warning if we do not convert the diagram to a numeric type explicitly
                signature = type_of_data_to_vectorize.fit_transform([diagram])
 
                if type_of_data_to_vectorize == HK or type_of_data_to_vectorize == PI:

                    vector = atol_vectoriser(signature[0][hom_dim])
                    vectorizations["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)].append(vector)
                else:
                    vector = atol_vectoriser(signature[0][hom_dim].reshape(1, -1))
                    vectorizations["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)].append(vector)

    return vectorizations


In [788]:
# Initialize dictionaries with all vectorizations

all_vectorizations = {}

for fold_idx in range(5):
    all_vectorizations["Fold_"+str(fold_idx)] = {}

In [789]:
def get_all_train_persistence_diagrams(persistence_diagrams, train_indices_dict, subject, label_list):

    train_diagrams = {}

    for fold_idx, fold_key in enumerate(train_indices_dict["Label_0"].keys()):
        train_diagrams[fold_key] = {}

    
    # Initialize dictionarys with folds as keys and the train/validation sets/ their labels as values
    for label in label_list:
        for fold_idx, fold_key in enumerate(train_indices_dict["Label_"+str(label)].keys()):
            train_diagrams[fold_key]["Label_"+str(label)] = [persistence_diagrams["Label_"+str(label)][train_idx] for train_idx in train_indices_dict["Label_"+str(label)]["Fold_"+str(fold_idx)]]

    return train_diagrams

# Vectorize persistence diagrams

In [790]:
# Retrieve train persistence diagrams for each fold
train_diagrams = get_all_train_persistence_diagrams(persistence_diagrams, train_indices_dict, subject, label_list)

In [791]:
# Train ATOL
for fold_idx in range(5):
    atol_vectoriser = train_atol(train_diagrams["Fold_"+str(fold_idx)], label_list, vector_dim)

    #Apply trained model to the entire data
    all_vectorizations["Fold_"+str(fold_idx)]["PD"] = create_vectorizations(persistence_diagrams, atol_vectoriser, label_list)

In [792]:
# for final test set
validation_diagrams = get_all_train_persistence_diagrams(persistence_diagrams, validation_indices_dict, subject, label_list)

final_test_train_diagrams = {0: {}}

for label in label_list:
    final_test_train_diagrams[0]["Label_"+str(label)] = train_diagrams["Fold_0"]["Label_"+str(label)] + (validation_diagrams["Fold_0"]["Label_"+str(label)])
    

In [793]:
# Train ATOL
atol_vectoriser = train_atol(final_test_train_diagrams[0], label_list, vector_dim)

# Apply trained model to the entire data
final_test_vectorizations = {}
final_test_vectorizations["PD"] = create_vectorizations(persistence_diagrams, atol_vectoriser, label_list)

# Create dataframes and save

In [794]:
# TODO not necessary anymore?

def insert_zeros_for_removed_indices(all_vectorizations, removed_indices, vector_dim, fold_idx):
    """ Inserts zero embeddings to the places where diagrams that were to short were removed before.
    
    Parameters:
    - 
    - vector_dim (int): Dimension the vectorizations should have, e.g. 4.

    Returns
    - 
    """

    for type_of_data_to_vectorize in all_vectorizations["Fold_"+str(fold_idx)].keys(): # the keys are the types of the data that was vectorized (PD, BC, HK,...)
        for label in label_list: # labels

            # Persistence diagrams do not use the shortened diagrams
            if str(type_of_data_to_vectorize) == "PD":                
                pass
                
            # If type_of_data_to_vectorize is a signature
            else:
                for dim in range(3): # homology dimension

                    vectorization = all_vectorizations["Fold_"+str(fold_idx)][type_of_data_to_vectorize][dim][label]
                
                    for idx in removed_indices[label]:
                        vectorization.insert(idx, np.zeros(vector_dim))

                    all_vectorizations["Fold_"+str(fold_idx)][type_of_data_to_vectorize][dim][label] = vectorization

    return all_vectorizations

In [795]:
def create_feature_df(data_type, all_vectorizations, vector_dim, num_diagrams, label, fold_idx):
    """
    Create DataFrame for each label from features.

    Parameters:
    - all_vectorizations (dictionary): all vectorizations for all datatypes (keys) 
    - vector_dim (int): dimension of the vectorization (e.g. 5)
    - num_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 0, 1, 2, 3 or 4

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    for type_of_data_to_vectorize in all_vectorizations.keys():
        # Persistence diagrams are shaped differently (not separated according to homology dimension)
        for dim in range(vector_dim):
            feature_df[str(data_type)+"_PD_Vectorization_Coord_"+str(dim)] = [arr[dim] for arr in \
                                                                    all_vectorizations[type_of_data_to_vectorize]["Label_"+str(label)]]
        

        
    
    # Label
    feature_df["Label"] = label
    feature_df["Fold"] = fold_idx


    return feature_df

In [796]:
dataframes = {}

for fold_idx in range(5):
    dataframes["Fold_"+str(fold_idx)] = {}
    for label in label_list:
        dataframes["Fold_"+str(fold_idx)]["Label_"+str(label)] = create_feature_df(data_type, \
                all_vectorizations["Fold_"+str(fold_idx)], vector_dim, len(persistence_diagrams["Label_"+str(label)]), label, fold_idx)

In [797]:
# Concatenate all dataframe to one

fold_dataframes = {}

for fold_idx in range(5):
    current_df = dataframes["Fold_"+str(fold_idx)]
    fold_dataframes["Fold_"+str(fold_idx)] = pd.concat([current_df["Label_"+str(0)], current_df["Label_"+str(1)], current_df["Label_"+str(2)], current_df["Label_"+str(3)], current_df["Label_"+str(4)]], ignore_index=True)

feature_df = pd.concat([fold_dataframes["Fold_"+str(0)], fold_dataframes["Fold_"+str(1)], fold_dataframes["Fold_"+str(2)], \
                        fold_dataframes["Fold_"+str(3)], fold_dataframes["Fold_"+str(4)]], ignore_index=True)


In [798]:
feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Vectorization_Features.csv")

In [799]:
# Final Test Set

dataframes = {}

for label in label_list:
        dataframes["Label_"+str(label)] = create_feature_df(data_type, \
                final_test_vectorizations, vector_dim, len(persistence_diagrams["Label_"+str(label)]), label, -1)


feature_df = pd.concat([dataframes["Label_"+str(0)], dataframes["Label_"+str(1)], dataframes["Label_"+str(2)], \
                        dataframes["Label_"+str(3)], dataframes["Label_"+str(4)]], ignore_index=True)


In [800]:
feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Vectorization_Features_for_Final_Test.csv")