In [1]:
""" This file vectorizes persistence diagrams and their signatures with the ATOL algorithm."""

' This file vectorizes persistence diagrams and their signatures with the ATOL algorithm.'

In [2]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension, PersistenceImage
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from sklearn.cluster import KMeans
from gudhi.representations.vector_methods import Atol
import warnings

# Load Data

In [3]:
# Choose if you want to look at EEG or EMG data

data_type = "EEG"
#data_type = "EMG"

In [4]:
# choose individuum
subject = "m294"

In [5]:
label_list = [0, 1, 2, 3, 4]

In [6]:
# Load persistence diagrams

persistence_diagrams  = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/'+str(data_type)+'/Persistence_Diagrams_All_Labels.npy', \
    allow_pickle=True).item() # .item() to convert the dtype to dict again

In [7]:
extended_persistence_diagrams  = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/'+str(data_type)+'/Extended_Persistence_Diagrams_All_Labels.npy', \
    allow_pickle=True).item() # .item() to convert the dtype to dict again

In [8]:
# TODO do this in Preprocessing_And_Computing_...

reshaped_persistence_diagrams = {}

for label in label_list:
    reshaped_persistence_diagrams["Label_"+str(label)] = [persistence_diagram[0] for persistence_diagram in list(persistence_diagrams["Label_"+str(label)])]

persistence_diagrams = reshaped_persistence_diagrams

## Get training indices

In [9]:
def load_indices(subject):
    train_indices = np.load("Train_Test_Splitting/"+str(subject)+"/Train_Indices_All_Labels_All_Folds.npy", allow_pickle=True).item()

    return train_indices


train_indices_dict = load_indices(subject)

# Set parameters and important functions

In [10]:
# Choose dimensionality of the vectorization

# Later in the classification, a dimension of 4 works already works approx. optimally at least for persistence diagrams
vector_dim = 4

In [11]:
# Define all data type objects

HK = HeatKernel(sigma=0.00003, n_bins=100)
BC = BettiCurve()
SH = Silhouette()
PL = PersistenceLandscape()
PI = PersistenceImage(sigma=0.00003, n_bins=100)

In [12]:
# TODO these functions deal with HK and PI as global variables, which is not ideal. 

def train_atol(training_data, label_list, vector_dim, type_of_data_to_vectorize = None):
    """ Trains the ATOl model with the training data.
    
    Parameters:
    - training_data (dictionary of np.ndarrays of np.ndarrays of np.ndarrays): Data used for training. 
    Shape (labels, #persistence diagrams/features, shape of persistence diagram/feature).
    - label_list (list): List of labels (e.g. [1, 3, 5, 7].
    - vector_dim (int): Dimension the vectorizations should have, e.g. 4.
    - type_of_data_to_vectorize (object): either "HK", "BC", "SH" or "PL", or None if we are directly vectorizing the data_to_vectorize.

    Returns
    - atol_vectoriser (object): Atol() object; trained model to vectorize the data to vectorize later.
    """
    
    # Concatenate all training data
    all_training_data = []
    
    for label in label_list:
        if not type_of_data_to_vectorize:
            all_training_data.extend(training_data["Label_"+str(label)])
            
         # HK & PI have a different shape than the other signatures
        elif type_of_data_to_vectorize == HK or type_of_data_to_vectorize == PI:
            all_training_data.extend(type_of_data_to_vectorize.fit_transform(training_data["Label_"+str(label)])[0])
        else:
            all_training_data.extend(type_of_data_to_vectorize.fit_transform(training_data["Label_"+str(label)]))
            
    # Train Atol vectorizer with all training data
    atol_vectoriser = Atol(quantiser=KMeans(n_clusters=vector_dim, random_state=202006))
    atol_vectoriser.fit(X=all_training_data).centers

    return atol_vectoriser


In [13]:
def create_vectorizations(data_to_vectorize, atol_vectoriser, label_list, type_of_data_to_vectorize = None):
    """ Creates vectorizations from signatures.
    
    Parameters:
    - data_to_vectorize (dictionary of np.ndarrays of np.ndarrays of np.ndarrays): Data to vectorize. 
    Shape (labels, #persistence diagrams/features, shape of persistence diagram/feature).
    - atol_vectoriser (object): Atol() object; trained model to vectorize the data_to_vectorize.
    - label_list (list): List of labels (e.g. [1, 3, 5, 7].
    - type_of_data_to_vectorize (object): either "HK", "BC", "SH" or "PL", or None if we are directly vectorizing the data_to_vectorize.

    Returns
    - Vectorization. Shape (Number of homology dimensions, number of labels, data_to_vectorize, length of vectorization)
    """

    
    # If we are directly vectorizing persistence diagrams
    
    if not type_of_data_to_vectorize: 
        
        vectorizations = {} # initialize dictionary with labels as keys and vectorizations as values

        for label in label_list:
            vectorizations["Label_"+str(label)] = []
            for diagram in data_to_vectorize["Label_"+str(label)]:
                vectorization = atol_vectoriser(diagram)
                vectorizations["Label_"+str(label)].append(vectorization)

        return vectorizations


    # If we are vectorizing features
    
    vectorizations = {} # initialize dictionary

    
    for hom_dim in range(3):
        vectorizations["Hom_Dim_"+str(hom_dim)] = {}
        
        for label in label_list:

            # Initialize list of vectorizations
            vectorizations["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)] = []
    
            for diagram in data_to_vectorize["Label_"+str(label)]:
                # We get a depreciation warning if we do not convert the diagram to a numeric type explicitly
                signature = type_of_data_to_vectorize.fit_transform([diagram])
 
                if type_of_data_to_vectorize == HK or type_of_data_to_vectorize == PI:

                    vector = atol_vectoriser(signature[0][hom_dim])
                    vectorizations["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)].append(vector)
                else:
                    vector = atol_vectoriser(signature[0][hom_dim].reshape(1, -1))
                    vectorizations["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)].append(vector)

    return vectorizations


In [14]:
# Initialize dictionaries with all vectorizations

all_vectorizations = {}

for fold_idx in range(5):
    all_vectorizations["Fold_"+str(fold_idx)] = {}

In [15]:
def get_all_train_persistence_diagrams(persistence_diagrams, train_indices_dict, subject, label_list):

    train_diagrams = {}

    for fold_idx, fold_key in enumerate(train_indices_dict["Label_0"].keys()):
        train_diagrams[fold_key] = {}

    
    # Initialize dictionarys with folds as keys and the train/validation sets/ their labels as values
    for label in label_list:
        for fold_idx, fold_key in enumerate(train_indices_dict["Label_"+str(label)].keys()):
            train_diagrams[fold_key]["Label_"+str(label)] = [persistence_diagrams["Label_"+str(label)][train_idx] for train_idx in train_indices_dict["Label_"+str(label)]["Fold_"+str(fold_idx)]]

    return train_diagrams

# Vectorize persistence diagrams directly

In [16]:
# Retrieve train persistence diagrams for each fold
train_diagrams = get_all_train_persistence_diagrams(persistence_diagrams, train_indices_dict, subject, label_list)

In [17]:
vector_dim = 3

In [18]:
# Train ATOL
for fold_idx in range(5):
    atol_vectoriser = train_atol(train_diagrams["Fold_"+str(fold_idx)], label_list, vector_dim)

    #Apply trained model to the entire data
    all_vectorizations["Fold_"+str(fold_idx)]["PD"] = create_vectorizations(persistence_diagrams, atol_vectoriser, label_list)

# Vectorize Signatures

In [19]:
# Retrieve extended train persistence diagrams for each fold
train_diagrams = get_all_train_persistence_diagrams(extended_persistence_diagrams, train_indices_dict, subject, label_list)

## Heatkernel vectorizations

In [20]:
# Train ATOL
for fold_idx in range(5):
    atol_vectoriser = train_atol(train_diagrams["Fold_"+str(fold_idx)], label_list, vector_dim, HK)

    #Apply trained model to the entire data
    all_vectorizations["Fold_"+str(fold_idx)]["HK"] = create_vectorizations(extended_persistence_diagrams, atol_vectoriser, label_list, HK)

## Betti Curve Vectorizations

In [21]:
# Train ATOL
for fold_idx in range(5):
    atol_vectoriser = train_atol(train_diagrams["Fold_"+str(fold_idx)], label_list, vector_dim, BC)

    #Apply trained model to the entire data
    all_vectorizations["Fold_"+str(fold_idx)]["BC"] = create_vectorizations(extended_persistence_diagrams, atol_vectoriser, label_list, BC)

## Vectorize Silhouettes

In [22]:
# Train ATOL
for fold_idx in range(5):
    atol_vectoriser = train_atol(train_diagrams["Fold_"+str(fold_idx)], label_list, vector_dim, SH)

    #Apply trained model to the entire data
    all_vectorizations["Fold_"+str(fold_idx)]["SH"] = create_vectorizations(extended_persistence_diagrams, atol_vectoriser, label_list, SH)

## Persistence Landscapes Vectorization

In [23]:
# Train ATOL
for fold_idx in range(5):
    atol_vectoriser = train_atol(train_diagrams["Fold_"+str(fold_idx)], label_list, vector_dim, PL)

    #Apply trained model to the entire data
    all_vectorizations["Fold_"+str(fold_idx)]["PL"] = create_vectorizations(extended_persistence_diagrams, atol_vectoriser, label_list, PL)

## Persistence Image Vectorization

Takes long to compute.

In [24]:
# Train ATOL
#atol_vectoriser = train_atol(train_shortened_diagrams, label_list, vector_dim, PI)

# Create vectorizations
#train_all_vectorizations["PI"] = create_vectorizations(train_shortened_diagrams, atol_vectoriser, label_list, PI)
#test_all_vectorizations["PI"] = create_vectorizations(test_shortened_diagrams, atol_vectoriser, label_list, PI)

# Create dataframes and save

In [25]:
def insert_zeros_for_removed_indices(all_vectorizations, removed_indices, vector_dim, fold_idx):
    """ Inserts zero embeddings to the places where diagrams that were to short were removed before.
    
    Parameters:
    - 
    - vector_dim (int): Dimension the vectorizations should have, e.g. 4.

    Returns
    - 
    """

    for type_of_data_to_vectorize in all_vectorizations["Fold_"+str(fold_idx)].keys(): # the keys are the types of the data that was vectorized (PD, BC, HK,...)
        for label in label_list: # labels

            # Persistence diagrams do not use the shortened diagrams
            if str(type_of_data_to_vectorize) == "PD":                
                pass
                
            # If type_of_data_to_vectorize is a signature
            else:
                for dim in range(3): # homology dimension

                    vectorization = all_vectorizations["Fold_"+str(fold_idx)][type_of_data_to_vectorize][dim][label]
                
                    for idx in removed_indices[label]:
                        vectorization.insert(idx, np.zeros(vector_dim))

                    all_vectorizations["Fold_"+str(fold_idx)][type_of_data_to_vectorize][dim][label] = vectorization

    return all_vectorizations

In [65]:
def create_feature_df(data_type, all_vectorizations, vector_dim, num_diagrams, label, fold_idx):
    """
    Create DataFrame for each label from features.

    Parameters:
    - all_vectorizations (dictionary): all vectorizations for all datatypes (keys) 
    - vector_dim (int): dimension of the vectorization (e.g. 5)
    - num_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 0, 1, 2, 3 or 4

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    for type_of_data_to_vectorize in all_vectorizations.keys():
        print(type_of_data_to_vectorize)
        # Persistence diagrams are shaped differently (not separated according to homology dimension)
        if type_of_data_to_vectorize == "PD":
            for dim in range(vector_dim):
                feature_df[str(data_type)+"_PD_Vectorization_Coord_"+str(dim)] = [arr[dim] for arr in \
                                                                    all_vectorizations[type_of_data_to_vectorize]["Label_"+str(label)]]
        

        
        else:
            for hom_dim in range(3):
                for vector_dim in range(vector_dim):
                    print(type_of_data_to_vectorize)
                    feature_df[str(data_type)+"_"+str(type_of_data_to_vectorize)+"_Vectorization_Dim_"+str(hom_dim)+"Coord_"+str(vector_dim)] = \
                            [arr[vector_dim] for arr in all_vectorizations[type_of_data_to_vectorize]["Hom_Dim_"+str(hom_dim)]["Label_"+str(label)]]
    
    
    # Label
    feature_df["Label"] = label
    feature_df["Fold"] = fold_idx


    return feature_df

In [66]:
dataframes = {}

for fold_idx in range(5):
    dataframes["Fold_"+str(fold_idx)] = {}
    for label in label_list:
        dataframes["Fold_"+str(fold_idx)]["Label_"+str(label)] = create_feature_df(data_type, \
                all_vectorizations["Fold_"+str(fold_idx)], vector_dim, len(persistence_diagrams["Label_"+str(label)]), label, fold_idx)

PD
HK
HK
HK
HK
HK
HK
HK
BC
SH
PL
Index(['EEG_PD_Vectorization_Coord_0', 'EEG_PD_Vectorization_Coord_1',
       'EEG_PD_Vectorization_Coord_2', 'EEG_HK_Vectorization_Dim_0Coord_0',
       'EEG_HK_Vectorization_Dim_0Coord_1',
       'EEG_HK_Vectorization_Dim_0Coord_2',
       'EEG_HK_Vectorization_Dim_1Coord_0',
       'EEG_HK_Vectorization_Dim_1Coord_1',
       'EEG_HK_Vectorization_Dim_2Coord_0', 'Label', 'Fold'],
      dtype='object')
PD
HK
HK
HK
HK
HK
HK
HK
BC
SH
PL
Index(['EEG_PD_Vectorization_Coord_0', 'EEG_PD_Vectorization_Coord_1',
       'EEG_PD_Vectorization_Coord_2', 'EEG_HK_Vectorization_Dim_0Coord_0',
       'EEG_HK_Vectorization_Dim_0Coord_1',
       'EEG_HK_Vectorization_Dim_0Coord_2',
       'EEG_HK_Vectorization_Dim_1Coord_0',
       'EEG_HK_Vectorization_Dim_1Coord_1',
       'EEG_HK_Vectorization_Dim_2Coord_0', 'Label', 'Fold'],
      dtype='object')
PD
HK
HK
HK
HK
HK
HK
HK
BC
SH
PL
Index(['EEG_PD_Vectorization_Coord_0', 'EEG_PD_Vectorization_Coord_1',
       'EEG_PD_V

In [39]:
# Concatenate all dataframe to one

fold_dataframes = {}

for fold_idx in range(5):
    current_df = dataframes["Fold_"+str(fold_idx)]
    fold_dataframes["Fold_"+str(fold_idx)] = pd.concat([current_df["Label_"+str(0)], current_df["Label_"+str(1)], current_df["Label_"+str(2)], current_df["Label_"+str(3)], current_df["Label_"+str(4)]], ignore_index=True)

feature_df = pd.concat([fold_dataframes["Fold_"+str(0)], fold_dataframes["Fold_"+str(1)], fold_dataframes["Fold_"+str(2)], \
                        fold_dataframes["Fold_"+str(3)], fold_dataframes["Fold_"+str(4)]], ignore_index=True)


In [29]:
feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Vectorization_Features.csv")

In [52]:
all_vectorizations["Fold_"+str(0)]["BC"]

{'Hom_Dim_0': {'Label_0': [array([1.79180773e-12, 8.98732506e-04, 1.20398372e-08]),
   array([1.44465135e-05, 2.52099728e-01, 1.29750058e-02]),
   array([0.00060951, 0.34648331, 0.19287885]),
   array([3.15482511e-06, 2.92568756e-01, 3.97628978e-03]),
   array([9.70899779e-07, 8.24881041e-02, 1.19212717e-03]),
   array([2.85661678e-11, 9.92271553e-04, 1.14219021e-07]),
   array([9.68705877e-10, 7.84106866e-03, 2.87243443e-06]),
   array([4.34392319e-12, 3.45260815e-03, 3.07562722e-08]),
   array([1.17319648e-08, 8.64301812e-03, 2.25701399e-05]),
   array([9.14085092e-06, 8.58316211e-01, 7.68251749e-03]),
   array([3.00079762e-07, 1.57935799e-01, 5.28296581e-04]),
   array([1.24665241e-36, 6.10761457e-22, 7.46388437e-32]),
   array([7.71548428e-11, 9.04437645e-03, 3.89830884e-07]),
   array([5.85352364e-23, 3.76321251e-10, 1.89437398e-18]),
   array([1.37792310e-15, 1.07278296e-05, 1.50003224e-11]),
   array([3.47105009e-16, 3.14802333e-05, 5.70962041e-12]),
   array([5.18926469e-07, 1.