In [1]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 

In [109]:
# Choose if you want to look at EEG or EMG data

data_type = "EEG" # Does not have an effect yet, will be added later when processing anesthesia data
#data_type = "EMG"

In [110]:
# choose individuum

subject = "m292"

In [111]:
label_list = [0, 1, 2, 3, 4]

n_folds = 5 # This should be the same as in the file which creates PDs 

In [130]:
# Load persistence diagrams

test_persistence_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values
persistence_diagrams_for_cross_validation = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values
train_indices_dict_for_folds = {}
validation_indices_dict_for_folds = {}


for label in label_list:
    # Initialize dictionary with folds as keys
    train_indices_dict_for_folds[label] = {}
    validation_indices_dict_for_folds[label] = {}


for label in label_list:

    #### Final test set ####
    test_persistence_diagrams[label]  = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/Test/'+str(data_type)+'/PD_Label'+str(label)+'.npy', \
    allow_pickle=True).item() # .item() to convert the dtype to dict again


    #### Data for cross validation ####


    # Import all train persistence diagrams #
    persistence_diagrams_for_cross_validation[label] = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/Train/'+str(data_type)+'/PD_Label_'+str(label)+'.npy', \
                allow_pickle=True).item() # .item() to convert the dtype to dict again

    
    # Import train and validation indices for all folds for each label
    for fold in range(n_folds):
        # Train Set
        train_indices_dict_for_folds[label][fold] = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/Train/'+str(data_type)+'/Fold_'+str(fold)+'/Train/Indices_Label_'+str(label)+'.npy', \
                allow_pickle=True)
        
        # Validation Set
        validation_indices_dict_for_folds[label][fold] = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/Train/'+str(data_type)+'/Fold_'+str(fold)+'/Validation/Indices_Label_'+str(label)+'.npy', \
                allow_pickle=True)

# Summary Statistics

In [131]:
def compute_summary_statistics(persistence_diagrams):
    """
    Compute summary statistics of list of persistence diagrams

    Parameters:
    - persistence_diagrams (list): persistence diagrams

    Returns:
    Tuple of four lists:
    - Persistence Entropy
    - Persistence
    - Betti Numbers
    - Complex Polynomials
    """
    
    PE = PersistenceEntropy()
    AM = Amplitude()
    NP = NumberOfPoints()
    CP = ComplexPolynomial(n_coefficients=1)

    persistence_entropies = []
    amplitudes = []
    nos_points = []
    complex_polynomials = []

    for diagram in persistence_diagrams:
        persistence_entropies.append(PE.fit_transform([diagram]))
        amplitudes.append(AM.fit_transform([diagram]))
        nos_points.append(NP.fit_transform([diagram]))
        #complex_polynomials.append(CP.fit_transform([diagram]))

    return persistence_entropies, amplitudes, nos_points, #complex_polynomials

In [156]:
# Initialize dict for final test set with labels as key
features_for_test_set = {}

# Initialize dicts for the different folds set with labels as key
features_for_cross_validation = {}

for label in label_list:

    # Final test set features
    features_for_test_set[label] = compute_summary_statistics(test_persistence_diagrams[label][0])

    # Initialize a dict for each label for the different folds
    features_for_cross_validation[label] = {}

    for PD_idx in persistence_diagrams_for_cross_validation[label][0].keys():
        features_for_cross_validation[label][PD_idx] = \
            compute_summary_statistics(persistence_diagrams_for_cross_validation[label][0][PD_idx])

In [133]:
def compute_largest_persistence(persistence_diagrams):
    """
    Computes persistence of the most prominent points of each dimension in each diagram

    Parameters:
    - persistence_diagrams (list): persistence diagrams

    Returns:
    List of 3 lists:
    - List of the largest persistences of homology dimension 0
    - List of the largest persistences of homology dimension 1
    - List of the largest persistences of homology dimension 2 
    """

    largest_persistences = [] # will contain 3 lists for the 3 homology dimensions
    for homology_dimension in [0, 1, 2]:
        largest_persistences_of_hom_dim = []
        for diagram in persistence_diagrams:
            # only look at holes of our homology dimension
            condition = diagram[:, 2] == homology_dimension
            filtered_diagram = diagram[condition]

            if len(filtered_diagram) > 0:
                differences = filtered_diagram[:, 1] - filtered_diagram[:, 0]
                largest_persistences_of_hom_dim.append(np.max(differences))

        largest_persistences.append(largest_persistences_of_hom_dim)

    return largest_persistences

In [139]:
# Initialize dict for final test set with labels as key
largest_persistences_for_test_set = {}

# Initialize dicts for the different folds set with labels as key
largest_persistences_for_cross_validation = {}

for label in label_list:
    
    # Final test set features
    largest_persistences_for_test_set[label] = compute_largest_persistence(test_persistence_diagrams[label][0])
    
    # Initialize a dict for each label for the different fold
    largest_persistences_for_cross_validation[label] = {}

    for PD_idx in persistence_diagrams_for_cross_validation[label][0].keys():
        largest_persistences_for_cross_validation[label][PD_idx] = \
            compute_largest_persistence(persistence_diagrams_for_cross_validation[label][0][PD_idx])

In [145]:
def split_features(features, train_indices_dict, test_indices_dict, label):
    
    train_features = {}
    test_features = {}

    
    for fold_idx in range(len(train_indices_dict[0])):
        
        # Initalize feature dicts for each fold
        train_features[fold_idx]  = [] # train set
        test_features[fold_idx]  = [] # test set

    # Loop through folds (only one fold in case of final test)
    for fold_idx in range(len(train_indices_dict[label])):
        # Loop through the first train data segments with our label
        
        for PD_idx in train_indices_dict[label][fold_idx]: 
            train_features[fold_idx].append(features[label][PD_idx][0])

        for PD_idx in test_indices_dict[label][fold_idx]:
            test_features[fold_idx].append(features[label][PD_idx][0])


    return train_features, test_features

In [158]:
# Split features into train and validation set for different folds

train_features_for_cross_validation = {}
validation_features_for_cross_validation = {}

for label in label_list:
    train_features_for_cross_validation[label], validation_features_for_cross_validation[label] = \
    split_features(features_for_cross_validation, train_indices_dict_for_folds, validation_indices_dict_for_folds, label = label)

In [159]:
# Split largest persistence feature into train and validation set for different folds

train_largest_persistences_for_cross_validation = {}
validation_largest_persistences_for_cross_validation = {}

for label in label_list:
    train_largest_persistences_for_cross_validation[label], validation_largest_persistences_for_cross_validation[label] = \
    split_features(largest_persistences_for_cross_validation, train_indices_dict_for_folds, validation_indices_dict_for_folds, label = label)

# Concatenate Features to one DataFrame

In [160]:
def choose_column_in_matrix(matrix, i):
    return [row[0][i] for row in matrix]

In [161]:
def create_feature_df(subject, data_type, persistence_entropies, amplitudes, nos_points, persistences, label, train):
    """
    Create DataFrame for each label from features

    Parameters:
    - persistence_entropies (list): persistence entropies
    - amplitudes (list): amplitudes
    - nos_points (list): number of points
    - label (int): Label for which we want to create a dataframe. 1, 3, 5 or 7.

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame()

    # All 3 columns (corresponding to hole dimensions)
    feature_df[str(data_type)+"_Persistence Entropy_Dim_0"] = choose_column_in_matrix(list(persistence_entropies), 0)
    feature_df[str(data_type)+"_Persistence Entropy_Dim_1"] = choose_column_in_matrix(list(persistence_entropies), 1)
    feature_df[str(data_type)+"_Persistence Entropy_Dim_2"] = choose_column_in_matrix(list(persistence_entropies), 2)

    # All 3 columns (corresponding to hole dimensions)
    feature_df[str(data_type)+"_Amplitude_Dim_0"] = choose_column_in_matrix(list(amplitudes), 0)
    feature_df[str(data_type)+"_Amplitude_Dim_1"] = choose_column_in_matrix(list(amplitudes), 1)
    feature_df[str(data_type)+"_Amplitude_Dim_2"] = choose_column_in_matrix(list(amplitudes), 2)

    # All 3 columns (corresponding to hole dimensions)
    feature_df[str(data_type)+"_No_Points_Dim_0"] = choose_column_in_matrix(list(nos_points), 0)
    feature_df[str(data_type)+"_No_Points_Dim_1"] = choose_column_in_matrix(list(nos_points), 1)
    feature_df[str(data_type)+"_No_Points_Dim_2"] = choose_column_in_matrix(list(nos_points), 2)


    feature_df[str(data_type)+"_Largest_Persistence_Dim_0"] = persistences[0]
    feature_df[str(data_type)+"_Largest_Persistence_Dim_1"] = persistences[1]
    feature_df[str(data_type)+"_Largest_Persistence_Dim_2"] = persistences[2]

    # Label
    feature_df["Label"] = label

    # Subject
    feature_df["Subject"] = subject

    feature_df["Train"] = train


    return feature_df

In [162]:
test_dataframes= {}

train_dataframes = {}
validation_dataframes = {}



for label in label_list:
    # Final test set
    persistence_entropies = features_for_test_set[label][0]
    amplitudes = features_for_test_set[label][1]
    nos_points = features_for_test_set[label][2]

    test_dataframes[label] = create_feature_df(subject, data_type, persistence_entropies, amplitudes, nos_points, \
                                                largest_persistences_for_test_set[label], label, False)

    # Initialize dict with folds as key for each label
    train_dataframes[label] = {}
    validation_dataframes[label] = {}

    # Folds
    for fold in range(n_folds):
        # Train sets
        persistence_entropies = train_features_for_cross_validation[label][fold][0]
        amplitudes = train_features_for_cross_validation[label][fold][1]
        nos_points = train_features_for_cross_validation[label][fold][2]
        
        train_dataframes[label][fold] = create_feature_df(subject, data_type, persistence_entropies, amplitudes, nos_points, \
                                                train_largest_persistences_for_cross_validation[label][fold], label, False)


        # Validation sets
        persistence_entropies = validation_features_for_cross_validation[label][fold][0]
        amplitudes = validation_features_for_cross_validation[label][fold][1]
        nos_points = validation_features_for_cross_validation[label][fold][2]
        
        validation_dataframes[label][fold] = create_feature_df(subject, data_type, persistence_entropies, amplitudes, nos_points, \
                                                validation_largest_persistences_for_cross_validation[label][fold], label, False)






In [163]:
# Concatenate and save features of final test set
test_feature_df = pd.concat([test_dataframes[0], test_dataframes[1], test_dataframes[2], test_dataframes[3], test_dataframes[4]], ignore_index=True)
test_feature_df.to_csv("Features/"+str(subject)+"/Test/"+str(data_type)+"/Topological_Summary_Statistics.csv")


# Concatenate and save features of for single folds
for fold in range(n_folds):
    # Train
    train_feature_df = pd.concat([test_dataframes[0], test_dataframes[1], test_dataframes[2], test_dataframes[3], test_dataframes[4]], ignore_index=True)
    train_feature_df.to_csv("Features/"+str(subject)+"/Train/"+str(data_type)+"/Fold_"+str(fold)+"/Train/Topological_Summary_Statistics.csv")

    # Validation
    validation_feature_df = pd.concat([test_dataframes[0], test_dataframes[1], test_dataframes[2], test_dataframes[3], test_dataframes[4]], ignore_index=True)
    validation_feature_df.to_csv("Features/"+str(subject)+"/Train/"+str(data_type)+"/Fold_"+str(fold)+"/Validation/Topological_Summary_Statistics.csv")
