In [329]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
import random
from sklearn import preprocessing

In [189]:
# Import persistence diagrams
persistence_diagrams1 = np.load("PD1.npy", allow_pickle=True)
persistence_diagrams3 = np.load("PD3.npy", allow_pickle=True)
persistence_diagrams5 = np.load("PD5.npy", allow_pickle=True)

# Import point cloud embeddings
embeddings1 = np.load("Embeddings1.npy", allow_pickle=True)
embeddings3 = np.load("Embeddings3.npy", allow_pickle=True)
embeddings5 = np.load("Embeddings5.npy", allow_pickle=True)

In [190]:
# We will look at 0, 1 and 2 dimensional holes
homology_dimensions = [0, 1, 2]

# We will use a Vietoris Rips filtrations
persistence = VietorisRipsPersistence(
    homology_dimensions=homology_dimensions, n_jobs=10
)

# Computing the distance to the Wasserstein Barycenter

For the next analyses, we will need to compare persistence diagrams (pairwise). Giotto TDA needs all diagrams to have the same length for this (to have the same amount of holes). We will therefore cut off some "noise" (holes with low persistence) such that all persistence diagrams have the same length.

In [92]:
def cut_diagrams(persistence_diagrams, no_holes_per_dimension):
    # no_holes_per_dimension is a list indicating how many holes for each dimension there should be left
    shortened_diagrams = []

    removed_indices = []
    for diagram_idx, diagram in enumerate(persistence_diagrams): # There are no_segments many diagrams per label at max (as chosen in the beginning)
        most_significant_holes_per_diagram = []
        should_be_appended = True
        holes = {}
        for hole_dimension, number_of_holes in zip(range(3), no_holes_per_dimension):
            # the third entry of each point (hole) in a diagram indicates its dimensionality
            holes[hole_dimension] = diagram[np.where(diagram[:, 2] == hole_dimension)[0]]

            if number_of_holes > len(holes[hole_dimension]):
                print("There is a diagram shorter than the shortened diagrams, which will be removed")
                print("It has " + str(len(holes[hole_dimension])) + " holes of dimension " + str(hole_dimension))
                should_be_appended = False 
                removed_indices.append(diagram_idx)
                
            # The first and second entries of each hole indicate its birth and death, the difference is the persistence
            large_persistence_indices = np.argsort(holes[hole_dimension][:, 0] - holes[hole_dimension][:, 1])[-number_of_holes:]
            
            # For each dimension, getting the holes with the above indices (the holes with the largest persistence)
            significant_holes_with_hole_dimension = holes[hole_dimension][large_persistence_indices, :]
            most_significant_holes_per_diagram.extend(significant_holes_with_hole_dimension)

        if should_be_appended:
            shortened_diagrams.append(most_significant_holes_per_diagram)

    return shortened_diagrams, removed_indices

In [388]:
no_holes_per_dimension = [120, 60, 7] 

# Label 1

shortened_diagrams1, removed_indices1 = cut_diagrams(persistence_diagrams1, no_holes_per_dimension)

# Remove embeddings of diagrams that were removed 
embeddings1 =  np.delete(embeddings1, removed_indices1, 0)

In [389]:
# Label 3

shortened_diagrams3, removed_indices3 = cut_diagrams(persistence_diagrams3, no_holes_per_dimension)

# Remove embeddings of diagrams that were removed 
embeddings3 =  np.delete(embeddings3, removed_indices3, 0)

There is a diagram shorter than the shortened diagrams, which will be removed
It has 4 holes of dimension 2


In [390]:
# Label 5

shortened_diagrams5, removed_indices5 = cut_diagrams(persistence_diagrams5, no_holes_per_dimension)

# Remove embeddings of diagrams that were removed 
embeddings5 =  np.delete(embeddings5, removed_indices5, 0)

In [96]:
# Label 7

# One "outlier" diagram only has one 2-dimensional hole
# Later, such outliers should be deleted before computing the pairwise distances
# between all diagrams, because eventually all diagrams should be shortened to the
# length of the shortest diagram

no_holes_per_dimension7 = [50, 9, 1] 

#shortened_diagrams7 = cut_diagrams(persistence_diagrams7, no_holes_per_dimension7)

## Wasserstein Barycenter

The Wasserstein Barycenter is the most representative persistence diagram in a set of diagrams (of one class), so the one with the lowest overall (Wasserstein) distance to all other diagrams. Because it takes long to compute, we will for now only use a part of the data as training data. For now, these training samples can also be in the test set of the simple classifier in the end.

### Computing the Wasserstein Barycenter for all labels

In [374]:
# Randomly select 100 diagrams as training data
random_diagrams1 = random.sample(shortened_diagrams1, 200)

# Finding the most representative diagram for label 1

# First, compute the pairwise Wasserstein distances
wasserstein_distance1 = PairwiseDistance(metric="wasserstein").fit_transform(random_diagrams1)

# For each hole, calculate the sum of distances to all other holes
sum_distances = [sum(dist) for dist in wasserstein_distance1]

# Find the index of the Wasserstein barycenter
most_representative_index = np.argmin(sum_distances)

# Unshortened Wasserstein Barycenter for label 1
representative_diagram1 = persistence.fit_transform_plot([embeddings1[most_representative_index]])

KeyboardInterrupt: 

In [None]:
# Finding the most representative diagram for label 3

# Randomly select 100 diagrams as training data
random_diagrams3 = random.sample(shortened_diagrams3, 200)

# First, compute the pairwise Wasserstein distances
wasserstein_distance3 = PairwiseDistance(metric="wasserstein").fit_transform(random_diagrams3)

# For each hole, calculate the sum of distances to all other holes
sum_distances = [sum(dist) for dist in wasserstein_distance3]
most_representative_index = np.argmin(sum_distances)

# Wasserstein Barycenter for label 3
representative_diagram3 = persistence.fit_transform_plot([embeddings3[most_representative_index]])

In [None]:
# Finding the most representative diagram for label 5

# Randomly select 20 diagrams as training data
random_diagrams5 = random.sample(shortened_diagrams5, 30)


# First, compute the pairwise Wasserstein distances
wasserstein_distance5 = PairwiseDistance(metric="wasserstein").fit_transform(random_diagrams5)

# For each hole, calculate the sum of distances to all other holes
sum_distances = [sum(dist) for dist in wasserstein_distance5]

# Find the index of the Wasserstein barycenter
most_representative_index = np.argmin(sum_distances)

# Unshortened Wasserstein Barycenter for label 5
representative_diagram5 = persistence.fit_transform_plot([embeddings5[most_representative_index]])

### Computing the distances of all diagrams to the most representative diagram of each class

In [None]:
shortened_barycenter1, _ = cut_diagrams([representative_diagram1[0]], no_holes_per_dimension)
shortened_barycenter3, _ = cut_diagrams([representative_diagram3[0]], no_holes_per_dimension)
shortened_barycenter5, _ = cut_diagrams([representative_diagram5[0]], no_holes_per_dimension)

In [None]:
# Class 1 barycenter
label1_distances_to_barycenter1 = []
label3_distances_to_barycenter1 = []
label5_distances_to_barycenter1 = []


for diagram in shortened_diagrams1:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter1[0]])
    label1_distances_to_barycenter1.append(max(dist[0]))

print("Mean distance of class 1 diagrams to class 1 Wasserstein barycenter: " + str(statistics.mean(label1_distances_to_barycenter1)))

for diagram in shortened_diagrams3:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter1[0]])
    label3_distances_to_barycenter1.append(max(dist[0]))

print("Mean distance of class 3 diagrams to class 1 Wasserstein barycenter: " + str(statistics.mean(label3_distances_to_barycenter1)))

for diagram in shortened_diagrams5:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter1[0]])
    label5_distances_to_barycenter1.append(max(dist[0]))

print("Mean distance of class 5 diagrams to class 1 Wasserstein barycenter: " + str(statistics.mean(label5_distances_to_barycenter1)))


In [345]:
# Class 3 barycenter
label1_distances_to_barycenter3 = []
label3_distances_to_barycenter3 = []
label5_distances_to_barycenter3 = []


for diagram in shortened_diagrams1:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter3[0]])
    label1_distances_to_barycenter3.append(max(dist[0]))

print("Mean distance of class 1 diagrams to class 3 Wasserstein barycenter: " + str(statistics.mean(label1_distances_to_barycenter3)))

for diagram in shortened_diagrams3:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter3[0]])
    label3_distances_to_barycenter3.append(max(dist[0]))

print("Mean distance of class 3 diagrams to class 3 Wasserstein barycenter: " + str(statistics.mean(label3_distances_to_barycenter3)))

for diagram in shortened_diagrams5:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter3[0]])
    label5_distances_to_barycenter3.append(max(dist[0]))

print("Mean distance of class 5 diagrams to class 3 Wasserstein barycenter: " + str(statistics.mean(label5_distances_to_barycenter3)))


Mean distance of class 1 diagrams to class 3 Wasserstein barycenter: 0.0004955658647319321
Mean distance of class 3 diagrams to class 3 Wasserstein barycenter: 0.00015663768206736951
Mean distance of class 5 diagrams to class 3 Wasserstein barycenter: 0.00033686900717187865


In [346]:
# Class 5 barycenter

label1_distances_to_barycenter5 = []
label3_distances_to_barycenter5 = []
label5_distances_to_barycenter5 = []


for diagram in shortened_diagrams1:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter5[0]])
    label1_distances_to_barycenter5.append(max(dist[0]))

print("Mean distance of class 1 diagrams to class 5 Wasserstein barycenter: " + str(statistics.mean(label1_distances_to_barycenter5)))

for diagram in shortened_diagrams3:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter5[0]])
    label3_distances_to_barycenter5.append(max(dist[0]))

print("Mean distance of class 3 diagrams to class 5 Wasserstein barycenter: " + str(statistics.mean(label3_distances_to_barycenter5)))

for diagram in shortened_diagrams5:
    dist = PairwiseDistance(metric="wasserstein").fit_transform([diagram, shortened_barycenter5[0]])
    label5_distances_to_barycenter5.append(max(dist[0]))

print("Mean distance of class 5 diagrams to class 5 Wasserstein barycenter: " + str(statistics.mean(label5_distances_to_barycenter5)))

Mean distance of class 1 diagrams to class 5 Wasserstein barycenter: 0.00037817846104274837
Mean distance of class 3 diagrams to class 5 Wasserstein barycenter: 0.0002033517016367501
Mean distance of class 5 diagrams to class 5 Wasserstein barycenter: 0.0002101560610054421


In [367]:
min1 = min([min(label1_distances_to_barycenter1), min(label3_distances_to_barycenter1), min(label5_distances_to_barycenter1)])
min3 = min([min(label1_distances_to_barycenter3), min(label3_distances_to_barycenter3), min(label5_distances_to_barycenter3)])
min5 = min([min(label1_distances_to_barycenter5), min(label3_distances_to_barycenter5), min(label5_distances_to_barycenter5)])

max1 = max([max(label1_distances_to_barycenter1), max(label3_distances_to_barycenter1), max(label5_distances_to_barycenter1)])
max3 = max([max(label1_distances_to_barycenter3), max(label3_distances_to_barycenter3), max(label5_distances_to_barycenter3)])
max5 = max([max(label1_distances_to_barycenter5), max(label3_distances_to_barycenter5), max(label5_distances_to_barycenter5)])


normalized_label5_distances_to_barycenter1 = statistics.mean((label5_distances_to_barycenter1-min1)/(max1-min1))
normalized_label5_distances_to_barycenter3 = statistics.mean((label5_distances_to_barycenter1-min3)/(max3-min3))
normalized_label5_distances_to_barycenter5 = statistics.mean((label5_distances_to_barycenter1-min5)/(max5-min5))

In [368]:
print(normalized_label5_distances_to_barycenter1)
print(normalized_label5_distances_to_barycenter3)
print(normalized_label5_distances_to_barycenter5)

0.23665597351082332
0.3369633206464957
0.3133326662347334


In [None]:
scaled_mean_class1_to_center1 = statistics.mean(label1_distances_to_barycenter1)/mean1
scaled_mean_class1_to_center3 = statistics.mean(label1_distances_to_barycenter3)/mean1
scaled_mean_class1_to_center5 = statistics.mean(label1_distances_to_barycenter5)/mean1

print("Scaled mean distance of class 1 diagrams to class 5 Wasserstein barycenter: " + str(scaled_mean_class1_to_center1))
print("Scaled mean distance of class 1 diagrams to class 5 Wasserstein barycenter: " + str(scaled_mean_class1_to_center1))
print("Scaled mean distance of class 1 diagrams to class 5 Wasserstein barycenter: " + str(scaled_mean_class1_to_center1))

In [353]:
scaled_mean_class3_to_center1 = statistics.mean(label3_distances_to_barycenter1)/mean3
scaled_mean_class3_to_center3 = statistics.mean(label3_distances_to_barycenter3)/mean3
scaled_mean_class3_to_center5 = statistics.mean(label3_distances_to_barycenter5)/mean3

print("Scaled mean distance of class 3 diagrams to class 1 Wasserstein barycenter: " + str(scaled_mean_class1_to_center1))
print("Scaled mean distance of class 3 diagrams to class 3 Wasserstein barycenter: " + str(scaled_mean_class1_to_center3))
print("Scaled mean distance of class 3 diagrams to class 5 Wasserstein barycenter: " + str(scaled_mean_class1_to_center5))

Scaled mean distance of class 3 diagrams to class 1 Wasserstein barycenter: 0.30579016421076805
Scaled mean distance of class 3 diagrams to class 3 Wasserstein barycenter: 1.9163863829342114
Scaled mean distance of class 3 diagrams to class 5 Wasserstein barycenter: 1.4624414323883521


In [360]:
scaled_mean_class5_to_center1 = statistics.mean(label5_distances_to_barycenter1)/mean1
scaled_mean_class5_to_center3 = statistics.mean(label5_distances_to_barycenter3)/mean3
scaled_mean_class5_to_center5 = statistics.mean(label5_distances_to_barycenter5)/mean5

print("Scaled mean distance of class 5 diagrams to class 1 Wasserstein barycenter: " + str(scaled_mean_class5_to_center1))
print("Scaled mean distance of class 5 diagrams to class 5 Wasserstein barycenter: " + str(scaled_mean_class5_to_center3))
print("Scaled mean distance of class 5 diagrams to class 3 Wasserstein barycenter: " + str(scaled_mean_class5_to_center5))

Scaled mean distance of class 5 diagrams to class 1 Wasserstein barycenter: 0.7662792555015087
Scaled mean distance of class 5 diagrams to class 5 Wasserstein barycenter: 1.0217723840965898
Scaled mean distance of class 5 diagrams to class 3 Wasserstein barycenter: 0.7963611897675604


## Save Wasserstein distances

In [299]:
# Add the removed diagrams again, with a distance that is higher than the maximal distance out of the other diagrams
# At a later step, the "outlier" diagrams should be labelled as outliers beforehand

# Compute maximal distances per class
maximal_distance1 = np.max(all_distances_to_barycenter1)
maximal_distance3 = np.max(all_distances_to_barycenter3)
maximal_distance5 = np.max(all_distances_to_barycenter5)

# Add twice the maximal distance to the indices that were removed before
for idx in removed_indices1:
    all_distances_to_barycenter1.insert(idx, maximal_distance1*2)

for idx in removed_indices3:
    all_distances_to_barycenter3.insert(idx, maximal_distance3*2)

for idx in removed_indices5:
    all_distances_to_barycenter5.insert(idx, maximal_distance5*2)

In [300]:
feature_df1 = pd.DataFrame()
feature_df1["Distance_to_Wasserstein_Barycenter1"] = label1_distances_to_barycenter1
feature_df1["Distance_to_Wasserstein_Barycenter3"] = label1_distances_to_barycenter3
feature_df1["Distance_to_Wasserstein_Barycenter5"] = label1_distances_to_barycenter5
feature_df1["Label"] = 1

feature_df3 = pd.DataFrame()
feature_df3["Distance_to_Wasserstein_Barycenter1"] = label3_distances_to_barycenter1
feature_df3["Distance_to_Wasserstein_Barycenter3"] = label3_distances_to_barycenter3
feature_df3["Distance_to_Wasserstein_Barycenter5"] = label3_distances_to_barycenter5
feature_df3["Label"] = 3

feature_df5 = pd.DataFrame()
feature_df5["Distance_to_Wasserstein_Barycenter1"] = label5_distances_to_barycenter1
feature_df5["Distance_to_Wasserstein_Barycenter3"] = label5_distances_to_barycenter3
feature_df5["Distance_to_Wasserstein_Barycenter5"] = label5_distances_to_barycenter5
feature_df5["Label"] = 5

In [301]:
feature_df = pd.concat([feature_df1, feature_df3, feature_df5])

In [302]:
feature_df.to_csv("Wasserstein_Features.csv")

## Save shortened diagrams

In [386]:
# TODO Still have to add removed diagrams again?

In [391]:
np.save('Shortened_PD1.npy', np.array(shortened_diagrams1, dtype=object), allow_pickle=True)
np.save('Shortend_PD3.npy', np.array(shortened_diagrams3, dtype=object), allow_pickle=True)
np.save('Shortened_PD5.npy', np.array(removed_indices5, dtype=object), allow_pickle=True)