In [87]:
""" This file vectorizes persistence diagrams (ATOL)"""

' This file vectorizes persistence diagrams (ATOL)'

In [88]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from sklearn.cluster import KMeans
from gudhi.representations.vector_methods import Atol

In [89]:
# Load persistence diagrams
train_persistence_diagrams_label_1 = np.load("Embeddings_and_Persistence_Diagrams/Train_PD1.npy", allow_pickle=True)
test_persistence_diagrams_label_1 = np.load("Embeddings_and_Persistence_Diagrams/Test_PD1.npy", allow_pickle=True)

train_persistence_diagrams_label_3 = np.load("Embeddings_and_Persistence_Diagrams/Train_PD3.npy", allow_pickle=True)
test_persistence_diagrams_label_3 = np.load("Embeddings_and_Persistence_Diagrams/Test_PD3.npy", allow_pickle=True)

train_persistence_diagrams_label_5 = np.load("Embeddings_and_Persistence_Diagrams/Train_PD5.npy", allow_pickle=True)
test_persistence_diagrams_label_5 = np.load("Embeddings_and_Persistence_Diagrams/Test_PD5.npy", allow_pickle=True)

train_persistence_diagrams_label_7 = np.load("Embeddings_and_Persistence_Diagrams/Train_PD7.npy", allow_pickle=True)
test_persistence_diagrams_label_7 = np.load("Embeddings_and_Persistence_Diagrams/Test_PD7.npy", allow_pickle=True)


In [90]:
# Choose dimensionality of the vectorization

vector_dim = 4  # Later in the classification, a dimension of works already works approx. optimally

In [91]:
all_persistence_diagrams = [] # all training data

all_persistence_diagrams.extend(train_persistence_diagrams_label_1)
all_persistence_diagrams.extend(train_persistence_diagrams_label_3)
all_persistence_diagrams.extend(train_persistence_diagrams_label_5)
all_persistence_diagrams.extend(train_persistence_diagrams_label_7)


atol_vectoriser = Atol(quantiser=KMeans(n_clusters=vector_dim, random_state=202006))
atol_vectoriser.fit(X=all_persistence_diagrams).centers


# Apply trained model to the entire data
# Label 1
train_atol_vectorization_label_1 = []
for diagram in train_persistence_diagrams_label_1:
    vectorization = atol_vectoriser(diagram)
    train_atol_vectorization_label_1.append(vectorization)

test_atol_vectorization_label_1 = []
for diagram in test_persistence_diagrams_label_1:
    vectorization = atol_vectoriser(diagram)
    test_atol_vectorization_label_1.append(vectorization)

# Label 3
train_atol_vectorization_label_3 = []
for diagram in train_persistence_diagrams_label_3:
    vectorization = atol_vectoriser(diagram)
    train_atol_vectorization_label_3.append(vectorization)

test_atol_vectorization_label_3 = []
for diagram in test_persistence_diagrams_label_3:
    vectorization = atol_vectoriser(diagram)
    test_atol_vectorization_label_3.append(vectorization)

# Label 5
train_atol_vectorization_label_5 = []
for diagram in train_persistence_diagrams_label_5:
    vectorization = atol_vectoriser(diagram)
    train_atol_vectorization_label_5.append(vectorization)

test_atol_vectorization_label_5 = []
for diagram in test_persistence_diagrams_label_5:
    vectorization = atol_vectoriser(diagram)
    test_atol_vectorization_label_5.append(vectorization)


# Label 7
train_atol_vectorization_label_7 = []
for diagram in train_persistence_diagrams_label_7:
    vectorization = atol_vectoriser(diagram)
    train_atol_vectorization_label_7.append(vectorization)

test_atol_vectorization_label_7 = []
for diagram in test_persistence_diagrams_label_7:
    vectorization = atol_vectoriser(diagram)
    test_atol_vectorization_label_7.append(vectorization)


In [92]:
def create_feature_df(atol_vectorization, vector_dim, removed_indices, num_diagrams, label):
    """
    Create DataFrame for each label from features

    Parameters:
    - atol_vectorization (list): vectorization
    - vector_dim (int): dimension of the vectorization (e.g. 5)
    - removed_indices (list): list of diagrams that were removed from the list of shortened diagrams
    - num_all_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 1, 3, 5 or 7.

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    # Preliminary replace removed persistence diagrams with 0s TODO
    for idx in removed_indices:
        np.insert(atol_vectorization, idx, np.zeros(vector_dim), axis = 0)
    
    for dim in range(vector_dim):
        feature_df["Vectorization_Dim_"+str(dim)] = [arr[dim] for arr in atol_vectorization]
    
    # Label
    feature_df["Label"] = label

    return feature_df

In [93]:
# Label 1
train_df_label_1 = create_feature_df(train_atol_vectorization_label_1, vector_dim, removed_train_indices_label_1, len(train_persistence_diagrams_label_1), 1)
test_df_label_1 = create_feature_df(test_atol_vectorization_label_1, vector_dim,removed_test_indices_label_1, len(test_persistence_diagrams_label_1), 1)

# Label 3
train_df_label_3 = create_feature_df(train_atol_vectorization_label_3, vector_dim, removed_train_indices_label_3, len(train_persistence_diagrams_label_3), 3)
test_df_label_3 = create_feature_df(test_atol_vectorization_label_3, vector_dim,removed_test_indices_label_3, len(test_persistence_diagrams_label_3), 3)

# Label 5
train_df_label_5 = create_feature_df(train_atol_vectorization_label_5, vector_dim, removed_train_indices_label_5, len(train_persistence_diagrams_label_5), 5)
test_df_label_5 = create_feature_df(test_atol_vectorization_label_5, vector_dim, removed_test_indices_label_5, len(test_persistence_diagrams_label_5), 5)

# Label 7
train_df_label_7 = create_feature_df(train_atol_vectorization_label_7, vector_dim, removed_train_indices_label_7, len(train_persistence_diagrams_label_7), 7)
test_df_label_7 = create_feature_df(test_atol_vectorization_label_7, vector_dim, removed_test_indices_label_7, len(test_persistence_diagrams_label_7), 7)


In [94]:
# Concatenate and save features of training persistence diagrams
train_feature_df = pd.concat([train_df_label_1, train_df_label_3, train_df_label_5, train_df_label_7], ignore_index=True)
train_feature_df.to_csv("Features/Train_Vectorization_Features.csv")

# Concatenate and save features of training persistence diagrams
test_feature_df = pd.concat([test_df_label_1, test_df_label_3, test_df_label_5, test_df_label_7], ignore_index=True)
test_feature_df.to_csv("Features/Test_Vectorization_Features.csv")