In [87]:
""" This file vectorizes persistence diagrams (ATOL)"""

' This file vectorizes persistence diagrams (ATOL)'

In [2]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from sklearn.cluster import KMeans
from gudhi.representations.vector_methods import Atol

In [3]:
label_list = [1, 3, 5, 7]

In [7]:
# Load persistence diagrams

train_persistence_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values
test_persistence_diagrams = {} # dictionary with labels as keys, persistence diagrams of the respective classes as values


for label in label_list:
    train_persistence_diagrams[label] = np.load("Embeddings_and_Persistence_Diagrams/Train_PD"+str(label)+".npy", allow_pickle=True)
    test_persistence_diagrams[label] = np.load("Embeddings_and_Persistence_Diagrams/Test_PD"+str(label)+".npy", allow_pickle=True)


In [8]:
# Choose dimensionality of the vectorization

vector_dim = 4  # Later in the classification, a dimension of works already works approx. optimally

In [10]:
# Concatenate all training data
all_train_persistence_diagrams = []

for label in label_list:
    all_train_persistence_diagrams.extend(train_persistence_diagrams[label])
    
# Train Atol vectorizer with all training data
atol_vectoriser = Atol(quantiser=KMeans(n_clusters=vector_dim, random_state=202006))
atol_vectoriser.fit(X=all_train_persistence_diagrams).centers


# Apply trained model to the entire data

# Train Data
train_atol_vectorizations = {}

for label in label_list:
    train_atol_vectorizations[label] = []
    for diagram in train_persistence_diagrams[label]:
        vectorization = atol_vectoriser(diagram)
        train_atol_vectorizations[label].append(vectorization)

# Test data
test_atol_vectorizations = {}

for label in label_list:
    test_atol_vectorizations[label] = []
    for diagram in test_persistence_diagrams[label]:
        vectorization = atol_vectoriser(diagram)
        test_atol_vectorizations[label].append(vectorization)


In [23]:
def create_feature_df(atol_vectorization, vector_dim, num_diagrams, label):
    """
    Create DataFrame for each label from features

    Parameters:
    - atol_vectorization (list): vectorization
    - vector_dim (int): dimension of the vectorization (e.g. 5)
    - removed_indices (list): list of diagrams that were removed from the list of shortened diagrams
    - num_all_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 1, 3, 5 or 7.

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    
    for dim in range(vector_dim):
        feature_df["Vectorization_Dim_"+str(dim)] = [arr[dim] for arr in atol_vectorization]
    
    # Label
    feature_df["Label"] = label

    return feature_df

In [26]:
# Create train dataframes
train_dataframes = {}

for label in label_list:
    train_dataframes[label] = create_feature_df(train_atol_vectorizations[label], vector_dim, len(train_persistence_diagrams[label]), label)

# Create test dataframes
test_dataframes = {}

for label in label_list:
    test_dataframes[label] = create_feature_df(test_atol_vectorizations[label], vector_dim, len(test_persistence_diagrams[label]), label)


In [94]:
# Concatenate and save features of training persistence diagrams
train_feature_df = pd.concat([train_dataframes[1], train_dataframes[3], train_dataframes[5], train_dataframe[7], ignore_index=True)
train_feature_df.to_csv("Features/Train_Vectorization_Features.csv")

# Concatenate and save features of training persistence diagrams
test_feature_df = pd.concat([test_dataframes[1], test_dataframes[3], test_dataframes[5], test_dataframes[7], ignore_index=True)
test_feature_df.to_csv("Features/Test_Vectorization_Features.csv")