In [1]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from numpy.linalg import norm
from scipy.stats import skew, kurtosis

# Load Data and set important variables

In [70]:
# choose individuum
subject = "m294"

In [71]:
label_list = [0, 1, 2, 3, 4]

In [72]:
# Load persistence diagrams

persistence_diagrams = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Persistence_Diagrams.npy", allow_pickle=True).item()

# Define Function Wrapper for All Statistics

In [88]:
def signature_statistics_wrapper(func): 

    def compute_statistics_for_signature(*args, **kwargs):

        # Get variables
        signatures = kwargs.pop('signatures', None)
        label_list = kwargs.pop('label_list', None)
        persistence_diagrams = kwargs.pop('persistence_diagrams', None)


        overall_statistics = {}

        for label in label_list:
            amount_to_average = 80  # The imaging data has 20Hz, therefore we have to average 80 images to get 4 seconds

            # Initialize statistics list for label
            overall_statistics["Label_"+str(label)] = []
            # Initialize average statistics over 80 diagrams each
            average_staticistics_over_80_diagrams_dim_0 = []
            average_staticistics_over_80_diagrams_dim_1 = []

            # Take average of the statistics of 25 persistence diagrams
            for idx in range(int(len(persistence_diagrams[label]) / amount_to_average)):
                # Initialize list for the statistics of each of the the 80 PD after idx
                single_statistics_dim0 = []
                single_statistics_dim1 = []

                for counter in range(amount_to_average):
                    # TODO problem: This is computed again and again for each func
                    sgn = signatures[label][amount_to_average * idx + counter].astype("float")

                    # Apply function to compute statistics
                    single_statistics_dim0.append(func(sgn, 0))
                    single_statistics_dim1.append(func(sgn, 1))

                average_staticistics_over_80_diagrams_dim_0.append(np.mean(single_statistics_dim0, axis=0))
                average_staticistics_over_80_diagrams_dim_1.append(np.mean(single_statistics_dim1, axis=0))

        
            # Append both dimensions to general list of statistics for current label
            overall_statistics["Label_" + str(label)].append(average_staticistics_over_80_diagrams_dim_0)
            overall_statistics["Label_" + str(label)].append(average_staticistics_over_80_diagrams_dim_1)

        return overall_statistics

    return compute_statistics_for_signature


In [74]:
def precompute_signature(SG, label_list = label_list, persistence_diagrams = persistence_diagrams):

    amount_to_average = 80  # The imaging data has 20Hz, therefore we have to average 80 images to get 4 seconds

    signatures = {}
    for label in label_list:
        signatures[label] = {}
        
        for idx in range(int(len(persistence_diagrams[label]))):
            
            signatures[label][idx] = SG.fit_transform([persistence_diagrams[label][idx].astype("float")])

    return signatures

# HeatKernel Statistics

In [82]:
HK = HeatKernel(sigma=50000, n_bins=100)

heatkernels = precompute_signature(HK)

In [83]:
heat_kernel_statistics = {}

# Intensity

In a way, the Heat Kernel shows an "average distribution" of the persistence diagrams for each label, seperated per hole dimensionality.

In [84]:
@signature_statistics_wrapper
def heat_kernel_intensity(heatkernel, homology_dimension):
    """ Computes mean intensity of a heatkernel. Only takes positive values because otherwise the mean would
      always be zero.

    Parameters:
    - heatkernel (list of lists): heatkernel of all homology dimensions
    - homology_dimension (int): Which homology dimension to look at (0, 1 or 2)

    Returns:
    - mean intensity of heatkernel of homology dimension homology_dimension
    """
    
    positives =  [x for inner_list in heatkernel[0][homology_dimension] for x in inner_list if x > 0]
    
    return np.mean(positives)

In [85]:
heat_kernel_statistics["intensity"] = heat_kernel_intensity(signatures = heatkernels, label_list = label_list, persistence_diagrams = persistence_diagrams, heatkernel = None, homology_dimension = None)

## Maximum and Minimum

In [89]:
@signature_statistics_wrapper
def heat_kernel_max(heatkernel, homology_dimension):
    """ Computes maximum and minimum of a heatkernel. 

    Parameters:
    - heatkernel (list of lists): heatkernel of all homology dimensions
    - homology_dimension (int): Which homology dimension to look at (0, 1 or 2)

    Returns:
    - mean intensity of heatkernel of homology dimension homology_dimension
    """
    
    positives =  [x for inner_list in heatkernel[0][homology_dimension] for x in inner_list if x > 0]
    
    return np.max(positives)

In [90]:
@signature_statistics_wrapper
def heat_kernel_min(heatkernel, homology_dimension):
    """ Computes maximum and minimum of a heatkernel. Only takes positive values because otherwise the minimum
    would always be the negative of the maximum

    Parameters:
    - heatkernel (list of lists): heatkernel of all homology dimensions
    - homology_dimension (int): Which homology dimension to look at (0, 1 or 2)

    Returns:
    - mean intensity of heatkernel of homology dimension homology_dimension
    """
    
    positives =  [x for inner_list in heatkernel[0][homology_dimension] for x in inner_list if x > 0]
    
    return np.min(positives)

In [None]:
heat_kernel_statistics["maximum"] = heat_kernel_max(signatures = heatkernels, label_list = label_list, persistence_diagrams = persistence_diagrams, heatkernel = None, homology_dimension = None)
heat_kernel_statistics["minimum"] = heat_kernel_min(signatures = heatkernels, label_list = label_list, persistence_diagrams = persistence_diagrams, heatkernel = None, homology_dimension = None)

# Betti Curve Features

In [48]:
BC = BettiCurve()

betti_curves = precompute_signature(BC)

In [49]:
betti_curve_statistics = {}

## L1 Norm

Using the L1 norm of the some features as additional ML feature improves the accuracy by a bit.

In [50]:
@signature_statistics_wrapper
def L1_norm(signature, homology_dimension):

    return norm(signature[0][0], homology_dimension)    

In [51]:
betti_curve_statistics["L1"] = L1_norm(signatures = betti_curves, label_list = label_list, persistence_diagrams = persistence_diagrams, signature = None, homology_dimension = None)

## Mean, Standard deviation, Skewness and Kurtosis

In [52]:
@signature_statistics_wrapper
def signature_mean(signature, homology_dimension):
    
    return statistics.mean(signature[0][homology_dimension])

In [53]:
@signature_statistics_wrapper
def signature_standard_deviation(signature, homology_dimension):
    
    return statistics.stdev(signature[0][homology_dimension])


In [54]:
@signature_statistics_wrapper
def signature_skewness(signature, homology_dimension):
    
    return skew(signature[0][homology_dimension])

In [55]:
@signature_statistics_wrapper
def signature_kurtosis(signature, homology_dimension):
    
    return kurtosis(signature[0][homology_dimension])

In [56]:
betti_curve_statistics["Mean"] = signature_mean(signatures = betti_curves, label_list = label_list, persistence_diagrams = persistence_diagrams, signature = None, homology_dimension = None)
betti_curve_statistics["Standard_Deviation"] = signature_standard_deviation(signatures = betti_curves, label_list = label_list, persistence_diagrams = persistence_diagrams, signature = None, homology_dimension = None)
betti_curve_statistics["Skewness"] = signature_skewness(signatures = betti_curves, label_list = label_list, persistence_diagrams = persistence_diagrams, signature = None, homology_dimension = None)
betti_curve_statistics["Kurtosis"] = signature_kurtosis(signatures = betti_curves, label_list = label_list, persistence_diagrams = persistence_diagrams, signature = None, homology_dimension = None)

# Silhouette Features

In [57]:
SH = Silhouette()
silhouettes = precompute_signature(SH)

silhouette_statistics = {}

L1 norm, mean, SD, Skewness, Kurtosis

In [58]:
silhouette_statistics["L1"] = compute_statistics_for_signature(L1_norm, silhouettes)

silhouette_statistics["Mean"] = compute_statistics_for_signature(signature_mean, silhouettes)
silhouette_statistics["Standard_Deviation"] = compute_statistics_for_signature(signature_standard_deviation, silhouettes)
silhouette_statistics["Skewness"] = compute_statistics_for_signature(signature_skewness, silhouettes)
silhouette_statistics["Kurtosis"] = compute_statistics_for_signature(signature_kurtosis, silhouettes)

# Save Signature Features

In [59]:
def create_feature_df(heat_kernel_statistics, betti_curve_statistics, silhouette_statistics, num_diagrams, label):
    """
    Create DataFrame for each label from features

    Parameters:
    - kernel_densities (list): intensities of heatkernel
    - L1_norms (list): L1 norms of signatures
    - num_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 1, 3, 5 or 7.

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    for stat in heat_kernel_statistics.keys():
        for homology_dim in range(2):
            feature_df["HeatKernel_Statistic_"+str(stat)+"Dim"+str(homology_dim)] = heat_kernel_statistics[stat]["Label_"+str(label)][homology_dim]

    for stat in betti_curve_statistics.keys():
        for homology_dim in range(2):
            feature_df["Betti_Curve_Statistic_"+str(stat)+"Dim"+str(homology_dim)] = betti_curve_statistics[stat]["Label_"+str(label)][homology_dim]

    for stat in silhouette_statistics.keys():
        for homology_dim in range(2):
            feature_df["Silhouette_Statistic_"+str(stat)+"Dim"+str(homology_dim)] = silhouette_statistics[stat]["Label_"+str(label)][homology_dim]

    # Label
    feature_df["Label"] = label

    return feature_df

In [60]:
dataframes = {}

for label in label_list:
    dataframes["Label_"+str(label)] = create_feature_df(heat_kernel_statistics, betti_curve_statistics, silhouette_statistics, 75, label)

In [61]:
# Concatenate and save features of training persistence diagrams
# TODO make the creation of this dataframe nicer
feature_df = pd.concat([dataframes["Label_"+str(0)], dataframes["Label_"+str(1)], dataframes["Label_"+str(2)], dataframes["Label_"+str(3)], dataframes["Label_"+str(4)]], ignore_index=True)
feature_df.to_csv("Features/"+str(subject)+"/Signature_Statistics.csv")