In [1]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, ForgetDimension
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from numpy.linalg import norm

# Load Data and set important variables

In [2]:
# choose individuum
subject = "m294"

In [3]:
label_list = [0, 1, 2, 3, 4]

In [4]:
# Load persistence diagrams

persistence_diagrams = np.load("Embeddings_and_Persistence_Diagrams/"+str(subject)+"/Persistence_Diagrams.npy", allow_pickle=True).item()

# HeatKernel Intensity

In a way, the Heat Kernel shows an "average distribution" of the persistence diagrams for each label, seperated per hole dimensionality.

In [5]:
HK = HeatKernel(sigma=0.00003, n_bins=100)

In [6]:
def heat_kernel_intensity(heatkernel, homology_dimension):
    """ Computes mean intensity of a heatkernel. Only takes positive values because otherwise the mean would
      always be zero.

    Parameters:
    - heatkernel (list of lists): heatkernel of all homology dimensions
    - homology_dimension (int): Which homology dimension to look at (0, 1 or 2)

    Returns:
    - mean intensity of heatkernel of homology dimension homology_dimension
    """
    
    positives =  [x for inner_list in heatkernel[0][homology_dimension] for x in inner_list if x > 0]
    
    return np.mean(positives)

In [7]:
FG = ForgetDimension()

In [8]:
kernel_densities = []

kernel_intensity_dim0 = {}
kernel_intensity_dim1 = {}

for label in label_list:

    amount_to_average = 80 # The imaging data has 20Hz, therefore we have to average 80 images to get 4 seconds

    # Initialize intensity lists of our label
    kernel_intensity_dim0["Label_"+str(label)] = []
    kernel_intensity_dim1["Label_"+str(label)] = []

    # Take average of the statistics of 25 persistence diagrams 
    for idx in range(int(len(persistence_diagrams[label])/amount_to_average)):
        
        intensities_dim0 = []
        intensities_dim1 = []

        for counter in range(amount_to_average):
            heatkernel = HK.fit_transform([persistence_diagrams[label][amount_to_average*idx+counter].astype("float")])
            intensities_dim0.append(heat_kernel_intensity(heatkernel, 0))
            intensities_dim1.append(heat_kernel_intensity(heatkernel, 1))
            

        kernel_intensity_dim0["Label_"+str(label)].append(np.mean(intensities_dim0, axis=0))
        kernel_intensity_dim1["Label_"+str(label)].append(np.mean(intensities_dim1, axis=0))


kernel_densities.append(kernel_intensity_dim0)
kernel_densities.append(kernel_intensity_dim1)

# L1 norms of Features

Using the L1 norm of the some features as additional ML feature improves the accuracy by a bit.

In [9]:
L1_norms = {}

In [10]:
def compute_L1_norm_for_signature(persistence_diagrams, label_list, SG):

    L1_norms = {}
    
    for label in label_list:

        amount_to_average = 80 # The imaging data has 20Hz, therefore we have to average 80 images to get 4 seconds

        
        L1_norms["Label_"+str(label)] = []
        
        L1_norm_dim0 = []
        L1_norm_dim1 = []

    

        for idx in range(int(len(persistence_diagrams[label])/amount_to_average)):

            norm_dim0 = []
            norm_dim1 = []

            for counter in range(amount_to_average):

                signature = SG.fit_transform([persistence_diagrams[label][amount_to_average*idx+counter].astype("float")])
                norm_dim0.append(norm(signature[0][0], 1))
                norm_dim1.append(norm(signature[0][1], 1))
                

            L1_norm_dim0.append(np.mean(norm_dim0, axis=0))
            L1_norm_dim1.append(np.mean(norm_dim1, axis=0))

        
        L1_norms["Label_"+str(label)].append(L1_norm_dim0)
        L1_norms["Label_"+str(label)].append(L1_norm_dim1)

    return L1_norms
    

## Persistence Landscape

In [11]:
PL = PersistenceLandscape()

In [12]:
L1_norms["PD"] = compute_L1_norm_for_signature(persistence_diagrams, label_list, PL)

## Betti Curve

In [13]:
BC = BettiCurve()

In [14]:
L1_norms["BC"] = compute_L1_norm_for_signature(persistence_diagrams, label_list, BC)

# Save Signature Features

In [15]:
def create_feature_df(kernel_densities, L1_norms, num_diagrams, label):
    """
    Create DataFrame for each label from features

    Parameters:
    - kernel_densities (list): intensities of heatkernel
    - L1_norms (list): L1 norms of signatures
    - num_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 1, 3, 5 or 7.

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    for homology_dim in range(2):
        feature_df["Kernel_Intensity_Dim"+str(homology_dim)] = kernel_densities[homology_dim]["Label_"+str(label)]

    for signature in L1_norms.keys():
        for homology_dim in range(2):
            feature_df["L1_Norm_"+str(signature)+"Dim"+str(homology_dim)] = L1_norms[signature]["Label_"+str(label)][homology_dim]

    # Label
    feature_df["Label"] = label

    return feature_df

In [16]:
dataframes = {}

for label in label_list:
    dataframes["Label_"+str(label)] = create_feature_df(kernel_densities, L1_norms, 75, label)

In [17]:
# Concatenate and save features of training persistence diagrams
# TODO make the creation of this dataframe nicer
feature_df = pd.concat([dataframes["Label_"+str(0)], dataframes["Label_"+str(1)], dataframes["Label_"+str(2)], dataframes["Label_"+str(3)], dataframes["Label_"+str(4)]], ignore_index=True)
feature_df.to_csv("Features/"+str(subject)+"/Signature_Statistics.csv")