In [732]:
import numpy as np
import pyedflib
import statistics
import plotly.graph_objects as go
import pandas as pd
from gtda.time_series import SingleTakensEmbedding
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceEntropy, Amplitude, NumberOfPoints, ComplexPolynomial, PersistenceLandscape, HeatKernel, Silhouette, BettiCurve, PairwiseDistance, PersistenceImage 
from gtda.plotting import plot_point_cloud, plot_heatmap, plot_diagram
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, FastICA
from gtda.pipeline import Pipeline 
from numpy.linalg import norm

pd.set_option('display.max_columns', None)

# Load Data and set important variables

In [733]:
# TODO change warnings

In [734]:
# Choose if you want to look at EEG or EMG data

data_type = "EEG"
data_type = "EMG"

In [735]:
# choose individuum
subject = "m292"

In [736]:
label_list = [0, 1, 2, 3, 4]

In [737]:
# Load persistence diagrams

persistence_diagrams  = np.load('Embeddings_and_Persistence_Diagrams/'+str(subject)+'/'+str(data_type)+'/Persistence_Diagrams_All_Labels.npy', \
    allow_pickle=True).item() # .item() to convert the dtype to dict again

In [738]:
# TODO do this in Preprocessing_And_Computing_...

reshaped_persistence_diagrams = {}

for label in label_list:
    reshaped_persistence_diagrams["Label_"+str(label)] = [persistence_diagram[0] for persistence_diagram in list(persistence_diagrams["Label_"+str(label)])]

persistence_diagrams = reshaped_persistence_diagrams

# HeatKernel Intensity

In a way, the Heat Kernel shows an "average distribution" of the persistence diagrams for each label, seperated per hole dimensionality.

In [739]:
HK = HeatKernel(sigma=0.00003, n_bins=100)

In [740]:
def heat_kernel_intensity(heatkernel, homology_dimension):
    """ Computes mean intensity of a heatkernel. Only takes positive values because otherwise the mean would
      always be zero.

    Parameters:
    - heatkernel (list of lists): heatkernel of all homology dimensions
    - homology_dimension (int): Which homology dimension to look at (0, 1 or 2)

    Returns:
    - mean intensity of heatkernel of homology dimension homology_dimension
    """
    
    positives = [x for inner_list in heatkernel[0][homology_dimension] for x in inner_list if x > 0]

    # Return 0 intensity if there are no positives (=> if there are no holes of homology_dimension in the persistence diagram)
    return np.mean(positives) if len(positives) > 0 else 0

In [741]:
def compute_all_heat_kernel_intensities(label_list, persistence_diagrams):
    """
    Compute heat kernel intensity for each dimension and label.

    Parameters:
    - label_list (list): List of labels.
    - persistence_diagrams (dict): Dictionary containing persistence diagrams for each label.

    Returns:
    - kernel_densities (list): List containing dictionaries of kernel intensities for each dimension.
    """
    kernel_densities = []

    for dim in range(3):
        kernel_intensity_dim = {}

        for label in label_list:
            kernel_intensity_dim["Label_" + str(label)] = []

            for diagram in persistence_diagrams["Label_" + str(label)]:
                heatkernel = HK.fit_transform([diagram.astype("float")])
                kernel_intensity_dim["Label_" + str(label)].append(heat_kernel_intensity(heatkernel, dim))

        
        kernel_densities.append(kernel_intensity_dim)

    return kernel_densities

kernel_densities = compute_all_heat_kernel_intensities(label_list, persistence_diagrams)

# L1 norms of Features

Using the L1 norm of the some features as additional ML feature improves the accuracy by a bit.

In [742]:
L1_norms = {}

In [743]:
def compute_L1_norm_for_signature(persistence_diagrams, label_list, SG):

    L1_norms = {}
    
    for label in label_list:
        
        L1_norms["Label_"+str(label)] = []
        
        L1_norm_dim0 = []
        L1_norm_dim1 = []
        L1_norm_dim2 = []

        for diagram in persistence_diagrams["Label_"+str(label)]:
            signature = SG.fit_transform([diagram.astype("float")])
            L1_norm_dim0.append(norm(signature[0][0], 1))
            L1_norm_dim1.append(norm(signature[0][1], 1))
            L1_norm_dim2.append(norm(signature[0][2], 1))

        L1_norms["Label_"+str(label)].append(L1_norm_dim0)
        L1_norms["Label_"+str(label)].append(L1_norm_dim1)
        L1_norms["Label_"+str(label)].append(L1_norm_dim2)

    return L1_norms
    

## Persistence Landscape

In [744]:
PL = PersistenceLandscape()

In [745]:
L1_norms["PD"] = compute_L1_norm_for_signature(persistence_diagrams, label_list, PL)

## Betti Curve

In [746]:
BC = BettiCurve()

In [747]:
L1_norms["BC"] = compute_L1_norm_for_signature(persistence_diagrams, label_list, BC)

# Entries of vectorizations as direct features

In [748]:
def compute_vectorizations(persistence_diagrams, label_list, SG):
    
    vectorizations = {}

    for label in label_list:
        
        vectorizations["Label_"+str(label)] = []
    
        for diagram in persistence_diagrams["Label_"+str(label)]:
            
            signature = SG.fit_transform([diagram.astype("float")])

            vectorizations["Label_"+str(label)].append(signature)

    return vectorizations



In [749]:
def reshape_vectorizations_type_1(vectorizations, label_list):
    """
    Reshape vectorizations of shape (num_persistence_diagrams, 1, 3, 100) to separate dimensions for each label.

    Parameters:
    - vectorizations (dict): Dictionary containing vectorizations for each label.
    - label_list (list): List of labels.

    Returns:
    - reshaped_vectorizations (dict): Dictionary containing reshaped vectorizations for each label and dimension.
    """
    # Initialize dictionary to store reshaped vectorizations
    reshaped_vectorizations = {}
    
    # Iterate over each label
    for label in label_list:
        # Initialize dictionary to store reshaped vectorizations for the current label
        reshaped_vectorizations["Label_" + str(label)] = {}

        for coordinate_idx in range(100):

            # For each vectorization coordinate (there are 100), initialize one dictionary
            # which will contain lists of 74 vectorization coordinates (the "vectorization_idx"st coordinate
            # of the 74 persistence images) as values and the homology dimensions as keys
            reshaped_vectorizations["Label_" + str(label)]["Coordinate_" + str(coordinate_idx)] = {}
 
            for hom_dim in range(3):
                # Initialize list to store reshaped vectorizations for the current homology dimension
                reshaped_vectorizations["Label_" + str(label)]["Coordinate_" + str(coordinate_idx)]["Hom_Dim_" + str(hom_dim)] = []

    
    # Iterate over each label
    for label in label_list:
        # Iterate over each vectorization for the current label
        for coordinate_idx in range(100):
            
            for hom_dim in range(3):
                
                for vectorization_idx in range(len(vectorizations["Label_" + str(label)])):

                    # Append the component corresponding to the current homology dimension to the list
                    reshaped_vectorizations["Label_" + str(label)]["Coordinate_" + str(coordinate_idx)]["Hom_Dim_" + str(hom_dim)].append(
                        vectorizations["Label_" + str(label)][vectorization_idx][0][hom_dim][coordinate_idx])

    return reshaped_vectorizations


In [750]:
# Initialize vectorizations
vectorizations = {}

## Persistence Landscape

In [751]:
vectorizations_before_reshaping = compute_vectorizations(persistence_diagrams, label_list, PL)
vectorizations["PL"] = reshape_vectorizations_type_1(vectorizations_before_reshaping, label_list)

## Betti Curve

In [765]:
vectorizations_before_reshaping = compute_vectorizations(persistence_diagrams, label_list, BC)
vectorizations["BC"] = reshape_vectorizations_type_1(vectorizations_before_reshaping, label_list)

## Persistence Image

In [767]:
#PI = PersistenceImage(sigma=0.1, n_bins=100)

In [768]:
# TODO create function reshape_vectorizations_type_2() for reshaping persistence image vectorizations

#vectorizations_before_reshaping = compute_vectorizations(persistence_diagrams, label_list, PI)
#vectorizations["PI"] = reshape_vectorizations_type_2(vectorizations_before_reshaping, label_list)

## Silhouette

In [769]:
SH = Silhouette()

In [770]:
vectorizations_before_reshaping = compute_vectorizations(persistence_diagrams, label_list, SH)
vectorizations["SH"] = reshape_vectorizations_type_1(vectorizations_before_reshaping, label_list)

# Save Signature Features

In [771]:
def create_feature_df(data_type, kernel_densities, L1_norms, num_diagrams, vectorizations, label):
    """
    Create DataFrame for each label from features

    Parameters:
    - kernel_densities (list): intensities of heatkernel
    - L1_norms (list): L1 norms of signatures
    - num_diagrams (int): How many diagrams are there in total?
    - label (int): Label for which we want to create a dataframe. 1, 3, 5 or 7.

    Returns:
    - Feature DataFrame (DataFrame)
    """
    
    feature_df = pd.DataFrame(index=np.arange(0, num_diagrams))

    for homology_dim in range(3):
        feature_df[str(data_type)+"_Kernel_Intensity_Dim_"+str(homology_dim)] = kernel_densities[homology_dim]["Label_"+str(label)]

    for signature in L1_norms.keys():
        for homology_dim in range(3):
            feature_df[str(data_type)+"_L1_Norm_"+str(signature)+"_Dim_"+str(homology_dim)] = L1_norms[signature]["Label_"+str(label)][homology_dim]

    for signature in vectorizations.keys():
        for hom_dim in range(3):
            for coordinate_idx in range(100):
                feature_df[str(data_type)+"_"+str(signature)+"_Vectorization_Coordinate_"+str(coordinate_idx)+\
                "_Homology_Dim_"+str(homology_dim)] = vectorizations[signature]["Label_"+str(label)]["Coordinate_" + \
                str(coordinate_idx)]["Hom_Dim_" + str(hom_dim)]
        
    # Label
    feature_df["Label"] = label

    return feature_df

In [772]:
dataframes = {}

for label in label_list:
    dataframes["Label_"+str(label)] = create_feature_df(data_type, kernel_densities, L1_norms, len(persistence_diagrams["Label_"+str(label)]), vectorizations, label)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [759]:
df = dataframes["Label_"+str(0)] 

df["EMG_Kernel_Intensity_Dim_0"]

0     4.410392e+07
1     1.946777e+08
2     2.190124e+08
3     1.703430e+08
4     1.946777e+08
          ...     
69    2.190124e+08
70    1.460083e+08
71    1.703430e+08
72    1.460083e+08
73    1.703430e+08
Name: EMG_Kernel_Intensity_Dim_0, Length: 74, dtype: float64

In [760]:
# Concatenate and save features of training persistence diagrams
# TODO make the creation of this dataframe nicer
feature_df = pd.concat([dataframes["Label_"+str(0)], dataframes["Label_"+str(1)], dataframes["Label_"+str(2)], dataframes["Label_"+str(3)], dataframes["Label_"+str(4)]], ignore_index=True)
feature_df.to_csv("Features/"+str(subject)+"/"+str(data_type)+"/Signature_Statistics.csv")

In [775]:
feature_df.columns

Index(['EMG_Kernel_Intensity_Dim_0', 'EMG_Kernel_Intensity_Dim_1',
       'EMG_Kernel_Intensity_Dim_2', 'EMG_L1_Norm_PD_Dim_0',
       'EMG_L1_Norm_PD_Dim_1', 'EMG_L1_Norm_PD_Dim_2', 'EMG_L1_Norm_BC_Dim_0',
       'EMG_L1_Norm_BC_Dim_1', 'EMG_L1_Norm_BC_Dim_2',
       'EMG_PL_Vectorization_Coordinate_0_Homology_Dim_2',
       ...
       'EMG_SH_Vectorization_Coordinate_91_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_92_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_93_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_94_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_95_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_96_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_97_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_98_Homology_Dim_2',
       'EMG_SH_Vectorization_Coordinate_99_Homology_Dim_2', 'Label'],
      dtype='object', length=310)