# Consistency: Are the explanations consistent for similar classifiers?

* Train many logistic regression classifiers with different hyperparameters
* Get explanations for each sample in the test set and each classifier
* Report Lipschitz estimate for explanations of each classifier

Methodology for Sample Selection:
* For each class, randomly select one sample. Then, select 2 additional true positive samples of that class with the smallest euclidenan distance to the initial sample chosen.


Data Formats Before Wrappers: 
* Timeseries: 3D numpy array

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import gc
import itertools
import logging
from multiprocessing import Pool
import functools
import sys 
import random
from pathlib import Path
import time
import mlrose_ky as mlrose
#import mlrose
import h5py

from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, Dropout, BatchNormalization, Activation, Add, Flatten, Dense)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (ModelCheckpoint, TensorBoard, ReduceLROnPlateau,
                                        CSVLogger, EarlyStopping)
from tensorflow.keras.models import load_model

from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import NuSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
import tensorflow
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm import tqdm

import ecg_analysis.data 
import ecg_analysis.classifier
import explainers_benchmarking as explainers

import datasets

from explainable_model_ECG import ClfModel as ClfModel
from explainable_data_ECG import ClfData as ClfData

2025-04-09 20:31:33.262525: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-09 20:31:33.266045: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-09 20:31:33.277525: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744245093.296444 2479619 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744245093.302218 2479619 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 20:31:33.322924: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

## Step 1: Load in Testing Data, Select Samples of Interest

In [3]:
# load in ECG Testing Set

# Select points to explain from testing set

# load testing dataset 
path_to_hdf5_test = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/CODE/ecg_tracings.hdf5"
dataset_name_test = "tracings"  

# Import data. SEQ is an instance of class ECGSequence
seq = datasets.ECGSequence(path_to_hdf5_test, dataset_name_test)  # using default batch size

# load pretrained model (still need to compile later) 
model_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/PretrainedModels/model/model.hdf5"
pre_model = load_model(model_path)  

# compile and apply model to testing dataset
pre_model.compile(loss='binary_crossentropy', optimizer=Adam())
model_predictions = pre_model.predict(seq,verbose=1)   # y_score is a numpy array with dimensions 827x6. It holds the predictions generated by the model

# extra
print(model_predictions.shape)
print(model_predictions[:5])

# Generate dataframe
np.save("/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/dnn_output.npy", model_predictions)
print("Output predictions saved")

print(pre_model.input_shape)


W0000 00:00:1744245105.843797 2479619 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step
(827, 6)
[[1.4243224e-06 1.0710077e-07 2.6336946e-07 4.5377445e-07 9.4853863e-07
  6.4135390e-09]
 [2.8897351e-02 2.0066681e-03 3.1778637e-01 2.8277384e-05 4.8343472e-02
  3.2049985e-04]
 [3.1124635e-04 2.9402861e-05 4.1752292e-06 1.9712777e-05 9.3489951e-03
  2.4932497e-05]
 [2.3969111e-09 1.7344941e-09 6.9393674e-10 8.1738605e-10 5.6821343e-09
  2.7672636e-10]
 [5.3062342e-04 3.5334501e-06 3.3941697e-07 1.4301397e-06 2.2422880e-04
  4.7077333e-06]]
Output predictions saved
(None, 4096, 12)


In [4]:
def ECG_one_d_labels(model_predictions, onehot_labels = True):
    '''
    
    Purpose: turn one-hot encoding (N,d) array into (Nx1) vector of classes

    Input: 
    model_predictions: 2D array of probabilities or one-hot encodings (827x6)
    onehot_labels: boolean variable 

    Output: 
    (Nx1) vector of classes

    Comments: 
    The sample class is the class that exceeds the threshold
    If there are >1 classes that exceed the threshold, a tuple will be used to store the multiple classes 
    '''
    

    if not onehot_labels:
        # establish threshold
        threshold = np.array([0.124, 0.07, 0.05, 0.278, 0.390, 0.174])
        # generate class 0 probability
        exceedances = 1 - (np.maximum((model_predictions - threshold) , 0) / (1 - threshold))
        normal_prob = np.mean(exceedances, axis = 1, keepdims = True) # normal prob should be (N,1)
        # add normal prob
        probability_n = np.column_stack((normal_prob, model_predictions))
        # new threshold
        new_threshold = np.array([1, 0.124, 0.07, 0.05, 0.278, 0.390, 0.174])

        # make mask
        mask = probability_n >= new_threshold
    else:
        print(model_predictions.shape)

        mask = model_predictions == 1

        # Ensure each row has at least one '1'
        # no_positive_class is a column vector
        # Find rows with all False (no '1') # rows with all false becomes true
        no_positive_class = ~mask.any(axis=1) 
        
        # Expand mask by adding a new first column of zeros
        mask = np.column_stack((no_positive_class, mask))
    

    sample_classes = []
    for row in mask:
        passing_indices = np.where(row)[0]
        if len(passing_indices) > 1:  # If more than one indices pass
            if not onehot_labels: 
                # calc exceedances    
                exceedances = row - new_threshold
                # Get class with the highest exceedance
                max_class = np.argmax(exceedances)
                sample_classes.append(max_class)
            else:
                sample_classes.append(tuple(sorted(passing_indices)))  # Ensure passing indices are sorted in ascending order
        elif len(passing_indices) == 0:  # no passes
            sample_classes.append(0) 
        else:
            sample_classes.append(passing_indices[0])  

    return sample_classes


In [5]:
# load predictions to make y_pred
model_predictions = np.load("/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/dnn_output.npy")
# make y_pred
y_pred = ECG_one_d_labels(model_predictions, onehot_labels = False)

# make y_true
y_true_2D = pd.read_csv('/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/CODE/annotations/gold_standard.csv').values
# convert 2D to 1D
y_true = ECG_one_d_labels(y_true_2D, onehot_labels = True)

(827, 6)


In [6]:
# select indices/conditions for CoMTE
true_select = 0 #UPDATE HERE FOR OTHER CLASSES
pred_select = 0 #UPDATE HERE FOR OTHER CLASSES

# find relevant indices
indices_test = []
for idx, (true, pred) in enumerate(zip(y_true, y_pred)):
    print(f"Index:{idx}, True Label: {true}, Predicted Label: {pred}") # print elements
    if true ==  true_select and pred == pred_select:
        indices_test.append(idx)   
        
print('\n\n\n')
print(f"The {indices_test} indices match the case defined above:\n(true_select = {true_select}, pred_select = {pred_select})")

Index:0, True Label: 0, Predicted Label: 0
Index:1, True Label: 3, Predicted Label: 3
Index:2, True Label: 0, Predicted Label: 0
Index:3, True Label: 0, Predicted Label: 0
Index:4, True Label: 0, Predicted Label: 0
Index:5, True Label: 0, Predicted Label: 0
Index:6, True Label: 0, Predicted Label: 0
Index:7, True Label: 0, Predicted Label: 0
Index:8, True Label: 0, Predicted Label: 0
Index:9, True Label: 0, Predicted Label: 0
Index:10, True Label: 0, Predicted Label: 0
Index:11, True Label: 0, Predicted Label: 0
Index:12, True Label: 1, Predicted Label: 1
Index:13, True Label: 0, Predicted Label: 0
Index:14, True Label: 0, Predicted Label: 0
Index:15, True Label: (np.int64(1), np.int64(3)), Predicted Label: 3
Index:16, True Label: 0, Predicted Label: 0
Index:17, True Label: 0, Predicted Label: 0
Index:18, True Label: (np.int64(1), np.int64(3)), Predicted Label: 3
Index:19, True Label: 0, Predicted Label: 0
Index:20, True Label: 0, Predicted Label: 0
Index:21, True Label: 0, Predicted L

In [7]:
# select testing point

# get testing point
test_point = seq._getsample_(253)
# wrap test point 
test_df = ClfData.wrap_df_test_point(test_point)

print(test_df.shape)

(4096, 12)


## Step 2: Select Nearest Neighbor Samples

In [8]:
# Methods for getting nearest neighbor samples
from collections import defaultdict
from sklearn.neighbors import KDTree
class nearest_neighbor_samples:
    '''
        Purpose: 
        - find true positives
        - append them to KDTree
        - find nearest neighbor samples that are true positives of the same class
    '''
    

    
    def __init__(self, clf, timeseries, labels, silent=True,
                     num_distractors=2, dont_stop=False):#,
                     # threads=multiprocessing.cpu_count()):
        self.clf = clf
        self.timeseries = timeseries
        self.labels = labels
        self.silent = silent
        self.num_distractors = num_distractors
        if hasattr(clf, "metrics") and clf.metrics is not None:
            self.metrics = clf.metrics
        else:
            self.metrics = self.clf.steps[0][1].column_names
        self.dont_stop = dont_stop
        if hasattr(clf, "window_size") and clf.window_size is not None:
            self.window_size = clf.window_size
        else:
            self.window_size = len(timeseries.loc[
                timeseries.index.get_level_values('node_id')[0]])
        self.tree = None
        self.per_class_trees = None
        #self.threads = threads
        self.classes_to_test = [1,2,3,4,5,6]
        

    
    def construct_per_class_trees(self):
            """Used to choose distractors"""
            if self.per_class_trees is not None:
    
                for c, tree in self.per_class_trees.items():
                    num_indices = len(tree.data)  # The number of points in the KDTree
                    print(f"Class {c} has {num_indices} indices.")
                return
            self.per_class_trees = {}
            self.per_class_node_indices = {c: [] for c in self.clf.classes_}
            print('making predictions for per class trees')
            preds = self.clf.predict(self.timeseries)
    
            from collections import Counter
            #checking preds ...
            print('Validate Predictions')
            counter = Counter(preds)
            # Print unique items and their frequencies
            for item, freq in counter.items():
                print(f"{item}: {freq}")
    
            true_positive_node_ids = {c: [] for c in self.clf.classes_}
            
            for pred, (idx, row) in zip(preds, self.labels.iterrows()):
                if isinstance(row['label'], tuple):  # skip tuples for now - not sure how to handle them in MLRose Optimization
                    continue
                if row['label'] == pred:
                    if isinstance(idx, int): # wrap single datapoints in array
                        idx = [idx]
                    true_positive_node_ids[pred].append(idx[0])
        
            # validation
            print("\nTrue Positive Dictionary Stats")
            for key, value in true_positive_node_ids.items():
                print(f"Key: {key}, Length of value: {len(value)}")
            
            
            print('making per class trees')
            print(self.clf.classes_)
            for c in self.clf.classes_:
                print(c)
                dataset = []
                for node_id in true_positive_node_ids[c]:
                    # The below syntax of timeseries.loc[[node_id], :, :] is extremely fragile. The first two ranges index into the multi-index
                    # while the third range indexes the columns. But anything other than ":" for the third range causes the code to crash, apparently
                    # due to ambiguity. See the Warning here: https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#using-slicers
                    try:
                        sliced_node = self.timeseries.loc[[node_id], :, :]
                    except pd.errors.IndexingError: # try slicing with fallback
                        sliced_node = self.timeseries.loc[[node_id], :]
                    dataset.append(sliced_node.values.T.flatten())
                    self.per_class_node_indices[c].append(node_id)
                if dataset:
                    self.per_class_trees[c] = KDTree(np.stack(dataset))
            if not self.silent:
                logging.info("Finished constructing per class kdtree")


    def get_random_tps(self):
        '''
            This function should generate one random sample for each class in self.classes_to_test

        '''
        # print out the structure of the KD tree
        print("\nKDTree structure (underlying tree representation):")
        print(self.per_class_trees)

        
        # get a random sample for each class in KDTree, store in tuple
        random_samples = []

        for c in self.classes_to_test:
            if c in self.per_class_trees:  # Ensure the class has a KDTree
                # Get a random index from the class tree
                random_idx = np.random.randint(len(self.per_class_trees[c].data))
                print(f'random index is {random_idx}')
                # extract distractor
                random_sample = self.timeseries.loc[[self.per_class_node_indices[c][random_idx]], :, :]
                random_samples.append((c, random_sample))
            else:
                print(f"No KDTree found for class {c}")
        return random_samples


    def get_all_samples(self):
        '''
            This funciton should generate all samples, and store them in tuples
        '''
        print('Getting all samples')
        # start with constructing per class KD trees
        self.construct_per_class_trees()

        # get random true positives
        random_samples = self.get_random_tps()

        # iterate through each of the random samples to get Knn
        n_distractors = 2

        # Initialize a list to store distractors (nearest neighbors)
        distractors = defaultdict(list)
        print(len(random_samples))
        # For each random sample, find nearest neighbors
        for class_id, sample in random_samples:
            print(class_id)
            print(sample)
            # code for querying...
            for idx in self.per_class_trees[class_id].query(
                sample.values.T.flatten().reshape(1, -1),
                k=n_distractors)[1].flatten():
                
                print(f'idx for nn distractor is: {idx}')
                
                try:
                    sliced_distractor = self.timeseries.loc[[self.per_class_node_indices[class_id][idx]], :, :]
                except pd.errors.IndexingError: # try slicing with fallback
                    sliced_distractor = self.timeseries.loc[[self.per_class_node_indices[class_id][idx]], :]
                    sliced_distractor['node_id'] = [idx]
                    sliced_distractor.set_index('node_id', inplace=True) # aka sample_id
                    
                distractors[class_id].append((class_id, sliced_distractor))
            distractors[class_id].append((class_id, sample))
        
        return distractors

In [9]:
# define classifier methods

"""
Part 1: A Classifier that works with COMLEX

The classifier must have 2 capabilities:
1. Predict a class ie: class 0 in classes {0, 1}
2. Predict the probability for each class
-ie: [0.1, 0.9]

and

Be able to execute capability 1 and 2 on a PANDAS dataframe,
returning an array of corresponding predictions.



input:
    samples to be classified (pandas multiindex dataframe)

output: 
    for contrived_classification: length N list of classes

    for contrived_classification_proba: 
            length N list of 1x7 np arrays
"""

# load pretrained model (still need to compile later) 
model_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/PretrainedModels/model/model.hdf5"
pre_model = load_model(model_path)  
pre_model.compile(loss='binary_crossentropy', optimizer=Adam())


class BasicClassifier:
    classifier = pre_model  # tensorflow CNN
    import os
    
    @staticmethod
    def contrived_classification(pandas_dfs):
        import os
        classifier = pre_model  # tensorflow CNN

        # convert 2D pandas df to 3D dataframe (N,4096,12)
        array_3d = pandas_dfs.to_numpy().reshape(int(pandas_dfs.shape[0]/4096), 4096, 12)

        # create instance of ECGSequence to store the (N,4096,12) dataset
        temp_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/temporary.hdf5"
        temp_dataset_name = "tracings"
        if os.path.exists(temp_path):
            os.remove(temp_path)
        # create hdf with appropriate data
        hdf_file = h5py.File(temp_path, 'w')
        hdf_file.create_dataset(temp_dataset_name,data = array_3d)
        # init instnace of ECG Sequence holding modified with hdf path
        modified_instance = datasets.ECGSequence(temp_path, temp_dataset_name)

        # get classification and probability
        probability = classifier.predict(modified_instance, verbose = 0)    
        
    
        # close hdf5's
        modified_instance._closehdf()
        hdf_file.close()
        os.remove(temp_path)

        # analyze model output with thresholding
        # define given thresholds
        threshold = np.array([0.124, 0.07, 0.05, 0.278, 0.390, 0.174])
        
        # generate class 0 probability
        exceedances = 1 - (np.maximum((probability - threshold) , 0) / (1 - threshold))
        normal_prob = np.mean(exceedances, axis = 1, keepdims = True) # normal prob should be (N,1)
        
        # Add normal_prob as a new column
        probability_n = np.column_stack((normal_prob, probability))     

        # new threshold
        new_threshold = np.array([1, 0.124, 0.07, 0.05, 0.278, 0.390, 0.174])
        
        mask = probability_n >= new_threshold
        sample_classes = []  # init list for appends later
        
        for row, mask in zip(probability_n, mask):
            passing_indices = np.where(mask)[0]
            if len(passing_indices) > 1:  # If more than one indices pass
                # find margin between threshold and probability
                diff_array = row - new_threshold
                passing_index = np.argmax(diff_array)
                # append the index that has the highest margin
                sample_classes.append(passing_index)
            
            elif len(passing_indices) == 0:  # no passes
                sample_classes.append(0) 
            else:
                sample_classes.append(passing_indices[0])  # Select the first (or adjust logic)
                
        return sample_classes


    @staticmethod
    def contrived_classification_proba(pandas_dfs):
        import os
        classifier = pre_model  # tensorflow CNN
        
        # convert 2D pandas df to 3D dataframe (N,4096,12)
        array_3d = pandas_dfs.to_numpy().reshape(int(pandas_dfs.shape[0]/4096), 4096, 12)

        # create instance of ECGSequence to store the (N,4096,12) dataset
        temp_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/temporary.hdf5"
        temp_dataset_name = "tracings"
        if os.path.exists(temp_path):
            os.remove(temp_path)
        # create hdf with appropriate data
        hdf_file = h5py.File(temp_path, 'w')
        hdf_file.create_dataset(temp_dataset_name,data = array_3d)
        # init instnace of ECG Sequence holding modified with hdf path
        modified_instance = datasets.ECGSequence(temp_path, temp_dataset_name)

        # get classification and probability
        probability = classifier.predict(modified_instance, verbose = 0)  
        
        # close hdf5's
        modified_instance._closehdf()
        hdf_file.close()
        os.remove(temp_path)

        # analyze model output with thresholding
         # define given thresholds
        threshold = np.array([0.124, 0.07, 0.05, 0.278, 0.390, 0.174])
        
        # generate class 0 probability
        exceedances = 1 - (np.maximum((probability - threshold) , 0) / (1 - threshold))
        normal_prob = np.mean(exceedances)

        # modify result 
        probability = np.insert(probability,0,normal_prob)   

        # probability should be in a 2D array format
        if probability.ndim == 1:  # Check if it's 1D
            probability = probability.reshape(1, -1)
        
        return probability




In [10]:
# convert x_test from np into pandas
hdf5_test_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/CODE/ecg_tracings.hdf5"
dataset_name_hdf_tracings = 'tracings'
f = h5py.File(hdf5_test_path, "r")
x_test = np.array(f[dataset_name_hdf_tracings])
num_features = 12
pd_test_points = ClfData.wrap_df_x(x_test, num_features)

# make y_test (2D one-hot --> 1D)
y_true_2D = pd.read_csv('/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/CODE/annotations/gold_standard.csv').values
y_test = ClfData.wrap_df_y(y_true_2D)


# wrap model
classes_available = [0,1,2,3,4,5,6]
num_columns = 4096
wrapped_classifier = ClfModel(BasicClassifier.classifier,
                            predict_attr=BasicClassifier.contrived_classification,
                            predict_proba_attr=BasicClassifier.contrived_classification_proba,
                            column_attr=pd_test_points.columns.values.tolist(),
                            classes_attr=classes_available,
                            window_size_attr=num_columns)

(827, 6)
(827, 6)
(827,)
(827, 7)


In [11]:
# convert testing dataset into 
nn_sampler = nearest_neighbor_samples(
    clf=wrapped_classifier,
    timeseries=pd_test_points,
    labels=y_test,
    silent=False,             # show log info
    num_distractors=3,        # 3 neighbors per class
    dont_stop=True            # unclear what this does, but included for completeness
)

In [12]:
samples = nn_sampler.get_all_samples()

Getting all samples
making predictions for per class trees
Validate Predictions
0: 678
3: 28
1: 17
4: 19
6: 38
2: 39
5: 8

True Positive Dictionary Stats
Key: 0, Length of value: 670
Key: 1, Length of value: 17
Key: 2, Length of value: 28
Key: 3, Length of value: 24
Key: 4, Length of value: 14
Key: 5, Length of value: 7
Key: 6, Length of value: 35
making per class trees
[0, 1, 2, 3, 4, 5, 6]
0
1
2
3
4
5
6

KDTree structure (underlying tree representation):
{0: <sklearn.neighbors._kd_tree.KDTree object at 0xc23ea20>, 1: <sklearn.neighbors._kd_tree.KDTree object at 0xa910be0>, 2: <sklearn.neighbors._kd_tree.KDTree object at 0xc8132c0>, 3: <sklearn.neighbors._kd_tree.KDTree object at 0xbd19e50>, 4: <sklearn.neighbors._kd_tree.KDTree object at 0xbd179f0>, 5: <sklearn.neighbors._kd_tree.KDTree object at 0xc7f81c0>, 6: <sklearn.neighbors._kd_tree.KDTree object at 0xa93dbd0>}
random index is 13
random index is 18
random index is 6
random index is 1
random index is 5
random index is 6
6
1
    

In [13]:
print(len(samples))
#print((samples))

for class_id, items in samples.items():
    print(f"\nClass ID: {class_id}")
    print(f"Number of items: {len(items)}")
    for i, (cls, data) in enumerate(items):
        print(f"  Item {i}: type = {type(data)}")
        if hasattr(data, "shape"):
            print(f"    shape = {data.shape}")

6

Class ID: 1
Number of items: 3
  Item 0: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 1: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 2: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)

Class ID: 2
Number of items: 3
  Item 0: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 1: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 2: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)

Class ID: 3
Number of items: 3
  Item 0: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 1: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 2: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)

Class ID: 4
Number of items: 3
  Item 0: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 1: type = <class 'pandas.core.frame.DataFrame'>
    shape = (4096, 12)
  Item 2: ty

## Step 3: Create Lipschiz Ratios

In [None]:
# constructor for classifiers

In [16]:
# for CoMTE
sys.path.append('/projectnb/peaclab-mon/JLi/projectx/CoMTE_V2_JLi/CoMTE_V2/comlex_core/src')  # Path to the comlex_core directory
import explainers as explainers_V2
import itertools

class BasicData:
    # define basic variables
    classes_available = [0,1,2,3,4,5,6]
    num_columns = 4096
    num_features = 12

    # define paths and variables for training data
    path_to_hdf5_test = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/CODE/ecg_tracings.hdf5"
    dataset_name_hdf_tracings = "tracings" 
    training_set_hdf_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/combined_V2.hdf5"
    y_train_csv_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/labels_combined_V2.csv"
    
    f = h5py.File(training_set_hdf_path, "r")
    timeseries = np.array(f[dataset_name_hdf_tracings])

    # iterable of corresponding labels for the samples for the data wrapper (returns 20000x6 np array) <--- take out first column that represents ExamID
    labels = pd.read_csv(y_train_csv_path)


class BasicComlexInput:

    # 1. wrap training points
    df_train_points = ClfData.wrap_df_x(BasicData.timeseries, BasicData.num_features)
    
    # 2. wrap training labels
    df_train_labels = ClfData.wrap_df_y(BasicData.labels)

    # 3. wrap up the classifier
    # note: column_attr, or the corresponding name of the columns in the sample,
    #  is unique to dataframes, and auto-generated by wrap_df_x
    # wrapped_classifier = ClfModel(BasicClassifier.classifier,
    #                             predict_attr=BasicClassifier.contrived_classification,
    #                             predict_proba_attr=BasicClassifier.contrived_classification_proba,
    #                             column_attr=df_train_points.columns.values.tolist(),
    #                             classes_attr=BasicData.classes_available,
    #                             window_size_attr=BasicData.num_columns)    
    
    # get testing point
    test_point = seq._getsample_(253)
    # wrap test point 
    test_df = ClfData.wrap_df_test_point(test_point)


# set up an optimized search comlex runner
# note: classifier was set up above
comlex = explainers_V2.OptimizedSearch(wrapped_classifier,
                                    BasicComlexInput.df_train_points,
                                    BasicComlexInput.df_train_labels,
                                    silent=True, threads=4, num_distractors=3)

(20000, 6)
(20000, 6)
(20000,)
(20000, 7)


In [None]:
# Loop over the dictionary

results_df = pd.DataFrame(columns=['class', 'node_id', 'pair_index', 'dist', 'lipschitz'])

for key, tuple_list in samples.items():
    print(f"Class: {key}")
    
    # Get all 2-tuple combinations
    for t1, t2 in itertools.combinations(tuple_list, 2):
        # calculate euclidenan distance between samples
        # Flatten both DataFrames
        vec1 = t1[1].values.flatten()
        vec2 = t2[1].values.flatten()

        # get explanations
        explanation_t1 = comlex.explain(t1[1],to_maximize=key,
                             return_dist=True,single=True,
                             savefig=True,train_iter=100,
                             timeseries=False,filename="sample_result.png")
        explanation_t2 = comlex.explain(t2[1],to_maximize=key,
                             return_dist=True,single=True,
                             savefig=True,train_iter=100,
                             timeseries=False,filename="sample_result.png")
        
        # convert into sets for set distance
        exp_t1_set = set(explanation_t1[1].values.flatten())  
        exp_t2_set = set(explanation_t2[1].values.flatten())  
        # calculate set distance between explanations
        exp_diff = len(exp_t1_set.difference(exp_t2_set)) + len(exp_t2_set.difference(exp_t1_set))
        
        # Compute Euclidean distance between samples
        clf_diff = np.linalg.norm(vec1 - vec2)
        
        # calculate lipschiz ratio
        lipschitz = -1 * exp_diff / clf_diff

        # Store results in the DataFrame
        new_row = pd.DataFrame({
            'class': [key],
            'node_id': [t1[0]],  # Or other relevant identifiers
            'pair_index': [idx],
            'dist': [dist],
            'lipschitz': [lipschitz]
        })
        results_df = pd.concat([results_df,new_row],ignore_index=True)

Class: 1
-------Preliminary Statistics-------
Original Sample Class: [np.int64(1)] 
Sample Probabilities: [[9.2682105e-01 5.0862855e-01 2.4123222e-03 6.3755880e-03 5.7806697e-04
  1.2989482e-02 3.3653807e-05]]
Class of Interest: 1


generating distractors
Class 0 has 16498 indices.
Class 1 has 227 indices.
Class 2 has 449 indices.
Class 3 has 264 indices.
Class 4 has 258 indices.
Class 5 has 246 indices.
Class 6 has 390 indices.
validate distractors probabilities:
Distractor 1 probability: 
[[8.9172411e-01 6.9309807e-01 4.8672420e-05 2.2062271e-05 2.3005109e-06
  1.5616019e-02 4.7103786e-06]]
Distractor 2 probability: 
[[9.1059238e-01 5.9392655e-01 1.8268339e-03 1.4687905e-02 2.9223080e-04
  5.6689147e-02 8.1678838e-05]]
Distractor 3 probability: 
[[8.6975950e-01 8.0854422e-01 3.8052210e-06 6.6586483e-07 8.2461199e-08
  2.7381795e-05 1.6021142e-09]]

processing distractor 1 of 3
Probabilities of distractor sample: [[8.9172411e-01 6.9309807e-01 4.8672420e-05 2.2062271e-05 2.3005109e-06


In [None]:
# SHAP
import shap

"""
SHAP Wrappers for input data, model
"""

class BasicSHAPInput:
    classifier = pre_model  # tensorflow CNN

    # wrap training points for SHAP
    df_train_points_SHAP = ClfData.wrap_df_x_SHAP(BasicData.timeseries, BasicData.num_features)

    # select test point 
    test_point = seq._getsample_(253)
    
    # wrap test point for SHAP
    test_df_SHAP = ClfData.wrap_df_test_point_SHAP(test_point)

class BasicSHAPClassifier:
    classifier = pre_model  # tensorflow CNN

# X_train_background is typically a small representative subset of X_train
BasicSHAPInput.df_train_points_SHAP.shape

# init explainer
explainer = shap.GradientExplainer(BasicSHAPClassifier.classifier, BasicSHAPInput.df_train_points_SHAP)

In [None]:
# SHAP

shap_values = explainer.shap_values(BasicSHAPInput.test_df_SHAP)
print(shap_values)







In [None]:
# LIME


"""
LIME Wrappers for input data, model
"""

"""
SHAP Wrappers for input data, model
"""

class BasicLIMEClassifier:
    classifier = pre_model  # tensorflow CNN
    import os
    @staticmethod
    def contrived_classification(pandas_dfs):
        """
        Notes: 
            if there were multiple classes that exceeded the threshold, the class with the highest exceedance was assigned
            to the sample

        Input: 
            numpy_df: pandas multiindex array of samples

        Output: 
            if there is one sample: function returns the index
            if there are multiple samples: function returns a (1xN) list of the indices
        """
        classifier = pre_model  # tensorflow CNN
        
        # create instance of ECGSequence to store the (N,4096,12) dataset
        temp_path = "/projectnb/peaclab-mon/JLi/projectx/AutoECGDiagnosisData/temporary.hdf5"
        temp_dataset_name = "tracings"
        if os.path.exists(temp_path):
            os.remove(temp_path)
        # create hdf with appropriate data
        hdf_file = h5py.File(temp_path, 'w')
        hdf_file.create_dataset(temp_dataset_name,data = array_3d)
        # init instnace of ECG Sequence holding modified with hdf path
        modified_instance = datasets.ECGSequence(temp_path, temp_dataset_name)
        # get classification and probability
        probability = classifier.predict(modified_instance, verbose = 1)    
        # close hdf5's
        modified_instance._closehdf()
        hdf_file.close()
        os.remove(temp_path)
        # analyze model output with thresholding
        # define given thresholds
        threshold = np.array([0.124, 0.07, 0.05, 0.278, 0.390, 0.174])
        
        # generate class 0 probability
        exceedances = 1 - (np.maximum((probability - threshold) , 0) / (1 - threshold))
        normal_prob = np.mean(exceedances, axis = 1, keepdims = True) # normal prob should be (N,1)
        
        # Add normal_prob as a new column
        probability_n = np.column_stack((normal_prob, probability))     

        # new threshold
        new_threshold = np.array([1, 0.124, 0.07, 0.05, 0.278, 0.390, 0.174])
        
        mask = probability_n >= new_threshold
        sample_classes = []
        
        for row, mask in zip(probability_n, mask):
            passing_indices = np.where(mask)[0]
            if len(passing_indices) > 1:  # If more than one indices pass
                # compute exceedance
                exceedances = row[passing_indices] - new_threshold[passing_indices]
                # Get class with the highest exceedance
                max_class = passing_indices[np.argmax(exceedances)]
                sample_classes.append(max_class)
            elif len(passing_indices) == 0:  # no passes
                sample_classes.append(0) 
            else:
                sample_classes.append(passing_indices[0])  
                
        # don't return list if only one sample
        if len(sample_classes) == 1:
            sample_classes = sample_classes[0]
        
        return sample_classes 
    
    # LIME prediction function...
    def contrived_classification_proba_LIME(np_2d_array):
        import os
        """
        Purpose: 
            make prediction and return set of probabilities
        Return: 
            if there are single or multiple samples: function returns a list of arrays containing the probabilities
        """

        # reshape
        num_features = np_2d_array.shape[0]
        print(f"num features is {num_features}")
        df_reshaped = np_2d_array.reshape(num_features, 4096, 12)

        # init classifier
        classifier = pre_model  # tensorflow CNN

        # get classification and probability
        probability = classifier.predict(df_reshaped, verbose = 1)  
        
        return probability
        
class BasicLIMEInput:
    classifier = pre_model  # tensorflow CNN    

    # wrap training points for LIME
    df_train_points_LIME = ClfData.wrap_df_x_LIME(BasicData.timeseries, BasicData.num_features)
    
    # wrap training labels 
    df_train_labels = ClfData.wrap_df_y(BasicData.labels)
    
    # select test point 
    test_point = seq._getsample_(253)

    # wrap test point for LIME
    test_df_LIME = ClfData.wrap_df_test_point_LIME(test_point)


    # wrap classifier for LIME
    wrapped_classifier_LIME = ClfModel(BasicLIMEClassifier.classifier,
                                predict_attr=BasicLIMEClassifier.contrived_classification,
                                predict_proba_attr=BasicLIMEClassifier.contrived_classification_proba_LIME,
                                column_attr=['DI','DII','DIII','AVR','AVL','AVF','V1','V2','V3','V4','V5','V6'],
                                classes_attr=BasicData.classes_available,
                                window_size_attr=BasicData.num_columns)




import lime.lime_tabular as limetabular

# instantiate LIME tabular explainer
'''
    input: 
    1. flattened data (N,49152) <-- flattened in row-major order (C-style)
    2. model
    3. feature names 
    4. classification
'''
LIME_explainer = limetabular.LimeTabularExplainer(BasicLIMEInput.df_train_points_LIME, 
                                                  mode = 'classification')

## Original Code

In [None]:
logging.basicConfig(format='%(asctime)s %(levelname)-7s %(message)s',
                    stream=sys.stderr, level=logging.DEBUG)
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING) 

In [None]:
timeseries, labels, test_timeseries, test_labels = data_loading.load_hpc_data(
    # Path('/home/ates/data/taxonomist/'), window=60, skip=60, make_binary=True)
    Path('/projectnb/peaclab-mon/ates/hpas'), classes=['memorybandwidth', 'none'])

In [None]:
from sklearn.utils.validation import check_X_y

!pip install scikit-learn==0.23

In [None]:
good_kwargs = []
for p in ['l1', 'l2', 'elasticnet', 'none']:
    for tol in np.logspace(-8, -1, 8):
        for solver in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
            for c in np.logspace(-8, 8, 20) if p != 'none' else [1]:
                kwargs = {'penalty': p, 'tol': tol, 'C': c, 'solver': solver, 'fit_intercept': False}
                try:
                    clf = LogisticRegression(**kwargs)
                    clf.fit([[1], [0]], [1, 0])
                except ValueError:
                    pass
                else:
                    good_kwargs.append(kwargs)
print(len(good_kwargs))

In [None]:
used_kwargs = []
pipelines = []
scores = []
random.shuffle(good_kwargs)


# pull out feature extraction


for kwargs in tqdm(good_kwargs):
    
    p = Pipeline([
        ('assert1', analysis.classifier.CheckFeatures()),
        ('features', analysis.data.TSFeatureGenerator(threads=1, trim=0)),
        ('assert2', analysis.classifier.CheckFeatures()),
        ('scaler', MinMaxScaler(feature_range=(-1, 1))),
        ('clf', LogisticRegression(**kwargs))
    ])

    p.fit(timeseries, np.ravel(labels))
    preds = p.predict(test_timeseries)
    score = f1_score(test_labels, preds, average='weighted')
    
    to_add = False
    if score > 0.975:
        to_add = True
        for p2 in pipelines:
            if np.linalg.norm(p.steps[4][1].coef_ - p2.steps[4][1].coef_) == 0:
                to_add = False
    if not to_add:
        continue
    scores.append(score)
    pipelines.append(p)
    used_kwargs.append(kwargs)
    if len(pipelines) >= 25:
        break

In [None]:
explainer_constructors = {
    'lime': explainers.LimeExplanation,
    'random': explainers.RandomExplanation,
    'shap': explainers.ShapExplanation,
    'our_method': explainers.BruteForceSearch,
}

In [None]:
results = []
for node_id in tqdm(random.sample(list(test_timeseries.index.get_level_values('node_id').unique()), 50)):
    x_test = test_timeseries.loc[[node_id], :, :]
    for e in explainer_constructors:
        explanations = []
        for p in pipelines:
            exp = explainer_constructors[e](p, timeseries, labels)
            explanations.append(set(exp.explain(x_test)))
        min_ratio = 0
        for clf1 in range(len(pipelines)):
            for clf2 in range(clf1):
                clf_diff = np.linalg.norm(pipelines[clf1].steps[4][1].coef_ - pipelines[clf2].steps[4][1].coef_)
                print(pipelines[clf1].steps[4][1])
                print(pipelines[clf2].steps[4][1])
                
                exp_diff = len(explanations[clf1].difference(explanations[clf2])) + len(explanations[clf2].difference(explanations[clf1]))
                lipschitz = -1 * exp_diff / clf_diff
                if lipschitz < min_ratio:
                    min_ratio = lipschitz
        results.append({
            'method': e,
            'node_id': node_id,
            'ratio': min_ratio,
        })

In [None]:
df = pd.DataFrame(results)
df

In [None]:
df['lipschitz'] = df['ratio'].apply(lambda x: - x)

In [None]:
sns.catplot(data=df, x='method', y='lipschitz', kind='box', order=['our_method', 'lime', 'shap', 'random'])#, showfliers=False)