In [2]:
# import modules

import sys
sys.path.append('/projectnb/peaclab-mon/JLi/projectx/CoMTE_V2/comlex_core')  # Path to the comlex_core directory


# treat src as a package since it has an __init__.py file
from src import explainers
from src.explainable_model import ClfModel
from src.explainable_data import ClfData

import numpy as np


"""
This is a basic tutorial on how to use COMLEX in 4 parts

Parts 1 and 2 are to set up a classifier and a dataset used
to train the classifier, which you will usually have already.


Part 3 shows how to connect the classifier and dataset into
a wrapper that can be input to the COMLEX algorithm, by using
helper functions found in COMLEX's explainable_model and
explainable_data files. More info can be found in their
respective files.

Part 4 shows the actual execution of COMLEX, after the input
components have been wrapped up with COMLEX's utility function.
"""


"\nThis is a basic tutorial on how to use COMLEX in 4 parts\n\nParts 1 and 2 are to set up a classifier and a dataset used\nto train the classifier, which you will usually have already.\n\n\nPart 3 shows how to connect the classifier and dataset into\na wrapper that can be input to the COMLEX algorithm, by using\nhelper functions found in COMLEX's explainable_model and\nexplainable_data files. More info can be found in their\nrespective files.\n\nPart 4 shows the actual execution of COMLEX, after the input\ncomponents have been wrapped up with COMLEX's utility function.\n"

In [3]:
"""
Part 1: A Classifier that works with COMLEX

The classifier must have 2 capabilities:
1. Predict a class ie: class 0 in classes {0, 1}
2. Predict the probability for each class
-ie: [0.1, 0.9]

and

Be able to execute capability 1 and 2 on a PANDAS dataframe,
returning an array of corresponding predictions.
"""

class BasicClassifier:
    classifier = {} # nothing here, but sklearn classifiers are well supported

    @staticmethod
    def contrived_classification(pandas_dfs):
        results = []
        for index, pandas_df in pandas_dfs.iterrows():
            running_tot = 0
            #-----
            for val_i in pandas_df:
                running_tot += val_i
            res_val = running_tot % 2
            if res_val > 1:
                results.append(1)
            else:
                results.append(0)
            #-----
            # replace above with function giving class, ie: result=0 in result classes {0,1}
        return results

    @staticmethod
    def contrived_classification_proba(pandas_dfs):
        results = []
        for index, pandas_df in pandas_dfs.iterrows():
            running_tot = 0
            #-----
            for val_i in pandas_df:
                running_tot += val_i
            res_val = running_tot % 2
            results.append([1.0-res_val/2.0, res_val/2.0])
            #-----
            # replace above with function giving probability of classes, ie: [0.1, 0.9] in classes {0,1} meaning 0.1=0, 0.9=1
        return results

In [4]:
"""
Part 2: Training data and labels

[The explanation will use counterfactuals drawn from this input data]

The training data can be should be an iterable of samples
(ie: python array, numpy array, pandas dataframe),
where each sample needs to be the same size array as the others.

The labels should be a corresponding iterable to the samples.

COMLEX will only use samples for which the labels are the same
as the prediction from the trained classifier.

Note:
We don't support variable-length training data at this time,
use a different projection of the data if you have such data.
"""

class BasicData:
    classes_available = [0, 1]

    # Random data is generated here and the labels are calculated.
    # In this case, 20 samples of 8-length arrays.
    num_columns = 8
    num_rand_samples = 20
    
    # create a random dataset
    np_train_points = [np.random.rand(8) for _ in range(num_rand_samples)]


In [5]:
"""
Part 3: Wrapping it up.

The training data, training labels, and trained classifier need to be wrapped up
into a form that can pass through COMLEX.

While wrapping up the training data and labels is relatively straightforward,
wrapping up the classifier is more difficult
"""

class BasicComlexInput:
    # 1. wrap up the training points
    df_train_points = ClfData.wrap_df_x(BasicData.np_train_points, BasicData.num_columns)

    # 2. generate and wrap up the labels
    train_labels = BasicClassifier.contrived_classification(df_train_points)
    np_train_labels = np.array(train_labels)
    df_train_labels = ClfData.wrap_df_y(np_train_labels)

    # 3. wrap up the classifier
    # note: column_attr, or the corresponding name of the columns in the sample,
    #   is unique to dataframes, and auto-generated by wrap_df_x
    wrapped_classifier = ClfModel(BasicClassifier.classifier,
                                predict_attr=BasicClassifier.contrived_classification,
                                predict_proba_attr=BasicClassifier.contrived_classification_proba,
                                column_attr=df_train_points.columns.values.tolist(),
                                classes_attr=BasicData.classes_available,
                                window_size_attr=BasicData.num_columns)


In [6]:
"""
Part 4: Running it through COMLEX

Requires:
1. wrapped classifier
2. wrapped training data
3. wrapped training labels

To run COMLEX:
1. wrap the test point
2. instantiate a comlex runner on the wrapped components
-OptimizedSearch sets up a KDTree for based on the data,
 in order to speed up the search time for the counterfactual
 explanation.
-OptimizedSearch will fallback to BruteForceSearch if it fails
 to find a counterfactual explanation with a predicted
 probability greater than 0.95.
3. use the comlex runner to explain wrapped datapoint
"""

# 1. wrap a test data point whose classification you want to explain
test_point = [0.56, 0.32, 0.22, 0.44, 0.44, 0.11, 0.45, 0.24]
test_df = ClfData.wrap_df_test_point(test_point)

# 2. set up an optimized search comlex runner
comlex = explainers.OptimizedSearch(BasicComlexInput.wrapped_classifier,
                                    BasicComlexInput.df_train_points,
                                    BasicComlexInput.df_train_labels,
                                    silent=True, threads=4, num_distractors=3)

# 3. explain the test point
# make sure target_class != test_df_class, or else comlex.explain does nothing
# test_df_class = contrived_classification(test_df) # = 0
target_class = 1
explanation = comlex.explain(test_df,to_maximize=target_class,
                             return_dist=True,single=True,
                             savefig=True,#train_iter=10,
                             timeseries=False,filename="sample_result.png")

replacements = explanation[0]
distractor_new = explanation[1]
counterfactual_explanation = [point for point in test_point] # make copy of original test data before doing replacements

for replacement_i in replacements:
    counterfactual_explanation[replacement_i] = distractor_new[replacement_i].values[0]

print(f"The classification of {test_point}\n"
      f"can be changed from 0 to {target_class}\n" +
      f"by changing the sample at points {explanation[0]},\n" +
      f"with points from the distractor:\n{explanation[1]}\n\n" +
      f"The modified sample that would lead to a different classification is:\n{counterfactual_explanation}")

print(BasicComlexInput.df_train_points.columns.values.tolist())
print(BasicData.classes_available)
print(BasicData.num_columns)
print(BasicComlexInput.df_train_points)
print("")
print(BasicComlexInput.df_train_labels)

The classification of [0.56, 0.32, 0.22, 0.44, 0.44, 0.11, 0.45, 0.24]
can be changed from 0 to 1
by changing the sample at points {np.int64(1), np.int64(5), np.int64(6)},
with points from the distractor:
                      0         1         2         3        4         5  \
node_id dummy                                                              
3       3      0.364701  0.403356  0.064891  0.354251  0.09459  0.943919   

                      6         7  
node_id dummy                      
3       3      0.666855  0.229296  

The modified sample that would lead to a different classification is:
[0.56, np.float64(0.40335597145457647), 0.22, 0.44, 0.44, np.float64(0.9439191218325991), np.float64(0.666854667545052), 0.24]
[0, 1, 2, 3, 4, 5, 6, 7]
[0, 1]
8
                      0         1         2         3         4         5  \
node_id dummy                                                               
0       0      0.956389  0.056481  0.578514  0.233139  0.060734  0.91691