# How to contribute?

Here we report a simple guide with the key features of RobustX in order to implement your own (robust) counterfactual explanation method.

# Setup preparation

In [16]:
# Import necessary components
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import torch

from robustx.lib.models.pytorch_models.SimpleNNModel import SimpleNNModel
from robustx.datasets.ExampleDatasets import get_example_dataset
from robustx.lib.tasks.ClassificationTask import ClassificationTask
from robustx.generators.CE_methods.Wachter import Wachter
from robustx.generators.CEGenerator import CEGenerator
from robustx.generators.robust_CE_methods.EntropicRiskCE import EntropicRiskCE
from robustx.lib.models.pytorch_models.EnsembleModelWrapper import EnsembleModelWrapper




## Ensemble of single-target models

In [17]:
# Load and preprocess dataset
dl = get_example_dataset("iris")
dl.preprocess(
    impute_strategy_numeric='mean',  # Impute missing numeric values with mean
    scale_method='minmax',           # Apply min-max scaling
    encode_categorical=False         # No categorical encoding needed (since no categorical features)
)

# remove the target column from the dataset that has labels 2
dl.data = dl.data[dl.data['target'] != 2]

# Load model, note some RecourseGenerators may only work with a certain type of model,
# e.g., MCE only works with a SimpleNNModel
n_models = 10
model_ensemble = [SimpleNNModel(4, [10], 2, seed=0) for _ in range(n_models)]

target_column = "target"
X_train, X_test, y_train, y_test = train_test_split(dl.data.drop(columns=[target_column]), dl.data[target_column], test_size=0.35, random_state=0)


# Train each model in the ensemble
all_indexes = np.arange(X_train.shape[0])
for model in model_ensemble:
    np.random.shuffle(all_indexes)
    sampled_indexes = all_indexes[:int(0.8 * len(all_indexes))]
    model.train(X_train.iloc[sampled_indexes], y_train.iloc[sampled_indexes], epochs=50, batch_size=16, verbose=0)
    print(f"model accuracy: {model.compute_accuracy(X_test.values, y_test.values):0.4f}")

emodel = EnsembleModelWrapper(
    model_ensemble=[simple_model._model for simple_model in model_ensemble],
    device='cpu',
    ensemble_type=EnsembleModelWrapper.ENSEMBLE_SINGLE_OUTPUT_SINGLE_TARGET_LIST,
    aggregation_method=EnsembleModelWrapper.AGGREGATION_MAJORITY_VOTE
)
preds = emodel.predict_tensor(torch.tensor(X_test.to_numpy(), dtype=torch.float), apply_softmax=True)
# print(f"ensemble accuracy: {emodel.compute_accuracy(X_test, y_test)}")


# Create task
task = ClassificationTask(emodel, dl)

model accuracy: 0.9714
model accuracy: 0.9714
model accuracy: 0.8857
model accuracy: 0.9429
model accuracy: 0.9714
model accuracy: 0.8857
model accuracy: 0.8571
model accuracy: 0.8857
model accuracy: 0.6000
model accuracy: 0.9714


In [18]:
# Each counterfactual explanation generator takes the task on creation, it can also take a custom distance function, but for now we will use the default one.

ce_gen = EntropicRiskCE(task)


# Get negative instances, the default column_name is always "target" but you can set it to the name of your dataset's target variable
# negs = dl.get_negative_instances(neg_value=0, column_name="target")
negs = dl.data.drop(columns=['target']).head(1)
print("Negative instances shape: ", negs.shape)
print(f"Example of a prediction for a negative instance:\n")
print(negs.head(1))
print("Output: ", emodel.predict(negs.head(1)).values.item())
print("Class: ", int(emodel.predict(negs.head(1)).values.item() > 0.5))  # Assuming binary classification with threshold 0.5

# You can generate for a set of instances stored in a DataFrame
print("\nGenerating counterfactual explanations using EntropicRiskCF for the first 1 negative instances:")
instance = negs.head(1)
ce = ce_gen.generate_for_instance(
    instance=instance,
    target_weights=None,
    tau=0.45,
    max_iter=50,
    device='cpu',
    verbose=False
)
print(ce)
print("Original Output: ", emodel.predict(instance).values.item())
print("CF Output: ", emodel.predict(ce).values.item())

Negative instances shape:  (1, 4)
Example of a prediction for a negative instance:

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.222222             0.625           0.067797          0.041667
Output:  0
Class:  0

Generating counterfactual explanations using EntropicRiskCF for the first 1 negative instances:
          0         1         2         3
0 -0.057043  0.906059  0.347517  0.321656
Original Output:  0
CF Output:  1


In [19]:
# You can also implement a method to generate CEs for all the negative instance in one shot
ces = ce_gen.generate_for_all(
                            target_weights=None,
                            tau=0.8,
                            max_iter=50,
                            device='cpu',
                            verbose=False
)
print("All query outputs are positive? ", np.all(emodel.predict(negs)>0.5))
print("All query outputs are negative? ", np.all(emodel.predict(negs)<0.5))
print("All CF outputs are positive? ", np.all(emodel.predict(ces)>0.5))

All query outputs are positive?  False
All query outputs are negative?  True
All CF outputs are positive?  False


## Single-output model ensemble for multiple targets

In [20]:
# Load and preprocess dataset
dl = get_example_dataset("iris")
dl.preprocess(
    impute_strategy_numeric='mean',  # Impute missing numeric values with mean
    scale_method='minmax',           # Apply min-max scaling
    encode_categorical=False         # No categorical encoding needed (since no categorical features)
)

# remove the target column from the dataset that has labels 2
dl.data = dl.data[dl.data['target'] != 2]

dl.data["t1"] = dl.data["target"]
dl.data["t2"] = np.random.randint(low=0, high=2, size=len(dl.data))
dl.data = dl.data.drop(columns="target")
target_cols = ["t1", "t2"]

X_train, X_test, y_train, y_test = train_test_split(dl.data.drop(columns=target_cols), dl.data[target_cols], test_size=0.35, random_state=0)

dl.data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),t1,t2
0,0.222222,0.625,0.067797,0.041667,0,1
1,0.166667,0.416667,0.067797,0.041667,0,1
2,0.111111,0.5,0.050847,0.041667,0,1
3,0.083333,0.458333,0.084746,0.041667,0,0
4,0.194444,0.666667,0.067797,0.041667,0,1


In [21]:
# Load model, note some RecourseGenerators may only work with a certain type of model,
# e.g., MCE only works with a SimpleNNModel
n_models = 2
model_ensemble = {
    t: [SimpleNNModel(4, [10], 2, seed=0) for _ in range(n_models)]
    for t in target_cols
}


# Train each model in the ensemble
all_indexes = np.arange(X_train.shape[0])
for target, sub_ensemble in model_ensemble.items():
    print(f"Training for target {target}")
    for model in sub_ensemble:
        np.random.shuffle(all_indexes)
        sampled_indexes = all_indexes[:int(0.8 * len(all_indexes))]
        model.train(X_train.iloc[sampled_indexes], y_train[target].iloc[sampled_indexes], epochs=50, batch_size=16, verbose=0)
        print(f"model accuracy: {model.compute_accuracy(X_test.values, y_test[target].values):0.4f}")

model_ensemble = {
    t: [model._model for model in sub_ensemble] for t, sub_ensemble in model_ensemble.items()
}

emodel = EnsembleModelWrapper(
    model_ensemble=model_ensemble,
    device='cpu',
    ensemble_type=EnsembleModelWrapper.ENSEMBLE_SINGLE_OUTPUT_MULTI_TARGET_DICT,
    aggregation_method=EnsembleModelWrapper.AGGREGATION_MAJORITY_VOTE
)
preds = emodel.predict_tensor(torch.tensor(X_test.to_numpy(), dtype=torch.float), apply_softmax=True)
print(f"ensemble accuracy: {emodel.compute_accuracy(X_test, y_test)}")


# Create task
task = ClassificationTask(emodel, dl)

Training for target t1
model accuracy: 0.9714
model accuracy: 0.8857
Training for target t2
model accuracy: 0.4857
model accuracy: 0.5143
ensemble accuracy: {'t1': 0.9714285714285714, 't2': 0.5142857142857142}


In [22]:
ce_gen = EntropicRiskCE(task)


# Get negative instances, the default column_name is always "target" but you can set it to the name of your dataset's target variable
# negs = dl.get_negative_instances(neg_value=0, column_name="target")
negs = dl.data.drop(columns=target_cols).head(1)
print("Negative instances shape: ", negs.shape)
print(f"Example of a prediction for a negative instance:\n")
print(negs.head(1))
print("Output: \n", emodel.predict(negs.head(1)))

# You can generate for a set of instances stored in a DataFrame
print("\nGenerating counterfactual explanations using EntropicRiskCF for the first 1 negative instances:")
instance = negs.head(1)
ce = ce_gen.generate_for_instance(
    instance=instance,
    target_weights=None,
    tau=0.45,
    max_iter=50,
    device='cpu',
    verbose=False
)
print(ce)
print("Original Output: \n", emodel.predict(instance))
print("CF Output: \n", emodel.predict(ce))

Negative instances shape:  (1, 4)
Example of a prediction for a negative instance:

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           0.222222             0.625           0.067797          0.041667
Output: 
    t1  t2
0   0   0

Generating counterfactual explanations using EntropicRiskCF for the first 1 negative instances:
          0         1         2         3
0 -0.131432  0.979998  0.424915  0.399199
Original Output: 
    t1  t2
0   0   0
CF Output: 
    t1  t2
0   1   1


In [23]:
ces = ce_gen.generate(
                        instances=X_test,
                        target_weights=None,
                        tau=0.45,
                        max_iter=50,
                        device='cpu',
                        verbose=False
)
print("All query outputs are positive? ", np.all(emodel.predict(negs)>0.5))
print("All query outputs are negative? ", np.all(emodel.predict(negs)<0.5))
print("All CF outputs are positive? ", np.all(emodel.predict(ces)>0.5))

All query outputs are positive?  False
All query outputs are negative?  True
All CF outputs are positive?  True
