# Tabular claasification: OmniXAI

In this notebook, we examine the explanation methods in OmniXAI applied to three models: Logistic regression, Random Forest, and MLP.

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


2024-06-23 15:17:19.219800: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-23 15:17:19.267095: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Load data, define veriale names, split into train/valid/test

In [3]:
data = pd.read_csv("/home/kxlu/Documents/xai_compare/data/fraud/creditcard.csv")
target = 'Class'
predictors = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',\
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',\
       'Amount']

from omnixai.data.tabular import Tabular
# data = pd.read_csv("/home/kxlu/Documents/xai_compare/data/fraud/creditcard.csv")
tabular_data = Tabular(
   data,
   target_column='Class'
)
from omnixai.preprocessing.tabular import TabularTransform
# Data preprocessing
transformer = TabularTransform().fit(tabular_data)
class_names = transformer.class_names
x = transformer.transform(tabular_data)
# split data into train, validation, and test
train_df, test_df, train_labels, test_labels = train_test_split(x[:, :-1], x[:, -1], train_size=0.80, random_state=42,shuffle=True)
test_data = transformer.invert(test_df)

X_train = train_df
X_test = test_df
# normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Build logistic regression

In [4]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(fit_intercept=False)
lr.fit(X_train, train_labels)

Build random forest

In [5]:
clf = RandomForestClassifier(n_jobs=5, 
                             random_state=42,
                             criterion='gini',
                             n_estimators=100,
                             verbose=False)
clf.fit(X_train, train_labels)

Build MLP using tf

In [6]:
from tensorflow.keras import regularizers
model = tf.keras.Sequential(
    [
        # Adjusted number of neurons
        tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[-1],),
                              kernel_regularizer=regularizers.l2(0.001)),  # L2 regularization
        tf.keras.layers.Dropout(0.2),  # Adjusted dropout rate
        tf.keras.layers.Dense(64, activation="relu",
                              kernel_regularizer=regularizers.l2(0.001)),  # L2 regularization
        tf.keras.layers.Dropout(0.2),  # Adjusted dropout rate
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-06-23 15:27:51.222364: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2922 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:1d:00.0, compute capability: 8.6


In [7]:
metrics = [
    tf.keras.metrics.FalseNegatives(name="fn"),
    tf.keras.metrics.FalsePositives(name="fp"),
    tf.keras.metrics.TrueNegatives(name="tn"),
    tf.keras.metrics.TruePositives(name="tp"),
    tf.keras.metrics.Precision(name="precision"),
    tf.keras.metrics.Recall(name="recall"),
]

# compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics = metrics)

# calculate class weights
neg, pos = np.bincount(train_labels.astype("int64"))
total = neg + pos
class_weight = {0: 1, 1: 5}

# train the model
history = model.fit(X_train, train_labels,epochs=5, class_weight=class_weight)

Epoch 1/5


I0000 00:00:1719127677.778314 2010387 service.cc:145] XLA service 0x7fa8a401b680 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1719127677.778348 2010387 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-06-23 15:27:57.841786: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-06-23 15:27:57.989874: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8905


[1m  33/7121[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m33s[0m 5ms/step - fn: 0.0000e+00 - fp: 363.2727 - loss: 1.0158 - precision: 0.0022 - recall: 0.8182 - tn: 179.9091 - tp: 0.8182          

I0000 00:00:1719127681.537561 2010387 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5ms/step - fn: 92.4600 - fp: 1030.4984 - loss: 0.2274 - precision: 0.0837 - recall: 0.4339 - tn: 112744.2891 - tp: 100.7438
Epoch 2/5
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - fn: 45.6310 - fp: 32.5682 - loss: 0.0670 - precision: 0.8061 - recall: 0.7572 - tn: 113738.2734 - tp: 151.5177
Epoch 3/5
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - fn: 39.4792 - fp: 27.1963 - loss: 0.0346 - precision: 0.8639 - recall: 0.8055 - tn: 113744.6484 - tp: 156.6661
Epoch 4/5
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - fn: 41.2967 - fp: 34.2245 - loss: 0.0246 - precision: 0.8191 - recall: 0.8015 - tn: 113727.1016 - tp: 165.3653
Epoch 5/5
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 4ms/step - fn: 40.1449 - fp: 33.8377 - loss: 0.0183 - precision: 0.8203 - recall: 0.7814 - tn: 113737.4531 - tp: 156.5507


### Logistic regression
Error occurs in methods: shap, bias,sensitivity,shap_global
Doesn't support: ce and ig (reuires tf or pytorch)

In [None]:
from omnixai.explainers.tabular import TabularExplainer

# Initialize a TabularExplainer
explainer = TabularExplainer(
  explainers=["lime", "pdp", "ale","l2x","permutation","mace"], # The explainers to apply
  mode="classification",                             # The task type
  data=tabular_data,                                   # The data for initializing the explainers
  model=lr,                                       # The ML model to explain
  preprocess=lambda z: transformer.transform(z),    # Converts raw features into the model inputs
)

In [None]:
test_instances = test_data[:1]
local_explanations = explainer.explain(X=test_instances)
global_explanations = explainer.explain_global(
    params={"pdp": {"features": ["V1","V2"]},
            "ale": {"features": ["V1","V2"]},
            "permutation": {"X": test_data[:100], "y":test_labels[:100]}}
)

In [None]:
local_explanations['l2x'].ipython_plot()
local_explanations['lime'].ipython_plot()
local_explanations['mace'].ipython_plot()
local_explanations['predict'].ipython_plot(index=0)

In [None]:
global_explanations['ale'].ipython_plot()
global_explanations['pdp'].ipython_plot()
global_explanations['permutation'].ipython_plot()

### Random Forest
Error ocuurs in: bias, sensitivity, shap_global
Does not support: ce, ig

In [None]:
from omnixai.explainers.tabular import TabularExplainer

# Initialize a TabularExplainer
explainer = TabularExplainer(
  explainers=["lime", "shap", "pdp", "ale","l2x","permutation","mace"], # The explainers to apply
  mode="classification",                             # The task type
  data=tabular_data,                                   # The data for initializing the explainers
  model=clf,                                       # The ML model to explain
  preprocess=lambda z: transformer.transform(z),    # Converts raw features into the model inputs
)

 |████████████████████████████████████████| 100.0% Complete, Loss 0.0006
L2X prediction model accuracy: 0.998507761396314






In [None]:
test_instances = test_data[:3]
local_explanations = explainer.explain(X=test_instances)
global_explanations = explainer.explain_global(
    params={"pdp": {"features": ["V1","V2"]},
            "ale": {"features": ["V1","V2"]},
            "permutation": {"X": test_data[:100], "y":test_labels[:100]},}
)

100%|██████████| 3/3 [00:05<00:00,  1.72s/it]


In [None]:
from omnixai.explainers.prediction import PredictionAnalyzer

analyzer = PredictionAnalyzer(
    mode="classification",
    test_data=test_data,                           # The test dataset (a `Tabular` instance)
    test_targets=test_labels,                      # The test labels (a numpy array)
    model=clf,                                   # The ML model
    preprocess=lambda z: transformer.transform(z)  # Converts raw features into the model inputs
)
prediction_explanations = analyzer.explain()

In [None]:
local_explanations['l2x'].ipython_plot()
local_explanations['lime'].ipython_plot()
local_explanations['mace'].ipython_plot()
local_explanations['predict'].ipython_plot(index=0)
local_explanations['shap'].ipython_plot

ValueError: Cannot accept list of column references or list of columns for both `x` and `y`.

In [None]:
global_explanations['ale'].ipython_plot()
global_explanations['pdp'].ipython_plot()
global_explanations['permutation'].ipython_plot()

### MLP

Error occurs in bias,shap,shap_global,l2x
Sequential is not treated as keras.model

In [18]:
from omnixai.explainers.tabular import TabularExplainer

# Initialize a TabularExplainer
explainer = TabularExplainer(
  explainers=["sensitivity","lime","pdp", "ale","sensitivity","permutation","mace"], # The explainers to apply
  mode="classification",                             # The task type
  data=tabular_data,                                   # The data for initializing the explainers
  model=model,                                       # The ML model to explain
  preprocess=lambda z: transformer.transform(z),    # Converts raw features into the model inputs
)



In [20]:
test_instances = test_data[:3]
local_explanations = explainer.explain(X=test_instances)
global_explanations = explainer.explain_global(
    params={"pdp": {"features": ["V1","V2"]},
            "ale": {"features": ["V1","V2"]},
            "permutation": {"X": test_data[:100], "y":test_labels[:100]},}
)

                    Prediction probabilties do not sum to 1, and
                    thus does not constitute a probability space.
                    Check that you classifier outputs probabilities
                    (Not log probabilities, or actual class predictions).
                    
                    Prediction probabilties do not sum to 1, and
                    thus does not constitute a probability space.
                    Check that you classifier outputs probabilities
                    (Not log probabilities, or actual class predictions).
                    
                    Prediction probabilties do not sum to 1, and
                    thus does not constitute a probability space.
                    Check that you classifier outputs probabilities
                    (Not log probabilities, or actual class predictions).
                    
  unique_group_names = pd.unique(groups)
  unique_group_names = pd.unique(groups)
  unique_group_names = pd.unique(gro

In [29]:
local_explanations['lime'].ipython_plot()
local_explanations['predict'].ipython_plot(index=0)

mace produce nothing

In [28]:
local_explanations['mace'].ipython_plot()

In [32]:
global_explanations['ale'].ipython_plot()
global_explanations['sensitivity'].ipython_plot()
global_explanations['pdp'].ipython_plot()
global_explanations['permutation'].ipython_plot()