### GPT explainer for income prediction

In [1]:
import sys
import os

module_path = os.path.abspath('E:/Codes/OmniXAI/')
if module_path not in sys.path:
    sys.path.append(module_path)

# For Jupyter notebooks or interactive environments where __file__ is not defined
try:
    # Try to use __file__ if available
    directory = os.path.dirname(os.path.abspath(__file__))
except NameError:
    # If __file__ is not defined (e.g., in Jupyter), use the current working directory
    directory = os.path.abspath('')
    
sys.path.append(os.path.dirname(directory))

In [2]:
import os
import sklearn
import xgboost
import numpy as np
import pandas as pd
from omnixai.data.tabular import Tabular
from omnixai.preprocessing.tabular import TabularTransform
from omnixai.explainers.tabular import LLMExplainer

In [3]:
def diabetes_data(file_path):
    """Load and preprocess the diabetes dataset"""
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Get feature names (all columns except 'class')
    feature_names = [col for col in df.columns if col != 'class']
    
    # Prepare features and target
    X = df[feature_names].copy()
    y = df['class'].copy()
    
    # Encode categorical features
    label_encoders = {}
    for col in X.columns:
        if X[col].dtype == 'object' or col == 'Gender':
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))
            label_encoders[col] = le
    
    # Encode target variable (Positive=1, Negative=0)
    target_encoder = LabelEncoder()
    y_encoded = target_encoder.fit_transform(y)
    
    # Split into train/test sets
    x_train, x_test, y_train, y_test = train_test_split(
        X.values, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
    
    return x_train, y_train, x_test, y_test, feature_names

def train_tf_model(x_train, y_train, x_test, y_test):
    """Train a TensorFlow model for diabetes prediction"""
    try:
        import tensorflow as tf
        from tensorflow.keras import layers, models
        from tensorflow.keras.utils import to_categorical
        
        # Convert to categorical if needed
        num_classes = len(np.unique(y_train))
        if num_classes == 2:
            # Binary classification
            y_train_cat = y_train.reshape(-1, 1)
            y_test_cat = y_test.reshape(-1, 1)
        else:
            # Multi-class classification
            y_train_cat = to_categorical(y_train, num_classes)
            y_test_cat = to_categorical(y_test, num_classes)
        
        # Create the model
        model = models.Sequential([
            layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
            layers.Dropout(0.3),
            layers.Dense(32, activation='relu'),
            layers.Dropout(0.3),
            layers.Dense(1 if num_classes == 2 else num_classes, 
                        activation='sigmoid' if num_classes == 2 else 'softmax')
        ])
        
        # Compile the model
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy' if num_classes == 2 else 'categorical_crossentropy',
            metrics=['accuracy']
        )
        
        # Train the model
        history = model.fit(
            x_train, y_train_cat,
            epochs=100,
            batch_size=32,
            validation_data=(x_test, y_test_cat),
            verbose=0
        )
        
        # Evaluate the model
        train_loss, train_acc = model.evaluate(x_train, y_train_cat, verbose=0)
        test_loss, test_acc = model.evaluate(x_test, y_test_cat, verbose=0)
        
        print(f"Train loss: {train_loss:.4f}, train accuracy: {train_acc:.4f}")
        print(f"Test loss:  {test_loss:.4f}, test accuracy:  {test_acc:.4f}")
        
        return model
        
    except ImportError:
        print("TensorFlow not available, using XGBoost instead")
        import xgboost as xgb
        
        # Use XGBoost as fallback
        model = xgb.XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
        model.fit(x_train, y_train)
        
        train_acc = model.score(x_train, y_train)
        test_acc = model.score(x_test, y_test)
        
        print(f"Train accuracy: {train_acc:.4f}")
        print(f"Test accuracy:  {test_acc:.4f}")
        
        return model

In [4]:
file_path = '../data/diabetes.csv'

x_train, y_train, x_test, y_test, feature_names = diabetes_data(file_path)
print('x_train shape: {}'.format(x_train.shape))
print('x_test shape:  {}'.format(x_test.shape))

model = train_tf_model(x_train, y_train, x_test, y_test)
# Used for initializing the explainer
tabular_data = Tabular(
    x_train,
    feature_columns=feature_names,
)

x_train shape: (416, 16)
x_test shape:  (104, 16)
TensorFlow not available, using XGBoost instead
Train accuracy: 1.0000
Test accuracy:  0.9808


The dataset used in this example is for income prediction (https://archive.ics.uci.edu/ml/datasets/adult). We recommend using `Tabular` to represent a tabular dataset, which can be constructed from a pandas dataframe or a numpy array. To create a `Tabular` instance given a numpy array, one needs to specify the data, the feature names, the categorical feature names (if exists) and the target/label column name (if exists).

In [5]:
feature_names = [
    "Age", "Workclass", "fnlwgt", "Education",
    "Education-Num", "Marital Status", "Occupation",
    "Relationship", "Race", "Sex", "Capital Gain",
    "Capital Loss", "Hours per week", "Country", "label"
]
data = np.genfromtxt(os.path.join('../data', 'adult.data'), delimiter=', ', dtype=str)
tabular_data = Tabular(
    data,
    feature_columns=feature_names,
    categorical_columns=[feature_names[i] for i in [1, 3, 5, 6, 7, 8, 9, 13]],
    target_column='label'
)
print(tabular_data)

      Age         Workclass  fnlwgt   Education Education-Num  \
0      39         State-gov   77516   Bachelors            13   
1      50  Self-emp-not-inc   83311   Bachelors            13   
2      38           Private  215646     HS-grad             9   
3      53           Private  234721        11th             7   
4      28           Private  338409   Bachelors            13   
...    ..               ...     ...         ...           ...   
32556  27           Private  257302  Assoc-acdm            12   
32557  40           Private  154374     HS-grad             9   
32558  58           Private  151910     HS-grad             9   
32559  22           Private  201490     HS-grad             9   
32560  52      Self-emp-inc  287927     HS-grad             9   

           Marital Status         Occupation   Relationship   Race     Sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-managerial        Husband  Wh

`TabularTransform` is a special transform designed for tabular data. By default, it converts categorical features into one-hot encoding, and keeps continuous-valued features (if one wants to normalize continuous-valued features, set the parameter `cont_transform` in `TabularTransform` to `Standard` or `MinMax`). The `transform` method of `TabularTransform` will transform a `Tabular` instance into a numpy array. If the `Tabular` instance has a target/label column, the last column of the transformed numpy array will be the target/label. 

If one wants some other transformations that are not supported in the library, one can simply convert the `Tabular` instance into a pandas dataframe by calling `Tabular.to_pd()` and try different transformations with it.

After data preprocessing, we can train a XGBoost classifier for this task (one may try other classifiers). 

In [6]:
np.random.seed(1)
transformer = TabularTransform().fit(tabular_data)
class_names = transformer.class_names
x = transformer.transform(tabular_data)
train, test, labels_train, labels_test = \
    sklearn.model_selection.train_test_split(x[:, :-1], x[:, -1], train_size=0.80)
print('Training data shape: {}'.format(train.shape))
print('Test data shape:     {}'.format(test.shape))

gbtree = xgboost.XGBClassifier(n_estimators=300, max_depth=5)
gbtree.fit(train, labels_train)
print('Test accuracy: {}'.format(
    sklearn.metrics.accuracy_score(labels_test, gbtree.predict(test))))

Training data shape: (26048, 108)
Test data shape:     (6513, 108)
Test accuracy: 0.865806847842776


The prediction function takes a `Tabular` instance as its inputs, and outputs the class probabilities for classification tasks or the estimated values for regression tasks. In this example, we simply call `transformer.transform` to do data preprocessing followed by the prediction function of `gbtree`.

In [7]:
predict_function=lambda z: gbtree.predict_proba(transformer.transform(z))

To initialize a GPT explainer, we need to set:
  
  - `training_data`: The data used to initialize a SHAP explainer. ``training_data`` can be the training dataset for training the machine learning model. If the training dataset is too large, ``training_data`` can be a subset of it by applying `omnixai.sampler.tabular.Sampler.subsample`.
  - `predict_function`: The prediction function corresponding to the model.
  - `mode`: The task type, e.g., "classification" or "regression".
  - `apikey`: The OpenAI API key.

In [8]:
explainer = LLMExplainer(
    training_data=tabular_data,
    predict_function=predict_function,
    indoxrouter_apikey="indox-e3tqyXlz0lctgM6tZdjzcKQopuc4QoLF",
    indoxrouter_model="openai/gpt-4o-mini"
)
# Apply an inverse transform, i.e., converting the numpy array back to `Tabular`
test_instances = transformer.invert(test)
test_x = test_instances[1653]

Using 150 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


We are now ready to generate explanations:

In [9]:
explanations = explainer.explain(test_x)
print(explanations.get_explanations(index=0)["text"])

  0%|          | 0/1 [00:00<?, ?it/s]

This example is classified as label_1 primarily due to the high feature importance of "Capital Gain = 15024.0", which has the highest score of 7.0389. This indicates that capital gain is a strong predictor for the positive label. Other contributing factors include age, education level, work class, and marital status, all of which have positive feature importance scores, suggesting they also contribute to the prediction of label_1.

To change the predicted label from label_1 to label_0, you can adjust the feature values as follows: set "Capital Gain" to "3756.0" and "Capital Loss" to "125.125". This modification alters the input in a way that the model predicts label_0 instead of label_1.
