In [1]:
import sys
import os
import shap
import numpy as np
import pandas as pd
import matplotlib

# Set the font family for matplotlib
matplotlib.rcParams["font.family"] = ['serif']

# Add the parent directory to the sys.path to avoid 'ModuleNotFoundError'
sys.path.append(os.path.abspath(os.path.join('..')))

# Import custom modules
from src.preprocessing import *
from src.visualization import *
from src.trainer import *
from src.helpers import *
from src.config import *
from src.paths import paths

In [2]:
# Load data
X_train_list = [pd.read_csv(path) for path in paths.get('X_train_paths')]
y_train_list = [pd.read_csv(path) for path in paths.get('y_train_paths')]

X_val_list   = [pd.read_csv(path) for path in paths.get('X_val_paths')]
y_val_list   = [pd.read_csv(path) for path in paths.get('y_val_paths')]

y_train_list = [y_train[TARGET].to_numpy() for y_train in y_train_list]
y_val_list   = [y_val[TARGET].to_numpy() for y_val in y_val_list]

feature_names = X_train_list[0].columns.to_list()

# Concat y_val_list into a sigle list
y_val_agg = concatenate_lists(y_val_list)

# Load trained models of 5 folds
models = {}
for model_name in MODEL_NAMES:
    models[model_name] = []
    for fold in range(5):
        models[model_name].append(load_model_fold(fold, model_name))

In [3]:
class ShapValuesCalculator:
    def __init__(self, trained_models):
        self.trained_models = trained_models

    def calculate_shap_values_tree_based(self, model_name, fold_index, X_val):
        explainer = shap.TreeExplainer(
            model=self.trained_models[model_name][fold_index],
            model_output='raw',
            feature_perturbation='tree_path_dependent',
        )
        shap_values = explainer.shap_values(X_val)
        interaction_values = explainer.shap_interaction_values(X_val)

        # SHAP values for Random Forest will be returned for both class 0 and 1.
        # To make all shapes consistent, only extract SHAP values for class 1.
        if model_name == 'rf':
            shap_values = shap_values[..., 1]
            interaction_values = interaction_values[..., 1]

        return shap_values.tolist(), interaction_values.tolist()

    def calculate_shap_values_linear(self, model_name, fold_index, X_train, X_val):
        explainer = shap.LinearExplainer(
            model=self.trained_models[model_name][fold_index],
            masker=X_train,
            model_output='raw',
            feature_perturbation='correlation_dependent',
        )
        shap_values = explainer.shap_values(X_val)
        return shap_values.tolist(), None

    def calculate_shap_values_svm(self, model_name, fold_index, X_train, X_val):
        model = self.trained_models[model_name][fold_index]
        predict_function = lambda x: model.predict_proba(x)[:, 1]
        explainer = shap.KernelExplainer(
            model=predict_function,
            data=X_train,
            nsamples='auto'
        )
        shap_values = explainer.shap_values(X_val)
        return shap_values.tolist(), None

    def process_and_save_shap_values(self, X_train_list, X_val_list, save_paths):
        for model_name in MODEL_NAMES:
            shap_values_list = []
            interaction_values_list = []

            for fold_index, (X_train, X_val) in enumerate(zip(X_train_list, X_val_list)):
                if model_name in ['catboost', 'xgboost', 'lgbm', 'rf']:
                    shap_values, interaction_values = self.calculate_shap_values_tree_based(model_name, fold_index, X_val)
                elif model_name == 'lr':
                    shap_values, _ = self.calculate_shap_values_linear(model_name, fold_index, X_train, X_val)
                    interaction_values = None
                elif model_name == 'svm':
                    shap_values, _ = self.calculate_shap_values_svm(model_name, fold_index, X_train, X_val)
                    interaction_values = None
                else:
                    continue

                if shap_values is not None:
                    shap_values_list.append(shap_values)
                if interaction_values is not None:
                    interaction_values_list.append(interaction_values)

            if shap_values_list:
                save_json(save_paths[model_name]['sv'], shap_values_list)
            if interaction_values_list:
                save_json(save_paths[model_name]['iv'], interaction_values_list)


In [4]:
# shap_calculator = ShapValuesCalculator(models)
# shap_calculator.process_and_save_shap_values(X_train_list, X_val_list, paths)

In [5]:
# Load SHAP values and SHAP interaction values
shap_values = {}
shap_interaction_values = {}

def get_mean_absolute_shap_values(values_by_folds):
    # Concatenate sv of each fold into a single list
    concatenated_values = concatenate_lists(values_by_folds)
    # Get mean absolute SHAP values for this model
    mean_absolute_values = np.mean(np.abs(concatenated_values), axis=0)
    return mean_absolute_values

for model_name in MODEL_NAMES:
    sv_by_folds = load_json(paths[model_name]['sv'])
    shap_values[model_name] = get_mean_absolute_shap_values(sv_by_folds)

    if os.path.isfile(paths[model_name]['iv']):
        iv_by_folds = load_json(paths[model_name]['iv'])
        shap_interaction_values[model_name] = get_mean_absolute_shap_values(iv_by_folds)

print(shap_values.keys())
print(shap_interaction_values.keys())

dict_keys(['catboost', 'xgboost', 'lgbm', 'rf', 'svm', 'lr'])
dict_keys(['catboost', 'xgboost', 'lgbm', 'rf'])
