In [1]:
import sys
import os

# Add the parent directory to the sys.path to avoid 'ModuleNotFoundError'
sys.path.append(os.path.abspath(os.path.join('..')))

import shap
import numpy as np
import pandas as pd

import matplotlib
matplotlib.rcParams["font.family"] = ['serif']

from src.preprocessing import *
from src.visualization import *
from src.trainer import *
from src.helpers import *
from src.config import *
from src.paths import paths

In [2]:
# Load data
X_train_list = [pd.read_csv(path) for path in paths.get('X_train_paths')]
y_train_list = [pd.read_csv(path) for path in paths.get('y_train_paths')]
X_val_list   = [pd.read_csv(path) for path in paths.get('X_val_paths')]
y_val_list   = [pd.read_csv(path) for path in paths.get('y_val_paths')]

y_train_list = [y_train[TARGET].to_numpy() for y_train in y_train_list]
y_val_list   = [y_val[TARGET].to_numpy() for y_val in y_val_list]

feature_names = X_train_list[0].columns.to_list()

# Concat y_val_list into a sigle list
y_val_agg = []
for i in range(5):
    y_val_agg.extend(y_val_list[i])

# Load trained models of 5 folds
models = {}
for model_name in MODEL_NAMES:
    models[model_name] = []
    for fold in range(5):
        models[model_name].append(load_model_fold(fold, model_name))

In [None]:
# Get SHAP values and SHAP interaction values for each model
# Then save them in JSON files
for model_name in MODEL_NAMES:
    sv_allfolds = []
    iv_allfolds = []

    # For each model fold, get SHAP values and SHAP interaction values
    for fold, (X_train, X_val) in enumerate(zip(X_train_list, X_val_list)):
        # Tree-baed models
        if model_name in ['catboost', 'xgboost', 'lgbm', 'rf']:
            explainer = shap.TreeExplainer(
                model=models[model_name][fold],
                model_output='raw',
                feature_perturbation='tree_path_dependent',
            )
            sv = explainer.shap_values(X_val)
            iv = explainer.shap_interaction_values(X_val)
            
            if model_name == 'rf':
                # SHAP values for rf will be return for both class 0 and 1.
                # This will add an extra dim. For example: (55, 20) -> (55, 20, 2)
                # Therefore, to make all shapes consistent, only extract SHAP values for class 1
                sv = explainer.shap_values(X_val)[...,1]
                iv = explainer.shap_interaction_values(X_val)[..., 1]

            sv_allfolds.extend(sv.tolist())
            iv_allfolds.extend(iv.tolist())

        # Logistic Regression
        if model_name == 'lr':
            explainer = shap.LinearExplainer(
                model=models[model_name][fold],
                masker=X_train,
                model_output='raw',
                feature_perturbation='correlation_dependent',
            )
            sv = explainer.shap_values(X_val)
            # LinearExplainer does not support interaction values

        # SVM
        # if model_name == 'svm':
        #     model = models[model_name][fold]
        #     f = lambda x: model.predict_proba(x)[:,1]
        #     explainer = shap.KernelExplainer(
        #         model=f,
        #         data=X_train,
        #         nsamples='auto'
        #     )
        #     sv = explainer.shap_values(X_val)
        #     # KernelExplainer does not support interaction values

    # Save SHAP values and SHAP interaction values for this model to csv files
    save_json(paths[model_name]['sv'], sv_allfolds)
    if len(iv_allfolds) > 0:
        save_json(paths[model_name]['iv'], iv_allfolds)

In [17]:
# Load SHAP values and SHAP interaction values
shap_values = {}
shap_interaction_values = {}

for model_name in MODEL_NAMES:
    sv_by_folds = load_json(paths[model_name]['sv'])

    # Aggregate sv for each fold into a single list of sv
    sv_list = []
    for sv in sv_by_folds:
        sv_list.extend(sv)
    shap_values[model_name] = sv_list

    if os.path.isfile(paths[model_name]['iv']):
        iv_by_folds = load_json(paths[model_name]['iv'])

        # Aggregate iv for each fold into a single list of iv
        iv_list = []
        for iv in iv_by_folds:
            iv_list.extend(iv)
        shap_interaction_values[model_name] = iv_list

print(shap_values.keys())
print(shap_interaction_values.keys())

dict_keys(['catboost', 'xgboost', 'lgbm', 'rf', 'svm', 'lr'])
dict_keys(['catboost', 'xgboost', 'lgbm', 'rf'])
