In [2]:
import xgboost as xgb
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time

Path('xgb_shap_results').mkdir(exist_ok=True)

model = xgb.XGBClassifier()
model.load_model('xgb_model/model.json')

# Load feature information
import pickle
with open('xgb_model/feature_info.pkl', 'rb') as f:
    feature_info = pickle.load(f)

feature_names = feature_info['feature_names']
target_column = feature_info['target_column']
print(f"Feature info loaded: {len(feature_names)} features, target: {target_column}")

X_test = pd.read_csv('xgb_data/X_test_sample.csv')
y_test = pd.read_csv('xgb_data/y_test_sample.csv').iloc[:, 0]
print(f"Test data loaded: {X_test.shape[0]} rows, {X_test.shape[1]} columns")

print("\nCalculating SHAP values")
start_time = time.time()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

is_multiclass = isinstance(shap_values, list)
if is_multiclass:
    print(f"Multi-class model: {len(shap_values)} classes")
    positive_class_idx = 1
    shap_values_display = shap_values[positive_class_idx]
else:
    print("Binary classification model")
    shap_values_display = shap_values

print(f"SHAP values calculated in {time.time() - start_time:.2f} seconds")

# Feature importance plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values_display, X_test, plot_type="bar", show=False)
plt.title("XGBoost SHAP Feature Importance")
plt.tight_layout()
plt.savefig('xgb_shap_results/feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()

# Summary plot (impact and direction)
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values_display, X_test, show=False)
plt.title("XGBoost SHAP Summary")
plt.tight_layout()
plt.savefig('xgb_shap_results/summary_plot.png', dpi=300, bbox_inches='tight')
plt.close()

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': np.abs(shap_values_display).mean(0),
    'direction': np.where(shap_values_display.mean(0) > 0, 'positive', 'negative')
}).sort_values('importance', ascending=False)

importance_df.to_csv('xgb_shap_results/feature_importance.csv', index=False)

print(importance_df.head(10))

# Dependence plots for top features
top_features = importance_df.head(5)['feature'].values
for feature in top_features:
    plt.figure(figsize=(10, 6))
    shap.dependence_plot(feature, shap_values_display, X_test, show=False)
    plt.title(f"SHAP Dependence: {feature}")
    plt.tight_layout()
    plt.savefig(f'xgb_shap_results/dependence_{feature}.png', dpi=300, bbox_inches='tight')
    plt.close()

# Waterfall plot
try:
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    high_prob_idx = np.argsort(y_pred_proba)[-1]
    
    plt.figure(figsize=(12, 8))
    shap.plots.waterfall(shap.Explanation(
        values=shap_values_display[high_prob_idx],
        base_values=explainer.expected_value if not is_multiclass else explainer.expected_value[positive_class_idx],
        data=X_test.iloc[high_prob_idx].values,
        feature_names=feature_names
    ), show=False)
    plt.title("XGBoost Prediction Explanation")
    plt.tight_layout()
    plt.savefig('xgb_shap_results/example_explanation.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Force plot for the same instance
    plt.figure(figsize=(20, 3))
    shap.force_plot(
        explainer.expected_value if not is_multiclass else explainer.expected_value[positive_class_idx],
        shap_values_display[high_prob_idx],
        X_test.iloc[high_prob_idx],
        matplotlib=True,
        show=False
    )
    plt.title("Force Plot Explanation")
    plt.tight_layout()
    plt.savefig('xgb_shap_results/force_plot.png', dpi=300, bbox_inches='tight')
    plt.close()
except Exception as e:
    print(f"Could not create example explanation: {e}")

XGBoostError: [17:43:58] /Users/runner/work/xgboost/xgboost/src/common/io.cc:146: Opening xgb_model/model.json failed: No such file or directory
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001613ed8e5 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x0000000161530433 xgboost::common::LoadSequentialFile(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>) + 723
  [bt] (2) 3   libxgboost.dylib                    0x00000001614bc631 XGBoosterLoadModel::$_4::operator()() const + 129
  [bt] (3) 4   libxgboost.dylib                    0x00000001614bc1dc XGBoosterLoadModel + 332
  [bt] (4) 5   libffi.8.dylib                      0x00000001047dd972 ffi_call_unix64 + 82
  [bt] (5) 6   ???                                 0x00007ff7bc259370 0x0 + 140701990228848

