In [5]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
import shap  # Assuming SHAP is compatible with your model
from x_driving_env.envs.env import XDrivingEnv
class ExplainableRLAgent:
    def __init__(self, environment):
        self.env = environment
        self.model = self.build_model()
        self.explainer = None  # Initialize SHAP explainer later

    def build_model(self):
        # Create a vectorized environment
        vec_env = make_vec_env(lambda: self.env, n_envs=1)

        # Set up the DQN model
        model = PPO("MlpPolicy", vec_env, verbose=1)
        return model

    def train(self, total_timesteps):
        self.model.learn(total_timesteps=total_timesteps)

        # After training, you can initialize the SHAP explainer
        # Note: This is a placeholder, as setting up SHAP with DQN might require additional steps
        self.explainer = shap.Explainer(self.model.predict, self.env.observation_space)

    def explain_action(self, state):
        # Generate SHAP values for the given state
        shap_values = self.explainer.shap_values(state)
        return shap_values

# Usage
environment = XDrivingEnv(bumps_activated=True)  # Initialize your environment
agent = ExplainableRLAgent(environment)
agent.train(total_timesteps=10)

# Evaluate the policy
mean_reward, std_reward = evaluate_policy(agent.model, agent.env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")

# Example of getting an explanation
state = environment.reset()
explanation = agent.explain_action(state)
print("SHAP Explanation:", explanation)

You provided an OpenAI Gym environment. We strongly recommend transitioning to Gymnasium environments. Stable-Baselines3 is automatically wrapping your environments in a compatibility layer, which could potentially cause issues.


Using cuda device
-----------------------------
| time/              |      |
|    fps             | 29   |
|    iterations      | 1    |
|    time_elapsed    | 69   |
|    total_timesteps | 2048 |
-----------------------------


Evaluation environment is not wrapped with a ``Monitor`` wrapper. This may result in reporting modified episode lengths and rewards, if other wrappers happen to modify these. Consider wrapping environment first with ``Monitor`` wrapper.


Mean reward: -2.500618118152488, Std reward: 0.0


IndexError: tuple index out of range