### Working 

In [2]:
import gym
import gym_sokoban
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.buffers import ReplayBuffer

# Custom reward function can be added in the environment wrapper
class CustomSokobanEnv(gym.Wrapper):
    def __init__(self, env):
        super(CustomSokobanEnv, self).__init__(env)
        self.env = env

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        # Modify the reward here
        # Example: Penalize each step to encourage faster solutions
        reward -= 0.01
        if done and 'all_boxes_on_target' in info and info['all_boxes_on_target']:
            reward += 10.0  # Large reward for solving the puzzle
        return obs, reward, done, info

class PrioritizedReplayBuffer(ReplayBuffer):
    def __init__(self, buffer_size, observation_space, action_space, alpha=0.6):
        super(PrioritizedReplayBuffer, self).__init__(buffer_size, observation_space, action_space)
        self.alpha = alpha
        self.priorities = np.zeros((buffer_size,), dtype=np.float32)

    def add(self, obs, next_obs, action, reward, done):
        max_prio = self.priorities.max() if self.buffer_size > 0 else 1.0
        super().add(obs, next_obs, action, reward, done)
        self.priorities[self.pos - 1] = max_prio

    def sample(self, batch_size, beta=0.4):
        if self.buffer_size == self.size:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]

        probs = prios ** self.alpha
        probs /= probs.sum()

        indices = np.random.choice(self.size, batch_size, p=probs)
        samples = super().sample(batch_size)
        total = self.size
        weights = (total * probs[indices]) ** (-beta)
        weights /= weights.max()
        samples['weights'] = weights
        samples['indices'] = indices
        return samples

    def update_priorities(self, batch_indices, batch_priorities):
        self.priorities[batch_indices] = batch_priorities


env = gym.make('Sokoban-small-v1')
env = CustomSokobanEnv(env)

model = DQN("MlpPolicy", env, verbose=1, buffer_size=50000, replay_buffer_class=PrioritizedReplayBuffer)

# Train the model
model.learn(total_timesteps=1000000)

# Save the model
model.save("dqn_sokoban_buffer")

# # Load the trained model
# loaded_model = DQN.load("dqn_sokoban_buffer")

# # Evaluate the trained model
# obs = env.reset()
# while True:
#     action, _states = loaded_model.predict(obs, deterministic=True)
#     action = int(action)
#     obs, reward, done, info = env.step(action)
#     image = env.render(mode='rgb_array')
    
#     print(f"Action: {action}, Reward: {reward}, Done: {done}, Info: {info}")
    
#     if done:
#         print("Episode finished.")
#         break


[[0 0 0 0 0 0 0]
 [0 1 1 2 0 0 0]
 [0 1 1 1 0 0 0]
 [0 1 1 1 0 0 0]
 [0 1 1 2 2 0 0]
 [0 1 1 1 1 0 0]
 [0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0]
 [0 1 1 2 0 0 0]
 [0 1 4 1 0 0 0]
 [0 5 4 1 0 0 0]
 [0 1 4 2 2 0 0]
 [0 1 1 1 1 0 0]
 [0 0 0 0 0 0 0]] {(1, 3): (3, 2), (4, 3): (4, 2), (4, 4): (2, 2)}


NameError: name 'CustomSokobanEnv' is not defined