### Working 

In [3]:
import gym
import gym_sokoban
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import EvalCallback

# Custom reward function can be added in the environment wrapper
class CustomSokobanEnv(gym.Wrapper):
    def __init__(self, env):
        super(CustomSokobanEnv, self).__init__(env)
        self.env = env

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        # Modify the reward here
        # Example: Penalize each step to encourage faster solutions
        reward -= 0.01
        if done and 'all_boxes_on_target' in info and info['all_boxes_on_target']:
            reward += 10.0  # Large reward for solving the puzzle
        return obs, reward, done, info

env = gym.make('Sokoban-small-v1')
env = CustomSokobanEnv(env)

# Define the DQN model
model = DQN(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=1e-3,
    buffer_size=50000,
    learning_starts=1000,
    batch_size=32,
    tau=0.1,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=500,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    max_grad_norm=10
)

# Callback for evaluation
eval_callback = EvalCallback(
    env, 
    best_model_save_path='./logs/',
    log_path='./logs/', 
    eval_freq=10000,
    deterministic=True, 
    render=False
)

# # Train the model
# model.learn(total_timesteps=100000, callback=eval_callback)

# # Save the model
# model.save("dqn_sokoban_2")

# Load the trained model
loaded_model = DQN.load("dqn_sokoban_2")

# Evaluate the trained model
obs = env.reset()
while True:
    action, _states = loaded_model.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, done, info = env.step(action)
    image = env.render(mode='rgb_array')
    
    print(f"Action: {action}, Reward: {reward}, Done: {done}, Info: {info}")
    
    if done:
        print("Episode finished.")
        break



[[0 0 0 0 0 0 0]
 [0 1 1 2 0 0 0]
 [0 1 1 1 0 0 0]
 [0 1 1 1 0 0 0]
 [0 1 1 2 2 0 0]
 [0 1 1 1 1 0 0]
 [0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0]
 [0 1 1 2 0 0 0]
 [0 1 4 1 0 0 0]
 [0 5 4 1 0 0 0]
 [0 1 4 2 2 0 0]
 [0 1 1 1 1 0 0]
 [0 0 0 0 0 0 0]] {(1, 3): (3, 2), (4, 3): (4, 2), (4, 4): (2, 2)}
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
[[0 0 0 0 0 0 0]
 [0 1 1 2 0 0 0]
 [0 1 1 1 0 0 0]
 [0 1 1 1 0 0 0]
 [0 1 1 2 2 0 0]
 [0 1 1 1 1 0 0]
 [0 0 0 0 0 0 0]] [[0 0 0 0 0 0 0]
 [0 1 1 2 0 0 0]
 [0 1 4 1 0 0 0]
 [0 5 4 1 0 0 0]
 [0 1 4 2 2 0 0]
 [0 1 1 1 1 0 0]
 [0 0 0 0 0 0 0]] {(1, 3): (3, 2), (4, 3): (4, 2), (4, 4): (2, 2)}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}


  logger.warn(


Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', 'action.moved_player': False, 'action.moved_box': False}
Action: 8, Reward: -0.11, Done: False, Info: {'action.name': 'move right', '