In [1]:
import gym
import numpy as np

action_to_vector = {
    0: np.array([0, -1]),  # Up
    1: np.array([1, 0]),   # Right
    2: np.array([0, 1]),   # Down
    3: np.array([-1, 0])   # Left
}

class DotEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        self.width = 64
        self.height = 64
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.height, self.width, 3), dtype=np.uint8)
        self.action_space = gym.spaces.Discrete(4)
        self.dot_position = np.array([0, 0], dtype='int32')
        self.dot_color = np.array([255, 255, 255])
        self.goal_position = np.array([self.width/2, self.height/2], dtype='int32')
        self.viewer = None
        self.reward_range = (-float('inf'), float('inf'))
        self.max_episode_steps = 200
        self.current_step = 0

    def step(self, action):
        # Move the dot according to the action
        if action == 0:  # Up
            self.dot_position[1] = max(self.dot_position[1] - 1, 0)
        elif action == 1:  # Right
            self.dot_position[0] = min(self.dot_position[0] + 1, self.width - 2)
        elif action == 2:  # Down
            self.dot_position[1] = min(self.dot_position[1] + 1, self.height - 2)
        elif action == 3:  # Left
            self.dot_position[0] = max(self.dot_position[0] - 1, 0)

        # Compute the reward based on the new position of the dot
        distance = np.linalg.norm(self.dot_position - self.goal_position)
        reward = 1 if distance < np.linalg.norm(self.dot_position - action_to_vector[action] - self.goal_position) else -1

        # Update the observation
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color

        self.current_step += 1
        done = self.current_step >= self.max_episode_steps or all(self.dot_position == self.goal_position)
        info = {}

        return observation, reward, done, info
    
    def get_obs(self):
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color
        return observation

    def reset(self):
        self.dot_position = np.array([np.random.randint(0, self.width - 2), np.random.randint(0, self.height - 2)])
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color
        self.current_step = 0
        return observation

    def render(self, mode='human'):
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.SimpleImageViewer()
        self.viewer.imshow(self.reset())


In [2]:
import gym
from stable_baselines3 import DQN

env = DotEnv()

model = DQN("MlpPolicy", env, verbose=1, gamma=0.0)
model.learn(total_timesteps=100000, log_interval=100)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 198      |
|    ep_rew_mean      | -4.8     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 100      |
|    fps              | 8074     |
|    time_elapsed     | 2        |
|    total_timesteps  | 19786    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | -1.9     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 200      |
|    fps              | 8133     |
|    time_elapsed     | 4        |
|    total_timesteps  | 39786    |
----------------------------------


KeyboardInterrupt: 

In [3]:
import gym
import numpy as np

class ContinuousDotEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        self.width = 64
        self.height = 64
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.height, self.width, 3), dtype=np.uint8)
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
        self.dot_position = np.array([0, 0], dtype='int32')
        self.dot_color = np.array([255, 255, 255])
        self.goal_position = np.array([self.width/2, self.height/2], dtype='int32')
        self.viewer = None
        self.reward_range = (-float('inf'), float('inf'))
        self.max_episode_steps = 200
        self.current_step = 0

    def step(self, action):
        # Move the dot according to the action
        action = np.clip(action, self.action_space.low, self.action_space.high)
        self.dot_position[0] = np.clip(self.dot_position[0] + action[0], 0, self.width - 2)
        self.dot_position[1] = np.clip(self.dot_position[1] + action[1], 0, self.height - 2)

        # Compute the reward based on the new position of the dot
        distance = np.linalg.norm(self.dot_position - self.goal_position)
        reward = 1 if distance < np.linalg.norm(self.dot_position - action - self.goal_position) else -1

        # Update the observation
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color

        self.current_step += 1
        done = self.current_step >= self.max_episode_steps or all(self.dot_position == self.goal_position)
        info = {}

        return observation, reward, done, info
    
    def get_obs(self):
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color
        return observation

    def reset(self):
        self.dot_position = np.array([np.random.randint(0, self.width - 2), np.random.randint(0, self.height - 2)])
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color
        self.current_step = 0
        return observation

    def render(self, mode='human'):
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.SimpleImageViewer()
        self.viewer.imshow(self.reset())


In [4]:
import gym

from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

env = ContinuousDotEnv()

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, verbose=1)
model.learn(total_timesteps=100000, log_interval=100)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


KeyboardInterrupt: 

In [5]:
class RGBDotEnv(gym.Env):
    """
    An environment with sparse rewards. 
    Three dots are scattered randomly around the screen,
    and the agent gets +1 reward for each one they touch in order.
    """
    metadata = {'render.modes': ['human']}
    colors = [
        np.array([255, 0, 0]),
        np.array([0, 255, 0]),
        np.array([0, 0, 255]),
    ]

    def __init__(self):
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)
        self.action_space = gym.spaces.Discrete(4)
        self.dot_position = np.array([0, 0], dtype='int32')
        self.dot_color = np.array([255, 255, 255])
        self.viewer = None
        self.reward_range = (-float('inf'), float('inf'))
        self.current_step = 0
        self.colored_dots = []

    def step(self, action):
        # Move the dot according to the action
        if action == 0:  # Up
            self.dot_position[1] = max(self.dot_position[1] - 1, 0)
        elif action == 1:  # Right
            self.dot_position[0] = min(self.dot_position[0] + 1, 64 - 2)
        elif action == 2:  # Down
            self.dot_position[1] = min(self.dot_position[1] + 1, 64 - 2)
        elif action == 3:  # Left
            self.dot_position[0] = max(self.dot_position[0] - 1, 0)

        # Compute the reward based on the new position of the dot
        reward = 0
        if all(self.dot_position == self.colored_dots[0][0]):
            reward = 1
            self.colored_dots.pop(0)
            
        self.current_step += 1
        done = len(self.colored_dots) == 0 or self.current_step >= 1000
        info = {}

        # Update the observation
        observation = self.get_obs()
        
        return observation, reward, done, info
    
    def render_dot(self, obs, position, color):
        obs[position[1]:position[1]+2, position[0]:position[0]+2] = color
    
    def get_obs(self):
        observation = np.zeros((64, 64, 3), dtype=np.uint8)

        # render the dots in reverse order to ensure that the target is always visible
        for p, c in reversed(self.colored_dots):
            self.render_dot(observation, p, c)

        self.render_dot(observation, self.dot_position, np.array([255, 255, 255]))
        return observation
    
    def rand_point(self):
        return np.array([np.random.randint(0, 64 - 2), np.random.randint(0, 64 - 2)])
        
    def reset(self):
        self.dot_position = self.rand_point()
        self.current_step = 0
        self.colored_dots = []
        for c in self.colors:
            self.colored_dots.append((self.rand_point(), c))
        observation = self.get_obs()
        return observation

    def render(self, mode='human'):
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.SimpleImageViewer()
        self.viewer.imshow(self.get_obs())

In [None]:
env = RGBDotEnv()
observation = env.reset()
done = False
total_reward = 0
while not done:
    action = env.action_space.sample()  # Replace with your policy
    observation, reward, done, info = env.step(action)
    total_reward += reward
#     env.render()
print(f'Total reward: {total_reward}')

In [None]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = [10, 8]
import matplotlib.pyplot as plt


In [None]:
plt.imshow(observation)

In [None]:
env = RGBDotEnv()

model = DQN("MlpPolicy", env, verbose=1, gamma=0.0)
model.learn(total_timesteps=10000000, log_interval=10)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.3      |
|    exploration_rate | 0.991    |
| time/               |          |
|    episodes         | 10       |
|    fps              | 8451     |
|    time_elapsed     | 1        |
|    total_timesteps  | 10000    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.2      |
|    exploration_rate | 0.981    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 8706     |
|    time_elapsed     | 2        |
|    total_timesteps  | 20000    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.133    |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes       

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.04     |
|    exploration_rate | 0.829    |
| time/               |          |
|    episodes         | 180      |
|    fps              | 1440     |
|    time_elapsed     | 124      |
|    total_timesteps  | 180000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.02e-07 |
|    n_updates        | 32499    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.05     |
|    exploration_rate | 0.819    |
| time/               |          |
|    episodes         | 190      |
|    fps              | 1412     |
|    time_elapsed     | 134      |
|    total_timesteps  | 190000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.65e-08 |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.06     |
|    exploration_rate | 0.677    |
| time/               |          |
|    episodes         | 340      |
|    fps              | 1207     |
|    time_elapsed     | 281      |
|    total_timesteps  | 340000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.94e-07 |
|    n_updates        | 72499    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.06     |
|    exploration_rate | 0.667    |
| time/               |          |
|    episodes         | 350      |
|    fps              | 1199     |
|    time_elapsed     | 291      |
|    total_timesteps  | 350000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.39e-07 |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.03     |
|    exploration_rate | 0.525    |
| time/               |          |
|    episodes         | 500      |
|    fps              | 1116     |
|    time_elapsed     | 447      |
|    total_timesteps  | 500000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 9.06e-08 |
|    n_updates        | 112499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.04     |
|    exploration_rate | 0.515    |
| time/               |          |
|    episodes         | 510      |
|    fps              | 1112     |
|    time_elapsed     | 458      |
|    total_timesteps  | 510000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.64e-08 |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.09     |
|    exploration_rate | 0.373    |
| time/               |          |
|    episodes         | 660      |
|    fps              | 1062     |
|    time_elapsed     | 620      |
|    total_timesteps  | 660000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 5.01e-07 |
|    n_updates        | 152499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.08     |
|    exploration_rate | 0.364    |
| time/               |          |
|    episodes         | 670      |
|    fps              | 1060     |
|    time_elapsed     | 631      |
|    total_timesteps  | 670000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 4.51e-08 |
|    n_updates      

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.07     |
|    exploration_rate | 0.221    |
| time/               |          |
|    episodes         | 820      |
|    fps              | 1026     |
|    time_elapsed     | 799      |
|    total_timesteps  | 820000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 5.24e-09 |
|    n_updates        | 192499   |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 0.05     |
|    exploration_rate | 0.212    |
| time/               |          |
|    episodes         | 830      |
|    fps              | 1024     |
|    time_elapsed     | 810      |
|    total_timesteps  | 830000   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.8e-08  |
|    n_updates      

In [30]:
class CollectDotsEnv(gym.Env):
    """
    An environment with sparse rewards. 
    Sixteen green dots are scattered randomly around the screen,
    and the agent gets +1 reward for each one they touch.
    """
    metadata = {'render.modes': ['human']}
    colors = [
        np.array([255, 0, 0]),
        np.array([0, 255, 0]),
        np.array([0, 0, 255]),
    ]

    def __init__(self):
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)
        self.action_space = gym.spaces.Discrete(4)
        self.dot_position = np.array([0, 0], dtype='int32')
        self.dot_color = np.array([255, 255, 255])
        self.viewer = None
        self.reward_range = (-float('inf'), float('inf'))
        self.current_step = 0
        self.colored_dots = []

    def step(self, action):
        # Move the dot according to the action
        if action == 0:  # Up
            self.dot_position[1] = max(self.dot_position[1] - 1, 0)
        elif action == 1:  # Right
            self.dot_position[0] = min(self.dot_position[0] + 1, 64 - 2)
        elif action == 2:  # Down
            self.dot_position[1] = min(self.dot_position[1] + 1, 64 - 2)
        elif action == 3:  # Left
            self.dot_position[0] = max(self.dot_position[0] - 1, 0)

        # Compute the reward based on the new position of the dot
        reward = 0
        for i, d in enumerate(self.colored_dots):
            if sum(np.abs(self.dot_position - d[0])) <= 1:
                # we hit the dot!
                reward += 1
                self.colored_dots.pop(i)
                break
            
        self.current_step += 1
        done = len(self.colored_dots) == 0 or self.current_step >= 1000
        info = {}

        # Update the observation
        observation = self.get_obs()
        
        return observation, reward, done, info
    
    def render_dot(self, obs, position, color):
        obs[position[1]:position[1]+2, position[0]:position[0]+2] = color
    
    def get_obs(self):
        observation = np.zeros((64, 64, 3), dtype=np.uint8)

        # render the dots in reverse order to ensure that the target is always visible
        for p, c in reversed(self.colored_dots):
            self.render_dot(observation, p, c)

        self.render_dot(observation, self.dot_position, np.array([255, 255, 255]))
        return observation
    
    def rand_point(self):
        return np.array([np.random.randint(0, 64 - 2), np.random.randint(0, 64 - 2)])
        
    def reset(self):
        self.dot_position = self.rand_point()
        self.current_step = 0
        self.colored_dots = []
        for i in range(64):
            self.colored_dots.append((self.rand_point(), np.array([0, 255, 0])))
        observation = self.get_obs()
        return observation

    def render(self, mode='human'):
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.SimpleImageViewer()
        self.viewer.imshow(self.get_obs())

In [32]:
env = CollectDotsEnv()
observation = env.reset()
done = False
total_reward = 0
while not done:
    action = env.action_space.sample()  # Replace with your policy
    observation, reward, done, info = env.step(action)
    total_reward += reward
#     env.render()
print(f'Total reward: {total_reward}')

Total reward: 10


In [33]:
env = CollectDotsEnv()

model = DQN("MlpPolicy", env, verbose=1, gamma=0.0)
model.learn(total_timesteps=100000, log_interval=10)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 8.3      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 10       |
|    fps              | 2905     |
|    time_elapsed     | 3        |
|    total_timesteps  | 10000    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1e+03    |
|    ep_rew_mean      | 7.7      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 20       |
|    fps              | 2896     |
|    time_elapsed     | 6        |
|    total_timesteps  | 20000    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 

<stable_baselines3.dqn.dqn.DQN at 0x7fa924500040>

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = make_vec_env(CollectDotsEnv, n_envs=4)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

Using cuda device
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 9.12     |
| time/              |          |
|    fps             | 2054     |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 7.19        |
| time/                   |             |
|    fps                  | 1022        |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.016474059 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explaine

In [35]:

from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = make_vec_env(CollectDotsEnv, n_envs=4)

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

Using cuda device
Wrapping the env in a VecTransposeImage.
------------------------------------
| time/                 |          |
|    fps                | 1212     |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.902   |
|    explained_variance | 0.931    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0011  |
|    value_loss         | 1.97e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 4.5      |
| time/                 |          |
|    fps                | 1216     |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 4000     |
| train/                |          |
|    entropy_loss       | -1.23    |
|    explained_v

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 4.57     |
| time/                 |          |
|    fps                | 1339     |
|    iterations         | 1500     |
|    time_elapsed       | 22       |
|    total_timesteps    | 30000    |
| train/                |          |
|    entropy_loss       | -1.34    |
|    explained_variance | 0.997    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1499     |
|    policy_loss        | -0.0133  |
|    value_loss         | 0.000244 |
------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 1e+03     |
|    ep_rew_mean        | 5         |
| time/                 |           |
|    fps                | 1345      |
|    iterations         | 1600      |
|    time_elapsed       | 23        |
|    total_timesteps    | 32000     |
| train/                |    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 5.18     |
| time/                 |          |
|    fps                | 1361     |
|    iterations         | 2800     |
|    time_elapsed       | 41       |
|    total_timesteps    | 56000    |
| train/                |          |
|    entropy_loss       | -1.2     |
|    explained_variance | 0.995    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2799     |
|    policy_loss        | 0.00575  |
|    value_loss         | 7.15e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 5.18     |
| time/                 |          |
|    fps                | 1363     |
|    iterations         | 2900     |
|    time_elapsed       | 42       |
|    total_timesteps    | 58000    |
| train/                |          |
|

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 5.3      |
| time/                 |          |
|    fps                | 1357     |
|    iterations         | 4100     |
|    time_elapsed       | 60       |
|    total_timesteps    | 82000    |
| train/                |          |
|    entropy_loss       | -0.836   |
|    explained_variance | 0.981    |
|    learning_rate      | 0.0007   |
|    n_updates          | 4099     |
|    policy_loss        | -0.00347 |
|    value_loss         | 2.15e-05 |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 1e+03    |
|    ep_rew_mean        | 5.36     |
| time/                 |          |
|    fps                | 1357     |
|    iterations         | 4200     |
|    time_elapsed       | 61       |
|    total_timesteps    | 84000    |
| train/                |          |
|

<stable_baselines3.a2c.a2c.A2C at 0x7fa8480f88b0>

In [39]:
import gym
import numpy as np

action_to_vector = {
    0: np.array([0, -1]),  # Up
    1: np.array([1, 0]),   # Right
    2: np.array([0, 1]),   # Down
    3: np.array([-1, 0])   # Left
}

class SparseDotEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        self.width = 64
        self.height = 64
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.height, self.width, 3), dtype=np.uint8)
        self.action_space = gym.spaces.Discrete(4)
        self.dot_position = np.array([0, 0], dtype='int32')
        self.dot_color = np.array([255, 255, 255])
        self.goal_position = np.array([self.width/2, self.height/2], dtype='int32')
        self.viewer = None
        self.reward_range = (-float('inf'), float('inf'))
        self.max_episode_steps = 200
        self.current_step = 0

    def step(self, action):
        # Move the dot according to the action
        if action == 0:  # Up
            self.dot_position[1] = max(self.dot_position[1] - 1, 0)
        elif action == 1:  # Right
            self.dot_position[0] = min(self.dot_position[0] + 1, self.width - 2)
        elif action == 2:  # Down
            self.dot_position[1] = min(self.dot_position[1] + 1, self.height - 2)
        elif action == 3:  # Left
            self.dot_position[0] = max(self.dot_position[0] - 1, 0)

        # Update the observation
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color

        self.current_step += 1
        finished = all(self.dot_position == self.goal_position)
        done = self.current_step >= self.max_episode_steps or finished
        info = {}
        
        reward = 1 if finished else 0

        return observation, reward, done, info
    
    def get_obs(self):
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color
        return observation

    def reset(self):
        self.dot_position = np.array([np.random.randint(0, self.width - 2), np.random.randint(0, self.height - 2)])
        observation = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        observation[self.dot_position[1]:self.dot_position[1]+2, self.dot_position[0]:self.dot_position[0]+2] = self.dot_color
        self.current_step = 0
        return observation

    def render(self, mode='human'):
        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.SimpleImageViewer()
        self.viewer.imshow(self.reset())


In [40]:
env = SparseDotEnv()

model = DQN("MlpPolicy", env, verbose=1, gamma=0.0)
model.learn(total_timesteps=100000, log_interval=10)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.81     |
| time/               |          |
|    episodes         | 10       |
|    fps              | 9521     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 191      |
|    ep_rew_mean      | 0.05     |
|    exploration_rate | 0.637    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 9538     |
|    time_elapsed     | 0        |
|    total_timesteps  | 3819     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 194      |
|    ep_rew_mean      | 0.0333   |
|    exploration_rate | 0.447    |
| time/               |          |
|    episodes       

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 196      |
|    ep_rew_mean      | 0.05     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 230      |
|    fps              | 9479     |
|    time_elapsed     | 4        |
|    total_timesteps  | 44747    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 196      |
|    ep_rew_mean      | 0.05     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 240      |
|    fps              | 9483     |
|    time_elapsed     | 4        |
|    total_timesteps  | 46747    |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 196      |
|    ep_rew_mean      | 0.04     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes       

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 400      |
|    fps              | 2021     |
|    time_elapsed     | 38       |
|    total_timesteps  | 78548    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 1.48e-06 |
|    n_updates        | 7136     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 200      |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 410      |
|    fps              | 1951     |
|    time_elapsed     | 41       |
|    total_timesteps  | 80548    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 4.22e-07 |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x7fa8480f8d90>