In [1]:
import gymnasium as gym
import numpy as np

In [2]:
env = gym.make("CartPole-v1")
print("Observation space:", env.observation_space)
print("Shape:", env.observation_space.shape)
print("Action space:", env.action_space)

Observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Shape: (4,)
Action space: Discrete(2)


In [3]:
obs = env.reset()
action = env.action_space.sample()
print("Sampled action:", action)
obs, reward, terminated, truncated, info = env.step(action)
print(obs.shape, reward, terminated, truncated, info)

Sampled action: 0
(4,) 1.0 False False {}


In [4]:
class GoLeftEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where the agent must learn to go always left.
    """

    metadata = {"render.modes": ["console"]}
    LEFT = 0
    RIGHT = 1

    def __init__(self, grid_size=10):
        super(GoLeftEnv, self).__init__()

        self.grid_size = grid_size
        self.agent_pos = grid_size - 1
        n_actions = 2
        self.action_space = gym.spaces.Discrete(n_actions)
        self.observation_space = gym.spaces.Box(
            low=0, high=self.grid_size, shape=(1,), dtype=np.float32
        )

    def reset(self, seed=0):
        self.agent_pos = self.grid_size - 1
        obs = np.array([self.agent_pos]).astype(np.float32)
        info = {}
        return obs, info

    def step(self, action):
        if action == self.LEFT:
            self.agent_pos -= 1
        elif action == self.RIGHT:
            self.agent_pos += 1
        else:
            raise ValueError(
                f"Received invalid action={action} which is not part of the action space"
            )

        self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)
        terminated = bool(self.agent_pos == 0)
        reward = 1 if self.agent_pos == 0 else 0
        info = {}
        truncated = False
        obs = np.array([self.agent_pos]).astype(np.float32)

        return obs, reward, terminated, truncated, info

    def render(self, mode="console"):
        if mode != "console":
            raise NotImplementedError()
        print("." * self.agent_pos, end="")
        print("x", end="")
        print("." * (self.grid_size - self.agent_pos))

    def close(self):
        pass

In [5]:
env = GoLeftEnv(grid_size=10)
obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

.........x.
Box(0.0, 10.0, (1,), float32)
Discrete(2)
0


In [6]:
from stable_baselines3.common.env_checker import check_env

env = GoLeftEnv()
check_env(env, warn=True)

In [7]:
GO_LEFT = 0
n_steps = 20
for step in range(n_steps):
    print("Step {}".format(step + 1))
    obs, reward, terminated, truncated, info = env.step(GO_LEFT)
    print("obs=", obs, "reward=", reward, "terminated=", terminated)
    env.render()
    if terminated:
        print("Goal reached!", "reward=", reward)
        break

Step 1
obs= [6.] reward= 0 terminated= False
......x....
Step 2
obs= [5.] reward= 0 terminated= False
.....x.....
Step 3
obs= [4.] reward= 0 terminated= False
....x......
Step 4
obs= [3.] reward= 0 terminated= False
...x.......
Step 5
obs= [2.] reward= 0 terminated= False
..x........
Step 6
obs= [1.] reward= 0 terminated= False
.x.........
Step 7
obs= [0.] reward= 1 terminated= True
x..........
Goal reached! reward= 1


In [8]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

env = GoLeftEnv(grid_size=10)
vec_env = make_vec_env(lambda: env, n_envs=1)

In [9]:
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=25000)
model.save("models/ppo_goleft")

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.8     |
|    ep_rew_mean     | 1        |
| time/              |          |
|    fps             | 978      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 56.4        |
|    ep_rew_mean          | 1           |
| time/                   |             |
|    fps                  | 748         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.019652033 |
|    clip_fraction        | 0.303       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.676      |
|    explained_variance   | 0.019       |
|    learnin

In [10]:
model = PPO.load("models/ppo_goleft")

obs = vec_env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, terminated, truncated = vec_env.step(action)
    print("obs=", obs, "reward=", reward, "terminated=", terminated)
    env.render(mode="console")
    if terminated:
        print("Goal Reached!", "reward=", reward)
        break

Step 1
Action:  [0]
obs= [[8.]] reward= [0.] terminated= [False]
........x..
Step 2
Action:  [0]
obs= [[7.]] reward= [0.] terminated= [False]
.......x...
Step 3
Action:  [0]
obs= [[6.]] reward= [0.] terminated= [False]
......x....
Step 4
Action:  [0]
obs= [[5.]] reward= [0.] terminated= [False]
.....x.....
Step 5
Action:  [0]
obs= [[4.]] reward= [0.] terminated= [False]
....x......
Step 6
Action:  [0]
obs= [[3.]] reward= [0.] terminated= [False]
...x.......
Step 7
Action:  [0]
obs= [[2.]] reward= [0.] terminated= [False]
..x........
Step 8
Action:  [0]
obs= [[1.]] reward= [0.] terminated= [False]
.x.........
Step 9
Action:  [0]
obs= [[9.]] reward= [1.] terminated= [ True]
.........x.
Goal Reached! reward= [1.]
