# Example usage of gymnasium with minigrid

In [4]:
import gymnasium as gym
import minigrid

# Create the environment
env = gym.make("MiniGrid-Empty-5x5-v0", render_mode="human")

# Reset the environment to start a new episode
obs, info = env.reset()

# Render the initial state of the environment
env.render()

# Take num_steps random actions in the environment
for _ in range(50):
    # Sample a random action
    action = env.action_space.sample()
    
    # Step through the environment with the chosen action
    obs, reward, terminated, truncated, info = env.step(action)
    
    # Render the environment after each action
    env.render()
    
    # Check if the episode is done
    if terminated or truncated:
        print("Episode finished")
        break

# Close the environment
env.close()


Episode finished


# Example usage of gymnasium with CartPole-v1 and PPO

Model train

In [1]:
import gymnasium as gym
import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import EvalCallback

from stable_baselines3.common.monitor import Monitor

# Create the Gymnasium environment
env_id = "CartPole-v1"
env = gym.make(env_id)

# Optionally, you can create a vectorized environment for parallel training
# This can speed up training by using multiple environments simultaneously
# Here, we create 4 parallel environments
vec_env = make_vec_env(env_id, n_envs=4)

# Initialize the PPO agent
model = PPO(
    "MlpPolicy",          # Multi-layer Perceptron policy
    vec_env,              # Vectorized environment
    verbose=1,            # Verbosity level (0: no output, 1: info)
    tensorboard_log="./ppo_cartpole_tensorboard/"  # Path for TensorBoard logs
)

# Set up an evaluation callback to monitor the agent's performance
eval_env = gym.make(env_id)
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path='./logs/',
    log_path='./logs/',
    eval_freq=500,        # Evaluate the agent every 500 steps
    n_eval_episodes=5,    # Number of episodes for evaluation
    deterministic=True,
    render=False
)

# Train the agent for a total of 100,000 steps, use the eval_callback every 500 steps
model.learn(total_timesteps=100, callback=eval_callback)

# Save the trained model
model.save("ppo_cartpole")

Using cpu device
Logging to ./ppo_cartpole_tensorboard/PPO_4




Eval num_timesteps=2000, episode_reward=112.60 +/- 13.18
Episode length: 112.60 +/- 13.18
---------------------------------
| eval/              |          |
|    mean_ep_length  | 113      |
|    mean_reward     | 113      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
Eval num_timesteps=4000, episode_reward=126.20 +/- 37.93
Episode length: 126.20 +/- 37.93
---------------------------------
| eval/              |          |
|    mean_ep_length  | 126      |
|    mean_reward     | 126      |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------
New best mean reward!
Eval num_timesteps=6000, episode_reward=141.40 +/- 35.57
Episode length: 141.40 +/- 35.57
---------------------------------
| eval/              |          |
|    mean_ep_length  | 141      |
|    mean_reward     | 141      |
| time/              |          |
|    total_timesteps | 6000     |
------

Model eval

In [2]:
# Load the environment
env_id = "CartPole-v1"
env = gym.make(env_id, render_mode="human")

# To demonstrate loading the model, you can reload it as follows:
model = PPO.load("ppo_cartpole", env=vec_env)

# Evaluate the trained agent
episodes = 10
for episode in range(1, episodes + 1):
    obs, info = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
        total_reward += reward
        # Optionally, render the environment (requires a display)
        # env.render()
    print(f"Episode {episode}: Total Reward = {total_reward}")

# Close the environments
env.close()
eval_env.close()
vec_env.close()


Episode 1: Total Reward = 283.0
Episode 2: Total Reward = 995.0
Episode 3: Total Reward = 397.0
Episode 4: Total Reward = 423.0
Episode 5: Total Reward = 601.0
Episode 6: Total Reward = 124.0
Episode 7: Total Reward = 278.0
Episode 8: Total Reward = 1015.0
Episode 9: Total Reward = 171.0
Episode 10: Total Reward = 132.0
