In [1]:
import gymnasium as gym
from gymnasium import Wrapper

class CustomTerminationWrapper(Wrapper):
    def __init__(self, env, max_steps):
        super().__init__(env)
        self.max_steps = max_steps
        self.current_step = 0

    def reset(self, **kwargs):
        self.current_step = 0
        return self.env.reset(**kwargs)

    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        
        self.current_step += 1
        
        # Override termination condition
        # Terminate only if max steps reached
        if self.current_step >= self.max_steps:
            terminated = True
        else:
            terminated = False
        
        return observation, reward, terminated, truncated, info
      
# Create the environment with the wrapper
max_steps = 500  # Set this to your desired maximum number of steps

In [None]:
from stable_baselines3 import PPO
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
import numpy as np

# Create the environment
max_steps = 350  # Set this to your desired maximum number of steps
# env = gym.make('MountainCarContinuous-v0',render_mode='rgb_array') 
env = CustomTerminationWrapper(gym.make('MountainCarContinuous-v0',render_mode='rgb_array'), max_steps)
# env = gym.make("whatever")
# Initialize the PPO agent
# model = PPO("MlpPolicy", env, verbose=1,device='cpu')
# The noise objects for DDPG

n_actions = env.action_space.shape[-1]
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.5 * np.ones(n_actions))

model = DDPG("MlpPolicy", env, action_noise=action_noise, buffer_size=5000, verbose=1)

# Train the agent
model.learn(total_timesteps=300000)

# Save the trained model
model.save("ddpg_cartpole")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 350      |
|    ep_rew_mean     | -7.45    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 188      |
|    time_elapsed    | 7        |
|    total_timesteps | 1400     |
| train/             |          |
|    actor_loss      | 0.00564  |
|    critic_loss     | 1.14e-07 |
|    learning_rate   | 0.001    |
|    n_updates       | 1299     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 350      |
|    ep_rew_mean     | -7.35    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 180      |
|    time_elapsed    | 15       |
|    total_timesteps | 2800     |
| train/             |          |
|    actor_loss      | 0.00187  |
|    critic_loss     

In [None]:
# Enjoy trained agent
vec_env = model.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")