In [21]:
import gymnasium as gym

from torch.utils.tensorboard import SummaryWriter

import tianshou as ts
from tianshou.algorithm.modelfree.ddpg import (
    ContinuousDeterministicPolicy,
    ContinuousPolicyWithExplorationNoise,
)
from tianshou.algorithm.modelfree.td3 import TD3
from tianshou.algorithm.optim import AdamOptimizerFactory
from tianshou.data import CollectStats, Collector
from tianshou.trainer import OffPolicyTrainerParams
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ContinuousActorDeterministic, ContinuousCritic
from tianshou.utils.space_info import SpaceInfo
from tianshou.env import DummyVectorEnv
from tianshou.exploration.random import GaussianNoise

import numpy as np

In [22]:
# Add continuous environments
# import bbrl_utils
# import bbrl_gymnasium  # noqa: F401

# bbrl_utils.setup()

# assert "CartPoleContinuous-v1" in gym.envs.registry.keys()

In [23]:
print(gym.envs.registry.keys())
# env = gym.make("MountainCarContinuous-v0")


dict_keys(['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'phys2d/CartPole-v0', 'phys2d/CartPole-v1', 'phys2d/Pendulum-v0', 'LunarLander-v3', 'LunarLanderContinuous-v3', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v3', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v1', 'CliffWalkingSlippery-v1', 'Taxi-v3', 'tabular/Blackjack-v0', 'tabular/CliffWalking-v0', 'Reacher-v2', 'Reacher-v4', 'Reacher-v5', 'Pusher-v2', 'Pusher-v4', 'Pusher-v5', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedPendulum-v5', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'InvertedDoublePendulum-v5', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'HalfCheetah-v5', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Hopper-v5', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Swimmer-v5', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Walker2d-v5', 'Ant-v2', 'Ant-v3', 'Ant-v4', 'Ant-v5', 'Humanoid-v2', 'Humanoid-v3', 

In [24]:
# task = "CartPoleContinuous-v1"
task = "Pendulum-v1"
lr_actor, epoch, batch_size = 1e-3, 10, 64
lr_critic = 1e-3
num_training_envs, num_testing_envs = 5, 50
gamma = 0.99
tau = 0.005
buffer_size = 100_000
epoch_num_steps, collection_step_num_env_steps = 5000, 5

In [11]:
# class ActionToScalar(gym.ActionWrapper):
#     def action(self, act):
#         # act is usually shape (1,) -> convert to a python float
#         return float(np.asarray(act).reshape(-1)[0])


# def make_env(task):
#     env = gym.make(task, disable_env_checker=True)
#     env = ActionToScalar(env)
#     return env

In [25]:
# Environment
env = gym.make(task)
space_info = SpaceInfo.from_env(env)
state_shape = space_info.observation_info.obs_shape
action_shape = space_info.action_info.action_shape

logger = ts.utils.TensorboardLogger(SummaryWriter("log/td3"))

training_envs = DummyVectorEnv(
    [lambda: gym.make(task) for _ in range(num_training_envs)]
)

test_envs = DummyVectorEnv([lambda: gym.make(task) for _ in range(num_testing_envs)])

In [26]:
# Actor
net_actor = Net(
    state_shape=state_shape, action_shape=action_shape, hidden_sizes=[400, 300]
)
actor = ContinuousActorDeterministic(
    preprocess_net=net_actor, action_shape=action_shape
)
actor_optim = AdamOptimizerFactory(lr=lr_actor)

# Critics
net_critic_1 = Net(
    state_shape=state_shape,
    action_shape=action_shape,
    hidden_sizes=[400, 300],
    concat=True,
)
critic_1 = ContinuousCritic(preprocess_net=net_critic_1)
critic_1_optim = AdamOptimizerFactory(lr=lr_critic)

net_critic_2 = Net(
    state_shape=state_shape,
    action_shape=action_shape,
    hidden_sizes=[400, 300],
    concat=True,
)
critic_2 = ContinuousCritic(preprocess_net=net_critic_2)
critic_2_optim = AdamOptimizerFactory(lr=lr_critic)

# Policy
policy = ContinuousDeterministicPolicy(
    actor=actor, action_space=env.action_space, exploration_noise=GaussianNoise(0, 0.5)
)

# Algorithm
algorithm = TD3(
    policy=policy,
    policy_optim=actor_optim,
    critic=critic_1,
    critic_optim=critic_1_optim,
    critic2=critic_2,
    critic2_optim=critic_2_optim,
    gamma=gamma,
)

# Collectors
training_collector = Collector[CollectStats](
    algorithm,
    training_envs,
    ts.data.VectorReplayBuffer(buffer_size, num_training_envs),
    exploration_noise=True,
)

test_collector = Collector[CollectStats](algorithm, test_envs, exploration_noise=False)


def stop_fn(mean_rewards: float) -> bool:
    if env.spec:
        if not env.spec.reward_threshold:
            return False
        else:
            return mean_rewards >= env.spec.reward_threshold
    return False


# Warm up
training_collector.reset()
training_collector.collect(n_step=10_000, random=True)

off_policy_trainer_params = OffPolicyTrainerParams(
    training_collector=training_collector,
    test_collector=test_collector,
    max_epochs=epoch,
    epoch_num_steps=epoch_num_steps,
    collection_step_num_env_steps=collection_step_num_env_steps,
    test_step_num_episodes=num_testing_envs,
    batch_size=batch_size,
    update_step_num_gradient_steps_per_sample=1.0,
    stop_fn=stop_fn,
    logger=logger,
    test_in_training=True,
)

result = algorithm.run_training(off_policy_trainer_params)

Initial test step: test_reward: -1273.497396 ± 295.436084, best_reward: -1273.497396 ± 295.436084 in #0


Epoch #1: 100%|##########| 5000/5000 [00:14<00:00, 335.16it/s, env_episode=25, env_step=5000, len=200, n_ep=5, n_st=5, rew=-610.13, update_step=1000]


Epoch #1: test_reward: -243.227987 ± 250.064336, best_reward: -243.227987 ± 250.064336 in #1


Epoch #2: 100%|##########| 5000/5000 [00:14<00:00, 354.82it/s, env_episode=50, env_step=10000, len=200, n_ep=5, n_st=5, rew=-192.48, update_step=2000]


Epoch #2: test_reward: -148.954922 ± 83.290718, best_reward: -148.954922 ± 83.290718 in #2


Epoch #3: 100%|##########| 5000/5000 [00:14<00:00, 353.23it/s, env_episode=75, env_step=15000, len=200, n_ep=5, n_st=5, rew=-196.54, update_step=3000]


Epoch #3: test_reward: -169.860997 ± 94.279373, best_reward: -148.954922 ± 83.290718 in #2


Epoch #4: 100%|##########| 5000/5000 [00:14<00:00, 347.37it/s, env_episode=100, env_step=20000, len=200, n_ep=5, n_st=5, rew=-173.96, update_step=4000]


Epoch #4: test_reward: -145.042790 ± 77.540787, best_reward: -145.042790 ± 77.540787 in #4


Epoch #5: 100%|##########| 5000/5000 [00:14<00:00, 345.61it/s, env_episode=125, env_step=25000, len=200, n_ep=5, n_st=5, rew=-114.50, update_step=5000]


Epoch #5: test_reward: -127.696114 ± 80.982816, best_reward: -127.696114 ± 80.982816 in #5


Epoch #6: 100%|##########| 5000/5000 [00:14<00:00, 344.09it/s, env_episode=150, env_step=30000, len=200, n_ep=5, n_st=5, rew=-100.79, update_step=6000]


Epoch #6: test_reward: -150.004917 ± 70.868086, best_reward: -127.696114 ± 80.982816 in #5


Epoch #7: 100%|##########| 5000/5000 [00:14<00:00, 334.89it/s, env_episode=175, env_step=35000, len=200, n_ep=5, n_st=5, rew=-191.50, update_step=7000]


Epoch #7: test_reward: -152.278776 ± 63.310081, best_reward: -127.696114 ± 80.982816 in #5


Epoch #8: 100%|##########| 5000/5000 [00:15<00:00, 316.57it/s, env_episode=200, env_step=40000, len=200, n_ep=5, n_st=5, rew=-150.70, update_step=8000]


Epoch #8: test_reward: -167.232848 ± 78.215232, best_reward: -127.696114 ± 80.982816 in #5


Epoch #9: 100%|##########| 5000/5000 [00:15<00:00, 320.37it/s, env_episode=225, env_step=45000, len=200, n_ep=5, n_st=5, rew=-149.97, update_step=9000]


Epoch #9: test_reward: -139.631572 ± 73.820023, best_reward: -127.696114 ± 80.982816 in #5


Epoch #10: 100%|##########| 5000/5000 [00:15<00:00, 315.88it/s, env_episode=250, env_step=50000, len=200, n_ep=5, n_st=5, rew=-365.04, update_step=10000]


Epoch #10: test_reward: -146.020494 ± 89.321157, best_reward: -127.696114 ± 80.982816 in #5
