In [27]:
import torch
from omegaconf import OmegaConf
from functools import partial

import bbrl_utils
from bbrl_utils.notebook import setup_tensorboard
from bbrl.stats import WelchTTest
from bbrl.agents import Agent, Agents, TemporalAgent
from bbrl.agents.gymnasium import ParallelGymAgent, make_env
from bbrl.workspace import Workspace
from bbrl.utils.replay_buffer import ReplayBuffer
from pmind_utils import (
    DQN,
    DDPG,
    TD3,
    dqn_compute_critic_loss,
    ddqn_compute_critic_loss,
    run_dqn,
    run_ddpg,
    run_td3,
)

bbrl_utils.setup()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Test used algorithms

In [None]:
setup_tensorboard("./outputs/tblogs")

### DQN:

In [None]:
params = {
    "base_dir": "${gym_env.env_name}/dqn-S${algorithm.seed}_${current_time:}",
    # `collect_stats` is True: we keep the cumulated reward for all
    # evaluation episodes
    "collect_stats": True,
    "save_best": False,
    "algorithm": {
        "seed": 4,
        "max_grad_norm": 0.5,
        "epsilon": 0.02,
        "n_envs": 8,
        "n_steps": 32,
        "n_updates": 32,
        "eval_interval": 2000,
        "learning_starts": 5000,
        "nb_evals": 10,
        "buffer_size": 100_000,
        "batch_size": 256,
        "target_critic_update": 1_000,
        "max_epochs": 3_000,
        "discount_factor": 0.99,
        "architecture": {"hidden_size": [256, 256]},
    },
    "gym_env": {
        "env_name": "CartPole-v1",
    },
    "optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
}

dqn = DQN(OmegaConf.create(params))
run_dqn(dqn, dqn_compute_critic_loss)
dqn.visualize_best()

### DDQN:

In [None]:
params = {
    "base_dir": "${gym_env.env_name}/double-dqn-S${algorithm.seed}_${current_time:}",
    "collect_stats": True,
    "save_best": False,
    "algorithm": {
        "seed": 3,
        "max_grad_norm": 0.5,
        "epsilon": 0.02,
        "n_envs": 8,
        "n_steps": 32,
        "n_updates": 32,
        "eval_interval": 2_000,
        "learning_starts": 5_000,
        "nb_evals": 10,
        "buffer_size": 100_000,
        "batch_size": 256,
        "target_critic_update": 1000,
        "max_epochs": 3_000,
        "discount_factor": 0.99,
        "architecture": {"hidden_size": [128, 128]},
    },
    "gym_env": {
        "env_name": "CartPole-v1",
    },
    "optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
}

ddqn = DQN(OmegaConf.create(params))
run_dqn(ddqn, ddqn_compute_critic_loss)
ddqn.visualize_best()

In [None]:
WelchTTest().plot(
    torch.stack(dqn.eval_rewards), torch.stack(ddqn.eval_rewards), save=False
)

### DDPG:

In [None]:
params = {
    "save_best": False,
    "base_dir": "${gym_env.env_name}/ddpg-S${algorithm.seed}_${current_time:}",
    "collect_stats": True,
    # Set to true to have an insight on the learned policy
    # (but slows down the evaluation a lot!)
    "plot_agents": True,
    "algorithm": {
        "seed": 1,
        "max_grad_norm": 0.5,
        "n_envs": 1,
        "n_steps": 100,
        "nb_evals": 10,
        "discount_factor": 0.98,
        "buffer_size": 2e5,
        "batch_size": 64,
        "tau_target": 0.05,
        "eval_interval": 2_000,
        "max_epochs": 11_000,
        # Minimum number of transitions before learning starts
        "learning_starts": 10000,
        "action_noise": 0.1,
        "architecture": {
            "actor_hidden_size": [400, 300],
            "critic_hidden_size": [400, 300],
        },
    },
    "gym_env": {
        "env_name": "CartPoleContinuous-v1",
    },
    "actor_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
    "critic_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
}

ddpg = DDPG(OmegaConf.create(params))
run_ddpg(ddpg)
ddpg.visualize_best()

### TD3:

In [None]:
# Create hyper-params

params = {
    "save_best": False,
    "base_dir": "${gym_env.env_name}/td3-S${algorithm.seed}_${current_time:}",
    "collect_stats": True,
    # Set to true to have an insight on the learned policy
    # (but slows down the evaluation a lot!)
    "plot_agents": True,
    "algorithm": {
        "seed": 1,
        "max_grad_norm": 0.5,
        "n_envs": 1,
        "n_steps": 100,
        "nb_evals": 10,
        "discount_factor": 0.98,
        "buffer_size": 2e5,
        "batch_size": 64,
        "tau_target": 0.05,
        "eval_interval": 2_000,
        "max_epochs": 11_000,
        # Minimum number of transitions before learning starts
        "learning_starts": 10000,
        "action_noise": 0.1,

        # TD3 SPECIFIC
        "policy_delay": 2,
        "target_policy_noise": 0.2,
        "target_policy_noise_clip": 0.5,

        "architecture": {
            "actor_hidden_size": [400, 300],
            "critic_hidden_size": [400, 300]
        },
    },
    "gym_env": {
        "env_name": "CartPoleContinuous-v1",
    },
    "actor_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
    "critic_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
}

td3 = TD3(OmegaConf.create(params))
run_td3(td3)
td3.visualize_best()

In [None]:
WelchTTest().plot(
    torch.stack(ddpg.eval_rewards),
    torch.stack(td3.eval_rewards),
    legends="ddpg/td3",
    save=False,
)

# SANDBOX

In [None]:
params = {
    "save_best": False,
    "base_dir": "${gym_env.env_name}/td3-S${algorithm.seed}_${current_time:}",
    "collect_stats": True,
    # Set to true to have an insight on the learned policy
    # (but slows down the evaluation a lot!)
    "plot_agents": True,
    "algorithm": {
        "seed": 1,
        "max_grad_norm": 0.5,
        "n_envs": 1,
        "n_steps": 100,
        "nb_evals": 10,
        "discount_factor": 0.98,
        "buffer_size": 2e5,
        "batch_size": 64,
        "tau_target": 0.05,
        "eval_interval": 2_000,
        "max_epochs": 1000, #11_000, # NOTE: less epochs for test
        # Minimum number of transitions before learning starts
        "learning_starts": 10000, 
        "action_noise": 0.1,

        # TD3 SPECIFIC
        "policy_delay": 2,
        "target_policy_noise": 0.2,
        "target_policy_noise_clip": 0.5,

        "architecture": {
            "actor_hidden_size": [400, 300],
            "critic_hidden_size": [400, 300]
        },
    },
    "gym_env": {
        "env_name": "CartPoleContinuous-v1",
    },
    "actor_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
    "critic_optimizer": {
        "classname": "torch.optim.Adam",
        "lr": 1e-3,
    },
}

td3 = TD3(OmegaConf.create(params))
run_td3(td3)
td3.visualize_best()

Matplotlib backend: inline


  0%|          | 0/1000 [00:00<?, ?it/s]

Video of best agent recorded in folder outputs/CartPoleContinuous-v1/td3-S1_20260207-150620/best_agent


objc[25108]: Class SDLApplication is implemented in both /Users/vlad/Documents/University/Master-MIND/projet-mind/.venv/lib/python3.10/site-packages/cv2/.dylibs/libSDL2-2.0.0.dylib (0x123358890) and /Users/vlad/Documents/University/Master-MIND/projet-mind/.venv/lib/python3.10/site-packages/pygame/.dylibs/libSDL2-2.0.0.dylib (0x12843d2c8). This may cause spurious casting failures and mysterious crashes. One of the duplicates must be removed or renamed.
objc[25108]: Class SDLAppDelegate is implemented in both /Users/vlad/Documents/University/Master-MIND/projet-mind/.venv/lib/python3.10/site-packages/cv2/.dylibs/libSDL2-2.0.0.dylib (0x1233588e0) and /Users/vlad/Documents/University/Master-MIND/projet-mind/.venv/lib/python3.10/site-packages/pygame/.dylibs/libSDL2-2.0.0.dylib (0x12843d318). This may cause spurious casting failures and mysterious crashes. One of the duplicates must be removed or renamed.
objc[25108]: Class SDLTranslatorResponder is implemented in both /Users/vlad/Documents/U

moviepy is not installed, skipping video display


In [26]:
obs_test = torch.tensor([0.,0.,0.,1.])
best_policy_agent = td3.best_policy#.model(obs_test)

In [36]:
td3.best_policy.model(torch.tensor([0.,0.,0.,0.]))

tensor([0.0728], grad_fn=<TanhBackward0>)

In [30]:
# We deal with 3 environments at a time (random seed 2139)

epoch_size = 10
multienv_agent = ParallelGymAgent(partial(make_env, env_name='CartPoleContinuous-v1', autoreset=True), num_envs=3).seed(2139)
obs_size, action_dim = multienv_agent.get_obs_and_actions_sizes()
print(f"Environment: observation space in R^{obs_size} and action space R^{action_dim}")

agents = Agents(multienv_agent, best_policy_agent)
t_agents = TemporalAgent(agents)
workspace = Workspace() 
t_agents(workspace, n_steps=epoch_size)
transitions = workspace.get_transitions()

display("Observations (first 4)", workspace["env/env_obs"][:4])

display("Transitions (first 3)")
for t in range(3):
    display(f'(s_{t}, s_{t+1})')
    display(transitions["env/env_obs"][:, t])

Environment: observation space in R^4 and action space R^1


'Observations (first 4)'

tensor([[[-0.0085, -0.0427, -0.0489,  0.0215],
         [ 0.0005,  0.0025, -0.0493, -0.0402],
         [ 0.0080,  0.0203, -0.0023, -0.0085]],

        [[-0.0094, -0.0379, -0.0485, -0.0001],
         [ 0.0006,  0.0061, -0.0501, -0.0601],
         [ 0.0084,  0.0362, -0.0025, -0.0330]],

        [[-0.0101, -0.0351, -0.0485, -0.0185],
         [ 0.0007,  0.0075, -0.0513, -0.0768],
         [ 0.0092,  0.0517, -0.0031, -0.0569]],

        [[-0.0108, -0.0343, -0.0489, -0.0340],
         [ 0.0009,  0.0065, -0.0528, -0.0905],
         [ 0.0102,  0.0667, -0.0043, -0.0804]]])

'Transitions (first 3)'

'(s_0, s_1)'

tensor([[-0.0085, -0.0427, -0.0489,  0.0215],
        [-0.0094, -0.0379, -0.0485, -0.0001]])

'(s_1, s_2)'

tensor([[ 0.0005,  0.0025, -0.0493, -0.0402],
        [ 0.0006,  0.0061, -0.0501, -0.0601]])

'(s_2, s_3)'

tensor([[ 0.0080,  0.0203, -0.0023, -0.0085],
        [ 0.0084,  0.0362, -0.0025, -0.0330]])

In [42]:
rb = ReplayBuffer(max_size=80)
# We add the transitions to the buffer....
rb.put(transitions)
rb.get_shuffled(1)["action"]

tensor([[[-0.0484]],

        [[-0.0578]]])