In [22]:
import torch
import torch.optim

from vel.api import TrainingInfo, EpochInfo
from vel.rl.metrics import EpisodeRewardMetric
from vel.storage.streaming.stdout import StdoutStreaming
from vel.util.random import set_seed
from vel.rl.env.mujoco import MujocoEnv
from vel.rl.models.deterministic_policy_model import DeterministicPolicyModelFactory
from vel.rl.models.backbone.mlp import MLPFactory
from vel.rl.reinforcers.buffered_single_off_policy_iteration_reinforcer import (
    BufferedSingleOffPolicyIterationReinforcer, BufferedSingleOffPolicyIterationReinforcerSettings
)
from vel.rl.algo.policy_gradient.ddpg import DeepDeterministicPolicyGradient
from vel.rl.env_roller.single.deque_replay_roller_ou_noise import DequeReplayRollerOuNoise
from vel.optimizers.adam import AdamFactory


device = torch.device('cpu:0')
seed = 1002

# Set random seed in python std lib, numpy and pytorch
set_seed(seed)

env = MujocoEnv('InvertedPendulum-v2').instantiate(seed=seed)

model_factory = DeterministicPolicyModelFactory(
    policy_backbone=MLPFactory(input_length=4, hidden_layers=[64, 64], activation='tanh'),
    value_backbone=MLPFactory(input_length=5, hidden_layers=[64, 64], activation='tanh'),
)

model = model_factory.instantiate(action_space=env.action_space)

reinforcer = BufferedSingleOffPolicyIterationReinforcer(
    device=device,
    settings=BufferedSingleOffPolicyIterationReinforcerSettings(
        batch_rollout_rounds=100,
        batch_training_rounds=50,
        batch_size=64,
        discount_factor=0.99
    ),
    environment=env,
    model=model,
    algo=DeepDeterministicPolicyGradient(
        model_factory=model_factory,
        tau=0.01,
    ),
    env_roller=DequeReplayRollerOuNoise(
        environment=env,
        device=device,
        batch_size=64,
        buffer_capacity=1_000_000,
        buffer_initial_size=2_000,
        noise_std_dev=0.2,
        normalize_observations=True,
        normalize_returns=True,
        discount_factor=0.99
    )
)

# Optimizer helper - A weird regularization settings I've copied from OpenAI code
adam_optimizer = AdamFactory(
    lr=[1.0e-4, 1.0e-3, 1.0e-3],
    weight_decay=[0.0, 0.0, 0.001],
    eps=1.0e-4,
    layer_groups=True
).instantiate(model)

# Overall information store for training information
training_info = TrainingInfo(
    metrics=[
        EpisodeRewardMetric('episode_rewards'),  # Calculate average reward from episode
    ],
    callbacks=[StdoutStreaming()]  # Print live metrics every epoch to standard output
)

# A bit of training initialization bookkeeping...
training_info.initialize()
reinforcer.initialize_training(training_info)
training_info.on_train_begin()

# Let's make 20 batches per epoch to average metrics nicely
num_epochs = int(1.0e5 / 64 / 20)

# Normal handrolled training loop
for i in range(1, num_epochs+1):
    epoch_info = EpochInfo(
        training_info=training_info,
        global_epoch_idx=i,
        batches_per_epoch=20,
        optimizer=adam_optimizer
    )

    reinforcer.train_epoch(epoch_info)

training_info.on_train_end()







Training:   0%|          | 0/20 [00:00<?, ?batch/s][A[A[A


Training:   5%|▌         | 1/20 [00:01<00:21,  1.16s/batch][A[A[A


Training:  10%|█         | 2/20 [00:01<00:12,  1.45batch/s][A[A[A


Training:  15%|█▌        | 3/20 [00:01<00:09,  1.85batch/s][A[A[A


Training:  20%|██        | 4/20 [00:01<00:07,  2.19batch/s][A[A[A


Training:  25%|██▌       | 5/20 [00:02<00:06,  2.46batch/s][A[A[A


Training:  30%|███       | 6/20 [00:02<00:05,  2.68batch/s][A[A[A


Training:  35%|███▌      | 7/20 [00:02<00:04,  2.85batch/s][A[A[A


Training:  40%|████      | 8/20 [00:02<00:04,  2.99batch/s][A[A[A


Training:  45%|████▌     | 9/20 [00:02<00:03,  3.12batch/s][A[A[A


Training:  50%|█████     | 10/20 [00:03<00:03,  3.23batch/s][A[A[A


Training:  55%|█████▌    | 11/20 [00:03<00:02,  3.33batch/s][A[A[A


Training:  60%|██████    | 12/20 [00:03<00:02,  3.42batch/s][A[A[A


Training:  65%|██████▌   | 13/20 [00:03<00:02,  3.49batch/s][A[A[A


Trainin

In [26]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.1750, -0.0671, -0.1387, -0.1854],
        [-0.2957, -0.1280, -0.2333,  0.2835],
        [ 0.2045,  0.0682,  0.0352,  0.0887],
        [-0.2051,  0.2991,  0.2412, -0.0116],
        [ 0.0659,  0.0202,  0.0315,  0.1427],
        [-0.3118,  0.2163, -0.0558,  0.1101],
        [ 0.2171, -0.0911, -0.1034,  0.1852],
        [ 0.2501,  0.0834,  0.0867, -0.0581],
        [ 0.0291,  0.0741, -0.2078, -0.4078],
        [ 0.2605, -0.3334, -0.1629,  0.0708],
        [ 0.2930,  0.2330,  0.4691, -0.4598],
        [ 0.1670,  0.0664,  0.1288,  0.2249],
        [ 0.2540, -0.2337,  0.6277,  0.2632],
        [ 0.7580,  0.3034, -0.2812,  0.1298],
        [-0.1975,  0.0587, -0.0375, -0.1541],
        [-0.4660, -0.3170,  0.1192,  0.2015],
        [-0.1727, -0.0962, -0.2104, -0.2884],
        [-0.0923, -0.0706, -0.2292, -0.1339],
        [-0.2768, -0.1294, -0.1045, -0.0542],
        [ 0.2322,  0.0929,  0.1657, -0.0226],
        [-0.1075, -0.0336, -0.1570, -0.1927],
        [ 0.

In [15]:
env = MujocoEnv('CartPole-v0').instantiate(seed=0)
env.action_space

Discrete(2)