In [1]:
import numpy as np
import tensorflow as tf
import gym

from infrastructure.buffer import ReplayBuffer
from utils.util import build_q_network, build_p_network, OrnsteinUhlenbeckActionNoise
from agents.ddpg_agent import DDPGAgent
from learners.ddpg_learner import DDPGLearner

In [2]:
env = gym.make("InvertedDoublePendulum-v2")
env.reset()

## environment config
print('Action Space: ', env.action_space)
print('Action Space H : ', env.action_space.high)
print('Action Space L : ', env.action_space.low)

print('Obsrev Space: ', env.observation_space)
print('Obsrev H: ',env.observation_space.high)
print('Obsrev L: ',env.observation_space.low)

## GPU configuration
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

buffer = ReplayBuffer(10**4)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

noise = OrnsteinUhlenbeckActionNoise(np.zeros((1,1)), 0.05)

# q_network = build_network(n_states+n_actions, 1, 3, 400)
# p_network = build_network(n_states, n_actions, 3, 400, (-1, 1))
q_network = build_q_network(n_states, n_actions)
p_network = build_p_network(n_states, env.action_space.high[0])
pg = DDPGAgent(q_network, p_network, 0.99, 1e-3, 1e-3, 1e-4)

Action Space:  Box(1,)
Action Space H :  [1.]
Action Space L :  [-1.]
Obsrev Space:  Box(11,)
Obsrev H:  [inf inf inf inf inf inf inf inf inf inf inf]
Obsrev L:  [-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf]


In [3]:
trainer = DDPGLearner(env, buffer, pg, maximize=True, batch_size=64)
hist = trainer.train(presample=100, noise=noise, episodes=10, interval=1, save_path="./", save_name="DoubleInvertedPendulum")

Start pre-sampling with random policy...
Pre-sampling finished!
Episode: 0, Episode Length: 11, Mean Cost: -9.205122438465317
Models are saved to ./
Episode: 1, Episode Length: 9, Mean Cost: -9.194553257846268
Models are saved to ./
Episode: 2, Episode Length: 9, Mean Cost: -9.182285654400651
Models are saved to ./
Episode: 3, Episode Length: 8, Mean Cost: -9.171710962830465
Models are saved to ./
Episode: 4, Episode Length: 9, Mean Cost: -9.176630289596652
Models are saved to ./
Episode: 5, Episode Length: 6, Mean Cost: -9.156272283504675
Models are saved to ./
Episode: 6, Episode Length: 5, Mean Cost: -9.07329919479283
Models are saved to ./
Episode: 7, Episode Length: 5, Mean Cost: -9.022322890867697
Models are saved to ./
Episode: 8, Episode Length: 4, Mean Cost: -9.073048113464708
Models are saved to ./
Episode: 9, Episode Length: 4, Mean Cost: -8.992952739075568
Models are saved to ./
Training finished!


In [4]:
noise()

array([[0.02096988]])

In [5]:
env.action_space.sample()

array([0.5834904], dtype=float32)