In [31]:
import numpy as np
import gym
import random

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess


import tensorflow.keras.layers as layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [2]:
env = gym.make('Pendulum-v1', g=9.81)

  logger.warn(


In [3]:
env.action_space

Box(-2.0, 2.0, (1,), float32)

In [4]:
env.observation_space

Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)

In [6]:
import pygame
episodes = 15
for episode in range(episodes):
    state = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action = random.random()*2-1
        n_state, reward, done, info = env.step([action])
        score += reward
    print("Episode {}: {}".format(episode, score))


Episode 0: -1459.2110032050145
Episode 1: -1801.1742549225742
Episode 2: -813.9490472146578
Episode 3: -1306.1234088164608
Episode 4: -1070.2868405347342
Episode 5: -1729.1463001628993
Episode 6: -880.0270921625812
Episode 7: -971.8364744874998
Episode 8: -1381.880612433597
Episode 9: -816.6357347031828
Episode 10: -1395.1626295431997
Episode 11: -1273.873835101292
Episode 12: -780.5112547639702
Episode 13: -1387.6843605455756
Episode 14: -1760.042964704504


In [54]:
def build_agent_model(states, actions):
    inputs = layers.Input(shape=(1, states))
    x = layers.Dense(64, activation="relu") (inputs)
    x = layers.Dense(64, activation="relu") (x)
    x = layers.Flatten()(x)
    outputs = layers.Dense(actions, activation="tanh")(x)
    outputs = 2 * outputs
    return Model(inputs, outputs, name="pendulum_agent")

def build_critic_model(states, actions):
    states_input = layers.Input(shape=(1, states), name="state_input")
    states_out = layers.Dense(64, activation="relu")(states_input)
    states_out = layers.Flatten()(states_out)

    actions_input = layers.Input(shape=(actions), name="actions_input")
    actions_out = layers.Dense(64, activation="relu")(actions_input)

    x = layers.Concatenate()([states_out, actions_out])
    x = layers.Dense(128,activation="relu")(x)
    x = layers.Dense(128,activation="relu")(x)
    output = layers.Dense(1, activation="linear")(x)
    return Model(inputs=[states_input, actions_input], outputs=output, name="pendulum_critic"), actions_input

In [55]:
agent = build_agent_model(3, 1)
critic, actions_input = build_critic_model(3, 1)

In [56]:
agent.summary()

Model: "pendulum_agent"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_18 (InputLayer)       [(None, 1, 3)]            0         
                                                                 
 dense_51 (Dense)            (None, 1, 64)             256       
                                                                 
 dense_52 (Dense)            (None, 1, 64)             4160      
                                                                 
 flatten_8 (Flatten)         (None, 64)                0         
                                                                 
 dense_53 (Dense)            (None, 1)                 65        
                                                                 
 tf_op_layer_mul_135 (Tensor  [(None, 1)]              0         
 FlowOpLayer)                                                    
                                                    

In [57]:
critic.summary()

Model: "pendulum_critic"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 state_input (InputLayer)       [(None, 1, 3)]       0           []                               
                                                                                                  
 dense_54 (Dense)               (None, 1, 64)        256         ['state_input[0][0]']            
                                                                                                  
 actions_input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 flatten_9 (Flatten)            (None, 64)           0           ['dense_54[0][0]']               
                                                                                    

In [58]:
def build_agent(actions):
    memory = SequentialMemory(limit=50000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=actions, theta=.15, mu=0., sigma=.3)
    dqn = DDPGAgent(actions, agent, critic, actions_input, memory, random_process= random_process)
    return dqn

In [66]:
dqn = build_agent(1)
dqn.compile(Adam(learning_rate=1e-3), metrics=["mae"])

In [67]:
from callbacks import TrainEpisodeLogger, TrainIntervalLogger, TestLogger
callbacks = [TrainEpisodeLogger()]

In [68]:
# dqn.fit(env,nb_steps=100000, visualize=False, verbose=1)
dqn.fit(env,nb_steps=10000,visualize=False,verbose = 0, callbacks=callbacks, nb_max_episode_steps=1000)
# dqn.fit(env,nb_steps=100000,visualize=False,verbose = 1,  nb_max_episode_steps=200)

Training for 10000 steps ...


  updates=self.state_updates,


  200/10000: episode: 1, duration: 1.235s, episode steps: 200, steps per second: 162, episode reward: -121.863, mean reward: -0.609 [-12.155, -0.006], mean action: -0.630 [-2.618, 1.538], loss: --, mae: --, mean_q: --
The reward is higher than the best one, saving checkpoint weights
  400/10000: episode: 2, duration: 0.672s, episode steps: 200, steps per second: 298, episode reward: -369.862, mean reward: -1.849 [-14.181, -0.006], mean action: -0.006 [-1.781, 2.392], loss: --, mae: --, mean_q: --
The reward is lower than the best one, checkpoint weights not updated
  600/10000: episode: 3, duration: 0.620s, episode steps: 200, steps per second: 322, episode reward: -121.189, mean reward: -0.606 [-12.109, -0.006], mean action: -0.128 [-1.798, 2.523], loss: --, mae: --, mean_q: --
The reward is higher than the best one, saving checkpoint weights
  800/10000: episode: 4, duration: 0.667s, episode steps: 200, steps per second: 300, episode reward: -2.118, mean reward: -0.011 [-0.028, -0.00

  updates=self.state_updates,


 1200/10000: episode: 6, duration: 6.645s, episode steps: 200, steps per second: 30, episode reward: -130.353, mean reward: -0.652 [-14.011, -0.005], mean action: -0.336 [-2.067, 2.121], loss: 0.312279, mae: 0.392920, mean_q: -9.272600
The reward is lower than the best one, checkpoint weights not updated
 1400/10000: episode: 7, duration: 4.838s, episode steps: 200, steps per second: 41, episode reward: -392.726, mean reward: -1.964 [-14.783, -0.005], mean action: 0.351 [-2.036, 2.291], loss: 0.269493, mae: 0.386166, mean_q: -11.210000
The reward is lower than the best one, checkpoint weights not updated
 1600/10000: episode: 8, duration: 4.878s, episode steps: 200, steps per second: 41, episode reward: -253.828, mean reward: -1.269 [-13.293, -0.003], mean action: -0.213 [-2.589, 1.490], loss: 0.230059, mae: 0.394338, mean_q: -12.881079
The reward is lower than the best one, checkpoint weights not updated
 1800/10000: episode: 9, duration: 5.288s, episode steps: 200, steps per second: 

<keras.callbacks.History at 0x1d59887ca90>

In [27]:
scores = dqn.test(env, nb_episodes=100, visualize=False)


Testing for 100 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500
Episode 11: reward: 500.000, steps: 500
Episode 12: reward: 500.000, steps: 500
Episode 13: reward: 500.000, steps: 500
Episode 14: reward: 500.000, steps: 500
Episode 15: reward: 500.000, steps: 500
Episode 16: reward: 500.000, steps: 500
Episode 17: reward: 500.000, steps: 500
Episode 18: reward: 500.000, steps: 500
Episode 19: reward: 500.000, steps: 500
Episode 20: reward: 500.000, steps: 500
Episode 21: reward: 500.000, steps: 500
Episode 22: reward: 500.000, steps: 500
Episode 23: reward: 500.000, steps: 500
Episode 24: reward: 500.000, steps: 500
Episode 25: reward: 

In [69]:
dqn.save_weights("./model/model.h5f", overwrite=True)

In [16]:
dqn.load_weights("./model/model.h5f")


In [71]:
_ = dqn.test(env, nb_episodes=3, visualize=True,nb_max_episode_steps=100)

Testing for 3 episodes ...
Episode 1: reward: -238.952, steps: 100
Episode 2: reward: -342.503, steps: 100
Episode 3: reward: -232.792, steps: 100
