In [1]:
import numpy as np
import gym

import warnings
warnings.filterwarnings('ignore')

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

Using TensorFlow backend.


## Set Environment

In [3]:
ENV_NAME = 'CartPole-v0'

In [4]:
env = gym.make(ENV_NAME)
env.seed(9)

nb_actions = env.action_space.n
np.random.seed(9)

## Build DQN Architecture

In [5]:
model = Sequential()

In [6]:
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


## Set Configs

In [7]:
MEMORY_LIMIT = 50000
WINDOW_LENGTH = 1
MODEL_UPDATE = 1e-2
NB_WARMUP=10
NB_STEPS = 5000
LR = 1e-3
LOSS = 'mae'

## Set Memory, Policy & Init DQN

In [8]:
memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH)

In [9]:
policy = EpsGreedyQPolicy()

In [10]:
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, policy=policy, 
               nb_steps_warmup=NB_WARMUP, target_model_update=MODEL_UPDATE)

## Set Optimizer

In [11]:
dqn.compile(Adam(lr=LR), metrics=[LOSS])

## Train DQN

In [12]:
dqn.fit(env, nb_steps=NB_STEPS, visualize=True, verbose=2)

Training for 5000 steps ...
Instructions for updating:
Use tf.cast instead.
   71/5000: episode: 1, duration: 3.245s, episode steps: 71, steps per second: 22, episode reward: 71.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.479 [0.000, 1.000], mean observation: -0.067 [-0.732, 0.411], loss: 0.430876, mean_absolute_error: 0.493167, mean_q: 0.140679
  107/5000: episode: 2, duration: 0.598s, episode steps: 36, steps per second: 60, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.106 [-0.845, 0.214], loss: 0.330674, mean_absolute_error: 0.443146, mean_q: 0.331545
  141/5000: episode: 3, duration: 0.565s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.118 [-0.737, 0.205], loss: 0.276651, mean_absolute_error: 0.452350, mean_q: 0.499531
  181/5000: episode: 4, duration: 0.666s, episode steps: 40, steps per seco

  695/5000: episode: 29, duration: 0.217s, episode steps: 13, steps per second: 60, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.097 [-1.685, 1.005], loss: 0.423695, mean_absolute_error: 2.609868, mean_q: 4.969396
  705/5000: episode: 30, duration: 0.165s, episode steps: 10, steps per second: 61, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.125 [-1.531, 0.978], loss: 0.578688, mean_absolute_error: 2.711513, mean_q: 5.073770
  723/5000: episode: 31, duration: 0.307s, episode steps: 18, steps per second: 59, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.096 [-1.139, 0.738], loss: 0.569537, mean_absolute_error: 2.740453, mean_q: 5.113036
  739/5000: episode: 32, duration: 0.259s, episode steps: 16, steps per second: 62, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean 

 1098/5000: episode: 58, duration: 0.248s, episode steps: 15, steps per second: 60, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.089 [-1.264, 0.810], loss: 1.139373, mean_absolute_error: 4.025370, mean_q: 7.495666
 1114/5000: episode: 59, duration: 0.266s, episode steps: 16, steps per second: 60, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.120 [-0.974, 0.544], loss: 0.908806, mean_absolute_error: 4.022348, mean_q: 7.592436
 1133/5000: episode: 60, duration: 0.322s, episode steps: 19, steps per second: 59, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.096 [-1.069, 0.590], loss: 1.005674, mean_absolute_error: 4.108882, mean_q: 7.726799
 1153/5000: episode: 61, duration: 0.102s, episode steps: 20, steps per second: 196, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean

 1852/5000: episode: 89, duration: 0.194s, episode steps: 9, steps per second: 46, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.139 [-1.760, 2.821], loss: 1.418601, mean_absolute_error: 5.901606, mean_q: 11.350758
 1862/5000: episode: 90, duration: 0.164s, episode steps: 10, steps per second: 61, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.153 [-1.975, 3.115], loss: 1.906922, mean_absolute_error: 5.952682, mean_q: 11.442248
 1873/5000: episode: 91, duration: 0.183s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.118 [-1.792, 2.800], loss: 2.172411, mean_absolute_error: 6.043588, mean_q: 11.571087
 1884/5000: episode: 92, duration: 0.182s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean ac

 3028/5000: episode: 118, duration: 0.883s, episode steps: 49, steps per second: 56, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: -0.014 [-1.186, 0.578], loss: 1.999811, mean_absolute_error: 7.867227, mean_q: 15.118134
 3053/5000: episode: 119, duration: 0.430s, episode steps: 25, steps per second: 58, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: -0.106 [-0.729, 0.375], loss: 2.579384, mean_absolute_error: 8.050916, mean_q: 15.439906
 3097/5000: episode: 120, duration: 0.752s, episode steps: 44, steps per second: 59, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.067 [-0.507, 1.037], loss: 2.962845, mean_absolute_error: 8.042192, mean_q: 15.359389
 3140/5000: episode: 121, duration: 0.729s, episode steps: 43, steps per second: 59, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000],

 4557/5000: episode: 147, duration: 1.510s, episode steps: 82, steps per second: 54, episode reward: 82.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.106 [-0.770, 0.359], loss: 3.906746, mean_absolute_error: 10.318641, mean_q: 20.052610
 4609/5000: episode: 148, duration: 0.890s, episode steps: 52, steps per second: 58, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.162 [-0.802, 0.366], loss: 4.436498, mean_absolute_error: 10.330420, mean_q: 19.978947
 4685/5000: episode: 149, duration: 1.291s, episode steps: 76, steps per second: 59, episode reward: 76.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.078 [-0.550, 0.891], loss: 3.650803, mean_absolute_error: 10.339061, mean_q: 20.154089
 4716/5000: episode: 150, duration: 0.558s, episode steps: 31, steps per second: 56, episode reward: 31.000, mean reward: 1.000 [1.000, 1.00

<keras.callbacks.History at 0xb3a6ec4e0>

## Test DQN

In [16]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 70.000, steps: 70
Episode 2: reward: 82.000, steps: 82
Episode 3: reward: 197.000, steps: 197
Episode 4: reward: 94.000, steps: 94
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 143.000, steps: 143
Episode 7: reward: 58.000, steps: 58
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 77.000, steps: 77
Episode 10: reward: 85.000, steps: 85


<keras.callbacks.History at 0xb4290deb8>

---