# Deep Q Learning using Keras

In [11]:
# See that, we are using the correct environment
import sys
sys.executable

'/Users/sandeep/anaconda3/envs/rlenv/bin/python'

In [12]:
import numpy as np
import gym

import keras.backend as k

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory

# Gym env and actions

In [3]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(1)
env.seed(1)
nb_actions = env.action_space.n

In [4]:
input_shape=(1,) + env.observation_space.shape
print(input_shape)

(1, 4)


# Simple NN model

In [5]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16, name ="Dense_1"))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
Dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.00,value_min=.05, value_test=.05,
nb_steps=10000)

memory = SequentialMemory(limit=10000, window_length=1)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=100, policy=policy)

dqn.compile(Adam(lr=0.001), metrics=['mae'])






In [7]:
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

Training for 50000 steps ...




    16/50000: episode: 1, duration: 2.231s, episode steps: 16, steps per second: 7, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.084 [-0.788, 1.345], loss: 1.072774, mean_absolute_error: 0.919518, mean_q: 0.609775, mean_eps: 0.998765
    34/50000: episode: 2, duration: 0.056s, episode steps: 18, steps per second: 323, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.098 [-1.294, 0.559], loss: 0.756542, mean_absolute_error: 0.768471, mean_q: 0.512654, mean_eps: 0.997672
    54/50000: episode: 3, duration: 0.070s, episode steps: 20, steps per second: 287, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.115 [-0.614, 1.473], loss: 0.555532, mean_absolute_error: 0.702276, mean_q: 0.470321, mean_eps: 0.995867




    66/50000: episode: 4, duration: 0.120s, episode steps: 12, steps per second: 100, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.093 [-0.843, 1.525], loss: 0.477063, mean_absolute_error: 0.713304, mean_q: 0.587330, mean_eps: 0.994347
    81/50000: episode: 5, duration: 0.049s, episode steps: 15, steps per second: 306, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.076 [-2.068, 1.368], loss: 0.404321, mean_absolute_error: 0.721789, mean_q: 0.691998, mean_eps: 0.993065
   110/50000: episode: 6, duration: 0.076s, episode steps: 29, steps per second: 382, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.552 [0.000, 1.000], mean observation: -0.023 [-1.430, 0.929], loss: 0.424261, mean_absolute_error: 0.753687, mean_q: 0.746574, mean_eps: 0.990975
   125/50000: episode: 7, duration: 0.039s, episode steps: 15, steps per second: 38

   716/50000: episode: 33, duration: 0.055s, episode steps: 21, steps per second: 382, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.039 [-1.172, 1.703], loss: 1.650535, mean_absolute_error: 3.977781, mean_q: 7.086475, mean_eps: 0.933025
   729/50000: episode: 34, duration: 0.036s, episode steps: 13, steps per second: 359, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.081 [-1.384, 2.057], loss: 1.834634, mean_absolute_error: 4.220618, mean_q: 7.646779, mean_eps: 0.931410
   751/50000: episode: 35, duration: 0.057s, episode steps: 22, steps per second: 385, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.545 [0.000, 1.000], mean observation: -0.123 [-1.498, 0.586], loss: 1.313973, mean_absolute_error: 4.168688, mean_q: 7.990619, mean_eps: 0.929747
   775/50000: episode: 36, duration: 0.073s, episode steps: 24, steps per second:

  1369/50000: episode: 61, duration: 0.077s, episode steps: 31, steps per second: 401, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.419 [0.000, 1.000], mean observation: 0.049 [-1.147, 1.876], loss: 3.744569, mean_absolute_error: 7.154407, mean_q: 13.536773, mean_eps: 0.871465
  1382/50000: episode: 62, duration: 0.034s, episode steps: 13, steps per second: 388, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.125 [-1.324, 2.166], loss: 4.025679, mean_absolute_error: 7.220718, mean_q: 13.802047, mean_eps: 0.869375
  1403/50000: episode: 63, duration: 0.054s, episode steps: 21, steps per second: 388, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.081 [-0.798, 1.470], loss: 2.750321, mean_absolute_error: 7.162532, mean_q: 13.826633, mean_eps: 0.867760
  1425/50000: episode: 64, duration: 0.055s, episode steps: 22, steps per secon

  1954/50000: episode: 88, duration: 0.034s, episode steps: 13, steps per second: 378, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.105 [-0.946, 1.660], loss: 4.821831, mean_absolute_error: 8.597705, mean_q: 16.073599, mean_eps: 0.815035
  2000/50000: episode: 89, duration: 0.131s, episode steps: 46, steps per second: 350, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.543 [0.000, 1.000], mean observation: -0.050 [-1.607, 0.770], loss: 4.187607, mean_absolute_error: 8.590580, mean_q: 16.166035, mean_eps: 0.812232
  2010/50000: episode: 90, duration: 0.027s, episode steps: 10, steps per second: 373, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.133 [-1.669, 0.952], loss: 5.219007, mean_absolute_error: 8.752424, mean_q: 16.107631, mean_eps: 0.809573
  2055/50000: episode: 91, duration: 0.111s, episode steps: 45, steps per sec

  2697/50000: episode: 118, duration: 0.074s, episode steps: 29, steps per second: 392, episode reward: 29.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.517 [0.000, 1.000], mean observation: 0.088 [-0.741, 1.145], loss: 4.803394, mean_absolute_error: 9.274438, mean_q: 17.489754, mean_eps: 0.745210
  2730/50000: episode: 119, duration: 0.093s, episode steps: 33, steps per second: 354, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.047 [-0.992, 1.795], loss: 4.985629, mean_absolute_error: 9.346875, mean_q: 17.508879, mean_eps: 0.742265
  2754/50000: episode: 120, duration: 0.111s, episode steps: 24, steps per second: 217, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.103 [-0.950, 0.442], loss: 3.446511, mean_absolute_error: 9.304729, mean_q: 17.716523, mean_eps: 0.739558
  2786/50000: episode: 121, duration: 0.120s, episode steps: 32, steps per 

  3664/50000: episode: 147, duration: 0.108s, episode steps: 42, steps per second: 390, episode reward: 42.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: 0.042 [-0.999, 1.321], loss: 3.888794, mean_absolute_error: 10.449230, mean_q: 20.052266, mean_eps: 0.653962
  3707/50000: episode: 148, duration: 0.112s, episode steps: 43, steps per second: 383, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.488 [0.000, 1.000], mean observation: 0.042 [-0.804, 1.157], loss: 5.417473, mean_absolute_error: 10.546180, mean_q: 20.071957, mean_eps: 0.649925
  3763/50000: episode: 149, duration: 0.145s, episode steps: 56, steps per second: 387, episode reward: 56.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.100 [-0.986, 0.538], loss: 4.936752, mean_absolute_error: 10.527509, mean_q: 20.103037, mean_eps: 0.645223
  3785/50000: episode: 150, duration: 0.058s, episode steps: 22, steps p

  4622/50000: episode: 174, duration: 0.156s, episode steps: 57, steps per second: 365, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.509 [0.000, 1.000], mean observation: 0.098 [-0.770, 1.088], loss: 6.055066, mean_absolute_error: 11.960460, mean_q: 22.961523, mean_eps: 0.563665
  4681/50000: episode: 175, duration: 0.168s, episode steps: 59, steps per second: 352, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.492 [0.000, 1.000], mean observation: -0.100 [-1.145, 0.526], loss: 4.975235, mean_absolute_error: 12.012402, mean_q: 23.194024, mean_eps: 0.558155
  4696/50000: episode: 176, duration: 0.046s, episode steps: 15, steps per second: 329, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.078 [-0.825, 1.448], loss: 3.997256, mean_absolute_error: 12.070277, mean_q: 23.360219, mean_eps: 0.554640
  4783/50000: episode: 177, duration: 0.232s, episode steps: 87, steps p

  5738/50000: episode: 202, duration: 0.148s, episode steps: 44, steps per second: 297, episode reward: 44.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.477 [0.000, 1.000], mean observation: -0.089 [-0.916, 0.575], loss: 8.152367, mean_absolute_error: 13.935198, mean_q: 26.825813, mean_eps: 0.457028
  5858/50000: episode: 203, duration: 0.334s, episode steps: 120, steps per second: 360, episode reward: 120.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.034 [-0.728, 1.557], loss: 7.179053, mean_absolute_error: 14.055230, mean_q: 27.227070, mean_eps: 0.449238
  5880/50000: episode: 204, duration: 0.068s, episode steps: 22, steps per second: 323, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.115 [-0.985, 0.359], loss: 6.984232, mean_absolute_error: 14.028120, mean_q: 27.220686, mean_eps: 0.442493
  5947/50000: episode: 205, duration: 0.201s, episode steps: 67, step

  7114/50000: episode: 229, duration: 0.340s, episode steps: 120, steps per second: 353, episode reward: 120.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.054 [-0.734, 0.870], loss: 9.942266, mean_absolute_error: 15.795280, mean_q: 30.595701, mean_eps: 0.329918
  7160/50000: episode: 230, duration: 0.129s, episode steps: 46, steps per second: 356, episode reward: 46.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.129 [-1.319, 0.456], loss: 7.985271, mean_absolute_error: 15.923508, mean_q: 31.033326, mean_eps: 0.322033
  7205/50000: episode: 231, duration: 0.134s, episode steps: 45, steps per second: 335, episode reward: 45.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.141 [-0.744, 0.376], loss: 6.283641, mean_absolute_error: 15.912469, mean_q: 31.201680, mean_eps: 0.317710
  7285/50000: episode: 232, duration: 0.254s, episode steps: 80, step

  9027/50000: episode: 256, duration: 0.326s, episode steps: 112, steps per second: 344, episode reward: 112.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.518 [0.000, 1.000], mean observation: 0.079 [-0.578, 0.871], loss: 10.657747, mean_absolute_error: 18.142294, mean_q: 35.290063, mean_eps: 0.147803
  9088/50000: episode: 257, duration: 0.171s, episode steps: 61, steps per second: 357, episode reward: 61.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.541 [0.000, 1.000], mean observation: 0.158 [-0.709, 0.931], loss: 11.987586, mean_absolute_error: 18.025737, mean_q: 35.027315, mean_eps: 0.139585
  9177/50000: episode: 258, duration: 0.226s, episode steps: 89, steps per second: 393, episode reward: 89.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.124 [-0.492, 0.883], loss: 8.842292, mean_absolute_error: 18.156958, mean_q: 35.425089, mean_eps: 0.132460
  9238/50000: episode: 259, duration: 0.157s, episode steps: 61, step

 12341/50000: episode: 283, duration: 0.499s, episode steps: 200, steps per second: 401, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: -0.005 [-0.582, 0.463], loss: 11.750365, mean_absolute_error: 21.603866, mean_q: 42.561425, mean_eps: 0.050000
 12541/50000: episode: 284, duration: 0.509s, episode steps: 200, steps per second: 393, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: -0.184 [-1.181, 0.407], loss: 10.235331, mean_absolute_error: 21.789592, mean_q: 43.079889, mean_eps: 0.050000
 12741/50000: episode: 285, duration: 0.498s, episode steps: 200, steps per second: 402, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.515 [0.000, 1.000], mean observation: 0.149 [-0.729, 1.233], loss: 10.045539, mean_absolute_error: 22.006873, mean_q: 43.534778, mean_eps: 0.050000
 12941/50000: episode: 286, duration: 0.531s, episode steps: 2

 17596/50000: episode: 310, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.157 [-0.411, 0.982], loss: 7.453832, mean_absolute_error: 28.966702, mean_q: 57.926329, mean_eps: 0.050000
 17796/50000: episode: 311, duration: 0.505s, episode steps: 200, steps per second: 396, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.183 [-0.932, 1.142], loss: 6.969012, mean_absolute_error: 29.277246, mean_q: 58.602483, mean_eps: 0.050000
 17996/50000: episode: 312, duration: 0.500s, episode steps: 200, steps per second: 400, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.239 [-0.515, 1.484], loss: 9.558787, mean_absolute_error: 29.447801, mean_q: 58.811481, mean_eps: 0.050000
 18196/50000: episode: 313, duration: 0.518s, episode steps: 200, s

 22996/50000: episode: 337, duration: 0.500s, episode steps: 200, steps per second: 400, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.048 [-0.374, 0.614], loss: 6.506877, mean_absolute_error: 32.527287, mean_q: 65.088384, mean_eps: 0.050000
 23196/50000: episode: 338, duration: 0.492s, episode steps: 200, steps per second: 406, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.152 [-0.477, 0.906], loss: 6.393116, mean_absolute_error: 32.582568, mean_q: 65.166283, mean_eps: 0.050000
 23396/50000: episode: 339, duration: 0.495s, episode steps: 200, steps per second: 404, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.032 [-0.378, 0.448], loss: 4.681974, mean_absolute_error: 32.800659, mean_q: 65.696759, mean_eps: 0.050000
 23596/50000: episode: 340, duration: 0.482s, episode steps: 200, s

 28396/50000: episode: 364, duration: 0.606s, episode steps: 200, steps per second: 330, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.106 [-0.405, 0.663], loss: 3.715341, mean_absolute_error: 35.213592, mean_q: 70.384743, mean_eps: 0.050000
 28596/50000: episode: 365, duration: 0.488s, episode steps: 200, steps per second: 410, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.140 [-0.448, 0.820], loss: 5.648756, mean_absolute_error: 35.366050, mean_q: 70.635232, mean_eps: 0.050000
 28796/50000: episode: 366, duration: 0.495s, episode steps: 200, steps per second: 404, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.066 [-0.435, 0.522], loss: 4.394115, mean_absolute_error: 35.183986, mean_q: 70.309164, mean_eps: 0.050000
 28996/50000: episode: 367, duration: 0.483s, episode steps: 200, s

 33796/50000: episode: 391, duration: 0.478s, episode steps: 200, steps per second: 419, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.031 [-0.702, 0.708], loss: 6.813499, mean_absolute_error: 36.207569, mean_q: 72.179127, mean_eps: 0.050000
 33996/50000: episode: 392, duration: 0.477s, episode steps: 200, steps per second: 420, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.046 [-0.529, 0.537], loss: 5.977262, mean_absolute_error: 36.363814, mean_q: 72.508857, mean_eps: 0.050000
 34196/50000: episode: 393, duration: 0.515s, episode steps: 200, steps per second: 388, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.505 [0.000, 1.000], mean observation: 0.059 [-0.510, 0.657], loss: 5.716766, mean_absolute_error: 36.318783, mean_q: 72.423394, mean_eps: 0.050000
 34396/50000: episode: 394, duration: 0.520s, episode steps: 200, s

 39196/50000: episode: 418, duration: 0.495s, episode steps: 200, steps per second: 404, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.007 [-0.583, 0.603], loss: 10.891399, mean_absolute_error: 36.829249, mean_q: 72.991313, mean_eps: 0.050000
 39396/50000: episode: 419, duration: 0.488s, episode steps: 200, steps per second: 410, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.023 [-0.554, 0.764], loss: 12.868482, mean_absolute_error: 37.123263, mean_q: 73.613216, mean_eps: 0.050000
 39596/50000: episode: 420, duration: 0.487s, episode steps: 200, steps per second: 411, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.008 [-0.754, 0.670], loss: 11.266116, mean_absolute_error: 37.002506, mean_q: 73.570301, mean_eps: 0.050000
 39796/50000: episode: 421, duration: 0.509s, episode steps: 20

 44596/50000: episode: 445, duration: 0.514s, episode steps: 200, steps per second: 389, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.045 [-1.107, 1.168], loss: 11.245718, mean_absolute_error: 38.247369, mean_q: 76.043513, mean_eps: 0.050000
 44796/50000: episode: 446, duration: 0.500s, episode steps: 200, steps per second: 400, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.017 [-1.140, 1.181], loss: 13.377765, mean_absolute_error: 38.428352, mean_q: 76.206466, mean_eps: 0.050000
 44996/50000: episode: 447, duration: 0.513s, episode steps: 200, steps per second: 390, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.060 [-0.447, 0.439], loss: 10.168802, mean_absolute_error: 38.383181, mean_q: 76.327618, mean_eps: 0.050000
 45196/50000: episode: 448, duration: 0.625s, episode steps: 2

 46881/50000: episode: 476, duration: 0.027s, episode steps: 10, steps per second: 377, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.144 [-2.426, 1.539], loss: 7.218714, mean_absolute_error: 38.101210, mean_q: 76.038538, mean_eps: 0.050000
 46891/50000: episode: 477, duration: 0.027s, episode steps: 10, steps per second: 375, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.112 [-2.964, 1.978], loss: 14.289712, mean_absolute_error: 38.612946, mean_q: 77.099623, mean_eps: 0.050000
 46901/50000: episode: 478, duration: 0.027s, episode steps: 10, steps per second: 367, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.150 [-2.704, 1.725], loss: 13.472761, mean_absolute_error: 38.784985, mean_q: 77.337586, mean_eps: 0.050000
 46912/50000: episode: 479, duration: 0.028s, episode steps: 11, ste

 47386/50000: episode: 509, duration: 0.026s, episode steps: 10, steps per second: 390, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.129 [-1.533, 2.515], loss: 6.023609, mean_absolute_error: 38.551815, mean_q: 77.507533, mean_eps: 0.050000
 47396/50000: episode: 510, duration: 0.028s, episode steps: 10, steps per second: 361, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.118 [-1.925, 2.971], loss: 13.862841, mean_absolute_error: 38.977180, mean_q: 78.299212, mean_eps: 0.050000
 47405/50000: episode: 511, duration: 0.024s, episode steps: 9, steps per second: 378, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.150 [-1.782, 2.828], loss: 1.088944, mean_absolute_error: 38.385212, mean_q: 77.082311, mean_eps: 0.050000
 47414/50000: episode: 512, duration: 0.023s, episode steps: 9, steps per 

 48414/50000: episode: 536, duration: 0.513s, episode steps: 200, steps per second: 390, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.001 [-0.485, 0.532], loss: 18.542920, mean_absolute_error: 38.098516, mean_q: 74.820585, mean_eps: 0.050000
 48614/50000: episode: 537, duration: 0.510s, episode steps: 200, steps per second: 392, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.023 [-0.827, 0.560], loss: 17.372756, mean_absolute_error: 37.852730, mean_q: 74.416143, mean_eps: 0.050000
 48814/50000: episode: 538, duration: 0.550s, episode steps: 200, steps per second: 364, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.026 [-0.417, 0.588], loss: 22.856058, mean_absolute_error: 37.702851, mean_q: 73.926269, mean_eps: 0.050000
 49014/50000: episode: 539, duration: 0.519s, episode steps: 2

<keras.callbacks.History at 0x10c6a3d30>

In [9]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200


<keras.callbacks.History at 0x10c6a3e80>

In [10]:
env.close()