In [2]:
import numpy as np
import gym

In [3]:
from keras.models import Sequential
from keras.layers import Dense,Activation,Flatten
from keras.optimizers import Adam

Using TensorFlow backend.


In [5]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [7]:
# setting the environment
env_name = 'CartPole-v0'

In [8]:
# get the environment and extract the number of actions available in cartpole problem
env = gym.make(env_name)

In [9]:
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [10]:
# building single hidden layer neural network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary)

<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x7fa401db2438>>


In [11]:
# creating policy and memory as sequencial memory as we are going to store action and respective reward
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
                                    target_model_update=1e-2, policy=policy)

In [12]:
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [13]:
# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot.
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...




   79/5000: episode: 1, duration: 3.733s, episode steps: 79, steps per second: 21, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.426199, mean_absolute_error: 0.494559, mean_q: 0.054424
  111/5000: episode: 2, duration: 0.539s, episode steps: 32, steps per second: 59, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.531 [0.000, 1.000], mean observation: 0.153 [-0.151, 0.650], loss: 0.350789, mean_absolute_error: 0.443192, mean_q: 0.187516
  174/5000: episode: 3, duration: 1.060s, episode steps: 63, steps per second: 59, episode reward: 63.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.508 [0.000, 1.000], mean observation: 0.059 [-0.342, 0.786], loss: 0.312599, mean_absolute_error: 0.469253, mean_q: 0.336845
  208/5000: episode: 4, duration: 0.571s, episode steps: 34, steps per second: 60, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action:

  687/5000: episode: 30, duration: 0.200s, episode steps: 12, steps per second: 60, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.118 [-0.941, 1.699], loss: 0.338919, mean_absolute_error: 2.211718, mean_q: 4.248566
  702/5000: episode: 31, duration: 0.268s, episode steps: 15, steps per second: 56, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.087 [-0.823, 1.424], loss: 0.431989, mean_absolute_error: 2.291770, mean_q: 4.382295
  714/5000: episode: 32, duration: 0.183s, episode steps: 12, steps per second: 65, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.098 [-0.839, 1.536], loss: 0.365381, mean_absolute_error: 2.342580, mean_q: 4.506822
  725/5000: episode: 33, duration: 0.185s, episode steps: 11, steps per second: 60, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean act

 1004/5000: episode: 60, duration: 0.170s, episode steps: 10, steps per second: 59, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.123 [-1.608, 2.497], loss: 0.965405, mean_absolute_error: 3.526048, mean_q: 6.616965
 1013/5000: episode: 61, duration: 0.148s, episode steps: 9, steps per second: 61, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.127 [-1.773, 2.767], loss: 1.043108, mean_absolute_error: 3.549012, mean_q: 6.649749
 1023/5000: episode: 62, duration: 0.166s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.130 [-1.754, 2.734], loss: 1.310334, mean_absolute_error: 3.636896, mean_q: 6.664918
 1034/5000: episode: 63, duration: 0.208s, episode steps: 11, steps per second: 53, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean actio

 1493/5000: episode: 89, duration: 0.824s, episode steps: 49, steps per second: 59, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.087 [-0.630, 0.556], loss: 0.869173, mean_absolute_error: 4.497672, mean_q: 8.500762
 1525/5000: episode: 90, duration: 0.538s, episode steps: 32, steps per second: 59, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.104 [-0.680, 0.246], loss: 0.853077, mean_absolute_error: 4.586233, mean_q: 8.669847
 1553/5000: episode: 91, duration: 0.472s, episode steps: 28, steps per second: 59, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.111 [-0.704, 0.252], loss: 1.131083, mean_absolute_error: 4.758250, mean_q: 8.931754
 1574/5000: episode: 92, duration: 0.347s, episode steps: 21, steps per second: 60, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean 

 1897/5000: episode: 118, duration: 0.168s, episode steps: 10, steps per second: 60, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.132 [-2.564, 1.543], loss: 1.765942, mean_absolute_error: 5.940698, mean_q: 11.185366
 1905/5000: episode: 119, duration: 0.142s, episode steps: 8, steps per second: 56, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.160 [-2.576, 1.578], loss: 2.274333, mean_absolute_error: 5.937957, mean_q: 11.172526
 1915/5000: episode: 120, duration: 0.155s, episode steps: 10, steps per second: 65, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.123 [-2.596, 1.613], loss: 1.805741, mean_absolute_error: 5.920573, mean_q: 11.193787
 1927/5000: episode: 121, duration: 0.204s, episode steps: 12, steps per second: 59, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], 

 2522/5000: episode: 147, duration: 0.519s, episode steps: 31, steps per second: 60, episode reward: 31.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.115 [-0.719, 0.361], loss: 2.154610, mean_absolute_error: 7.004448, mean_q: 13.284513
 2574/5000: episode: 148, duration: 0.877s, episode steps: 52, steps per second: 59, episode reward: 52.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.046 [-0.731, 0.391], loss: 2.437306, mean_absolute_error: 7.091385, mean_q: 13.401912
 2600/5000: episode: 149, duration: 0.435s, episode steps: 26, steps per second: 60, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.112 [-0.959, 0.592], loss: 2.196901, mean_absolute_error: 7.164228, mean_q: 13.608659
 2635/5000: episode: 150, duration: 0.589s, episode steps: 35, steps per second: 59, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000]

 4269/5000: episode: 176, duration: 0.639s, episode steps: 38, steps per second: 59, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.447 [0.000, 1.000], mean observation: -0.151 [-0.743, 0.372], loss: 3.244174, mean_absolute_error: 9.687693, mean_q: 18.653227
 4320/5000: episode: 177, duration: 0.858s, episode steps: 51, steps per second: 59, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.076 [-1.203, 0.435], loss: 3.250243, mean_absolute_error: 9.667204, mean_q: 18.579103
 4451/5000: episode: 178, duration: 2.207s, episode steps: 131, steps per second: 59, episode reward: 131.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.489 [0.000, 1.000], mean observation: -0.050 [-0.687, 0.374], loss: 3.353874, mean_absolute_error: 9.913673, mean_q: 19.080969
 4522/5000: episode: 179, duration: 1.195s, episode steps: 71, steps per second: 59, episode reward: 71.000, mean reward: 1.000 [1.000, 1.00

<keras.callbacks.History at 0x7fa4001b3e80>

In [None]:
# testing
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 54.000, steps: 54
Episode 2: reward: 74.000, steps: 74
