# Deep Q Learning using Keras

In [6]:
import numpy as np
import gym


from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory

# Gym env and actions

In [7]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(1)
env.seed(1)
nb_actions = env.action_space.n

In [8]:
input_shape=(1,) + env.observation_space.shape
print(input_shape)

(1, 4)


# Simple NN model

In [9]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.00,value_min=.05, value_test=.05,
nb_steps=10000)

memory = SequentialMemory(limit=10000, window_length=1)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=100, policy=policy)

dqn.compile(Adam(lr=0.001), metrics=['mae'])

W1118 20:02:53.499825 4514584000 deprecation_wrapper.py:119] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:159: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W1118 20:02:53.500586 4514584000 deprecation_wrapper.py:119] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:164: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W1118 20:02:53.598199 4514584000 deprecation_wrapper.py:119] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/optimizers.py:711: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1118 20:02:53.634010 4514584000 deprecation.py:506] From /Users/sandeep/anaconda3/envs/py36/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:1247: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with

In [14]:
dqn.fit(env, nb_steps=50000, visualize=True, verbose=2)

In [None]:
dqn.test(env, nb_episodes=100, visualize=True)

Testing for 100 episodes ...
Episode 1: reward: 186.000, steps: 186
Episode 2: reward: 170.000, steps: 170
Episode 3: reward: 172.000, steps: 172
Episode 4: reward: 174.000, steps: 174


TypeError: can only concatenate str (not "bytes") to str

In [13]:
env.close()