In [2]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory

import io
import sys
import csv

In [3]:
# Path environment changed to make things work properly
# export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/usr/lib


# Get the environment and extract the number of actions.
ENV_NAME = 'LunarLander-v2'
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(24))
model.add(Activation('relu'))
model.add(Dense(24))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())


[2019-05-01 11:13:10,403] Making new env: LunarLander-v2
[2019-05-01 11:13:10,544] From C:\Users\nik\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 24)                216       
_________________________________________________________________
activation_1 (Activation)    (None, 24)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
activation_2 (Activation)    (None, 24)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                400       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
__________

In [10]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=300000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])


In [11]:
dqn.fit(env, nb_steps=100000, visualize=False, verbose=2)

Training for 100000 steps ...


[2019-05-01 11:15:36,082] From C:\Users\nik\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


    95/100000: episode: 1, duration: 1.353s, episode steps: 95, steps per second: 70, episode reward: -88.944, mean reward: -0.936 [-100.000, 15.241], mean action: 1.663 [0.000, 3.000], mean observation: 0.042 [-2.728, 1.000], loss: 1.696407, mean_absolute_error: 0.542604, mean_q: 0.396878
   235/100000: episode: 2, duration: 0.458s, episode steps: 140, steps per second: 306, episode reward: -211.561, mean reward: -1.511 [-100.000, 3.301], mean action: 1.714 [0.000, 3.000], mean observation: 0.125 [-0.737, 1.023], loss: 35.905678, mean_absolute_error: 1.128150, mean_q: 1.496794
   406/100000: episode: 3, duration: 0.561s, episode steps: 171, steps per second: 305, episode reward: -448.832, mean reward: -2.625 [-100.000, 4.445], mean action: 1.532 [0.000, 3.000], mean observation: 0.133 [-2.001, 2.287], loss: 36.734901, mean_absolute_error: 1.358044, mean_q: 1.258333
   602/100000: episode: 4, duration: 0.648s, episode steps: 196, steps per second: 302, episode reward: -447.745, mean re

<keras.callbacks.History at 0x22976809828>

In [12]:
# After training is done, we save the final weights.

dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME))

# Redirect stdout to capture test results
old_stdout = sys.stdout
sys.stdout = mystdout = io.StringIO()

# Evaluate our algorithm for a few episodes.
dqn.test(env, nb_episodes=200, visualize=True)

# Reset stdout
sys.stdout = old_stdout

results_text = mystdout.getvalue()

# Print results text
print("results")
print(results_text)



[TIP] Next time specify overwrite=True!
results
Testing for 200 episodes ...
Episode 1: reward: 124.098, steps: 625
Episode 2: reward: 59.405, steps: 842
Episode 3: reward: -42.412, steps: 499
Episode 4: reward: 164.400, steps: 422
Episode 5: reward: -119.325, steps: 1000
Episode 6: reward: -179.037, steps: 1000
Episode 7: reward: 0.447, steps: 1000
Episode 8: reward: -15.933, steps: 1000
Episode 9: reward: 217.622, steps: 361
Episode 10: reward: 60.952, steps: 813
Episode 11: reward: -309.825, steps: 963
Episode 12: reward: 97.736, steps: 709
Episode 13: reward: 165.006, steps: 317
Episode 14: reward: 143.592, steps: 312
Episode 15: reward: -140.103, steps: 704
Episode 16: reward: 177.081, steps: 407
Episode 17: reward: 110.878, steps: 766
Episode 18: reward: 19.588, steps: 973
Episode 19: reward: -82.731, steps: 1000
Episode 20: reward: 68.954, steps: 684
Episode 21: reward: 147.419, steps: 578
Episode 22: reward: 65.036, steps: 789
Episode 23: reward: 81.956, steps: 768
Episode 24: 