In [0]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import Callback


import io
import sys
import csv

Using TensorFlow backend.


In [0]:
# Path environment changed to make things work properly
# export DYLD_FALLBACK_LIBRARY_PATH=$DYLD_FALLBACK_LIBRARY_PATH:/usr/lib
# Get the environment and extract the number of actions.
ENV_NAME = 'LunarLander-v2'
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [0]:
#Set hyperparameter
hidden = 'relu' #Activation function for hidden layer
output = 'linear' #Activation function for output layer
size = 42 #Size of hidden layers
gamma = 0.99
lr = 0.001

In [0]:
# Next, we build a very simple model.
#There are 3 hidden layer used
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(size))
model.add(Activation(hidden))
model.add(Dense(size))
model.add(Activation(hidden))
model.add(Dense(size))
model.add(Activation(hidden))
model.add(Dense(nb_actions))
model.add(Activation(output))

Instructions for updating:
Colocations handled automatically by placer.


In [0]:
memory = SequentialMemory(limit=1000000, window_length=1)
policy = EpsGreedyQPolicy(eps=1.0)
#callback function 
class epsCallback(Callback):
    def __init__(self, eps_policy, decay_rate = 0.97):
        self.eps_policy = eps_policy
        self.decay_rate = decay_rate
    def on_episode_begin(self,episode,logs={}):
        self.eps_policy.eps *= self.decay_rate    

In [0]:
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,gamma = 0.99,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=0.001), metrics=['mae'])

callback_param = [epsCallback(eps_policy=policy,decay_rate = 0.97)]

# After training is done, we save the final weights.
dqn.fit(env, nb_steps=30000, visualize=False, verbose=2,callbacks=callback_param)
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite = True)

# Redirect stdout to capture test results
old_stdout = sys.stdout
sys.stdout = mystdout = io.StringIO()

Training for 30000 steps ...
Instructions for updating:
Use tf.cast instead.




    72/30000: episode: 1, duration: 1.160s, episode steps: 72, steps per second: 62, episode reward: -99.087, mean reward: -1.376 [-100.000, 17.325], mean action: 1.486 [0.000, 3.000], mean observation: 0.051 [-1.619, 1.399], loss: 0.805373, mean_absolute_error: 0.662385, mean_q: 1.020939
   207/30000: episode: 2, duration: 0.513s, episode steps: 135, steps per second: 263, episode reward: -124.534, mean reward: -0.922 [-100.000, 23.228], mean action: 1.452 [0.000, 3.000], mean observation: 0.178 [-3.407, 1.563], loss: 35.888912, mean_absolute_error: 1.244270, mean_q: 2.073455
   276/30000: episode: 3, duration: 0.276s, episode steps: 69, steps per second: 250, episode reward: -101.976, mean reward: -1.478 [-100.000, 7.591], mean action: 1.362 [0.000, 3.000], mean observation: 0.085 [-1.185, 3.992], loss: 28.474173, mean_absolute_error: 1.821950, mean_q: 3.034862
   357/30000: episode: 4, duration: 0.313s, episode steps: 81, steps per second: 259, episode reward: -92.389, mean reward: 

  4104/30000: episode: 29, duration: 1.248s, episode steps: 298, steps per second: 239, episode reward: -4.156, mean reward: -0.014 [-100.000, 20.662], mean action: 1.862 [0.000, 3.000], mean observation: 0.022 [-1.904, 1.395], loss: 8.026385, mean_absolute_error: 19.473810, mean_q: 8.471862
  5104/30000: episode: 30, duration: 5.699s, episode steps: 1000, steps per second: 175, episode reward: -21.777, mean reward: -0.022 [-21.064, 23.706], mean action: 2.063 [0.000, 3.000], mean observation: 0.135 [-0.783, 1.419], loss: 8.056190, mean_absolute_error: 19.821621, mean_q: 8.118248
  6104/30000: episode: 31, duration: 4.609s, episode steps: 1000, steps per second: 217, episode reward: 67.629, mean reward: 0.068 [-23.833, 22.101], mean action: 1.856 [0.000, 3.000], mean observation: 0.129 [-0.569, 1.402], loss: 8.689539, mean_absolute_error: 20.679960, mean_q: 5.887534
  7104/30000: episode: 32, duration: 5.223s, episode steps: 1000, steps per second: 191, episode reward: 31.434, mean rew

 19124/30000: episode: 57, duration: 4.757s, episode steps: 1000, steps per second: 210, episode reward: -46.104, mean reward: -0.046 [-4.533, 4.973], mean action: 1.875 [0.000, 3.000], mean observation: -0.017 [-0.645, 1.411], loss: 7.809074, mean_absolute_error: 22.985445, mean_q: 27.029669
 20124/30000: episode: 58, duration: 5.138s, episode steps: 1000, steps per second: 195, episode reward: -34.449, mean reward: -0.034 [-3.849, 4.558], mean action: 1.883 [0.000, 3.000], mean observation: 0.053 [-0.648, 1.409], loss: 7.840860, mean_absolute_error: 23.575449, mean_q: 28.155304
 21124/30000: episode: 59, duration: 6.047s, episode steps: 1000, steps per second: 165, episode reward: 20.501, mean reward: 0.021 [-24.579, 23.207], mean action: 1.605 [0.000, 3.000], mean observation: 0.060 [-0.581, 1.408], loss: 7.680381, mean_absolute_error: 23.613497, mean_q: 28.330084
 21777/30000: episode: 60, duration: 3.133s, episode steps: 653, steps per second: 208, episode reward: -147.297, mean r

In [0]:

# Reset stdout
dqn.test(env, nb_episodes=200, visualize=False)
sys.stdout = old_stdout

results_text = mystdout.getvalue()

# Print results text
print("results")
print(results_text)

# Extact a rewards list from the results
total_rewards = list()
for idx, line in enumerate(results_text.split('\n')):
    if idx > 0 and len(line) > 1:
        reward = float(line.split(':')[2].split(',')[0].strip())
        total_rewards.append(reward)

# Print rewards and average	
print("total rewards", total_rewards)
print("average total reward", np.mean(total_rewards))



In [0]:
# Write total rewards to file
f = open("lunarlander_rl_rewards.csv",'w')
wr = csv.writer(f)
for r in total_rewards:
     wr.writerow([r,])
f.close()