In [2]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.sarsa import SARSAAgent
from rl.memory import EpisodeParameterMemory

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy
from rl.policy import EpsGreedyQPolicy
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [4]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))

# Option 2: deep network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 16)                80        
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0

In [5]:
#SARSAAgent
#sarsa
#__init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10, train_interval=1, delta_clip=np.inf, *args, **kwargs)
#cem
#__init__(self, model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, noise_decay_const=0.0, noise_ampl=0.0, **kwargs)        

In [7]:
#sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=1000, train_interval=50)
#sars.compile(optimizer='adam')

memory = EpisodeParameterMemory(limit=2000, window_length=1)
cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

In [8]:
cem.fit(env, nb_steps=2000000, visualize=False, verbose=1)

# After training is done, we save the best weights.
cem.save_weights(f'cem_{ENV_NAME}_l3k_w10_ba50_st2m_params.h5f', overwrite=True)

Training for 2000000 steps ...
Interval 1 (0 steps performed)
  193/10000 [..............................] - ETA: 7s - reward: 1.0000



583 episodes - episode_reward: 17.122 [8.000, 107.000] - mean_best_reward: 54.556

Interval 2 (10000 steps performed)
566 episodes - episode_reward: 17.686 [8.000, 100.000] - mean_best_reward: 51.409

Interval 3 (20000 steps performed)
542 episodes - episode_reward: 18.445 [8.000, 109.000] - mean_best_reward: 46.364

Interval 4 (30000 steps performed)
526 episodes - episode_reward: 19.025 [8.000, 119.000] - mean_best_reward: 62.773

Interval 5 (40000 steps performed)
463 episodes - episode_reward: 21.575 [8.000, 179.000] - mean_best_reward: 59.000

Interval 6 (50000 steps performed)
484 episodes - episode_reward: 20.669 [8.000, 187.000] - mean_best_reward: 77.650

Interval 7 (60000 steps performed)
363 episodes - episode_reward: 27.504 [8.000, 131.000] - mean_best_reward: 79.929

Interval 8 (70000 steps performed)
358 episodes - episode_reward: 27.919 [8.000, 152.000] - mean_best_reward: 73.786

Interval 9 (80000 steps performed)
334 episodes - episode_reward: 29.982 [8.000, 200.000] -

In [9]:
cem.test(env, nb_episodes=5, visualize=True)
# l2k_w10_ba50_st2m = 190

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 174.000, steps: 174
Episode 5: reward: 200.000, steps: 200


<tensorflow.python.keras.callbacks.History at 0x7f8ff3efc370>