In [6]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.sarsa import SARSAAgent
from rl.memory import EpisodeParameterMemory

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy
from rl.policy import EpsGreedyQPolicy
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [14]:
env = gym.make("gym_ent:ent-v0")

print(env.observation_space)
print(env.action_space)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape
print(nb_actions)
print(obs_dim)

Discrete(2)
Discrete(5)
5
()


In [23]:
# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1,)))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 1)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 10        
_________________________________________________________________
activation_3 (Activation)    (None, 5)                 0         
Total params: 10
Trainable params: 10
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=1000, train_interval=50)
sars.compile(optimizer='adam')

In [32]:
sars.fit(env, nb_steps=200000, visualize=False, verbose=1)
# After training is done, we save the best weights.
#sars.save_weights(f'sars_{ENV_NAME}_params.h5f', overwrite=True)

Training for 200000 steps ...
Interval 1 (0 steps performed)
10000 episodes - episode_reward: 0.842 [-1.000, 1.000]

Interval 2 (10000 steps performed)
10000 episodes - episode_reward: 0.846 [-1.000, 1.000]

Interval 3 (20000 steps performed)
10000 episodes - episode_reward: 0.825 [-1.000, 1.000]

Interval 4 (30000 steps performed)
10000 episodes - episode_reward: 0.845 [-1.000, 1.000]

Interval 5 (40000 steps performed)
10000 episodes - episode_reward: 0.839 [-1.000, 1.000]

Interval 6 (50000 steps performed)
10000 episodes - episode_reward: 0.839 [-1.000, 1.000]

Interval 7 (60000 steps performed)
10000 episodes - episode_reward: 0.836 [-1.000, 1.000]

Interval 8 (70000 steps performed)
10000 episodes - episode_reward: 0.843 [-1.000, 1.000]

Interval 9 (80000 steps performed)
10000 episodes - episode_reward: 0.829 [-1.000, 1.000]

Interval 10 (90000 steps performed)
10000 episodes - episode_reward: 0.848 [-1.000, 1.000]

Interval 11 (100000 steps performed)
10000 episodes - episode_r

<tensorflow.python.keras.callbacks.History at 0x7f53f0745a30>

In [33]:
sars.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 1.000, steps: 1
Episode 2: reward: 1.000, steps: 1
Episode 3: reward: 1.000, steps: 1
Episode 4: reward: 1.000, steps: 1
Episode 5: reward: 1.000, steps: 1


<tensorflow.python.keras.callbacks.History at 0x7f54025ea4f0>

In [38]:
memory = EpisodeParameterMemory(limit=2000, window_length=1)
cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=500, train_interval=50, elite_frac=0.05)
cem.compile()

cem.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 17.598 seconds


<tensorflow.python.keras.callbacks.History at 0x7f53fc4d4880>

In [39]:
cem.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 1.000, steps: 1
Episode 2: reward: 1.000, steps: 1
Episode 3: reward: 1.000, steps: 1
Episode 4: reward: 1.000, steps: 1
Episode 5: reward: 1.000, steps: 1


<tensorflow.python.keras.callbacks.History at 0x7f54025e1130>