In [1]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.sarsa import SARSAAgent
from rl.memory import EpisodeParameterMemory

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy
from rl.policy import EpsGreedyQPolicy
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [2]:
env = gym.make('gym_ent:ent-v1')

print(env.observation_space)
print(env.action_space)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape
print(nb_actions)
print(obs_dim)
#env.observation_space.shape

Box(8, 2)
Discrete(4)
4
(8, 2)


In [3]:
# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 4)                 68        
_________________________________________________________________
activation (Activation)      (None, 4)                 0         
Total params: 68
Trainable params: 68
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=2000, train_interval=100)
sars.compile(optimizer='adam')

In [5]:
sars.fit(env, nb_steps=400000, visualize=False, verbose=1)
# After training is done, we save the best weights.
#sars.save_weights(f'sars_{ENV_NAME}_params.h5f', overwrite=True)

Training for 400000 steps ...
Interval 1 (0 steps performed)
  184/10000 [..............................] - ETA: 5s - reward: 9.0543  



1097 episodes - episode_reward: 87.422 [0.000, 109.000] - loss: 485.075 - mean_q: 1.000

Interval 2 (10000 steps performed)
1109 episodes - episode_reward: 87.002 [0.000, 109.000] - loss: 288.785 - mean_q: 1.000

Interval 3 (20000 steps performed)
1099 episodes - episode_reward: 88.809 [0.000, 109.000] - loss: 357.866 - mean_q: 1.000

Interval 4 (30000 steps performed)
1104 episodes - episode_reward: 87.684 [0.000, 109.000] - loss: 474.009 - mean_q: 1.000

Interval 5 (40000 steps performed)
1113 episodes - episode_reward: 85.428 [0.000, 109.000] - loss: 442.126 - mean_q: 1.000

Interval 6 (50000 steps performed)
1092 episodes - episode_reward: 88.566 [0.000, 109.000] - loss: 663.419 - mean_q: 1.000

Interval 7 (60000 steps performed)
1094 episodes - episode_reward: 88.574 [0.000, 109.000] - loss: 329.425 - mean_q: 1.000

Interval 8 (70000 steps performed)
1094 episodes - episode_reward: 90.223 [0.000, 109.000] - loss: 749.916 - mean_q: 1.000

Interval 9 (80000 steps performed)
1106 epi

<tensorflow.python.keras.callbacks.History at 0x7fad82a41310>

In [8]:
sars.test(env, nb_episodes=25, visualize=True)
# 8/25

Testing for 25 episodes ...
Episode 1: reward: 109.000, steps: 10
Episode 2: reward: 109.000, steps: 10
Episode 3: reward: 109.000, steps: 10
Episode 4: reward: 109.000, steps: 10
Episode 5: reward: 109.000, steps: 10
Episode 6: reward: 109.000, steps: 10
Episode 7: reward: 109.000, steps: 10
Episode 8: reward: 109.000, steps: 10
Episode 9: reward: 4.000, steps: 5
Episode 10: reward: 109.000, steps: 10
Episode 11: reward: 109.000, steps: 10
Episode 12: reward: 109.000, steps: 10
Episode 13: reward: 8.000, steps: 9
Episode 14: reward: 109.000, steps: 10
Episode 15: reward: 109.000, steps: 10
Episode 16: reward: 109.000, steps: 10
Episode 17: reward: 109.000, steps: 10
Episode 18: reward: 109.000, steps: 10
Episode 19: reward: 109.000, steps: 10
Episode 20: reward: 109.000, steps: 10
Episode 21: reward: 109.000, steps: 10
Episode 22: reward: 109.000, steps: 10
Episode 23: reward: 109.000, steps: 10
Episode 24: reward: 109.000, steps: 10
Episode 25: reward: 109.000, steps: 10


<tensorflow.python.keras.callbacks.History at 0x7fad82a8c220>

In [38]:
memory = EpisodeParameterMemory(limit=2000, window_length=1)
cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=500, train_interval=50, elite_frac=0.05)
cem.compile()

cem.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 17.598 seconds


<tensorflow.python.keras.callbacks.History at 0x7f53fc4d4880>

In [39]:
cem.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 1.000, steps: 1
Episode 2: reward: 1.000, steps: 1
Episode 3: reward: 1.000, steps: 1
Episode 4: reward: 1.000, steps: 1
Episode 5: reward: 1.000, steps: 1


<tensorflow.python.keras.callbacks.History at 0x7f54025e1130>