In [95]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.sarsa import SARSAAgent
from rl.memory import EpisodeParameterMemory

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy
from rl.policy import EpsGreedyQPolicy
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

In [96]:
#from gym import envs
#print(envs.registry.all())

In [97]:
print(env.action_space)
#> Discrete(2)
print(env.observation_space)
#> Box(4,)

Discrete(2)
Box(4,)


In [98]:
ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))

# Option 2: deep network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))


print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_12 (Dense)             (None, 16)                80        
_________________________________________________________________
activation_12 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_13 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_14 (Activation)   (None, 16)               

In [None]:
SARSAAgent

sarsa
__init__(self, model, nb_actions, policy=None, test_policy=None, gamma=.99, nb_steps_warmup=10, train_interval=1, delta_clip=np.inf, *args, **kwargs)
cem
__init__(self, model, nb_actions, memory, batch_size=50, nb_steps_warmup=1000, train_interval=50, elite_frac=0.05, memory_interval=1, theta_init=None, noise_decay_const=0.0, noise_ampl=0.0, **kwargs)        
        

In [20]:
sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=1000, train_interval=50)
sars.compile(optimizer='adam')

In [33]:
#sars.fit(env, nb_steps=500000, visualize=False, verbose=1)
# After training is done, we save the best weights.
#sars.save_weights(f'sars_{ENV_NAME}_params.h5f', overwrite=True)

In [113]:
# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = EpisodeParameterMemory(limit=2000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

In [114]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=2000000, visualize=False, verbose=1)

# After training is done, we save the best weights.
cem.save_weights(f'cem_{ENV_NAME}_l3k_w10_ba50_st2m_params.h5f', overwrite=True)

Training for 2000000 steps ...
Interval 1 (0 steps performed)
485 episodes - episode_reward: 20.464 [8.000, 147.000] - mean_best_reward: 49.286

Interval 2 (10000 steps performed)
256 episodes - episode_reward: 39.328 [8.000, 200.000] - mean_best_reward: 98.300

Interval 3 (20000 steps performed)
202 episodes - episode_reward: 49.490 [9.000, 200.000] - mean_best_reward: 124.875

Interval 4 (30000 steps performed)
147 episodes - episode_reward: 67.585 [9.000, 193.000] - mean_best_reward: 147.500

Interval 5 (40000 steps performed)
169 episodes - episode_reward: 58.964 [9.000, 200.000] - mean_best_reward: 141.375

Interval 6 (50000 steps performed)
131 episodes - episode_reward: 77.023 [9.000, 200.000] - mean_best_reward: 134.500

Interval 7 (60000 steps performed)
139 episodes - episode_reward: 72.029 [14.000, 200.000] - mean_best_reward: 142.833

Interval 8 (70000 steps performed)
146 episodes - episode_reward: 68.295 [11.000, 164.000] - mean_best_reward: 146.667

Interval 9 (80000 ste

In [94]:
# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=5, visualize=True)
#sars.test(env, nb_episodes=5, visualize=True)



Testing for 5 episodes ...
Episode 1: reward: 120.000, steps: 120
Episode 2: reward: 128.000, steps: 128
Episode 3: reward: 112.000, steps: 112
Episode 4: reward: 128.000, steps: 128
Episode 5: reward: 118.000, steps: 118


<tensorflow.python.keras.callbacks.History at 0x7ff4e2f05c70>

In [112]:
cem.test(env, nb_episodes=5, visualize=True)

# l5k_w10_ba50_st2m = 90
# l3k_w10_ba50_st2m = 148

Testing for 5 episodes ...
Episode 1: reward: 186.000, steps: 186
Episode 2: reward: 175.000, steps: 175
Episode 3: reward: 151.000, steps: 151
Episode 4: reward: 188.000, steps: 188
Episode 5: reward: 148.000, steps: 148


<tensorflow.python.keras.callbacks.History at 0x7ff4e2c424f0>

In [116]:
cem.test(env, nb_episodes=5, visualize=True)
# l2k_w10_ba50_st2m = 200


Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200


<tensorflow.python.keras.callbacks.History at 0x7ff5ca733ac0>

In [28]:
memory = SequentialMemory(limit=5000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [34]:
# dqn.fit(env, nb_steps=2500, visualize=True, verbose=1)

# # After training is done, we save the final weights.
# dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# # Finally, evaluate our algorithm for 5 episodes.
# dqn.test(env, nb_episodes=5, visualize=True)

In [35]:
#dqn.test(env, nb_episodes=22, visualize=True)

In [78]:
def build_model(state_size, num_actions):
    input = Input(shape=(1,state_size))
    x = Flatten()(input)
    x = Dense(16, activation='relu')(x)
    #x = Dense(32, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(num_actions, activation='linear')(x)
    model = Model(inputs=input, outputs=output)
    print(model.summary())
    return model

In [84]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=.6, value_min=.05, value_test=.02, nb_steps=10000)

dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=20,
               target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [85]:
def build_callbacks(env_name):
    checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=5000)]
    callbacks += [FileLogger(log_filename, interval=100)]
    return callbacks

In [86]:
callbacks = build_callbacks(ENV_NAME)

dqn.fit(env, nb_steps=150000, visualize=False, verbose=1, callbacks=callbacks)

Training for 150000 steps ...
Interval 1 (0 steps performed)
292 episodes - episode_reward: 34.247 [15.000, 66.000] - loss: 0.650 - mae: 0.600 - mean_q: 0.999 - mean_eps: 0.175

Interval 2 (10000 steps performed)
286 episodes - episode_reward: 34.899 [22.000, 52.000] - loss: 0.581 - mae: 0.553 - mean_q: 0.999 - mean_eps: 0.050

Interval 3 (20000 steps performed)
279 episodes - episode_reward: 35.896 [22.000, 56.000] - loss: 0.558 - mae: 0.537 - mean_q: 0.999 - mean_eps: 0.050

Interval 4 (30000 steps performed)
284 episodes - episode_reward: 35.169 [21.000, 55.000] - loss: 0.548 - mae: 0.531 - mean_q: 0.999 - mean_eps: 0.050

Interval 5 (40000 steps performed)
286 episodes - episode_reward: 35.010 [20.000, 52.000] - loss: 0.543 - mae: 0.527 - mean_q: 0.999 - mean_eps: 0.050

Interval 6 (50000 steps performed)
285 episodes - episode_reward: 35.088 [18.000, 56.000] - loss: 0.539 - mae: 0.524 - mean_q: 0.999 - mean_eps: 0.050

Interval 7 (60000 steps performed)
282 episodes - episode_rewa

<tensorflow.python.keras.callbacks.History at 0x7ff4e914b910>

In [87]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 44.000, steps: 44
Episode 2: reward: 35.000, steps: 35
Episode 3: reward: 38.000, steps: 38
Episode 4: reward: 40.000, steps: 40
Episode 5: reward: 30.000, steps: 30


<tensorflow.python.keras.callbacks.History at 0x7ff4e8740c10>