In [1]:
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.agents.sarsa import SARSAAgent
from rl.memory import EpisodeParameterMemory

from rl.agents.dqn import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy
from rl.policy import EpsGreedyQPolicy
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

from stable_baselines3 import A2C



In [2]:
env = gym.make('fixed_treasure:fixed_treasure-v1')

print(env.observation_space)
print(env.action_space)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape

Box(2, 2)
Discrete(4)


In [3]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(64, activation='relu'))
#model.add(Activation('relu'))
model.add(Dense(32, activation='relu'))
#model.add(Activation('relu'))
model.add(Dense(16, activation='relu'))
#model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

#sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=200, train_interval=50)
sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=0, train_interval=1)
sars.compile(optimizer='adam')

In [3]:
model = A2C('MlpPolicy', 'fixed_treasure:fixed_treasure-v1', n_steps=1).learn(100000)
#model.save("a2c_fixed_treasure_500k_plusminus100_steps3_no_distance")
model.save("a2c_fixed_treasure_dist1_100k_plusminus100_005step_n_steps1")

In [3]:
model = A2C.load("a2c_fixed_treasure_dist1_100k_plusminus100_005step_n_steps1")

In [4]:
#env = gym.make('random_treasure:random_treasure-v1')

for i in range(3):
    obs = env.reset()
    print("\nstart")
    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        print("s:" + str(_states) + " r:" + str(rewards) + " o:" + str(obs))        
        if dones:
            break
        env.render()


start
convict=[9 7] loot=[8 8] reward:1.4000000000000001
s:None r:1.4000000000000001 o:[[9 7]
 [8 8]]
convict=[10  7] loot=[8 8] reward:1.35
s:None r:1.35 o:[[10  7]
 [ 8  8]]
convict=[11  7] loot=[8 8] reward:1.3000000000000003
s:None r:1.3000000000000003 o:[[11  7]
 [ 8  8]]
convict=[12  7] loot=[8 8] reward:1.25
s:None r:1.25 o:[[12  7]
 [ 8  8]]
convict=[13  7] loot=[8 8] reward:1.2000000000000002
s:None r:1.2000000000000002 o:[[13  7]
 [ 8  8]]
convict=[14  7] loot=[8 8] reward:1.1500000000000001
s:None r:1.1500000000000001 o:[[14  7]
 [ 8  8]]
convict=[15  7] loot=[8 8] reward:1.1
s:None r:1.1 o:[[15  7]
 [ 8  8]]
convict=[15  7] loot=[8 8] reward:1.1
s:None r:1.1 o:[[15  7]
 [ 8  8]]
convict=[15  7] loot=[8 8] reward:1.1
s:None r:1.1 o:[[15  7]
 [ 8  8]]
convict=[15  7] loot=[8 8] reward:-100
s:None r:-100 o:[[15  7]
 [ 8  8]]

start
convict=[9 7] loot=[8 8] reward:1.4000000000000001
s:None r:1.4000000000000001 o:[[9 7]
 [8 8]]
convict=[10  7] loot=[8 8] reward:1.35
s:None r:1.

In [3]:
# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))
print(model.summary())

sars = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=0, train_interval=1)
sars.compile(optimizer='adam')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 4)                 20        
_________________________________________________________________
activation (Activation)      (None, 4)                 0         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
sars.fit(env, nb_steps=300000, visualize=False, verbose=1)

Training for 300000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 15:58 - reward: 1.2000



1000 episodes - episode_reward: -0.445 [-1.000, 2.100] - loss: 0.523 - mean_q: 0.691 - reward: -0.045

Interval 2 (10000 steps performed)
1000 episodes - episode_reward: -0.502 [-1.000, 1.200] - loss: 0.527 - mean_q: 0.712 - reward: -0.050

Interval 3 (20000 steps performed)
1001 episodes - episode_reward: 1.216 [-1.000, 20.900] - loss: 0.710 - mean_q: 0.648 - reward: 0.122

Interval 4 (30000 steps performed)
1000 episodes - episode_reward: 0.706 [-1.000, 2.500] - loss: 0.690 - mean_q: 0.544 - reward: 0.071

Interval 5 (40000 steps performed)
1004 episodes - episode_reward: 1.118 [-1.000, 21.100] - loss: 0.730 - mean_q: 0.490 - reward: 0.112

Interval 6 (50000 steps performed)
1003 episodes - episode_reward: 1.662 [0.800, 18.350] - loss: 0.788 - mean_q: 0.381 - reward: 0.166

Interval 7 (60000 steps performed)
1004 episodes - episode_reward: 1.707 [0.800, 18.050] - loss: 0.782 - mean_q: 0.500 - reward: 0.172

Interval 8 (70000 steps performed)
1002 episodes - episode_reward: 1.608 [0.8

<tensorflow.python.keras.callbacks.History at 0x7fbba00c0940>

In [5]:
sars.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 1.450, steps: 10
Episode 2: reward: 1.450, steps: 10
Episode 3: reward: 1.450, steps: 10
Episode 4: reward: 1.450, steps: 10


TclError: invalid command name ".!frame.!canvas"