In [3]:
from wumpus import *

In [2]:
scenario = WumpusWorldScenario(
    agent = Explorer(heading='north', verbose=False),
    objects = [(Wumpus(),(1,3)),
               (Pit(),(3,3)),
               (Pit(),(3,1)),
               (Gold(),(2,3))],
    width = 4, 
    height = 4, 
    entrance = (1,1),
    trace=False
)

In [4]:
import gym
from gym import spaces
import numpy as np
from collections import OrderedDict

In [5]:
class WumpusWorld(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self):
        self._reset()
        self.actions = [
            'TurnRight', 'TurnLeft', 'Forward', 'Grab', 
            'Climb', 'Shoot', 'Wait'
        ]
        self.action_space = spaces.Box(low=0, high=len(self.actions)-1, shape=(1,), dtype=np.uint8)
        self.observation_space = spaces.MultiDiscrete([5, 5, 2, 2, 2, 2, 2]) # TODO: Heading variable
        
    def step(self, action):
        action = int(action)
        if action >= len(self.actions) or action < 0:
            action = 6 # Wait
        self.env.execute_action(self.agent, self.actions[action])
        self.env.time_step += 1
        self.env.exogenous_change()
        reward = self.agent.performance_measure - self.previous_score
        
        observation = self._state
        
        #### SPECIAL CASE reward #####
        if self._location == (2,3) and not self.gold_reward_given: # TODO: Refactor hardcoded location
            if action == 3:
                self.has_gold = True
                reward = 100 # Grab
            else:
                reward = 50   # Gold state
            self.gold_reward_given = True
        elif self._location == (1,1): # TODO: Refactor hardocded location
            if self.has_gold:
                reward = 50
            elif action == 4: # Climb
                reward = -200 # Don't climb without gold :-)
        #### SPECIAL CASE reward #####
            
        self.previous_score = self.agent.performance_measure
        done = self.env.is_done()
        
        return observation, reward, done, {}
    
    def reset(self):
        self.previous_score = 0
        self._reset()
        return self._state
    
    def render(self, mode='human', close=False):
        print(self.env.to_string())
        
    def _reset(self):
        self.scenario = WumpusWorldScenario(
            agent = Explorer(heading='north', verbose=False),
            objects = [(Wumpus(),(1,3)),
                       (Pit(),(3,3)),
                       (Pit(),(3,1)),
                       (Gold(),(2,3))],
            width = 4, 
            height = 4, 
            entrance = (1,1),
            trace=False
        )
        self.agent = self.scenario.agent
        self.env = self.scenario.env
        self.has_gold = False
        self.gold_reward_given = False
        
    @property
    def _state(self):
        location = self._location
        percept = self.env.percept(self.agent)
        
        return np.array([location[0], location[1], int(percept[0]), int(percept[1]),
                int(percept[2]), int(percept[3]), int(percept[4])])
        
    @property
    def _location(self): return self.agent.location
    
    @property
    def _percept(self): return self.env.percept(self.agent)
    
    @property
    def spec(self):
        return WumpusWorld.Spec
    
    class Spec():
        id = "WumpusWorld"
        
env = WumpusWorld()
env.action_space.sample()

array([5], dtype=uint8)

In [None]:
for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        #env.render()
        #print(observation)
        action = env.action_space.sample() #a
        observation, reward, done, info = env.step(action) #s, r, s'
        print(reward)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

In [None]:
env.reset()
import tensorflow as tf
from rlzoo.common.utils import make_env, set_seed
from rlzoo.algorithms import AC
from rlzoo.common.value_networks import ValueNetwork
from rlzoo.common.policy_networks import StochasticPolicyNetwork

''' load environment '''
#env = gym.make('CartPole-v0').unwrapped
obs_space = env.observation_space
act_space = env.action_space
# reproducible
#seed = 2
#set_seed(seed, env)

''' build networks for the algorithm '''
num_hidden_layer = 4 #number of hidden layers for the networks
hidden_dim = 64 # dimension of hidden layers for the networks
with tf.name_scope('AC'):
        with tf.name_scope('Critic'):
            	# choose the critic network, can be replaced with customized network
                critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim])
        with tf.name_scope('Actor'):
            	# choose the actor network, can be replaced with customized network
                actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim], output_activation=tf.nn.tanh)
net_list = [actor, critic] # list of the networks

''' choose optimizers '''
a_lr, c_lr = 1e-4, 1e-2  # a_lr: learning rate of the actor; c_lr: learning rate of the critic
a_optimizer = tf.optimizers.Adam(a_lr)
c_optimizer = tf.optimizers.Adam(c_lr)
optimizers_list=[a_optimizer, c_optimizer]  # list of optimizers

# intialize the algorithm model, with algorithm parameters passed in
model = AC(net_list, optimizers_list)
''' 
full list of arguments for the algorithm
----------------------------------------
net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
optimizers_list: a list of optimizers for all networks and differentiable variables
gamma: discounted factor of reward
action_range: scale of action values
'''

# start the training process, with learning parameters passed in
model.learn(env, train_episodes=5000,  max_steps=200,
            save_interval=50, mode='train', render=False)
''' 
full list of parameters for training
---------------------------------------
env: learning environment
train_episodes:  total number of episodes for training
test_episodes:  total number of episodes for testing
max_steps:  maximum number of steps for one episode
save_interval: time steps for saving the weights and plotting the results
mode: 'train' or 'test'
render:  if true, visualize the environment
'''

# test after training
model.learn(env, test_episodes=100, max_steps=200,  mode='test', render=False)

[TL] Input  input_layer: [None, 7]
[TL] Dense  mlp_layer1: 64 relu
[TL] Dense  mlp_layer2: 64 relu
[TL] Dense  mlp_layer3: 64 relu
[TL] Dense  mlp_layer4: 64 relu
[TL] Dense  dense_7: 1 No Activation
[TL] Input  input_layer: (None, 7)
[TL] Dense  hidden_layer1: 64 relu
[TL] Dense  hidden_layer2: 64 relu
[TL] Dense  hidden_layer3: 64 relu
[TL] Dense  hidden_layer4: 64 relu
[TL] Dense  dense_8: 1 tanh
Training...  | Algorithm: AC  | Environment: WumpusWorld
A Wumpus ate <Explorer>!
Episode: 0/5000  | Episode Reward: -1003.0000  | Running Time: 0.2327
[TL] [*] Saving TL weights into ./model/AC-WumpusWorld/model_actor
[TL] [*] Saved
[TL] [*] Saving TL weights into ./model/AC-WumpusWorld/model_critic
[TL] [*] Saved
A Wumpus ate <Explorer>!
Episode: 1/5000  | Episode Reward: -1003.0000  | Running Time: 0.5015
Episode: 2/5000  | Episode Reward: -206.0000  | Running Time: 0.6710
Episode: 3/5000  | Episode Reward: -209.0000  | Running Time: 0.8758
Episode: 4/5000  | Episode Reward: -200.0000  |

In [5]:
from rlzoo.common.utils import make_env
from rlzoo.algorithms.td3.td3 import TD3
from rlzoo.common.value_networks import *
from rlzoo.common.policy_networks import *

''' load environment '''
# env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized/wrapped environment to run
action_shape = env.action_space.shape
state_shape = env.observation_space.shape
# reproducible
seed = 2
np.random.seed(seed)
tf.random.set_seed(seed)
#env.seed(seed)

''' build networks for the algorithm '''
num_hidden_layer = 2  # number of hidden layers for the networks
hidden_dim = 64  # dimension of hidden layers for the networks
with tf.name_scope('TD3'):
    with tf.name_scope('Q_Net1'):
        q_net1 = QNetwork(env.observation_space, env.action_space,
                          hidden_dim_list=num_hidden_layer * [hidden_dim])
    with tf.name_scope('Q_Net2'):
        q_net2 = QNetwork(env.observation_space, env.action_space,
                          hidden_dim_list=num_hidden_layer * [hidden_dim])
    with tf.name_scope('Target_Q_Net1'):
        target_q_net1 = QNetwork(env.observation_space, env.action_space,
                                 hidden_dim_list=num_hidden_layer * [hidden_dim])
    with tf.name_scope('Target_Q_Net2'):
        target_q_net2 = QNetwork(env.observation_space, env.action_space,
                                 hidden_dim_list=num_hidden_layer * [hidden_dim])
    with tf.name_scope('Policy'):
        policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space,
                                                hidden_dim_list=num_hidden_layer * [hidden_dim])
    with tf.name_scope('Target_Policy'):
        target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space,
                                                       hidden_dim_list=num_hidden_layer * [hidden_dim])
net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net]

''' choose optimizers '''
q_lr, policy_lr = 3e-4, 3e-4  # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network
q_optimizer1 = tf.optimizers.Adam(q_lr)
q_optimizer2 = tf.optimizers.Adam(q_lr)
policy_optimizer = tf.optimizers.Adam(policy_lr)
optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer]

model = TD3(net_list, optimizers_list)
''' 
full list of arguments for the algorithm
----------------------------------------
net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
optimizers_list: a list of optimizers for all networks and differentiable variables
state_dim: dimension of state for the environment
action_dim: dimension of action for the environment
replay_buffer_capacity: the size of buffer for storing explored samples
policy_target_update_interval: delayed interval for updating the target policy
action_range: value of each action in [-action_range, action_range]
'''

model.learn(env, train_episodes=5000, max_steps=150, batch_size=64, explore_steps=5000, update_itr=3,
            reward_scale=1., save_interval=100, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train',
            render=False)
''' 
full list of parameters for training
---------------------------------------
env: learning environment
train_episodes:  total number of episodes for training
test_episodes:  total number of episodes for testing
max_steps:  maximum number of steps for one episode
batch_size:  udpate batchsize
explore_steps:  for random action sampling in the beginning of training
update_itr: repeated updates for single step
reward_scale: value range of reward
save_interval: timesteps for saving the weights and plotting the results
explore_noise_scale: range of action noise for exploration
eval_noise_scale: range of action noise for evaluation of action value
mode: 'train' or 'test'
render: if true, visualize the environment
'''
# test
model.learn(env, test_episodes=10, max_steps=150, mode='test', render=False)

[TL] Input  Act_Input_Layer: (None, 1)
[TL] Input  Obs_Input_Layer: (None, 7)
[TL] Concat concat_1: concat_dim: -1
[TL] Dense  mlp_hidden_layer1: 64 relu
[TL] Dense  mlp_hidden_layer2: 64 relu
[TL] Dense  dense_1: 1 No Activation
[TL] Input  Act_Input_Layer: (None, 1)
[TL] Input  Obs_Input_Layer: (None, 7)
[TL] Concat concat_2: concat_dim: -1
[TL] Dense  mlp_hidden_layer1: 64 relu
[TL] Dense  mlp_hidden_layer2: 64 relu
[TL] Dense  dense_2: 1 No Activation
[TL] Input  Act_Input_Layer: (None, 1)
[TL] Input  Obs_Input_Layer: (None, 7)
[TL] Concat concat_3: concat_dim: -1
[TL] Dense  mlp_hidden_layer1: 64 relu
[TL] Dense  mlp_hidden_layer2: 64 relu
[TL] Dense  dense_3: 1 No Activation
[TL] Input  Act_Input_Layer: (None, 1)
[TL] Input  Obs_Input_Layer: (None, 7)
[TL] Concat concat_4: concat_dim: -1
[TL] Dense  mlp_hidden_layer1: 64 relu
[TL] Dense  mlp_hidden_layer2: 64 relu
[TL] Dense  dense_4: 1 No Activation
[TL] Input  input_layer: (None, 7)
[TL] Dense  hidden_layer1: 64 relu
[TL] Dense

Episode: 61/5000  | Episode Reward: -223.0000  | Running Time: 81.0639
<Explorer> fell into a bottomless pit!
Episode: 62/5000  | Episode Reward: 58.0000  | Running Time: 83.9935
Episode: 63/5000  | Episode Reward: -201.0000  | Running Time: 84.1023
Episode: 64/5000  | Episode Reward: -216.0000  | Running Time: 84.4978
Episode: 65/5000  | Episode Reward: -219.0000  | Running Time: 85.0573
Episode: 66/5000  | Episode Reward: -200.0000  | Running Time: 85.1203
Episode: 67/5000  | Episode Reward: -200.0000  | Running Time: 85.1662
Episode: 68/5000  | Episode Reward: -233.0000  | Running Time: 86.5034
Episode: 69/5000  | Episode Reward: -200.0000  | Running Time: 86.5560
Episode: 70/5000  | Episode Reward: -205.0000  | Running Time: 86.8985
Episode: 71/5000  | Episode Reward: -201.0000  | Running Time: 87.0111
Episode: 72/5000  | Episode Reward: 399.0000  | Running Time: 95.4552
Episode: 73/5000  | Episode Reward: 55.0000  | Running Time: 103.4661
<Explorer> fell into a bottomless pit!
Epi

Episode: 155/5000  | Episode Reward: -221.0000  | Running Time: 187.2478
Episode: 156/5000  | Episode Reward: -201.0000  | Running Time: 187.3606
Episode: 157/5000  | Episode Reward: -215.0000  | Running Time: 187.7152
Episode: 158/5000  | Episode Reward: -224.0000  | Running Time: 188.5799
Episode: 159/5000  | Episode Reward: -200.0000  | Running Time: 188.6444
Episode: 160/5000  | Episode Reward: -200.0000  | Running Time: 188.6910
Episode: 161/5000  | Episode Reward: -279.0000  | Running Time: 192.6922
Episode: 162/5000  | Episode Reward: -217.0000  | Running Time: 193.1562
Episode: 163/5000  | Episode Reward: -200.0000  | Running Time: 193.2334
Episode: 164/5000  | Episode Reward: -201.0000  | Running Time: 193.3444
A Wumpus ate <Explorer>!
Episode: 165/5000  | Episode Reward: -1002.0000  | Running Time: 193.4592
<Explorer> fell into a bottomless pit!
Episode: 166/5000  | Episode Reward: -1014.0000  | Running Time: 193.6837
Episode: 167/5000  | Episode Reward: -215.0000  | Running 

Episode: 248/5000  | Episode Reward: -214.0000  | Running Time: 679.8502
Episode: 249/5000  | Episode Reward: -235.0000  | Running Time: 681.5579
Episode: 250/5000  | Episode Reward: -201.0000  | Running Time: 681.6794
Episode: 251/5000  | Episode Reward: -245.0000  | Running Time: 683.9910
<Explorer> fell into a bottomless pit!
Episode: 252/5000  | Episode Reward: 11.0000  | Running Time: 693.4546
A Wumpus ate <Explorer>!
Episode: 253/5000  | Episode Reward: -1028.0000  | Running Time: 694.6862
Episode: 254/5000  | Episode Reward: -201.0000  | Running Time: 694.8072
Episode: 255/5000  | Episode Reward: -216.0000  | Running Time: 695.2732
Episode: 256/5000  | Episode Reward: -213.0000  | Running Time: 695.5147
Episode: 257/5000  | Episode Reward: -214.0000  | Running Time: 695.8413
Episode: 258/5000  | Episode Reward: -200.0000  | Running Time: 695.9101
Episode: 259/5000  | Episode Reward: -76.0000  | Running Time: 700.3466
Episode: 260/5000  | Episode Reward: -202.0000  | Running Time

KeyboardInterrupt: 