# Examples of Agents in a Number Guessing Environment

Contributor: michaelmenzel@google.com

In [1]:
#@title Imports
!pip install --upgrade -q gym 
!pip install -q dm-acme 
!pip install -q dm-acme[reverb] 
!pip install -q dm-acme[tf]

import logging
logging.basicConfig(level=logging.INFO)

import numpy as np

import gym
from gym import envs
from gym import spaces
from gym.utils import seeding

import tensorflow as tf
print(f'Tensorflow {tf.__version__}')

import tensorflow_probability as tfp
print(f'Tensorflow Probability {tfp.__version__}')

import acme
from acme import environment_loop
from acme import specs
from acme import wrappers
from acme.agents.tf import actors, dqn, dmpo, impala
from acme.tf import networks
from acme.utils import loggers 
print(f'ACME {acme.__version__}')

import sonnet as snt
print(f'Sonnet {snt.__version__}')



INFO:numexpr.utils:NumExpr defaulting to 2 threads.


Tensorflow 2.8.0
Tensorflow Probability 0.15.0
ACME 0.4.0
Sonnet 2.0.0


In [2]:
#@title Available Gym Environments
', '.join(map(lambda e: e.id, envs.registry.all()))

'CartPole-v0, CartPole-v1, MountainCar-v0, MountainCarContinuous-v0, Pendulum-v1, Acrobot-v1, LunarLander-v2, LunarLanderContinuous-v2, BipedalWalker-v3, BipedalWalkerHardcore-v3, CarRacing-v1, Blackjack-v1, FrozenLake-v1, FrozenLake8x8-v1, CliffWalking-v0, Taxi-v3, Reacher-v2, Pusher-v2, InvertedPendulum-v2, InvertedDoublePendulum-v2, HalfCheetah-v2, HalfCheetah-v3, Hopper-v2, Hopper-v3, Swimmer-v2, Swimmer-v3, Walker2d-v2, Walker2d-v3, Ant-v2, Ant-v3, Humanoid-v2, Humanoid-v3, HumanoidStandup-v2'

## Guessing Game Environment Implementation

Let's define a simple OpenAI Gym environment for a number guessing game. In the environment, an agent has to guess a number between 0-100 and can make observations if the guess is lower (1), equal (2) or higher (3) than the actual number.
The game ends when either 50 guesses were made or the guessed number is <1% off from the actual number.

In [3]:
class GuessingGame(gym.Env):
    """Number guessing game
    The object of the game is to guess within 1% of the randomly chosen number
    within 50 time steps
    After each step the agent is provided with one of four possible observations
    which indicate where the guess is in relation to the randomly chosen number
    0 - No guess yet submitted (only after reset)
    1 - Guess is lower than the target
    2 - Guess is equal to the target
    3 - Guess is higher than the target
    The actions are:
    0 - Decrease number by 10
    1 - Decrease number by 1
    2 - Generate random number
    3 - Increase number by 1
    4 - Increase number by 10
    The rewards are:
    0 if the agent's guess is outside of 1% of the target
    (50 - nbr of guesses) if the agent's guess is inside 1% of the target
    The episode terminates after the agent guesses within 1% of the target or
    50 steps have been taken
    The agent will need to use a memory of previously submitted actions and observations
    in order to efficiently explore the available actions
    The purpose is to have agents optimize their exploration parameters (e.g. how far to
    explore from previous actions) based on previous experience. Because the goal changes
    each episode a state-value or action-value function isn't able to provide any additional
    benefit apart from being able to tell whether to increase or decrease the next guess.
    The perfect agent would likely learn the bounds of the action space (without referring
    to them explicitly) and then follow binary tree style exploration towards to goal number
    """
    def __init__(self):
        self.range = 100  # Randomly selected number is within [0, this value]
        self.bounds = self.range

        self.action_space = spaces.Discrete(5)
        #self.action_space = spaces.Box(low=0, high=self.bounds, 
        #                               shape=(), dtype=np.int32)
        self.observation_space = spaces.Discrete(4)

        self.number = 0
        self.guess_count = 0
        self.guess_max = 50
        self.observation = 0

        self.reset()

    def step(self, action):
        assert self.action_space.contains(int(action))

        prev_guess_number = self.guess_number

        if action == 0:
          self.guess_number -= 10
        elif action == 1:
          self.guess_number -= 1
        elif action == 2:
          self.guess_number = np.random.randint(0, self.range)
        elif action == 3:
          self.guess_number += 1
        elif action == 4:
          self.guess_number += 10

        if self.guess_number < self.number:
            self.observation = 1
        elif self.guess_number == self.number:
            self.observation = 2
        elif self.guess_number > self.number:
            self.observation = 3

        reward = 0
        done = False
        self.guess_count += 1

        if (self.number - self.range * 0.01) < self.guess_number < (self.number + self.range * 0.01):
            reward = float(self.guess_max - self.guess_count)
            done = True
        else:
          reward = float(1 - (abs(self.number - self.guess_number) / abs(self.number - prev_guess_number)))

        if self.guess_count >= self.guess_max:
            done = True

        return self.observation, reward, done, {"number": self.number, "guess_number": self.guess_number, "guesses": self.guess_count}

    def reset(self):
        self.number = np.random.randint(0, self.range)
        self.guess_number = -1
        self.guess_count = 0
        self.observation = 0
        return self.observation

env = GuessingGame()
print(f'actions: {env.action_space}, observations: {env.observation_space}')

actions: Discrete(5), observations: Discrete(4)


Simple example to interact with the environment:

In [4]:
env.reset()
obs, reward, done, state = env.step(2)
print(f'observation: {obs}, reward: {reward}, done: {done}, state: {state}')

observation: 1, reward: 0.5352112676056338, done: False, state: {'number': 70, 'guess_number': 37, 'guesses': 1}


## Naive Random Policy Agent

We start with a simple implementation of a static random policy used by an agent. The agent simply loops through 50 guesses and picks a random number each time. If he is right or has reached 50 iterations, the loop stops. 

In [5]:
env.reset()
done = False
while not done:
  action = env.action_space.sample()
  print(f'action: {action}')
  obs, reward, done, state = env.step(action) # take a random action
  print(f'observation: {obs}, reward: {reward}, done: {done}, state: {state}')
  if done: break

action: 3
observation: 1, reward: 0.015873015873015928, done: False, state: {'number': 62, 'guess_number': 0, 'guesses': 1}
action: 1
observation: 1, reward: -0.016129032258064502, done: False, state: {'number': 62, 'guess_number': -1, 'guesses': 2}
action: 0
observation: 1, reward: -0.15873015873015883, done: False, state: {'number': 62, 'guess_number': -11, 'guesses': 3}
action: 1
observation: 1, reward: -0.013698630136986356, done: False, state: {'number': 62, 'guess_number': -12, 'guesses': 4}
action: 2
observation: 1, reward: 0.16216216216216217, done: False, state: {'number': 62, 'guess_number': 0, 'guesses': 5}
action: 0
observation: 1, reward: -0.16129032258064524, done: False, state: {'number': 62, 'guess_number': -10, 'guesses': 6}
action: 1
observation: 1, reward: -0.01388888888888884, done: False, state: {'number': 62, 'guess_number': -11, 'guesses': 7}
action: 4
observation: 1, reward: 0.136986301369863, done: False, state: {'number': 62, 'guess_number': -1, 'guesses': 8}


## Convert the OpenAI Gym Environment to ACME

In [6]:
from typing import Optional
from acme import types
from acme.wrappers import base
import dm_env

class StepLimitWrapper(base.EnvironmentWrapper):
  """A wrapper which truncates episodes at the specified step limit."""

  def __init__(self, environment: dm_env.Environment,
               step_limit: Optional[int] = None):
    super().__init__(environment)
    self._step_limit = step_limit
    self._elapsed_steps = 0

  def reset(self) -> dm_env.TimeStep:
    self._elapsed_steps = 0
    return self._environment.reset()

  def step(self, action: types.NestedArray) -> dm_env.TimeStep:
    timestep = self._environment.step(action)
    self._elapsed_steps += 1
    if self._step_limit is not None and self._elapsed_steps >= self._step_limit:
      return dm_env.truncation(
          timestep.reward, timestep.observation, timestep.discount)
    return timestep

In [7]:
acme_env = wrappers.SinglePrecisionWrapper(wrappers.GymWrapper(env))
acme_env = StepLimitWrapper(acme_env, env.guess_max)
env_spec = specs.make_environment_spec(acme_env)

print('actions:\n', env_spec.actions, '\n')
print('observations:\n', env_spec.observations, '\n')
print('rewards:\n', env_spec.rewards, '\n')
print('discounts:\n', env_spec.discounts, '\n')

actions:
 DiscreteArray(shape=(), dtype=int32, name=action, minimum=0, maximum=4, num_values=5) 

observations:
 DiscreteArray(shape=(), dtype=int32, name=observation, minimum=0, maximum=3, num_values=4) 

rewards:
 Array(shape=(), dtype=dtype('float32'), name='reward') 

discounts:
 BoundedArray(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0) 



In [8]:
def cast_to_float(x):
  return tf.cast(x, dtype=tf.float32)

def cast_to_int(x):
  return tf.cast(x, dtype=tf.int32)

def convert_to_int_distribution(x):
  x.dtype = tf.int32
  return x

def select_action_max(x):
  return tf.math.argmax(x, axis=-1)

## DQN Agent with ACME

### Instantiate a DQN Agent

In [9]:
dqn_policy_network = snt.Sequential([
    cast_to_float,
    networks.LayerNormMLP((32, 16, env_spec.actions.num_values)),
])

dqn_agent = dqn.DQN(env_spec, dqn_policy_network)


INFO:absl:Attempting to restore checkpoint: None


### Training Loop

Acme helps with automating the training loop either by runing n steps or n episodes. We run a number of episodes and let the agent learn from the experiences.

In [10]:
episodes = 1000

dqn_train_loop = acme.EnvironmentLoop(acme_env, dqn_agent, label='dqn_train_loop', logger=loggers.TerminalLogger())
dqn_train_loop.run(num_episodes=episodes)

INFO:absl:Saving checkpoint: /root/acme/f7f2607a-a38a-11ec-bee9-0242ac1c0002/checkpoints/dqn_learner
INFO:root:Episode Length = 50 | Episode Return = -3.3903257846832275 | Episodes = 1 | Steps = 50 | Steps Per Second = 46.534
INFO:root:Episode Length = 50 | Episode Return = -31.02460479736328 | Episodes = 2 | Steps = 100 | Steps Per Second = 483.146
INFO:root:Episode Length = 50 | Episode Return = -4.912646770477295 | Episodes = 3 | Steps = 150 | Steps Per Second = 537.377
INFO:root:Episode Length = 50 | Episode Return = -2.3742477893829346 | Episodes = 4 | Steps = 200 | Steps Per Second = 486.120
INFO:root:Episode Length = 50 | Episode Return = -7.045987129211426 | Episodes = 5 | Steps = 250 | Steps Per Second = 544.416
INFO:root:Episode Length = 50 | Episode Return = -4.474155902862549 | Episodes = 6 | Steps = 300 | Steps Per Second = 449.569
INFO:root:Episode Length = 50 | Episode Return = -2.2121593952178955 | Episodes = 7 | Steps = 350 | Steps Per Second = 535.652
INFO:root:Episod

### Test the DQN Agent's Learned Policy

We can now simply reuse the trained policy network to build a simple actor.

In [11]:
dqn_eval_network = snt.Sequential([
  dqn_policy_network,
  select_action_max,
])

dqn_eval_actor = actors.FeedForwardActor(policy_network=dqn_eval_network)

Alternatively, we could also use the agent as actor. However, we need to make sure not to update its state and variables.
Therefore, we disable updates in the following evaluation loop.

In [12]:
dqn_eval_loop = acme.EnvironmentLoop(acme_env, dqn_agent, should_update=False, label='dqn_eval_loop', logger=loggers.TerminalLogger())
dqn_eval_loop.run(num_episodes=10)

INFO:root:Episode Length = 2 | Episode Return = 48.5 | Episodes = 1 | Steps = 2 | Steps Per Second = 218.414
INFO:root:Episode Length = 38 | Episode Return = -12.582738876342773 | Episodes = 2 | Steps = 40 | Steps Per Second = 501.402
INFO:root:Episode Length = 18 | Episode Return = 34.520668029785156 | Episodes = 3 | Steps = 58 | Steps Per Second = 363.773
INFO:root:Episode Length = 14 | Episode Return = 38.346534729003906 | Episodes = 4 | Steps = 72 | Steps Per Second = 395.740
INFO:root:Episode Length = 8 | Episode Return = 44.349998474121094 | Episodes = 5 | Steps = 80 | Steps Per Second = 349.500
INFO:root:Episode Length = 15 | Episode Return = 28.82543182373047 | Episodes = 6 | Steps = 95 | Steps Per Second = 463.063
INFO:root:Episode Length = 10 | Episode Return = 41.9289665222168 | Episodes = 7 | Steps = 105 | Steps Per Second = 435.835
INFO:root:Episode Length = 5 | Episode Return = 45.75 | Episodes = 8 | Steps = 110 | Steps Per Second = 354.704
INFO:root:Episode Length = 16 |

Let's reuse our simple evaluation loop we used with a random policy to see what the agent does in detail.

In [13]:
obs = env.reset()
print(f'Initial observation: {obs}')
done = False
while not done:
  action = dqn_eval_actor.select_action(obs)
  print(f'action: {action}')
  obs, reward, done, state = env.step(action) # take a random action
  print(f'observation: {obs}, reward: {reward}, done: {done}, state: {state}')
  if done: break

Initial observation: 0
action: 4
observation: 1, reward: 0.47619047619047616, done: False, state: {'number': 20, 'guess_number': 9, 'guesses': 1}
action: 4
observation: 1, reward: 0.9090909090909091, done: False, state: {'number': 20, 'guess_number': 19, 'guesses': 2}
action: 4
observation: 3, reward: -8.0, done: False, state: {'number': 20, 'guess_number': 29, 'guesses': 3}
action: 1
observation: 3, reward: 0.11111111111111116, done: False, state: {'number': 20, 'guess_number': 28, 'guesses': 4}
action: 1
observation: 3, reward: 0.125, done: False, state: {'number': 20, 'guess_number': 27, 'guesses': 5}
action: 1
observation: 3, reward: 0.1428571428571429, done: False, state: {'number': 20, 'guess_number': 26, 'guesses': 6}
action: 1
observation: 3, reward: 0.16666666666666663, done: False, state: {'number': 20, 'guess_number': 25, 'guesses': 7}
action: 1
observation: 3, reward: 0.19999999999999996, done: False, state: {'number': 20, 'guess_number': 24, 'guesses': 8}
action: 1
observa

## IMPALA Agent with ACME

Let's try a different agent approach with Impala (or V-trace). Impala can find a policy similarly like DQN through off-policy experience collection.

In [14]:
impala_policy_network = snt.DeepRNN([
      cast_to_float,
      snt.Flatten(),
      snt.nets.MLP([32, 32]),
      snt.LSTM(25),
      networks.PolicyValueHead(env_spec.actions.num_values),
  ])

impala_agent = impala.IMPALA(
      environment_spec=env_spec,
      network=impala_policy_network,
      sequence_length=5,
      sequence_period=5,
  )

In [15]:
episodes = 1000

dqn_train_loop = acme.EnvironmentLoop(acme_env, impala_agent, label='impala_train_loop', logger=loggers.TerminalLogger())
dqn_train_loop.run(num_episodes=episodes)

INFO:root:Episode Length = 17 | Episode Return = 34.29985427856445 | Episodes = 1 | Steps = 17 | Steps Per Second = 97.351
INFO:root:Episode Length = 50 | Episode Return = -27.66452980041504 | Episodes = 2 | Steps = 67 | Steps Per Second = 190.573
INFO:tensorflow:Assets written to: /root/acme/f7f2607a-a38a-11ec-bee9-0242ac1c0002/snapshots/network/assets
INFO:root:[Learner] Critic Loss = 38.212 | Entropy Loss = -1.560 | Loss = 19.735 | Policy Gradient Loss = 0.645 | Steps = 1 | Walltime = 0
INFO:root:Episode Length = 50 | Episode Return = -36.963218688964844 | Episodes = 3 | Steps = 117 | Steps Per Second = 8.472
INFO:root:Episode Length = 50 | Episode Return = -44.15215301513672 | Episodes = 4 | Steps = 167 | Steps Per Second = 224.683
INFO:root:Episode Length = 50 | Episode Return = -11.291078567504883 | Episodes = 5 | Steps = 217 | Steps Per Second = 220.122
INFO:root:Episode Length = 50 | Episode Return = -11.518305778503418 | Episodes = 6 | Steps = 267 | Steps Per Second = 212.720


In [16]:
impala_eval_actor = impala.IMPALAActor(network=impala_policy_network)

In [17]:
impala_eval_loop = acme.EnvironmentLoop(acme_env, impala_agent, should_update=False, label='impala_eval_loop', logger=loggers.TerminalLogger())
impala_eval_loop.run(num_episodes=10)

INFO:root:Episode Length = 50 | Episode Return = -3.770080089569092 | Episodes = 1 | Steps = 50 | Steps Per Second = 209.877
INFO:root:Episode Length = 50 | Episode Return = -3.467578887939453 | Episodes = 2 | Steps = 100 | Steps Per Second = 203.408
INFO:root:Episode Length = 50 | Episode Return = 0.5505331158638 | Episodes = 3 | Steps = 150 | Steps Per Second = 197.990
INFO:root:Episode Length = 1 | Episode Return = 49.0 | Episodes = 4 | Steps = 151 | Steps Per Second = 125.132
INFO:root:Episode Length = 37 | Episode Return = 6.739270210266113 | Episodes = 5 | Steps = 188 | Steps Per Second = 193.471
INFO:root:Episode Length = 50 | Episode Return = -7.1552581787109375 | Episodes = 6 | Steps = 238 | Steps Per Second = 217.213
INFO:root:Episode Length = 26 | Episode Return = 24.74111557006836 | Episodes = 7 | Steps = 264 | Steps Per Second = 201.328
INFO:root:Episode Length = 50 | Episode Return = -12.168447494506836 | Episodes = 8 | Steps = 314 | Steps Per Second = 215.911
INFO:root:E

In [18]:
obs = env.reset()
print(f'Initial observation: {obs}')
done = False
while not done:
  action = impala_eval_actor.select_action(obs)
  print(f'action: {action}')
  obs, reward, done, state = env.step(action) # take a random action
  print(f'observation: {obs}, reward: {reward}, done: {done}, state: {state}')
  if done: break

Initial observation: 0
action: 2
observation: 1, reward: 0.6, done: False, state: {'number': 24, 'guess_number': 14, 'guesses': 1}
action: 1
observation: 1, reward: -0.10000000000000009, done: False, state: {'number': 24, 'guess_number': 13, 'guesses': 2}
action: 4
observation: 1, reward: 0.9090909090909091, done: False, state: {'number': 24, 'guess_number': 23, 'guesses': 3}
action: 4
observation: 3, reward: -8.0, done: False, state: {'number': 24, 'guess_number': 33, 'guesses': 4}
action: 1
observation: 3, reward: 0.11111111111111116, done: False, state: {'number': 24, 'guess_number': 32, 'guesses': 5}
action: 1
observation: 3, reward: 0.125, done: False, state: {'number': 24, 'guess_number': 31, 'guesses': 6}
action: 1
observation: 3, reward: 0.1428571428571429, done: False, state: {'number': 24, 'guess_number': 30, 'guesses': 7}
action: 1
observation: 3, reward: 0.16666666666666663, done: False, state: {'number': 24, 'guess_number': 29, 'guesses': 8}
action: 1
observation: 3, rewar

## DMPO Agent with ACME

Now we want to try one of the on-policy approaches using a policy optimization. In the following, a DMPO agent learns how to play the game.

However, DMPO is intended for continuous control while our environment uses a discrete action space.
We make this as a fun task of translating a continuous control agent to a discrete control agent. Let's see how it goes :)


In [19]:
from typing import Callable
from typing import List

from acme import types
from acme import core
from acme.agents import agent

class WrapperAgent(agent.Agent):
  def __init__(self, agent: agent.Agent):
    self._agent = agent

  def select_action(self, observation: types.NestedArray) -> types.NestedArray:
    return self._agent.select_action(observation)

  def observe_first(self, timestep: dm_env.TimeStep):
    self._agent.observe_first(timestep)

  def observe(
      self,
      action: types.NestedArray,
      next_timestep: dm_env.TimeStep,
  ):
    self._agent.observe(action, next_timestep)

  def update(self):
    self._agent.update()

  def get_variables(self, names: List[str]) -> List[List[np.ndarray]]:
    self._agent.get_variables(names)

class PostSelectActionAgentTransformer(WrapperAgent):
  def __init__(self, post_select_action_fn: Callable[[types.NestedArray], types.NestedArray], *args, **kwargs):
    self._post_select_action_fn = post_select_action_fn
    super().__init__(*args, **kwargs)

  def select_action(self, observation: types.NestedArray) -> types.NestedArray:
    return self._post_select_action_fn(super().select_action(observation))

def convert_action(x):
    if tf.rank(x) > 1 and tf.shape(x)[-1] > 1:
        result = tf.cast(tf.expand_dims(tf.math.argmax(x, axis=-1), axis=-1), dtype=tf.float32)
    else:
        result = tf.cast(tf.expand_dims(x, axis=-1), dtype=tf.float32)
    return result

In [20]:
dmpo_policy_network = snt.Sequential([
    networks.LayerNormMLP((32, 16)),
    networks.MultivariateNormalDiagHead(env_spec.actions.num_values),
])

dmpo_critic_network = snt.Sequential([
    networks.CriticMultiplexer(
        critic_network=networks.LayerNormMLP((256, 128, 1)),
        action_network=convert_action),
    networks.DiscreteValuedHead(env_spec.actions.minimum, env_spec.actions.maximum, env_spec.actions.num_values),
])

dmpo_observation_network = cast_to_float

dmpo_agent = dmpo.DistributionalMPO(env_spec, 
                                    dmpo_policy_network, 
                                    dmpo_critic_network, 
                                    dmpo_observation_network,
                                    checkpoint=False)
dmpo_discretized_agent = PostSelectActionAgentTransformer(
    post_select_action_fn=lambda x: np.array(np.argmax(x), dtype=np.int32), 
    agent=dmpo_agent)

In [21]:
episodes = 1000

dmpo_train_loop = acme.EnvironmentLoop(acme_env, dmpo_discretized_agent, label='dmpo_train_loop', logger=loggers.TerminalLogger())
dmpo_train_loop.run(num_episodes=episodes)

INFO:root:Episode Length = 50 | Episode Return = -17.023296356201172 | Episodes = 1 | Steps = 50 | Steps Per Second = 104.358
INFO:root:Episode Length = 50 | Episode Return = -9.756292343139648 | Episodes = 2 | Steps = 100 | Steps Per Second = 442.819
INFO:root:Episode Length = 50 | Episode Return = -22.035137176513672 | Episodes = 3 | Steps = 150 | Steps Per Second = 472.002
INFO:root:Episode Length = 50 | Episode Return = -16.271236419677734 | Episodes = 4 | Steps = 200 | Steps Per Second = 464.709
INFO:root:Episode Length = 50 | Episode Return = -12.974747657775879 | Episodes = 5 | Steps = 250 | Steps Per Second = 545.637
INFO:root:Episode Length = 50 | Episode Return = -11.78605842590332 | Episodes = 6 | Steps = 300 | Steps Per Second = 514.348
INFO:root:Episode Length = 32 | Episode Return = 9.626863479614258 | Episodes = 7 | Steps = 332 | Steps Per Second = 487.534
INFO:root:Episode Length = 50 | Episode Return = -6.09431791305542 | Episodes = 8 | Steps = 382 | Steps Per Second =

In [22]:
dmpo_eval_network = snt.Sequential([
  dmpo_observation_network,
  dmpo_policy_network,
  lambda x: x.sample(),
  select_action_max,
])

dmpo_eval_actor = actors.FeedForwardActor(policy_network=dmpo_eval_network)

In [23]:
dmpo_eval_loop = acme.EnvironmentLoop(acme_env, dmpo_eval_actor, should_update=False, label='dmpo_eval_loop', logger=loggers.TerminalLogger())
dmpo_eval_loop.run(num_episodes=10)

INFO:root:Episode Length = 5 | Episode Return = 46.99299621582031 | Episodes = 1 | Steps = 5 | Steps Per Second = 26.609
INFO:root:Episode Length = 6 | Episode Return = 46.048553466796875 | Episodes = 2 | Steps = 11 | Steps Per Second = 655.121
INFO:root:Episode Length = 14 | Episode Return = 39.25684356689453 | Episodes = 3 | Steps = 25 | Steps Per Second = 946.185
INFO:root:Episode Length = 16 | Episode Return = 35.58462905883789 | Episodes = 4 | Steps = 41 | Steps Per Second = 1175.513
INFO:root:Episode Length = 15 | Episode Return = 38.26211929321289 | Episodes = 5 | Steps = 56 | Steps Per Second = 1012.237
INFO:root:Episode Length = 10 | Episode Return = 42.96847915649414 | Episodes = 6 | Steps = 66 | Steps Per Second = 968.102
INFO:root:Episode Length = 12 | Episode Return = 40.75669479370117 | Episodes = 7 | Steps = 78 | Steps Per Second = 950.766
INFO:root:Episode Length = 10 | Episode Return = 43.08636474609375 | Episodes = 8 | Steps = 88 | Steps Per Second = 863.612
INFO:root

In [24]:
obs = env.reset()
print(f'Initial observation: {obs}')
done = False
while not done:
  action = dmpo_eval_actor.select_action(obs)
  print(f'action: {action}')
  obs, reward, done, state = env.step(action) # take a random action
  print(f'observation: {obs}, reward: {reward}, done: {done}, state: {state}')
  if done: break

Initial observation: 0
action: 4
observation: 1, reward: 0.11627906976744184, done: False, state: {'number': 85, 'guess_number': 9, 'guesses': 1}
action: 3
observation: 1, reward: 0.013157894736842146, done: False, state: {'number': 85, 'guess_number': 10, 'guesses': 2}
action: 4
observation: 1, reward: 0.1333333333333333, done: False, state: {'number': 85, 'guess_number': 20, 'guesses': 3}
action: 2
observation: 1, reward: -0.15384615384615374, done: False, state: {'number': 85, 'guess_number': 10, 'guesses': 4}
action: 4
observation: 1, reward: 0.1333333333333333, done: False, state: {'number': 85, 'guess_number': 20, 'guesses': 5}
action: 4
observation: 1, reward: 0.15384615384615385, done: False, state: {'number': 85, 'guess_number': 30, 'guesses': 6}
action: 4
observation: 1, reward: 0.18181818181818177, done: False, state: {'number': 85, 'guess_number': 40, 'guesses': 7}
action: 4
observation: 1, reward: 0.2222222222222222, done: False, state: {'number': 85, 'guess_number': 50, '