# Examples of Agents in a Number Guessing Environment

Contributors: michaelmenzel@google.com

In [None]:
#@title Imports
!pip3 install --upgrade --quiet gym tf-agents

import numpy as np

import gym
from gym import envs
from gym import spaces
from gym.utils import seeding

import tensorflow as tf

from tf_agents.networks import q_network, q_rnn_network, actor_distribution_network, value_network
from tf_agents.agents.dqn import dqn_agent
from tf_agents.agents import PPOAgent
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.metrics.tf_metrics import AverageEpisodeLengthMetric, AverageReturnMetric, ChosenActionHistogram, NumberOfEpisodes
from tf_agents.utils import common

[K     |████████████████████████████████| 1.6MB 23.8MB/s 
[K     |████████████████████████████████| 1.1MB 47.2MB/s 
[?25h  Building wheel for gym (setup.py) ... [?25l[?25hdone




In [None]:
#@title Available Gym Environments
', '.join(map(lambda e: e.id, envs.registry.all()))

'Copy-v0, RepeatCopy-v0, ReversedAddition-v0, ReversedAddition3-v0, DuplicatedInput-v0, Reverse-v0, CartPole-v0, CartPole-v1, MountainCar-v0, MountainCarContinuous-v0, Pendulum-v0, Acrobot-v1, LunarLander-v2, LunarLanderContinuous-v2, BipedalWalker-v3, BipedalWalkerHardcore-v3, CarRacing-v0, Blackjack-v0, KellyCoinflip-v0, KellyCoinflipGeneralized-v0, FrozenLake-v0, FrozenLake8x8-v0, CliffWalking-v0, NChain-v0, Roulette-v0, Taxi-v3, GuessingGame-v0, HotterColder-v0, Reacher-v2, Pusher-v2, Thrower-v2, Striker-v2, InvertedPendulum-v2, InvertedDoublePendulum-v2, HalfCheetah-v2, HalfCheetah-v3, Hopper-v2, Hopper-v3, Swimmer-v2, Swimmer-v3, Walker2d-v2, Walker2d-v3, Ant-v2, Ant-v3, Humanoid-v2, Humanoid-v3, HumanoidStandup-v2, FetchSlide-v1, FetchPickAndPlace-v1, FetchReach-v1, FetchPush-v1, HandReach-v0, HandManipulateBlockRotateZ-v0, HandManipulateBlockRotateZTouchSensors-v0, HandManipulateBlockRotateZTouchSensors-v1, HandManipulateBlockRotateParallel-v0, HandManipulateBlockRotateParallel

## Guessing Game Environment Implementation

Let's define a simple OpenAI Gym environment for a number guessing game. In the environment, an agent has to guess a number between 0-100, by asking for a random nbr or in- or decreasing by 1 or 10, and can make observations if the guess is lower (1), equal (2) or higher (3) than the actual number.
The game ends when either 50 guesses were made or the guessed number is <1% off from the actual number.

In [None]:
class GuessingGame(gym.Env):
    """Number guessing game
    The object of the game is to guess within 1% of the randomly chosen number
    within 50 time steps
    After each step the agent is provided with one of four possible observations
    which indicate where the guess is in relation to the randomly chosen number
    0 - No guess yet submitted (only after reset)
    1 - Guess is lower than the target
    2 - Guess is equal to the target
    3 - Guess is higher than the target
    The actions are:
    0 - Decrease number by 10
    1 - Decrease number by 1
    2 - Generate random number
    3 - Increase number by 1
    4 - Increase number by 10
    The rewards are:
    0 if the agent's guess is outside of 1% of the target
    (50 - nbr of guesses) if the agent's guess is inside 1% of the target
    The episode terminates after the agent guesses within 1% of the target or
    50 steps have been taken
    The agent will need to use a memory of previously submitted actions and observations
    in order to efficiently explore the available actions
    The purpose is to have agents optimize their exploration parameters (e.g. how far to
    explore from previous actions) based on previous experience. Because the goal changes
    each episode a state-value or action-value function isn't able to provide any additional
    benefit apart from being able to tell whether to increase or decrease the next guess.
    The perfect agent would likely learn the bounds of the action space (without referring
    to them explicitly) and then follow binary tree style exploration towards to goal number
    """
    def __init__(self):
        self.range = 100  # Randomly selected number is within [0, this value]
        self.bounds = self.range

        self.action_space = spaces.Discrete(5)
        #self.action_space = spaces.Box(low=0, high=self.bounds, 
        #                               shape=(), dtype=np.int32)
        self.observation_space = spaces.Discrete(4)

        self.number = 0
        self.guess_count = 0
        self.guess_max = 50
        self.observation = 0

        self.reset()

    def step(self, action):
        assert self.action_space.contains(action)

        prev_guess_number = self.guess_number

        if action == 0:
          self.guess_number -= 10
        elif action == 1:
          self.guess_number -= 1
        elif action == 2:
          self.guess_number = np.random.randint(0, self.range)
        elif action == 3:
          self.guess_number += 1
        elif action == 4:
          self.guess_number += 10

        if self.guess_number < self.number:
            self.observation = 1
        elif self.guess_number == self.number:
            self.observation = 2
        elif self.guess_number > self.number:
            self.observation = 3

        reward = 0
        done = False
        self.guess_count += 1

        if (self.number - self.range * 0.01) < self.guess_number < (self.number + self.range * 0.01):
            reward = self.guess_max - self.guess_count
            done = True
        else:
          reward = 1 - (abs(self.number - self.guess_number) / abs(self.number - prev_guess_number))

        if self.guess_count >= self.guess_max:
            done = True

        return self.observation, reward, done, {"number": self.number, "guess_number": self.guess_number, "guesses": self.guess_count}

    def reset(self):
        self.number = np.random.randint(0, self.range)
        self.guess_number = -1
        self.guess_count = 0
        self.observation = 0
        return self.observation

env = GuessingGame()
print(f'actions: {env.action_space}, observations: {env.observation_space}')

actions: Discrete(5), observations: Discrete(4)


## Naive Random Policy Agent

We start with a simple implementation of a static random policy used by an agent. The agent simply loops through 50 guesses and picks a random number each time. If he is right or has reached 50 iterations, the loop stops. 

In [None]:
env.reset()
done = False
while not done:
  action = env.action_space.sample()
  print(f'action: {action}')
  obs, reward, done, state = env.step(action) # take a random action
  print(f'observation: {obs}, reward: {reward}, done: {done}, state: {state}')
  if done: break

action: 3
observation: 1, reward: 0.012658227848101222, done: False, state: {'number': 78, 'guess_number': 0, 'guesses': 1}
action: 4
observation: 1, reward: 0.1282051282051282, done: False, state: {'number': 78, 'guess_number': 10, 'guesses': 2}
action: 3
observation: 1, reward: 0.014705882352941124, done: False, state: {'number': 78, 'guess_number': 11, 'guesses': 3}
action: 2
observation: 1, reward: 0.28358208955223885, done: False, state: {'number': 78, 'guess_number': 30, 'guesses': 4}
action: 4
observation: 1, reward: 0.20833333333333337, done: False, state: {'number': 78, 'guess_number': 40, 'guesses': 5}
action: 4
observation: 1, reward: 0.26315789473684215, done: False, state: {'number': 78, 'guess_number': 50, 'guesses': 6}
action: 2
observation: 1, reward: -1.25, done: False, state: {'number': 78, 'guess_number': 15, 'guesses': 7}
action: 4
observation: 1, reward: 0.15873015873015872, done: False, state: {'number': 78, 'guess_number': 25, 'guesses': 8}
action: 0
observation:

## DQN Agent with TF-Agents

### Convert the OpenAI Gym Environment to TF Agents

In [None]:
tf_env = tf_py_environment.TFPyEnvironment(suite_gym.wrap_env(env, max_episode_steps=env.guess_max))
print(f'actions: {tf_env.action_spec()}')
print(f'observations: {tf_env.observation_spec()}')
print(f'timespec: {tf_env.time_step_spec()}')

actions: BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(4))
observations: BoundedTensorSpec(shape=(), dtype=tf.int64, name='observation', minimum=array(0), maximum=array(3))
timespec: TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(), dtype=tf.int64, name='observation', minimum=array(0), maximum=array(3)))


### Instantiate a DQN Agent

In [None]:
q_net = q_network.QNetwork(
  tf_env.observation_spec(),
  tf_env.action_spec()    
)

## Optional: Use a RNN-based QNetwork
q_rnn_net = q_rnn_network.QRnnNetwork(
  tf_env.observation_spec(),
  tf_env.action_spec(),
  lstm_size=[150])

agent = dqn_agent.DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
# Optional: Replace here with a RNN-based QNetwork
    q_network=q_net,
    optimizer=tf.keras.optimizers.Adam(),
    epsilon_greedy=.1,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=tf.Variable(0),
    debug_summaries=True,
    summarize_grads_and_vars=True)

agent.initialize()

### Replay Buffer

We istantiate a replay buffer which stores experiences made by an agent. The buffer helps us replay past observations and train agents based on that data.
This helps when using agent A to make experiences and then use the data to train agent B.

In [None]:
replay_buffer_capacity = env.guess_max*10

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity)

### Training Loop

We use a ```DynamicStepDriver``` to run the agent's collect policy for ```collect_steps``` steps and store the trajectories (experiences) in the replay buffer.
We then train our ```QNetwork``` with the trajectories (and smooth with a target ```QNetwork```) which calculates favorable actions using q-values. The favorable actions are then considered as part of a greedy policy.

In [None]:
iterations = 10
collect_steps = replay_buffer_capacity

# metrics
avgReturn = AverageReturnMetric()
avgLength = AverageEpisodeLengthMetric()
actionHist = ChosenActionHistogram()
nbrEpisodes = NumberOfEpisodes()
metrics = [avgLength, avgReturn, nbrEpisodes]

observers = metrics + [replay_buffer.add_batch]

for iteration in range(iterations):

  # Collect trajectories with a driver
  dynamic_step_driver.DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=observers,
    num_steps=collect_steps).run()
  
  # DQN agent require 2 steps (current and next) or more (n_steps+1) to calculate loss
  data_iterator = iter(replay_buffer.as_dataset(
      num_parallel_calls=3, sample_batch_size=10, num_steps=2)
    .prefetch(3))

  # Train the DQN agent
  for i in range(collect_steps):
    trajectories, _ = next(data_iterator)
    loss, extra = agent.train(experience=trajectories)
    #print(f'step: {agent.train_step_counter.numpy()}, loss: {loss}')
    #print(f'td_loss: {extra.td_loss}')

  print(f'iteration: {iteration}, avg return: {avgReturn.result()}, avg length: {avgLength.result()}, # episodes: {nbrEpisodes.result()}')

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


iteration: 0, avg return: 9.885412216186523, avg length: 40.20000076293945, # episodes: 11
iteration: 1, avg return: -102.15940856933594, avg length: 45.0, # episodes: 23
iteration: 2, avg return: 6.598444938659668, avg length: 43.29999923706055, # episodes: 35
iteration: 3, avg return: -5.836390495300293, avg length: 50.0, # episodes: 45
iteration: 4, avg return: 18.867826461791992, avg length: 32.79999923706055, # episodes: 59
iteration: 5, avg return: -4.253633975982666, avg length: 46.099998474121094, # episodes: 70
iteration: 6, avg return: 1.2619245052337646, avg length: 46.70000076293945, # episodes: 81
iteration: 7, avg return: 12.12291145324707, avg length: 39.0, # episodes: 93
iteration: 8, avg return: -2.7912871837615967, avg length: 45.70000076293945, # episodes: 104
iteration: 9, avg return: -1.1686832904815674, avg length: 45.900001525878906, # episodes: 115


### Test the DQN Agent's Learned Policy

In [None]:
tf_env.reset()

def log_eval(traj):
  print(f'observation: {traj.observation}, action: {traj.action}, reward: {traj.reward}')

time_step, policy_state = dynamic_step_driver.DynamicStepDriver(
    tf_env,
    agent.policy,
    observers=[log_eval],
    num_steps=env.guess_max).run()

observation: [0], action: [2], reward: [0.61702126]
observation: [1], action: [2], reward: [0.]
observation: [1], action: [2], reward: [-0.30555555]
observation: [1], action: [2], reward: [0.80851066]
observation: [1], action: [2], reward: [-8.444445]
observation: [1], action: [2], reward: [0.25882354]
observation: [1], action: [2], reward: [0.3968254]
observation: [1], action: [2], reward: [-1.2105263]
observation: [1], action: [2], reward: [0.05952381]
observation: [1], action: [2], reward: [0.9493671]
observation: [3], action: [2], reward: [-19.]
observation: [1], action: [2], reward: [0.2125]
observation: [1], action: [2], reward: [0.7936508]
observation: [1], action: [2], reward: [-5.769231]
observation: [1], action: [2], reward: [0.4318182]
observation: [1], action: [2], reward: [-0.38]
observation: [1], action: [2], reward: [0.85507244]
observation: [1], action: [2], reward: [-6.9]
observation: [1], action: [2], reward: [0.835443]
observation: [1], action: [2], reward: [0.769230

## PPO Agent with TF-Agents


In [None]:
actor_net = actor_distribution_network.ActorDistributionNetwork(
  tf_env.observation_spec(),
  tf_env.action_spec(),
  #preprocessing_layers=tf.keras.layers.Lambda(lambda x: x/tf_env.observation_spec().maximum)
)

value_net = value_network.ValueNetwork(
  tf_env.observation_spec()
)

ppo_agent = PPOAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    actor_net=actor_net,
    value_net=value_net,
    optimizer=tf.keras.optimizers.Adam(),
    normalize_observations=False,
    train_step_counter=tf.Variable(0),
    debug_summaries=True,
    summarize_grads_and_vars=True)

ppo_agent.initialize()

In [None]:
ppo_replay_buffer_capacity = env.guess_max*10

ppo_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    ppo_agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=ppo_replay_buffer_capacity)

In [None]:
iterations = 10
collect_steps = ppo_replay_buffer_capacity

# metrics
avgReturn = AverageReturnMetric()
avgLength = AverageEpisodeLengthMetric()
actionHist = ChosenActionHistogram()
nbrEpisodes = NumberOfEpisodes()
metrics = [avgLength, avgReturn, nbrEpisodes]

observers = metrics + [ppo_replay_buffer.add_batch]

for iteration in range(iterations):

  # Collect trajectories with a driver
  dynamic_step_driver.DynamicStepDriver(
    tf_env,
    ppo_agent.collect_policy,
    observers=observers,
    num_steps=collect_steps).run()
  
  # PPO agent require 2 steps (current and next) or more (n_steps+1) to calculate loss
  data_iterator = iter(ppo_replay_buffer.as_dataset(
      num_parallel_calls=3, sample_batch_size=10, num_steps=2)
    .prefetch(3))

  # Train the PPO agent
  for i in range(collect_steps):
    trajectories, _ = next(data_iterator)
    loss, extra = ppo_agent.train(experience=trajectories)
    #print(f'step: {agent.train_step_counter.numpy()}, loss: {loss}')
    #print(f'td_loss: {extra.td_loss}')

  print(f'iteration: {iteration}, avg return: {avgReturn.result()}, avg length: {avgLength.result()}, # episodes: {nbrEpisodes.result()}')

iteration: 0, avg return: -10.359271049499512, avg length: 39.70000076293945, # episodes: 12
iteration: 1, avg return: 3.7636241912841797, avg length: 36.0, # episodes: 25
iteration: 2, avg return: 1.787976861000061, avg length: 35.900001525878906, # episodes: 39
iteration: 3, avg return: -2.353590250015259, avg length: 40.70000076293945, # episodes: 52
iteration: 4, avg return: -8.858983993530273, avg length: 44.900001525878906, # episodes: 64
iteration: 5, avg return: -22.469585418701172, avg length: 44.099998474121094, # episodes: 75
iteration: 6, avg return: 8.307008743286133, avg length: 32.099998474121094, # episodes: 92
iteration: 7, avg return: 12.121256828308105, avg length: 27.899999618530273, # episodes: 106
iteration: 8, avg return: -4.355345249176025, avg length: 31.600000381469727, # episodes: 121
iteration: 9, avg return: 7.595143795013428, avg length: 29.5, # episodes: 136


In [None]:
tf_env.reset()

def log_eval(traj):
  print(f'observation: {traj.observation}, action: {traj.action}, reward: {traj.reward}')

time_step, policy_state = dynamic_step_driver.DynamicStepDriver(
    tf_env,
    ppo_agent.policy,
    observers=[log_eval],
    num_steps=env.guess_max).run()

observation: [0], action: [4], reward: [0.11494253]
observation: [1], action: [4], reward: [0.12987013]
observation: [1], action: [4], reward: [0.14925373]
observation: [1], action: [4], reward: [0.1754386]
observation: [1], action: [4], reward: [0.21276596]
observation: [1], action: [4], reward: [0.27027026]
observation: [1], action: [4], reward: [0.37037036]
observation: [1], action: [4], reward: [0.5882353]
observation: [1], action: [4], reward: [0.5714286]
observation: [3], action: [1], reward: [0.33333334]
observation: [3], action: [1], reward: [0.5]
observation: [3], action: [1], reward: [38.]
observation: [2], action: [1], reward: [0.]
observation: [0], action: [4], reward: [0.24390244]
observation: [1], action: [4], reward: [0.32258064]
observation: [1], action: [4], reward: [0.47619048]
observation: [1], action: [4], reward: [0.90909094]
observation: [1], action: [4], reward: [-8.]
observation: [3], action: [1], reward: [0.11111111]
observation: [3], action: [1], reward: [0.12