## PPO Agent Training with Action Masking

Here we will import the Proximal Policy Optimization (PPO) algorithm from Ray.RLlib and train it with minimal parameters.

Action masking allows the policy to start learning the best strategy from the get-go, without having to first learn the rules of the game.
Enforcing action masking in Ray requires the definition of a custom model, which post-processes the logits coming out of the actual model (here, a Fully Connceted Neural Network) setting to -inf those of forbidden actions

In [1]:
from wildcatter.advanced_environment_for_RLib import AdvancedDriller
import ray
from ray import air, tune
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.models import ModelCatalog
from gym.spaces import Box, Dict
import numpy as np
import matplotlib.pyplot as plt
from time import time

tf1, tf, tfv = try_import_tf(error=True)

class WildcatterActionMaskedModel(TFModelV2):
     
    def __init__(self, 
                 obs_space,
                 action_space,
                 num_outputs,
                 model_config,
                 name,
                 true_obs_shape=(11,40),
                 action_embed_size=4+38+1,
                 *args, **kwargs):
         
        super(WildcatterActionMaskedModel, self).__init__(obs_space,
            action_space, num_outputs, model_config, name, 
            *args, **kwargs)
         
        self.action_embed_model = FullyConnectedNetwork(
            Box(-np.inf, np.inf, shape=true_obs_shape), 
                action_space, action_embed_size,
            model_config, name + "_action_embed")
 
    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]
        # Compute the predicted action embedding
        action_embed, _ = self.action_embed_model({
            "obs": input_dict["obs"]["obs"]})
        # Mask out invalid actions (use tf.float32.min for stability)
        inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
        # Return action_logits + inf_mask, state
        return action_embed + inf_mask, state
 
    def value_function(self):
        return self.action_embed_model.value_function()
    
ModelCatalog.register_custom_model('wildcatter_masked', WildcatterActionMaskedModel)

## Setting environment config dictionary

Here we list the config dictionaries for various environment types, selecting the one we want to train for.

In [2]:
env_random_config = dict(model_type = "random",
                  nrow=11,
                  ncol=40,
                  funds=20,
                  oil_price = 1,
                  relocation_cost = 0.2,
                  drilling_cost = 0.5,
                  drilling_depth_markup = 0.1,
                  #seed = 0,
                 )

env_random_pockets_config = dict(model_type = "random_pockets",
                  nrow=11,
                  ncol=40,
                  #nrow=40,
                  #ncol=80,
                  funds=20,
                  oil_price = 1,
                  relocation_cost = 0.2,
                  drilling_cost = 0.5,
                  drilling_depth_markup = 0.1,
                  #seed = 0,
                 )

env_2d_from_csv_config = dict(model_type = "from_csv",
                  #model_path=r"/home/studio-lab-user/sagemaker-studiolab-notebooks/wildcatter-ThreeAmigos/examples/data/2d_two_rectangular_targets.csv",
                  #model_path=r"/home/studio-lab-user/sagemaker-studiolab-notebooks/wildcatter-ThreeAmigos/examples/data/2d_stacked.csv",
                  model_path=r"/home/studio-lab-user/sagemaker-studiolab-notebooks/wildcatter-ThreeAmigos/examples/data/x-sec_targets.csv",
                  delim=",",
                  funds=20,
                  oil_price = 1,
                  relocation_cost = 0.2,
                  drilling_cost = 0.5,
                  drilling_depth_markup = 0.1,
                  #seed = 0,
                  )

env_config = env_random_pockets_config
env = AdvancedDriller(env_config)
# Setting variables for PPO trainer
true_obs_shape = env.observation_space["obs"].shape
action_embed_size = env.action_space.n

## Loading the PPO trainer

Let's instantiate a PPO trainer with three workers (to exploit the 4 CPUs available on SageMaker Studio Lab)

In [3]:
import ray.rllib.algorithms.ppo as ppo
from ray.tune.logger import pretty_print
ray.init(ignore_reinit_error=True)
config = ppo.DEFAULT_CONFIG.copy()
special_config = {"num_gpus" : 0,
                  "num_workers" : 3, # Parallel training!
                  "env": AdvancedDriller,
                  "env_config": env_config,
                  "model": {
                      "custom_model": "wildcatter_masked",
                      "custom_model_config": {
                          "true_obs_shape":true_obs_shape,
                          "action_embed_size":action_embed_size,
                      }
                  },
                  "horizon" : 40
                 }
config.update(special_config)
trainer = ppo.PPO(config=config)
# In case we want to restore a checkpointed trainer
#trainer.restore(my_checkpoint)

2022-09-22 23:58:15,514	INFO worker.py:1518 -- Started a local Ray instance.
2022-09-22 23:58:16,419	INFO algorithm.py:1871 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2022-09-22 23:58:16,420	INFO ppo.py:378 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-09-22 23:58:16,423	INFO algorithm.py:351 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


In [4]:
# Check that action masking works
env = trainer.env_creator(env_config)
obs = env.reset()
obs['action_mask'][:-1] = 0 # Having set to zero the action mask of all other actions, the only remaining valid action is number 42
# Let's ask the model to select an action for 10k times
actions = np.array([trainer.compute_single_action(obs) for i in range(10000)])
# Check that the action being selected has always been 42
all(actions==42)

True

In [5]:
# Let's do 1000 training epochs
for i in range(100):
    result = trainer.train()
    if i%10 == 0:
    #    checkpoint = trainer.save("my_checkpoint")
        print(f"Epoch: {i}; episode_reward_mean: {result.get('episode_reward_mean')}")

checkpoint = trainer.save("my_checkpoint")
print("Final checkpoint saved")
print(pretty_print(result))

[2m[36m(RolloutWorker pid=428)[0m   ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Epoch: 0; episode_reward_mean: -84.93577981651376
Epoch: 10; episode_reward_mean: 0.5613237639553429
Epoch: 20; episode_reward_mean: 8.912571428571427
Epoch: 30; episode_reward_mean: 14.056223175965664
Epoch: 40; episode_reward_mean: 21.181541218637992
Epoch: 50; episode_reward_mean: 21.170833333333334
Epoch: 60; episode_reward_mean: 28.291585127201564
Epoch: 70; episode_reward_mean: 28.21468253968254
Epoch: 80; episode_reward_mean: 29.847286821705424
Epoch: 90; episode_reward_mean: 31.814534883720928
Final checkpoint saved
agent_timesteps_total: 400200
counters:
  num_agent_steps_sampled: 400200
  num_agent_steps_trained: 400200
  num_env_steps_sampled: 400200
  num_env_steps_trained: 400200
custom_metrics: {}
date: 2022-09-23_00-18-11
done: false
episode_len_mean: 7.7159309021113245
episode_media: {}
episode_reward_max: 56.7
episode_reward_mean: 30.949520153550857
episode_reward_min: -122.1
episodes_this_iter: 521
episodes_total: 62731
experiment_id: 2c0e69614154413e9e426aa023e1c405


In [None]:
# Check policy
import ray.rllib.algorithms.ppo as ppo
from ray.tune.logger import pretty_print
eval_config = ppo.DEFAULT_CONFIG.copy()
special_config = {"num_workers" : 0,
                  "env": AdvancedDriller,
                  "env_config": env_config,
                  "model": {
                      "custom_model": "wildcatter_masked",
                      "custom_model_config": {
                          "true_obs_shape":true_obs_shape,
                          "action_embed_size":action_embed_size,
                      },
                  },
                  "horizon" : 40,
                  "explore" : False, # Always returns best action
                 }
eval_config.update(special_config)
agent = ppo.PPO(config=eval_config, env=AdvancedDriller)

In [None]:
# Here we restore an agent we trained for 400 epochs with the standard parameters
agent.restore(r"/home/studio-lab-user/sagemaker-studiolab-notebooks/wildcatter-ThreeAmigos/examples/my_checkpoint/checkpoint_000400/")

In [None]:
env_config = env_random_pockets_config
#env_config = env_2d_two_rectangular_targets_config
env = AdvancedDriller(env_config)
print("Beginning Drill Campaign")
obs = env.reset()
done = False
score = 0
plt.figure(figsize=(20, 20))
plt.imshow(env.state, vmin=-10, vmax=2)
plt.xticks(np.arange(0, env.ncol, 1.0))
plt.yticks(np.arange(0, env.nrow, 1.0))
plt.xlim([-0.5, env.ncol - 0.5])
plt.ylim([env.nrow - 0.5, -0.5])
plt.grid()

while not done:
    action = agent.compute_single_action(obs)
    obs, reward, done, info = env.step(action)
    score += reward
    print(f"Action: {action}; funds: {obs['obs'][-1]}; reward: {reward}; total score: {score}")

plt.figure(figsize=(20, 20))
plt.imshow(env.state, vmin=-10, vmax=2)
for well in env.trajectory:
    traj_z, traj_x = np.asarray(well).T
    plt.plot(traj_x, traj_z, "-", c="m", linewidth=6)
plt.xticks(np.arange(0, env.ncol, 1.0))
plt.yticks(np.arange(0, env.nrow, 1.0))
plt.xlim([-0.5, env.ncol - 0.5])
plt.ylim([env.nrow - 0.5, -0.5])
plt.grid()