## Testing GAIL on a Safety GridWorld Environment

### Defining the environment to test on.

Using a [grid world with lava](https://github.com/maximecb/gym-minigrid#distributional-shift-environment), based on the DeepMind RL Safety Envs, to test generalizability.

In [45]:
%load_ext autoreload

import os
from pathlib import Path

import gym
from gym import spaces

import gym_minigrid
from gym_minigrid.minigrid import WorldObj, IDX_TO_OBJECT, DIR_TO_VEC

import stable_baselines
from stable_baselines import GAIL, PPO2
from stable_baselines.gail import ExpertDataset, generate_expert_traj
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.callbacks import EvalCallback, BaseCallback
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines.bench import Monitor

from rl_baselines_zoo.utils import make_env

from util.stable_baseline_viz import show_videos, record_video

import numpy as np
from functools import reduce
import operator

# the main environment we're testing on
ENV_ID = 'MiniGrid-DistShift1-v0'

# use the slightly different env for later testing of generalization
MODIFIED_ENV_ID = 'MiniGrid-DistShift2-v0'

# file I/O configuration
EXPERIMENT_HOME = 'experiment_data'

# training logging
LOG_DIR_BASE = './logs/'

# these are for data associated with the expert
EXPERT_NAME = 'ppo2'

EXPERT_LOG_DIR = os.path.join(LOG_DIR_BASE, EXPERT_NAME)
EXPERT_DIR = os.path.join(EXPERIMENT_HOME, 'expert_data')
EXPERT_VIDEO_DIR = os.path.join(EXPERT_DIR, 'videos/')

EXPERT_RUN_ID = f'expert_{EXPERT_NAME}_{ENV_ID}'
EXPERT_MODEL_PATH = os.path.join(EXPERT_DIR, f'{EXPERT_RUN_ID}_model')
EXPERT_MODEL_TRACES_PATH = os.path.join(EXPERT_DIR,
                                        f'{EXPERT_RUN_ID}_traces.npz')
EXPERT_BEST_MODEL_PATH = os.path.join(EXPERT_LOG_DIR, 'best_model.zip')

# these are for data associated with the imitation learner
LEARNER_NAME = 'gail'

LEARNER_DIR = os.path.join(EXPERIMENT_HOME, 'learner_data')
LEARNER_LOG_DIR = os.path.join(LOG_DIR_BASE, LEARNER_NAME)
LEARNER_VIDEO_DIR = os.path.join(LEARNER_DIR, 'videos/')

LEARNER_RUN_ID = f'learner_{LEARNER_NAME}_{ENV_ID}'
LEARNER_MODIFIIED_RUN_ID = f'learner_{LEARNER_NAME}_{MODIFIED_ENV_ID}'
LEARNER_MODEL_PATH = os.path.join(LEARNER_DIR, f'{LEARNER_RUN_ID}_model')
LEARNER_BEST_MODEL_PATH= os.path.join(LEARNER_LOG_DIR, 'best_model.zip')

# need to ensure these directories always exist
Path(LOG_DIR_BASE).mkdir(parents=True, exist_ok=True)

Path(EXPERT_DIR).mkdir(parents=True, exist_ok=True)
Path(EXPERT_VIDEO_DIR).mkdir(parents=True, exist_ok=True)

Path(LEARNER_DIR).mkdir(parents=True, exist_ok=True)
Path(LEARNER_VIDEO_DIR).mkdir(parents=True, exist_ok=True)

# decide whether you want to load in a pre-trained expert for the ENV or if
# you need to learn an expert using RL data
expert_formats = ['pre_trained_model', 'traces_only', 'learn_the_model']
expert_format = expert_formats[2]

# allow use of already trained learner
load_learner = False

# performance evaluation / visualization settings
MAX_VIDEO_LEN = 500
NUM_EVAL_EPISODES = 100

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Underlying MDP definition

In [2]:
MDP_DISCOUNT_FACTOR = 0.99

# this function is essentially defining the MDP's state space
class FlatFullyObsStateWrapper(gym.core.ObservationWrapper):
    """
    Returns the agent's position and direction as 1-D vector
    """

    def __init__(self, parent_env):
        super().__init__(parent_env)
        
        self.parent_env = parent_env
        
        # the width / height include 1 cell padding on either side, so
        # an agent coordinate will be from 1 -- [(width/height) - 2]
        agent_max_x = parent_env.width - 2
        agent_max_y = parent_env.height - 2
        possible_state_label_IDs = list(IDX_TO_OBJECT.keys())
        possible_agent_direction_vectors = DIR_TO_VEC

        self.observation_space = spaces.Box(
            # direction part of state is from 0-4, so lowest observed value
            # in state vector is 0
            low=0,
            # we are keeping x and y as separate coordinates of the state
            # so the max value is the max of all types of vars in the state
            high=np.max([agent_max_x, agent_max_y,
                         np.max(possible_state_label_IDs),
                         len(possible_agent_direction_vectors)]),
            # we're going to store the state as:
            #     [pos_x, pos_y, direction, state_label_ID]
            shape=(4,),
            # sigh, this needs to be float or GAIL stuff will freak out
            dtype='float32'
        )

    def observation(self, obs):
        agent_x = self.parent_env.agent_pos[0]
        agent_y = self.parent_env.agent_pos[1]
        
        # here we're going to extract the MDP state label for the current 
        # grid location to help the learner generalize
        # 
        # IDX_TO_OBJECT[state_label_ID] is something like "lava" or "goal"
        grid = self.parent_env.grid.encode()
        state_label_ID, _, _ = grid[agent_x, agent_y]

        obs = np.array([agent_x,
                        agent_y,
                        self.parent_env.agent_dir,
                        state_label_ID], dtype=np.float32)

        return obs
    
class FlatObsWrapper(gym.core.ObservationWrapper):
    """
    Encode mission strings using a one-hot scheme,
    and combine these with observed images into one flat array
    """

    def __init__(self, env, maxStrLen=96):
        super().__init__(env)

        self.maxStrLen = maxStrLen
        self.numCharCodes = 27

        imgSpace = env.observation_space.spaces['image']
        imgSize = reduce(operator.mul, imgSpace.shape, 1)

        self.observation_space = spaces.Box(
            low=0,
            high=255,
            shape=(imgSize + self.numCharCodes * self.maxStrLen,),
            dtype='float32'
        )

        self.cachedStr = None
        self.cachedArray = None

    def observation(self, obs):
        image = obs['image']
        mission = obs['mission']

        # Cache the last-encoded mission string
        if mission != self.cachedStr:
            assert len(mission) <= self.maxStrLen, 'mission string too long ({} chars)'.format(len(mission))
            mission = mission.lower()

            strArray = np.zeros(shape=(self.maxStrLen, self.numCharCodes), dtype='float32')

            for idx, ch in enumerate(mission):
                if ch >= 'a' and ch <= 'z':
                    chNo = ord(ch) - ord('a')
                elif ch == ' ':
                    chNo = ord('z') - ord('a') + 1
                assert chNo < self.numCharCodes, '%s : %d' % (ch, chNo)
                strArray[idx, chNo] = 1

            self.cachedStr = mission
            self.cachedArray = strArray

        obs = np.concatenate((image.flatten(), self.cachedArray.flatten()))

        return obs

### Learning Hyperparams

Learning Model Hyperparameters:

In [58]:
hyperparams = {'expert_ppo2':
                    {'cliprange': 0.2,
                     'ent_coef': 0.0,
                     'gamma': MDP_DISCOUNT_FACTOR,
                     'lam': 0.95,
                     'learning_rate': 0.00025,
                     'n_steps': 128,
                     'n_timesteps': 200_000,
                     'nminibatches': 32,
                     'noptepochs': 10,
                     'policy': 'MlpPolicy',
                     'eval_freq': 500},
               'learner_gail':
                    {'policy': 'MlpPolicy',
                     'n_timesteps': 500_000,
                     'eval_freq': 500,
                    'gamma': MDP_DISCOUNT_FACTOR}}

Environment Hyperparameters:

In [32]:
# this transforms the partially-observable, image-based observations of the
# normal miniGW to a fully-observable, position-based, labeled underlying MDP 
env_wrapper = FlatObsWrapper

# using vectorized env. is SOO much faster, but only some algs support
# it e.g. PPO2
n_envs_expert = 16

### Environment Definition w.r.t. Hyperparameters

In [52]:
# Here, we need to make it fully observable and then compatible with 
# stable-baselines
expert_env = DummyVecEnv([make_env(ENV_ID, wrapper_class=env_wrapper,
                                   rank=i) for i in range(n_envs_expert)])
    
# need a copy of this environment for online algorithm evaluation
expert_eval_env = DummyVecEnv([make_env(ENV_ID, wrapper_class=env_wrapper,
                                        rank=i) for i in range(1)])

# need a non-vectorized copy of this environment for the gail learner
learner_env = gym.make(ENV_ID)
learner_env = Monitor(learner_env, LEARNER_LOG_DIR)
learner_env = env_wrapper(learner_env)
learner_eval_env = DummyVecEnv([make_env(ENV_ID, wrapper_class=env_wrapper,
                                         rank=i) for i in range(1)])

# need the different-looking-but-same-goal environment to test generalization
# need a copy of this environment for online algorithm evaluation
modified_env = DummyVecEnv([make_env(MODIFIED_ENV_ID,
                                     wrapper_class=env_wrapper,
                                     rank=i) for i in range(1)])



### Get an expert demonstrator and generate expert trajectories

The end goal of this section is to provide the imitation learner with a set of "expert" demonstrations to learn from. This can be accomplished in several ways:

<br>

**Expert Formats:**

*possible formats: `'pre_trained_model', 'traces_only', 'learn_the_model'`*

* `'learn_the_model'`: learning the model just requires choosing the desired RL algorithm and settings its hyperparameters (see above).

* `'pre_trained_model'`: a pre-trained expert must have a saved `stable_baselines` model file at `EXPERT_MODEL_PATH`. See the [saving guide](https://stable-baselines.readthedocs.io/en/master/guide/save_format.html) for info on how to do that.

* `'traces_only'`: demonstration traces must reside in the `npz` archive at `EXPERT_MODEL_TRACES_PATH` and folow the format needed by `stable_baselines.gail.ExpertDataset()`.


<br>

--- 

<br>

**From the docs**:

*The expert dataset is a .npz archive. The data is saved in python dictionary format with keys: actions, episode_returns, rewards, obs, episode_starts.*

*In case of images, obs contains the relative path to the images.*

*obs, actions: shape (N * L, ) + S*

*where N = # episodes, L = episode length and S is the environment observation/action space.*

*S = (1, ) for discrete space*

<br>

In [34]:
if expert_format == 'pre_trained_model':
    expert_model = PPO2.load(EXPERT_MODEL_PATH)
    expert_model.set_env(expert_env)

elif expert_format == 'learn_the_model':
    expert_hparam = hyperparams['expert_ppo2']

    expert_model = PPO2(expert_hparam['policy'], expert_env,
                        cliprange=expert_hparam['cliprange'],
                        ent_coef=expert_hparam['ent_coef'],
                        gamma=expert_hparam['gamma'],
                        lam=expert_hparam['lam'],
                        learning_rate=expert_hparam['learning_rate'],
                        n_steps=expert_hparam['n_steps'],
                        nminibatches=expert_hparam['nminibatches'],
                        noptepochs=expert_hparam['noptepochs'],
                        verbose=0,
                        tensorboard_log=EXPERT_LOG_DIR)

    # always use deterministic actions for live evaluation
    eval_callback = EvalCallback(expert_eval_env,
                                 best_model_save_path=EXPERT_LOG_DIR,
                                 log_path=EXPERT_LOG_DIR,
                                 eval_freq=expert_hparam['eval_freq'],
                                 deterministic=True, render=False)

    # while evaluate the model on a new environment and save the best one
    # periodically
    expert_model.learn(total_timesteps=expert_hparam['n_timesteps'],
                       callback=eval_callback)

    # need to load and then save the best model found during training
    expert_model.load(EXPERT_BEST_MODEL_PATH)
    expert_model.set_env(expert_env)

    # now save the final model so if we like it, we don't need to re-learn it
    # The model will be saved under $EXPERT_MODEL_EXPER_MODEL_PATH.zip
    expert_model.save(EXPERT_MODEL_PATH)

Eval num_timesteps=8000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
New best mean reward!
Eval num_timesteps=16000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=24000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=32000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=40000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=48000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=56000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=64000, episode_reward=0.00 +/- 0.00
Episode length: 252.00 +/- 0.00
Eval num_timesteps=72000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=80000, episode_reward=0.00 +/- 0.00
Episode length: 2.00 +/- 0.00
Eval num_timesteps=88000, episode_reward=0.00 +/- 0.00
Episode length: 252.00 +/- 0.00
Eval num_timesteps=96000, episode_reward

If we now have an expert model, sample trajectories from it and save them as demonstrations for the imitation learner. If you just provide traces, we will do nothing and assume the traces exist at `EXPERT_MODEL_TRACES_PATH`.

In [35]:
if expert_format != 'traces_only':
    
    # vectorized environments do not work with generate_expert_traj, not sure
    # why. Seems to be that threads are probably not synced properly somehow
    #
    # The evaluation environment already must only have a singular env, so
    # use it for trace generation.
    expert_model.set_env(expert_eval_env)
    
    # generate trajectories in the environment under the expert_model
    generate_expert_traj(expert_model, EXPERT_MODEL_TRACES_PATH,
                         n_episodes=100);
    
# Load the expert dataset
expert_dataset = ExpertDataset(expert_path=EXPERT_MODEL_TRACES_PATH,
                               verbose=1)

actions (1345, 1)
obs (1345, 2739)
rewards (1345, 1)
episode_returns (100,)
episode_starts (1345,)
actions (1345, 1)
obs (1345, 2739)
rewards (1345, 1)
episode_returns (100,)
episode_starts (1345,)
Total trajectories: -1
Total transitions: 1345
Average returns: 0.9519642949104309
Std for returns: 0.011421593223745896


### Training the imitation learner from the expert data

If you want to load in a pre-trained IL model, the model save archive needs to be located at `LEARNER_MODEL_PATH`.

In [43]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                # Mean training reward over the last 100 episodes
                mean_reward = np.mean(y[-100:])
                if self.verbose > 0:
                    print("Num timesteps: {}".format(self.num_timesteps))
                    print(f"Best mean reward: {self.best_mean_reward} - " + \
                          f"Last mean reward per episode: {mean_reward}")

                # New best model, you could save the agent here
                if mean_reward > self.best_mean_reward:
                    self.best_mean_reward = mean_reward
                    # Example for saving best model
                    if self.verbose > 0:
                        print("Saving new best model to {}".format(self.save_path))
                    self.model.save(self.save_path)

        return True

In [69]:
if load_learner:
    learner_model = GAIL.load(LEARNER_MODEL_PATH)
    learner_model.set_env(learner_env)
else:
    learner_hparam = hyperparams['learner_gail']

    learner_model = GAIL(learner_hparam['policy'],
                         learner_env,
                         expert_dataset,
                         gamma=learner_hparam['gamma'],
                         verbose=0,
                         tensorboard_log=LEARNER_LOG_DIR)

    # always use deterministic actions for live evaluation
#     eval_callback = EvalCallback(stable_baselines.common.base_class._UnvecWrapper(learner_env),
#                                  best_model_save_path=LEARNER_LOG_DIR,
#                                  log_path=LEARNER_LOG_DIR,
#                                  eval_freq=learner_hparam['eval_freq'],
#                                  deterministic=True, render=False)
#     eval_callback = SaveOnBestTrainingRewardCallback(check_freq=learner_hparam['eval_freq'], log_dir=LEARNER_LOG_DIR)

    learner_model.learn(total_timesteps=learner_hparam['n_timesteps'],
                        callback=eval_callback)

    # need to load and then save the best model found during training
    learner_model.load(LEARNER_BEST_MODEL_PATH)
    learner_model.set_env(learner_env)
    learner_model.save(LEARNER_MODEL_PATH)

Num timesteps: 0
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00532143
Num timesteps: 0
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00532143
Num timesteps: 0
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 0
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 0
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 0
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 1024
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 1024
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 1024
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 1024
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num timesteps: 1024
Best mean reward: 0.01114286 - Last mean reward per episode: 0.00696429
Num ti

KeyboardInterrupt: 

### Visualizing Learning

#### Visualizing the Learned Expert on the Original Domain

In [36]:
record_video(expert_model, eval_env=expert_eval_env,
             max_video_length=MAX_VIDEO_LEN,
             video_prefix=EXPERT_RUN_ID, video_folder=EXPERT_VIDEO_DIR)

show_videos(EXPERT_VIDEO_DIR, prefix=EXPERT_RUN_ID)

Evaluating the performance of the learned expert:

In [38]:
mean_reward, std_reward = evaluate_policy(expert_model, expert_eval_env,
                                          n_eval_episodes=NUM_EVAL_EPISODES)
print(f"expert ({EXPERT_NAME}) mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

expert (ppo2) mean_reward:0.95 +/- 0.00


#### Visualizing the Imitation Learner on the Orginal Domain

In [13]:
record_video(learner_model, eval_env=learner_env,
             max_video_length=MAX_VIDEO_LEN,
             video_prefix=LEARNER_RUN_ID, video_folder=LEARNER_VIDEO_DIR)

show_videos(LEARNER_VIDEO_DIR, prefix=LEARNER_RUN_ID)

Evaluating the performance of the learned expert:

In [None]:
mean_reward, std_reward = evaluate_policy(learner_model, eval_env,
                                          n_eval_episodes=NUM_EVAL_EPISODES)
print(f"learner ({LEARNER_NAME}) mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

#### Visualizing the Imitation Learner on the Orginal Domain

In [None]:
record_video(learner_model, eval_env=eval_env,
             max_video_length=MAX_VIDEO_LEN,
             video_prefix=LEARNER_RUN_ID, video_folder=LEARNER_VIDEO_DIR)

show_videos(LEARNER_VIDEO_DIR, prefix=LEARNER_RUN_ID)

Evaluating the performance of the learned expert:

In [None]:
mean_reward, std_reward = evaluate_policy(learner_model, eval_env,
                                          n_eval_episodes=NUM_EVAL_EPISODES)
print(f"learner ({LEARNER_NAME}) mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

#### Visualizing the Imitation Learner on a Similar, Unseen Domain

Can this fully-observable model that has no look-ahead learn? Shouldn't be able to.

In [None]:
record_video(learner_model, eval_env=modified_env,
             max_video_length=MAX_VIDEO_LEN,
             video_prefix=LEARNER_MODIFIIED_RUN_ID,
             video_folder=LEARNER_VIDEO_DIR)

show_videos(LEARNER_VIDEO_DIR, prefix=LEARNER_MODIFIIED_RUN_ID)

Evaluating the performance of the learned expert:

In [None]:
mean_reward, std_reward = evaluate_policy(learner_model, modified_env,
                                          n_eval_episodes=NUM_EVAL_EPISODES)
print(f"learner ({LEARNER_NAME}) mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")