## Testing GAIL on a Safety GridWorld Environment

### Defining the environment to test on.

Using a [grid world with lava](https://github.com/maximecb/gym-minigrid#distributional-shift-environment), based on the DeepMind RL Safety Envs, to test generalizability.

In [29]:
%load_ext autoreload

import os
from pathlib import Path

import gym
from gym import spaces

import gym_minigrid
from gym_minigrid.wrappers import FullyObsWrapper, FlatObsWrapper
from gym_minigrid.minigrid import WorldObj, IDX_TO_OBJECT, DIR_TO_VEC

from stable_baselines import GAIL, PPO2
from stable_baselines.gail import ExpertDataset, generate_expert_traj
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.callbacks import EvalCallback

from rl_baselines_zoo.utils import make_env

from util.stable_baseline_viz import show_videos, record_video
import numpy as np

# the main environment we're testing on
ENV_ID = 'MiniGrid-DistShift1-v0'

# use the slightly different env for later testing of generalization
MODIFIED_ENV_ID = 'MiniGrid-DistShift2-v0'

# file I/O configuration
SIM_HOME = 'gail_data'

# training logging
LOG_DIR_BASE = './logs/'

# these are for data associated with the expert
EXPERT_NAME = 'ppo2'
EXPERT_LOG_DIR = os.path.join(LOG_DIR_BASE, EXPERT_NAME)
EXPERT_DIR = os.path.join(SIM_HOME, 'expert_data')
EXPERT_RUN_ID = f'expert_{EXPERT_NAME}_{ENV_ID}'
EXPERT_MODEL_PATH = os.path.join(EXPERT_DIR, f'{EXPERT_RUN_ID}_model')
EXPERT_MODEL_TRACES_PATH = os.path.join(EXPERT_DIR, f'{EXPERT_RUN_ID}.npz')

# these are for data associated with the imitation learner
LEARNER_NAME = 'GAIL'
LEARNER_DIR = os.path.join(SIM_HOME, 'learner_data')
LEARNER_LOG_DIR = os.path.join(LOG_DIR_BASE, LEARNER_NAME)
LEARNER_RUN_ID = f'learner_{LEARNER_NAME}_{ENV_ID}'
LEARNER_MODEL_PATH = os.path.join(LEARNER_DIR, f'{LEARNER_RUN_ID}_model')

# need to ensure these directories always exist
Path(EXPERT_DIR).mkdir(parents=True, exist_ok=True)
Path(LEARNER_DIR).mkdir(parents=True, exist_ok=True)
Path(LOG_DIR_BASE).mkdir(parents=True, exist_ok=True)

# decide whether you want to load in a pre-trained expert for the ENV or if
# you need to learn an expert using RL data
expert_formats = ['pre_trained_model', 'traces_only', 'learn_the_model']
expert_format = 'traces_only'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Underlying MDP definition

In [2]:
MDP_DISCOUNT_FACTOR = 0.99

# this function is essentially defining the MDP's state space
class FlatFullyObsStateWrapper(gym.core.ObservationWrapper):
    """
    Returns the agent's position and direction as 1-D vector
    """

    def __init__(self, parent_env):
        super().__init__(parent_env)
        
        self.parent_env = parent_env
        
        # the width / height include 1 cell padding on either side, so
        # an agent coordinate will be from 1 -- [(width/height) - 2]
        agent_max_x = parent_env.width - 2
        agent_max_y = parent_env.height - 2
        possible_state_label_IDs = list(IDX_TO_OBJECT.keys())
        possible_agent_direction_vectors = DIR_TO_VEC
        
        self.observation_space = spaces.Box(
            # direction part of state is from 0-4, so lowest observed value
            # in state vector is 0
            low=0,
            # we are keeping x and y as separate coordinates of the state
            # so the max value is the max of all types of vars in the state
            high=np.max([agent_max_x, agent_max_y,
                         np.max(possible_state_label_IDs),
                         len(possible_agent_direction_vectors)]),
            # we're going to store the state as:
            #     [pos_x, pos_y, direction, state_label_ID]
            shape=(4,),
            # sigh, this needs to be float or GAIl stuff will freak out
            dtype='float32'
        )

    def observation(self, obs):
        agent_x = self.parent_env.agent_pos[0]
        agent_y = self.parent_env.agent_pos[1]
        
        # here we're going to extract the MDP state label for the current 
        # grid location to help the learner generalize
        # 
        # IDX_TO_OBJECT[state_label_ID] is something like "lava" or "goal"
        grid = self.parent_env.grid.encode()
        state_label_ID, _, _ = grid[agent_x, agent_y]

        obs = np.array([agent_x,
                        agent_y,
                        self.parent_env.agent_dir,
                        state_label_ID], dtype=np.float32)

        return obs

### Learning Hyperparams

Learning Model Hyperparameters:

In [None]:
hyperparams = {'expert_ppo2':
                    {'cliprange': 0.2,
                     'ent_coef': 0.0,
                     'gamma': MDP_DISCOUNT_FACTOR,
                     'lam': 0.95,
                     'learning_rate': 0.00025,
                     'n_steps': 128,
                     'n_timesteps': 70_000,
                     'nminibatches': 32,
                     'noptepochs': 10,
                     'policy': 'MlpPolicy'},
               'gail':
                    {'gamma': MDP_DISCOUNT_FACTOR
                    }}

Environment Hyperparameters:

In [20]:
env_wrapper = FlatFullyObsStateWrapper
normalize = False
n_envs = 16

### Environment Definition w.r.t. Hyperparameters

In [21]:
# Here, we need to make it fully observable and then compatible with 
# stable-baselines
env = DummyVecEnv([make_env(ENV_ID, wrapper_class=env_wrapper,
                             rank=i) for i in range(n_envs)])
if normalize:
    env = VecNormalize(env)
    
# need a copy of this environment for online algorithm evaluation
eval_env = DummyVecEnv([make_env(ENV_ID, wrapper_class=env_wrapper,
                                 rank=i) for i in range(1)])
if normalize:
    eval_env = VecNormalize(eval_env)



### Get an expert demonstrator and generate expert trajectories

The end goal of this section is to provide the imitation learner with a set of "expert" demonstrations to learn from. This can be accomplished in several ways:

<br>

**Expert Formats:**

*possible formats: `'pre_trained_model', 'traces_only', 'learn_the_model'`*

* `'learn_the_model'`: learning the model just requires choosing the desired RL algorithm and settings its hyperparameters (see above).

* `'pre_trained_model'`: a pre-trained expert must have a saved `stable_baselines` model file at `EXPERT_MODEL_PATH`. See the [saving guide](https://stable-baselines.readthedocs.io/en/master/guide/save_format.html) for info on how to do that.

* `'traces_only'`: demonstration traces must reside in the `npz` archive at `EXPERT_MODEL_TRACES_PATH` and folow the format needed by `stable_baselines.gail.ExpertDataset()`.


<br>

--- 

<br>

**From the docs**:

*The expert dataset is a .npz archive. The data is saved in python dictionary format with keys: actions, episode_returns, rewards, obs, episode_starts.*

*In case of images, obs contains the relative path to the images.*

*obs, actions: shape (N * L, ) + S*

*where N = # episodes, L = episode length and S is the environment observation/action space.*

*S = (1, ) for discrete space*

<br>

In [30]:
if expert_format == 'pre_trained_model':
    expert_model = PPO2.load(EXPERT_MODEL_PATH)
    expert_model.set_env(env)

elif expert_format == 'learn_the_model':
    expert_hparam = hyperparams['expert_ppo2']

    expert_model = PPO2(expert_hparam['policy'], env,
                        cliprange=expert_hparam['cliprange'],
                        ent_coef=expert_hparam['ent_coef'],
                        gamma=expert_hparam['gamma'],
                        lam=expert_hparam['lam'],
                        learning_rate=expert_hparam['learning_rate'],
                        n_steps=expert_hparam['n_steps'],
                        nminibatches=expert_hparam['nminibatches'],
                        noptepochs=expert_hparam['noptepochs'],
                        verbose=0,
                        tensorboard_log=EXPERT_LOG_DIR)

    # always use deterministic actions for live evaluation
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path=EXPERT_LOG_DIR,
                                 log_path=EXPERT_LOG_DIR, eval_freq=500,
                                 deterministic=True, render=False)

    # while evaluate the model on a new environment and save the best one
    # periodically
    expert_model.learn(total_timesteps=expert_hparam['n_timesteps'],
                       callback=eval_callback)
    
    # now save the final model so if we like it, we don't need to re-learn it
    # The model will be saved under $EXPERT_MODEL_EXPER_MODEL_PATH.zip
    expert_model.save(EXPERT_MODEL_PATH)

If we now have an expert model, sample trajectories from it and save them as demonstrations for the imitation learner. If you just provide traces, we will do nothing and assume the traces exist at `EXPERT_MODEL_TRACES_PATH`.

In [34]:
if expert_format != 'traces_only':
    
    # vectorized environments do not work with generate_expert_traj, not sure
    # why. Seems to be that threads are probably not synced properly somehow
    #
    # The evaluation environment already must only have a singular env, so
    # use it for trace generation.
    expert_model.set_env(eval_env)
    
    # generate trajectories in the environment under the expert_model
    generate_expert_traj(expert_model, EXPERT_MODEL_TRACES_PATH,
                         n_episodes=100);
    
# Load the expert dataset
dataset = ExpertDataset(expert_path=EXPERT_MODEL_TRACES_PATH, verbose=1)

actions (1417, 1)
obs (1417, 4)
rewards (1417, 1)
episode_returns (100,)
episode_starts (1417,)
Total trajectories: -1
Total transitions: 1417
Average returns: 0.9395714378356934
Std for returns: 0.09469160588637068


### Training the imitation learner from the expert data

In [33]:
%autoreload 2

gail_model = GAIL('MlpPolicy', env, dataset, verbose=1,
                  tensorboard_log=LEARNER_LOG_DIR)

# Note: in practice, you need to train for 1M steps to have a working policy
gail_model.learn(total_timesteps=100_000)
gail_model.save()

NameError: name 'expert_model_path' is not defined

### Visualizing Learning

#### Visualizing the Learned Expert on the Original Domain

In [None]:
%autoreload 2

record_video(expert_model, eval_env=vec_env, max_video_length=500,
             video_prefix=f'ppo2_expert_{ENV_ID}')
show_videos('videos', prefix='ppo2_expert')

Evaluating the performance of the learned expert:

In [None]:
mean_reward, std_reward = evaluate_policy(expert_model, vec_env,
                                          n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

#### Visualizing the Imitation Learner on the Orginal Domain

In [None]:
record_video(gail_model, eval_env=vec_env, max_video_length=500,
             video_prefix=f'gail_{ENV_ID}')
show_videos('videos', prefix='gail')

Evaluating the performance of the learned expert:

In [None]:
mean_reward, std_reward = evaluate_policy(gail_model, vec_env,
                                          n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")