## Testing GAIL on a Safety GridWorld Environment

### Defining the RL environment to test on.

Using a [grid world with lava](https://github.com/maximecb/gym-minigrid#distributional-shift-environment), based on the DeepMind RL Safety Envs, to test generalizability.

In [96]:
%load_ext autoreload

import os
from pathlib import Path

import gym
from gym import spaces

import gym_minigrid
from gym_minigrid.wrappers import FullyObsWrapper, FlatObsWrapper
from gym_minigrid.minigrid import WorldObj, IDX_TO_OBJECT, DIR_TO_VEC

from stable_baselines import GAIL, PPO2
from stable_baselines.gail import ExpertDataset, generate_expert_traj
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize

from util.stable_baseline_viz import show_videos, record_video
import numpy as np

# filpath constants
ENV = 'MiniGrid-DistShift1-v0'

# use the slightly different env for later testing of generalization
EVAL_ENV = 'MiniGrid-DistShift2-v0'
SIM_HOME = 'gail_data'
EXPERT_DIR = os.path.join(SIM_HOME, 'expert_data')
LEARNER_DIR = os.path.join(SIM_HOME, 'learner_data')
Path(EXPERT_DIR).mkdir(parents=True, exist_ok=True)
Path(LEARNER_DIR).mkdir(parents=True, exist_ok=True)

class FlatObsStateWrapper(gym.core.ObservationWrapper):
    """
    Returns the agent's position and direction as 1-D vector
    """

    def __init__(self, env):
        super().__init__(env)
        
        # the width / height include 1 cell padding on either side, so
        # an agent coordinate will be from 1 -- [(width/height) - 2]
        agent_max_x = env.width - 2
        agent_max_y = env.height - 2
        possible_state_label_IDs = list(IDX_TO_OBJECT.keys())
        possible_agent_direction_vectors = DIR_TO_VEC
        
        self.observation_space = spaces.Box(
            # direction part of state is from 0-4, so lowest observed value
            # in state vector is 0
            low=0,
            # we are keeping x and y as separate coordinates of the state
            # so the max value is the max of all types of vars in the state
            high=np.max([agent_max_x, agent_max_y,
                         np.max(possible_state_label_IDs),
                         len(possible_agent_direction_vectors)]),
            # we're going to store the state as:
            #     [pos_x, pos_y, direction, state_label_ID]
            shape=(4,),
            # sigh, this needs to be float or GAIl stuff will freak out
            dtype='float32'
        )

    def observation(self, obs):
        agent_x = env.agent_pos[0]
        agent_y = env.agent_pos[1]
        
        # here we're going to extract the MDP state label for the current 
        # grid location to help the learner generalize
        # 
        # IDX_TO_OBJECT[state_label_ID] is something like "lava" or "goal"
        grid = env.grid.encode()
        state_label_ID, _, _ = grid[agent_x, agent_y]

        obs = np.array([agent_x,
                        agent_y,
                        env.agent_dir,
                        state_label_ID], dtype=np.float32)

        return obs

# Here, we need to make it fully observable and then compatible with 
# stable-baselines
env = gym.make(ENV)
env = FlatObsStateWrapper(env)
vec_env = DummyVecEnv([lambda: env])
vec_env = VecNormalize(vec_env)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Generate expert trajectories (train expert)

Use an off-the-shelf RL policy to be the "expert" to imitate.

In [97]:
expert_model_path = os.path.join(EXPERT_DIR, f'expert_{ENV}.npz')
expert_model = PPO2('MlpPolicy', vec_env, verbose=1, tensorboard_log='logs')
generate_expert_traj(expert_model, expert_model_path, n_timesteps=100,
                     n_episodes=10);

actions (765, 1)
obs (765, 4)
rewards (765, 1)
episode_returns (10,)
episode_starts (765,)


### Training the imitation learner from the expert data

In [99]:
%autoreload 2

# Load the expert dataset and define the GAIL model
dataset = ExpertDataset(expert_path=expert_model_path, verbose=1)
gail_model = GAIL('MlpPolicy', vec_env, dataset, verbose=1,
                  tensorboard_log='logs')

# Note: in practice, you need to train for 1M steps to have a working policy
gail_model.learn(total_timesteps=100)
gail_model.save(os.path.join(LEARNER_DIR, f'gail_learned_model_{ENV}'))

actions (765, 1)
obs (765, 4)
rewards (765, 1)
episode_returns (10,)
episode_starts (765,)
Total trajectories: -1
Total transitions: 765
Average returns: 0.0
Std for returns: 0.0
********** Iteration 0 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 2.484 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.077 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0    0.00626          0
         1    0.00103      0.261
         2   4.54e-05      0.411
         3   3.83e-05      0.419
         4   3.28e-05      0.556
         5   2.64e-06      0.624
         6   3.53e-08      0.625
         7   1.73e-07      0.626
         8   4.18e-09      0.626
         9   3.71e-10      0.626
        10   4.91e-11      0.626
[35mdone in 0.123 seconds[0m
Expected: 0.024 Actual: 0.024
Stepsize OK!
[35mvf[0m
[35mdone in 0.079 seconds[0m
[35msampling[0m
[35mdone in 2.390 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.008 seconds[0m
[35mconjugate_gra

### Visualizing Learning

#### Visualizing the Learned Expert on the Original Domain

In [100]:
%autoreload 2

record_video(expert_model, eval_env=vec_env, max_video_length=500,
             video_prefix='ppo2_expert_minigw')
show_videos('videos', prefix='ppo2')

#### Visualizing the Imitation Learner on the Orginal Domain

In [101]:
record_video(gail_model, eval_env=vec_env, max_video_length=500,
             video_prefix='gail_minigw')
show_videos('videos', prefix='gail')