## Testing GAIL on a Safety GridWorld Environment

### Defining the RL environment to test on.

Using a [grid world with lava](https://github.com/maximecb/gym-minigrid#distributional-shift-environment), based on the DeepMind RL Safety Envs, to test generalizability.

In [5]:
%load_ext autoreload

import os
from pathlib import Path

import gym
from gym import spaces
import gym_minigrid
from gym_minigrid.wrappers import FullyObsWrapper, FlatObsWrapper

from stable_baselines import GAIL, PPO2
from stable_baselines.gail import ExpertDataset, generate_expert_traj
from stable_baselines.common.vec_env import DummyVecEnv

from util.stable_baseline_viz import show_videos, record_video
import numpy as np

# filpath constants
ENV = 'MiniGrid-DistShift1-v0'

# use the slightly different env for later testing of generalization
EVAL_ENV = 'MiniGrid-DistShift2-v0'
SIM_HOME = 'gail_data'
EXPERT_DIR = os.path.join(SIM_HOME, 'expert_data')
LEARNER_DIR = os.path.join(SIM_HOME, 'learner_data')
Path(EXPERT_DIR).mkdir(parents=True, exist_ok=True)
Path(LEARNER_DIR).mkdir(parents=True, exist_ok=True)

class FlatObsStateWrapper(gym.core.ObservationWrapper):
    """
    Returns the agent's position and direction as 1-D vector
    """

    def __init__(self, env):
        super().__init__(env)
        
        # the width / height include 1 cell padding on either side, so
        # an agent coordinate will be from 1 -- [(width/height) - 2]
        agent_max_x = env.width - 2
        agent_max_y = env.height - 2
        
        self.observation_space = spaces.Box(
            # direction part of state is from 0-4, so lowest observed value
            # in state vector is 0
            low=0,
            # we are keeping x and y as separate coordinates of the state
            # so the max value is the max of all types of vars in the state
            high=np.max([agent_max_x, agent_max_y]),
            # we're going to store the state as [pos_x, pos_y, direction]
            shape=(3,),
            # sigh, this needs to be float or GAIl stuff will freak out
            dtype='float32'
        )

    def observation(self, obs):

        obs = np.array([env.agent_pos[0],
                        env.agent_pos[1],
                        env.agent_dir], dtype=np.float32)

        return obs

# Here, we need to make it fully observable and then compatible with 
# stable-baselines
env = gym.make(ENV)
env = FlatObsStateWrapper(env)
vec_env = DummyVecEnv([lambda: env])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




### Generate expert trajectories (train expert)

Use an off-the-shelf RL policy to be the "expert" to imitate.

In [6]:
expert_model_path = os.path.join(EXPERT_DIR, f'expert_{ENV}.npz')
expert_model = PPO2('MlpPolicy', env, verbose=1, tensorboard_log='logs')
generate_expert_traj(expert_model, expert_model_path, n_timesteps=100,
                     n_episodes=10);

Wrapping the env in a DummyVecEnv.
actions (860, 1)
obs (860, 3)
rewards (860, 1)
episode_returns (10,)
episode_starts (860,)


### Training the imitation learner from the expert data

In [7]:
%autoreload 2

# Load the expert dataset and define the GAIL model
dataset = ExpertDataset(expert_path=expert_model_path, verbose=1)
model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log='logs')

# Note: in practice, you need to train for 1M steps to have a working policy
model.learn(total_timesteps=100)
model.save(os.path.join(LEARNER_DIR, f'gail_learned_model_{ENV}'))

actions (860, 1)
obs (860, 3)
rewards (860, 1)
episode_returns (10,)
episode_starts (860,)
Total trajectories: -1
Total transitions: 860
Average returns: 0.0
Std for returns: 0.0
********** Iteration 0 ************
Optimizing Policy...
[35msampling[0m
[35mdone in 2.008 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.078 seconds[0m
[35mconjugate_gradient[0m
      iter residual norm  soln norm
         0     0.0221          0
         1    0.00285     0.0698
         2   0.000635      0.289
         3    0.00481      0.363
         4   0.000709      0.695
         5    0.00183       1.03
         6   2.56e-05       1.08
         7   0.000434       1.14
         8   6.47e-05       1.17
         9   1.04e-06        1.2
        10   4.98e-06        1.2
[35mdone in 0.122 seconds[0m
Expected: 0.030 Actual: 0.030
Stepsize OK!
[35mvf[0m
[35mdone in 0.074 seconds[0m
[35msampling[0m
[35mdone in 2.003 seconds[0m
[35mcomputegrad[0m
[35mdone in 0.007 seconds[0m
[35mconjugate_gra

### Visualizing Learning

#### Visualizing the Expert on the Original Domain

In [8]:
%autoreload 2

record_video(expert_model, eval_env=vec_env, video_length=500,
             video_prefix='gail_minigw')
show_videos('videos', prefix='gail')

Saving video to  /home/ferg/GAIL-Formal_Methods/videos/gail_minigw-step-0-to-step-500.mp4
