# Meta learning shared hierarchies

Here is the [link](https://arxiv.org/abs/1710.09767) to the original paper

Following the OpenAI implementation 

* but we don't use MPI for clearer understanding

In [1]:
# import all the modules .... 
import tensorflow as tf
import gym
import gym.spaces
import os
import numpy as np
import sys
import subprocess

# OpenAI baselines implementation
from distributions import make_pdtype
import tf_util as U

In [2]:
LOG_DIR = "/tmp/mlsh_train"
CKPT_DIR = "/tmp/mlsh_ckpt"
NUM_TIMESTEPS = 1e9
M_SEED = 1401

# NOTE: CHANGE THE BELOW AS NEEDED
NUM_GPU = 0  
GLOBAL_STEP_DEVICE = '/cpu:0'
DEVICE_PREFIX = '/device:' # probably only used when GPU is needed
LOG_DEVICE_PLACEMENT = True
gym_env_name = 'CartPole-v1'


In [13]:
# NOTE: Hierarchical RL specific
num_subpolicies = 2 # How do we decide number of subpolicy ? 
pri_duration = 1000 # How do we decide duration number ?
num_rollouts = 2000 # How do we decide number of rollouts ?
num_batches = 15
warmup_time = 20 
train_time = 30
replay = False

In [4]:
tf_config=tf.ConfigProto(log_device_placement=True)

<img src="files/networks.png">

### Next we are going to implement the policy for our master action

Each policy parameters will be optimized based on [Proximal Policy Optimization](https://arxiv.org/abs/1707.06347)

In [20]:
class Policy(object):
    """
    At the moment only used MLP policy. (can switch to cnn when we need to)
    Would love to try a bayseian policy in here. (whats the benefit?)
    
    Args: 
        name - for the tf variable scope
        observation - tf placeholder observation tensor
        action_space - gym env action_space for the particular action
        hidden_size - fc layer hidden units
        num_hidden_layers - layers
        num_subpolicies - specify sub policies number 
        gaussian_fixed_var
    """
    def __init__(self, 
                 name, 
                 observation, 
                 action_space,
                 hidden_size, # Fully-connected Layer hidden-layer units
                 num_hidden_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True
                ):
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.namescope=tf.get_variable_scope().name
            # NOTE: in the original code, 
            # they normalize the observation to zero mean however why calculate running mean and running std.
            
            # build network for value function
            last_output = observation
            for i in range(num_hidden_layers):
                last_output = tf.layers.dense(last_output, 
                                              hidden_size, 
                                              name='vfn_fc%i'%(i+1), 
                                              activation=tf.nn.tanh)
            # estimate expected values
            self.value_pred = tf.layers.dense(last_output, 1, name='vfn_final')
            
            # build network for master policy to optimize against
            # which subpolicy to pick, as master works by choosing subpolicy
            last_output = observation
            for i in range(num_hidden_layers):
                last_output = tf.layers.dense(last_output, 
                                              hidden_size, 
                                              name='master_%i'%(i+1), 
                                              activation=tf.nn.tanh)
            # pick subpolicy
            self.policy_selector_prob = tf.layers.dense(last_output, 
                                                        num_subpolicies, 
                                                        name='master_final')
            # make probability distribution
            pdtype = make_pdtype(action_space)
            self.pdtype = pdtype
            self.pd = pdtype.pdfromflat(self.policy_selector_prob)
            
        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        action = tf.cond(stochastic, lambda: self.pd.sample(), lambda: self.pd.mode())
        # i.e. function([placeholders], [outputs])
        self._act = U.function([stochastic, observation], 
                               [action, self.value_pred])
            
        # debug
        self._debug = U.function([stochastic, observation], 
                                 [action, self.policy_selector_prob])
        
    def act(self, stochastic, observation):
        # no need for observation ?
        act1, v_pred1 = self._act(stochastic, observation[None])
        return act1[0], v_pred1[0]
    
    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.VARIABLES, self.namescope)
    
    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.namescope)
    
    def reset(self):
        with tf.variable_scope(self.namescope, reuse=True):
            trainable_vars = self.get_trainable_variables()
            initalizer = tf.variables_initializer(trainable_vars)
            tf.get_default_session().run(initalizer)
    
    # debug
    
    def debug(self, stochastic, observation):
        """
        check which selection we have got
        """
        _, selection = self._debug(stochastic, observation[None])
        return selection[0]

In [50]:
class SubPolicy(object):
    def __init__(self,
                 name,
                 observation,
                 action_space,
                 hidden_size,
                 num_hidden_layers,
                 gaussian_fixed_var=True):
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.gaussian_fixed_var = gaussian_fixed_var
        
        with tf.variable_scope(name):
            self.namescope = tf.get_variable_scope().name
            
            # NOTE: in the original code, 
            # they normalize the observation to zero mean 
            # however why calculate running mean and running std.
            
            #value function
            last_out = observation
            for i in range(num_hidden_layers):
                last_out = tf.layers.dense(last_out, hidden_size, activation=tf.nn.tanh, name='vfn_%i'%(i+1))
            self.value_pred = tf.layers.dense(last_out, 1, name='vfn_final')
            
            # sub policy fn
            pdtype = make_pdtype(action_space)
            paramshape = pdtype.param_shape()[0]
            self.pdtype = pdtype
            last_out = observation
            for i in range(num_hidden_layers):
                last_out = tf.layers.dense(last_out, 
                                           hidden_size, 
                                           activation=tf.nn.tanh, 
                                           name='policy_%i'%(i+1))
            if gaussian_fixed_var and isinstance(action_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, 
                                       paramshape//2, 
                                       name='policy_final')
                logstd = tf.get_variable(name='logstd', 
                                         shape=[1, paramshape//2], 
                                         initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, 
                                          paramshape, 
                                          name='policy_final')
        
            self.pd = pdtype.pdfromflat(pdparam)
            
            #sample actions
            stochastic = tf.placeholder(dtype=tf.bool, shape=())
            action = tf.cond(stochastic, lambda: self.pd.sample(), lambda: self.pd.mode())
            # i.e. function([placeholders], [outputs])
            self._act = U.function([stochastic, observation], 
                                   [action, self.value_pred])
            
    def act(self, stochastic, observation):
        act1, v_pred1 = self._act(stochastic, observation[None])
        return act1[0], v_pred1[0]
    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.VARIABLES, self.namescope)
    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.namescope)
    def reset(self):
        with tf.variable_scope(self.namescope, reuse=True):
            train_vars = self.get_trainable_variables()
            initializer = tf.variables_initializer(train_vars)
            tf.get_default_session().run(initializer)

In [57]:
class Learner:
    def __init__(self, 
                 env,
                 master_pol,
                 master_oldpol,
                 subpolicies,
                 old_subpolicies,
                 clip_param=0.2,
                 entropy_coeff=0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64):
        pass

In [54]:
if tf.gfile.Exists(LOG_DIR):
    tf.gfile.DeleteRecursively(LOG_DIR)
tf.gfile.MakeDirs(LOG_DIR)

tf.reset_default_graph()

with tf.Session(config=tf_config) as sess:
    env = gym.make(gym_env_name) 
    observ_space = env.observation_space
    action_space = env.action_space
    
    observ_holder = tf.placeholder(dtype=tf.float32, 
                                   name='observ', 
                                   shape=[None, observ_space.shape[0]])
    
    # PPO master policies
    policy = Policy(name='master_policy', 
                    observation=observ_holder, 
                    action_space=action_space, 
                    hidden_size=32, 
                    num_hidden_layers=2,
                    num_subpolicies=num_subpolicies)
    old_policy = Policy(name='old_master_policy',
                        observation=observ_holder,
                        action_space=action_space,
                        hidden_size=32,
                        num_hidden_layers=2,
                        num_subpolicies=num_subpolicies)
    
    # PPO subpolicies
    subpolicies = []
    old_subpolicies = []
    for i in range(num_subpolicies):
        subpolicies.append(SubPolicy(name='subpolicy_%i' % i,
                                      observation=observ_holder,
                                      action_space=action_space,
                                      hidden_size=32,
                                      num_hidden_layers=2))
        old_subpolicies.append(SubPolicy(name='old_subpolicy_%i' % i, 
                                         observation=observ_holder,
                                         action_space=action_space,
                                         hidden_size=32,
                                         num_hidden_layers=2))
    
    # create learner (i.e. agent)
    

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
