# Meta learning shared hierarchies

Here is the [link](https://arxiv.org/abs/1710.09767) to the original paper

Following the OpenAI implementation 

* but we don't use MPI for clearer understanding

### Pre-requisite

* set up the PYTHONPATH as specified in the [OpenAI code](https://github.com/openai/mlsh)

In [1]:
# import all the modules .... 
import tensorflow as tf
import gym
import gym.spaces

import os
import numpy as np
import sys
import subprocess
import multiprocessing
import math
import time

# OpenAI baselines implementation
from distributions import make_pdtype
import tf_util as U
import misc_util as M
import logger as L
import dataset as D

# meta-learning original code testing environment
import test_envs

In [2]:
LOG_DIR = "/tmp/mlsh_log"
CKPT_DIR = "/tmp/mlsh_ckpt"
# continue
continue_train = False
CKPT_ITERATION = 0

NUM_TIMESTEPS = 1e9
M_SEED = 1401

# NOTE: CHANGE THE BELOW AS NEEDED
NUM_GPU = 0  
GLOBAL_STEP_DEVICE = '/cpu:0'
DEVICE_PREFIX = '/device:' # probably only used when GPU is needed
LOG_DEVICE_PLACEMENT = False

# test envs built by OpenAI
gym_env_name = 'MovementBandits-v0' 


In [3]:
n_cpu = multiprocessing.cpu_count()
# well decide whether you have hyper threading
n_cpu = n_cpu // 2

In [4]:
# NOTE: Hierarchical RL specific
num_subpolicies = 2 # How do we decide number of subpolicy ? 
macro_duration = 10 # How do we decide duration number ?
num_rollouts = 2000 # How do we decide number of rollouts ?
num_batches = 15
warmup_time = 20 
train_time = 1
replay = True

<img src="networks.png">

### Policies implementations

* we are going to implement the policy for our master action
* Sub-policies 
* A learner to manage the loss and grads update
* Each policy parameters will be optimized based on [Proximal Policy Optimization](https://arxiv.org/abs/1707.06347)

In [5]:
class Policy(object):
    """
    At the moment only used MLP policy. (can switch to cnn when we need to)
    Would love to try a bayseian policy in here. (whats the benefit?)
    
    Args: 
        name - for the tf variable scope
        observation - tf placeholder observation tensor
        action_space - gym env action_space for the particular action
        hidden_size - fc layer hidden units
        num_hidden_layers - layers
        num_subpolicies - specify sub policies number 
        gaussian_fixed_var
    """
    def __init__(self, 
                 name, 
                 observation, 
                 action_space,
                 hidden_size, # Fully-connected Layer hidden-layer units
                 num_hidden_layers,
                 num_subpolicies,
                 gaussian_fixed_var=True
                ):
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_subpolicies = num_subpolicies
        self.gaussian_fixed_var = gaussian_fixed_var

        with tf.variable_scope(name):
            self.namescope=tf.get_variable_scope().name
            
            # NOTE: in the original code, 
            # they normalize the observation to zero mean however why calculate running mean and running std.
            # TODO: observation update running mean and std.
            
            # build network for value function
            last_output = observation
            for i in range(num_hidden_layers):
                last_output = tf.layers.dense(last_output, 
                                              hidden_size, 
                                              name='vfn_fc%i'%(i+1), 
                                              activation=tf.nn.tanh)
            # estimate expected values
            self.value_pred = tf.layers.dense(last_output, 1, name='vfn_final')
            
            # build network for master policy to optimize against
            # which subpolicy to pick, as master works by choosing subpolicy
            last_output = observation
            for i in range(num_hidden_layers):
                last_output = tf.layers.dense(last_output, 
                                              hidden_size, 
                                              name='master_%i'%(i+1), 
                                              activation=tf.nn.tanh)
            # pick subpolicy
            self.policy_selector_prob = tf.layers.dense(last_output, 
                                                        num_subpolicies, 
                                                        name='master_final')
            # make probability distribution
            pdtype = make_pdtype(action_space)
            self.pdtype = pdtype
            self.pd = pdtype.pdfromflat(self.policy_selector_prob)
            
        # sample actions
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        action = tf.cond(stochastic, lambda: self.pd.sample(), lambda: self.pd.mode())
        # i.e. function([placeholders], [outputs])
        self._act = U.function([stochastic, observation], 
                               [action, self.value_pred])
            
        # debug
        self._debug = U.function([stochastic, observation], 
                                 [action, self.policy_selector_prob])
        
    def act(self, stochastic, observation):
        # no need for observation ?
        act1, v_pred1 = self._act(stochastic, observation[None])
        return act1[0], v_pred1[0]
    
    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.MODEL_VARIABLES, self.namescope)
    
    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.namescope)
    
    def reset(self):
        with tf.variable_scope(self.namescope, reuse=True):
            trainable_vars = self.get_trainable_variables()
            initalizer = tf.variables_initializer(trainable_vars)
            tf.get_default_session().run(initalizer)
    
    # debug
    
    def debug(self, stochastic, observation):
        """
        check which selection we have got
        """
        _, selection = self._debug(stochastic, observation[None])
        return selection[0]

In [6]:
class SubPolicy(object):
    def __init__(self,
                 name,
                 observation,
                 action_space,
                 hidden_size,
                 num_hidden_layers,
                 gaussian_fixed_var=True):
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.gaussian_fixed_var = gaussian_fixed_var
        
        with tf.variable_scope(name):
            self.namescope = tf.get_variable_scope().name
            
            # NOTE: in the original code, 
            # they normalize the observation to zero mean 
            # however why calculate running mean and running std.
            
            #value function
            last_out = observation
            for i in range(num_hidden_layers):
                last_out = tf.layers.dense(last_out, hidden_size, activation=tf.nn.tanh, name='vfn_%i'%(i+1))
            self.value_pred = tf.layers.dense(last_out, 1, name='vfn_final')
            
            # sub policy fn
            pdtype = make_pdtype(action_space)
            paramshape = pdtype.param_shape()[0]
            self.pdtype = pdtype
            last_out = observation
            for i in range(num_hidden_layers):
                last_out = tf.layers.dense(last_out, 
                                           hidden_size, 
                                           activation=tf.nn.tanh, 
                                           name='policy_%i'%(i+1))
            if gaussian_fixed_var and isinstance(action_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, 
                                       paramshape//2, 
                                       name='policy_final')
                logstd = tf.get_variable(name='logstd', 
                                         shape=[1, paramshape//2], 
                                         initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, 
                                          paramshape, 
                                          name='policy_final')
        
            self.pd = pdtype.pdfromflat(pdparam)
            
            #sample actions
            stochastic = tf.placeholder(dtype=tf.bool, shape=())
            action = tf.cond(stochastic, lambda: self.pd.sample(), lambda: self.pd.mode())
            # i.e. function([placeholders], [outputs])
            self._act = U.function([stochastic, observation], 
                                   [action, self.value_pred])
            
    def act(self, stochastic, observation):
        act1, v_pred1 = self._act(stochastic, observation[None])
        return act1[0], v_pred1[0]
    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.MODEL_VARIABLES, self.namescope)
    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.namescope)
    def reset(self):
        with tf.variable_scope(self.namescope, reuse=True):
            train_vars = self.get_trainable_variables()
            initializer = tf.variables_initializer(train_vars)
            tf.get_default_session().run(initializer)

In [7]:
# utility function
def flatten_lists(listoflists):
    return [el for list_ in listoflists for el in list_]

In [8]:
class Learner:
    def __init__(self, 
                 env,
                 master_pol,
                 master_oldpol,
                 subpolicies,
                 old_subpolicies,
                 clip_param=0.2,
                 entropy_coeff=0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64):
        self.master_policy = master_pol
        self.clip_param = clip_param
        self.entropy_coeff = entropy_coeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.num_subpolicies = len(subpolicies)
        self.subpolicies = subpolicies
        obs_space = env.observation_space
        act_space = env.action_space
        
        # train theta (Master)
        self.master_observation = U.get_placeholder_cached(name='observation') # look for the observation in default graph
        self.master_action = self.master_policy.pdtype.sample_placeholder([None]) # create a placeholder for sampling with shape as None in the distribution
        self.master_adv = tf.placeholder(dtype=tf.float32, shape=[None]) # target advantage function
        self.master_emp_return = tf.placeholder(dtype=tf.float32, shape=[None]) # empirical return
        total_loss = self.policy_loss(master_pol, 
                                      master_oldpol, 
                                      self.master_observation, 
                                      self.master_action, 
                                      self.master_adv, 
                                      self.master_emp_return, 
                                      clip_param)
        
        self.master_policy_vars_list = master_pol.get_trainable_variables()
        # feed [obs, action, adv, emp_return], get flatgrad(total_loss, vars_list)
        master_grads = tf.gradients(total_loss, self.master_policy_vars_list)
        #flat_master_grads = U.flatgrad(total_loss, self.master_policy_vars_list)
        #self.master_loss = U.function([observation, action, adv, emp_return], flat_master_grads)
        master_grads = [(g, p) for g, p in zip(master_grads, self.master_policy_vars_list)]
        # we are not using MPI.
        self.master_adam = tf.train.AdamOptimizer(name='master_adam')
        self.master_train_op = self.master_adam.apply_gradients(master_grads)
        
        self.assign_oldpol_equal_new = U.function([], [], updates=[tf.assign(old_var, new_var)
            for (old_var, new_var) in M.zipsame(master_oldpol.get_variables(), master_pol.get_variables())])
        
        
        # subpolicies
        
        # sub policies placeholders
        self.subpols_observation = [
            U.get_placeholder(name='sub_ob_%i' % i, 
                              dtype=tf.float32,
                              shape=[None] + list(obs_space.shape)) 
            for i in range(num_subpolicies)]
        self.subpols_action = [
            subpolicies[0].pdtype.sample_placeholder([None])
            for _ in range(num_subpolicies)]
        self.subpols_adv = [
            tf.placeholder(dtype=tf.float32, shape=[None])
            for _ in range(num_subpolicies)]
        self.subpols_ret = [
            tf.placeholder(dtype=tf.float32, shape=[None])
            for _ in range(num_subpolicies)]
        
        self.assign_subpols = []
        self.change_subpols = []
        self.subpols_adam = []
        self.subpols_losses = []
        self.subpols_train_op = []
        
        for i in range(self.num_subpolicies):
            
            self.subpols_adam.append(tf.train.AdamOptimizer(name='subpol_%i_adam' % i))
            
            # loss for test
            loss = self.policy_loss(subpolicies[i], 
                                    old_subpolicies[i], 
                                    self.subpols_observation[i], 
                                    self.subpols_action[i],
                                    self.subpols_adv[i],
                                    self.subpols_ret[i],
                                    clip_param)
            self.subpols_losses.append(loss)
            vars_list = subpolicies[i].get_trainable_variables()
            
            self.assign_subpols.append(U.function([], [], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in M.zipsame(old_subpolicies[i].get_variables(), subpolicies[i].get_variables())]))
            self.zerograd = U.function([], self.nograd(vars_list))
            
            grads = tf.gradients(loss, vars_list)
            grads = [(g, p) for g, p in zip(grads, vars_list)]
            self.subpols_train_op.append(self.subpols_adam[i].apply_gradients(grads))
        
        U.initialize()
    
    
    def nograd(self, vars_list):
        """return zeros given the vars list"""
        return tf.concat(axis=0, values=[
            tf.reshape(tf.zeros_like(v), [U.numel(v)])
            for v in vars_list
        ])
        
    def policy_loss(self, pi, old_pi, observation, action, adv, emp_return, clip_param):
        """From the mlsh implementation, differ from baselines/PPO"""
        
        old_pi_clipped = tf.clip_by_value(old_pi.pd.logp(action), -20, 20) # the softmax entropy of the action
        ratio = tf.exp(pi.pd.logp(action) - old_pi_clipped) 
        surr1 = ratio * adv # surrogate from conservative policy iteration
        surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv
        policy_surrogate = - tf.reduce_mean(tf.minimum(surr1, surr2))
        
        # this part is different in comparison to openai baseline ppo1.
        vf_loss1 = tf.square(pi.value_pred - emp_return)
        value_pred_clipped = old_pi.value_pred + tf.clip_by_value(pi.value_pred - old_pi.value_pred, 
                                                                  -clip_param, 
                                                                  clip_param)
        vf_loss2 = tf.square(value_pred_clipped - emp_return)
        vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_loss1, vf_loss2))
        total_loss = policy_surrogate + vf_loss
        return total_loss
    
    def update_master_policy(self, seg):
        observation = seg["macro_ob"]
        adv = seg["macro_adv"]
        action = seg["macro_ac"]
        td_lambda_return= seg["macro_tdlamret"]
        
        # we are not using MPI here, so just do it as usual.
        # otherwise we should average out the adv, with mean and std.
        data = D.Dataset(dict(ob=observation, ac=action, adv=adv, vtarg=td_lambda_return), shuffle=True)
        optim_batchsize = min(self.optim_batchsize, observation.shape[0])
        
        # TODO: observation update running mean and std.
        
        # master only
        self.assign_oldpol_equal_new()
        
        for _ in range(self.optim_epochs):
            for batch in data.iterate_once(optim_batchsize):
                feed_dict = {}
                feed_dict[self.master_observation] = batch['ob']
                feed_dict[self.master_action] = batch['ac']
                feed_dict[self.master_adv] = batch['adv']
                feed_dict[self.master_emp_return] = batch['vtarg'] #critic
                
                # feed to policy
                _ = U.get_session().run([self.master_train_op], feed_dict)
        
        ep_rets = (seg["ep_rets"])
        #ep_rets = flatten_lists(ep_rets)
        ep_lens = (seg["ep_lens"])
        #ep_lens = flatten_lists(ep_lens)
        
        L.logkv('Mean episode return', np.mean(ep_rets))
        L.logkv('Mean episode length', np.mean(ep_lens))
        L.dumpkvs()
        return ep_rets
        
    def update_sub_policies(self, test_segs, num_batches, optimize=True):
        if not optimize:
            return
        
        for i in range(self.num_subpolicies):
            is_optimizing = True
            test_seg = test_segs[i]
            ob = test_seg['ob']
            ac = test_seg['ac']
            adv = test_seg['adv']
            tdlamret = test_seg['tdlamret']
            
            if np.shape(ob)[0] < 1:
                is_optimizing = False
            else:
                adv = np.array(adv, dtype='float32')
                adv = (adv - adv.mean())/ max(adv.std(), 0.000001)
            
            test_d = D.Dataset(dict(ob=ob, ac=ac, adv=adv, vtarg=tdlamret),shuffle=True)
            test_batchsize = int(ob.shape[0] / num_batches)
            
            # assigning old parameter to the new parameter values
            self.assign_subpols[i]()
            
            if self.optim_batchsize > 0 and is_optimizing and optimize:
                # Do i really need to normalize my observation ? 
                # TODO: normalize with running mean and std

                for _ in range(self.optim_epochs):
                    for test_batch in test_d.iterate_times(test_batchsize, num_batches):
                        feed_dict={}
                        feed_dict[self.subpols_observation[i]] = test_batch['ob']
                        feed_dict[self.subpols_adv[i]] = test_batch['adv']
                        feed_dict[self.subpols_action[i]] = test_batch['ac']
                        feed_dict[self.subpols_ret[i]] = test_batch['vtarg']
                        
                        _ = U.get_session().run([self.subpols_train_op[i]], feed_dict)
            
                #for i in range(self.num_subpolicies):
                #    logger.logkv('(S%d) policy loss'%i, np.mean(pol_surr_array[i]))
                #    logger.dumpkvs()
                        
                
                
                    

#### Now we should define how we want to roll out our experiences

In [9]:
def traj_segment_generator(policy, 
                           sub_policies, 
                           env, 
                           macro_len, # macro_duration 
                           horizon, # num_rollouts
                           stochastic):
    action = env.action_space.sample()
    observation = env.reset()
    macro_horizon = math.ceil(horizon/macro_len)
    
    ep_rets = []
    ep_lens = []
    
    # init our history - experiences 
    observations = np.array([observation for _ in range(horizon)])
    rewards = np.zeros(horizon, dtype=np.float32)
    value_preds = np.zeros(horizon, dtype=np.float32)
    news = np.zeros(horizon, dtype=np.int32)
    actions = np.array([action for _ in range(horizon)])
    
    macro_actions = np.zeros(macro_horizon, dtype=np.int32)
    macro_vpreds = np.zeros(macro_horizon, dtype=np.float32)
    
    observation = env.reset()
    
    t = 0
    x = 0
    z = 0
    curr_subpolicy = 0
    current_ep_return = 0
    current_ep_len = 0
    macro_vpred = 0
    new = True
    
    while True:
        # NOTE: if step is less_than macro_duration
        if t % macro_len == 0:
            # get the master macro action choice  
            curr_subpolicy, macro_vpred = policy.act(stochastic, observation)
            
            # 0.1 probability taking a random pick from the sub policies
            if np.random.uniform() < 0.1:
                curr_subpolicy = np.random.randint(0, len(sub_policies))
                    
        # chosen subpolicy to act given observation
        action, v_pred = sub_policies[curr_subpolicy].act(stochastic, observation)
        
        # NOTE: if step is less than num_rollouts
        if t > 0 and t % horizon == 0:
            dictionary = {
                'ob': observations, 'rew': rewards, 'vpred': value_preds, 'new': news, 'ac': actions,
                'ep_rets': ep_rets, 'ep_lens': ep_lens, 'macro_ac': macro_actions, 'macro_vpred': macro_vpreds }
            yield { key: np.copy(val) for key, val in dictionary.items() }
            
            # one batch of roll outs done
            ep_rets = []
            ep_lens = []
            x += 1
        
        i = t % horizon
        observations[i] = observation
        value_preds[i] = v_pred
        news[i] = new # not sure what this do
        actions[i] = action
        
        # if step is less than macro_duration
        if t % macro_len == 0:
            macro_actions[int(i / macro_len)] = curr_subpolicy
            macro_vpreds[int(i / macro_len)] = macro_vpred
        
        observation, reward, new, info = env.step(action)
        rewards[i] = reward
        
        if replay:
            # only display when its the start of the ep
            if len(ep_rets) == 0:
                env.render()
        
        # finish an ep 
        current_ep_return += reward
        current_ep_len += 1
        
        if new and ((t+1) % macro_len == 0):
            ep_rets.append(current_ep_return)
            ep_lens.append(current_ep_len)
            current_ep_len = 0
            current_ep_return = 0
            observation = env.reset()
        
        # incremet step
        t+=1
        

In [10]:
def add_advantage_macro(seg, macro_len, gamma, lam):
    new = np.append(seg['new'][0::macro_len], 0)
    value_pred = np.append(seg['macro_vpred'], 0)
    T = int(len(seg['rew'])/macro_len)
    gaelam = np.empty(T, dtype=np.float32)
    seg['macro_adv'] = gaelam
    rew = np.sum(seg['rew'].reshape(-1, macro_len), axis=1)
    lastgaelam = 0
    for t in reversed(range(T)):
        nonterminal = 1 - new[t+1]
        delta = rew[t] + gamma * value_pred[t+1] * nonterminal - value_pred[t]
        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
    seg['macro_tdlamret'] = seg['macro_adv'] + seg['macro_vpred']
    
    seg['macro_ob'] = seg['ob'][0::macro_len]

In [11]:
def split_segments(seg, macro_len, num_subpolicies):
    subpolicies_counts = []
    for i in range(num_subpolicies):
        subpolicies_counts.append(0)
    for macro_ac in seg['macro_ac']:
        subpolicies_counts[macro_ac] += macro_len
    subpolicies = []
    for i in range(num_subpolicies):
        observations = np.array([seg['ob'][0] for _ in range(subpolicies_counts[i])])
        advs = np.zeros(subpolicies_counts[i], dtype=np.float32)
        tdlams = np.zeros(subpolicies_counts[i], dtype=np.float32)
        actions = np.array([seg['ac'][0] for _ in range(subpolicies_counts[i])])
        subpolicies.append({'ob':observations, 'adv':advs, 'tdlamret':tdlams, 'ac':actions})
    
    subpolicies_counts = []
    for i in range(num_subpolicies):
        subpolicies_counts.append(0)
    for i in range(len(seg['ob'])):
        mac = seg['macro_ac'][int(i/macro_len)]
        subpolicies[mac]['ob'][subpolicies_counts[mac]] = seg['ob'][i]
        subpolicies[mac]['adv'][subpolicies_counts[mac]] = seg['adv'][i]
        subpolicies[mac]['tdlamret'][subpolicies_counts[mac]] = seg['tdlamret'][i]
        subpolicies[mac]['ac'][subpolicies_counts[mac]] = seg['ac'][i]
        subpolicies_counts[mac] += 1
    return subpolicies

In [12]:
def prepare_allrollouts(allrollouts, macro_len, gamma, lam, num_subpolicies):
    for i in range(len(allrollouts) - 1):
        for key,value in allrollouts[i + 1].items():
            allrollouts[0][key] = np.append(allrollouts[0][key], value, axis=0)
    test_seg = allrollouts[0]
    
    # calculate advantages
    new = np.append(test_seg['new'], 0)
    vpred = np.append(test_seg['vpred'],0)
    T = len(test_seg['rew'])
    test_seg['adv'] = gaelam = np.empty(T, dtype=np.float32)
    rew = test_seg['rew']
    lastgaelam = 0
    for t in reversed(range(T)):
        nonterminal = 1 - new[t+1]
        delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
        gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
    test_seg['tdlamret'] = test_seg['adv'] + test_seg['vpred']
    
    split_test = split_segments(test_seg, macro_len, num_subpolicies)
    return split_test

In [13]:
L.Logger.DEFAULT = L.Logger.CURRENT = L.Logger(dir=None, 
                                               output_formats=[L.HumanOutputFormat(sys.stdout),
                                                               L.CSVOutputFormat(os.path.join(LOG_DIR, 'log.csv'))])


In [14]:
tf.reset_default_graph()

tf_config=tf.ConfigProto(log_device_placement=True,
                         allow_soft_placement=True)
                         #intra_op_parallelism_threads=n_cpu,
                         #inter_op_parallelism_threads=n_cpu)

In [15]:
if tf.gfile.Exists(LOG_DIR):
    tf.gfile.DeleteRecursively(LOG_DIR)
tf.gfile.MakeDirs(LOG_DIR)

g = tf.Graph()

tf.Session(config=tf_config, graph=g).__enter__()

env = gym.make(gym_env_name) 
observ_space = env.observation_space
action_space = env.action_space

observ_holder = U.get_placeholder(name='observation', 
                                  dtype=tf.float32, 
                                  shape=[None, observ_space.shape[0]])


[2018-08-24 18:30:11,121] Making new env: MovementBandits-v0


seeded


In [16]:
# PPO master policies
policy = Policy(name='master_policy', 
                observation=observ_holder, 
                action_space=action_space, 
                hidden_size=32, 
                num_hidden_layers=2,
                num_subpolicies=num_subpolicies)
old_policy = Policy(name='old_master_policy',
                    observation=observ_holder,
                    action_space=action_space,
                    hidden_size=32,
                    num_hidden_layers=2,
                    num_subpolicies=num_subpolicies)

In [17]:
# PPO subpolicies
subpolicies = []
old_subpolicies = []
subpol_holders = []
# create placeholders

for i in range(num_subpolicies):
    subpol_holders.append(U.get_placeholder(name='sub_ob_%i' % i, 
                                            dtype=tf.float32,
                                            shape=[None] + list(observ_space.shape)))
    
    subpolicies.append(SubPolicy(name='subpolicy_%i' % i,
                                  observation=subpol_holders[i],
                                  action_space=action_space,
                                  hidden_size=32,
                                  num_hidden_layers=2))
    old_subpolicies.append(SubPolicy(name='old_subpolicy_%i' % i, 
                                     observation=subpol_holders[i],
                                     action_space=action_space,
                                     hidden_size=32,
                                     num_hidden_layers=2))

In [18]:
# create learner (i.e. agent)
learner = Learner(env, policy, old_policy, 
                  subpolicies, old_subpolicies)

In [None]:
rollouts = traj_segment_generator(policy, subpolicies, env, 
                                  macro_duration, num_rollouts,
                                  True) # stochastic

start_from = 0
if continue_train:
    start_from = int(CKPT_ITERATION) + 1


for x in range(start_from, 10000):
    
    if x % 5 == 0 and x > 3:
        U.save_state(os.path.join(CKPT_DIR, '%.5i'%x))
    
    if x == 0 and continue_train :
        U.load_state(os.path.join(CKPT_DIR, str(CKPT_ITERATION)))
    
    policy.reset()
    
    env.env.randomizeCorrect()
    
    print('It is iteration %d so i am changing the goal to %s' % (x, env.env.realgoal))
    
    mini_episode = 0
    total_means = []
    
    while mini_episode < warmup_time + train_time:
        mini_episode += 1
        print('mini episode %d ' % (mini_episode))        
        # roll out episode
        rollout = rollouts.__next__()
        allrollouts = []
        
        # experience
        allrollouts.append(rollout)
        
        # train master theta
        add_advantage_macro(rollout, macro_duration, 0.99, 0.98)
        
        ep_reward_mean = learner.update_master_policy(rollout)
        # train subpolicies phi
        test_seg = prepare_allrollouts(allrollouts, macro_duration, 0.99, 0.98, num_subpolicies)
        
        learner.update_sub_policies(test_seg, num_batches, (mini_episode >= warmup_time))
        


It is iteration 0 so i am changing the goal to 0
mini episode 1 
----------------------------------
| Mean episode length | 50       |
| Mean episode return | 4.47     |
----------------------------------
mini episode 2 
----------------------------------
| Mean episode length | 50       |
| Mean episode return | 3.55     |
----------------------------------
mini episode 3 
----------------------------------
| Mean episode length | 50       |
| Mean episode return | 3        |
----------------------------------
mini episode 4 
----------------------------------
| Mean episode length | 50       |
| Mean episode return | 2.35     |
----------------------------------
mini episode 5 
----------------------------------
| Mean episode length | 50       |
| Mean episode return | 4.33     |
----------------------------------
mini episode 6 
----------------------------------
| Mean episode length | 50       |
| Mean episode return | 3.1      |
----------------------------------
mini episode 7 