# Setup

In [None]:
from collections import namedtuple

Config = namedtuple('Config', [
    'env_id',
    'population_size',
    'learning_rate',
    'noise_stdev',
    'snapshot_freq',
    'return_proc_mode'
    #'timesteps_per_batch',
    #'calc_obstat_prob',
    #'eval_prob',
    #'episode_cutoff_mode'
])

Result = namedtuple('Result', [
    'worker_id',
    'noise_inds_n','returns_n2', 'signreturns_n2', 'lengths_n2',
    'eval_return', 'eval_length',
    'ob_sum', 'ob_sumsq', 'ob_count',
    'task_id'
])

## Config

In [None]:
config = Config(
    env_id="RoboschoolInvertedPendulum-v1",
    population_size=300,
    learning_rate=0.005,
    noise_stdev=0.02,
    snapshot_freq=20,
    return_proc_mode="centered_rank"
)

## Environment

Create one for every worker -> done in worker method
Master also

In [None]:
import gym, roboschool # Roboschool import needed to register the environments within gym
env = gym.make(config.env_id)

## Tensorflow Session

In [None]:
import tensorflow as tf
sess = tf.Session()

## Policy setup

Currently saves the arguments as local variable, then creates a TensorFlow variable scope where the neural network
architecture gets created.

Currently emitted:
1. Observation normalization
2. Obseration clipping
3. _act function
6. set_all_vars

In [None]:
import numpy as np

args = {
      "ac_bins": "continuous:",
      "ac_noise_std": 0.01,
      #"connection_type": "ff",
      "hidden_dims": [
        256,
        256
      ],
      "nonlin_type": "tanh"
}

ob_space= env.observation_space
ac_space = env.action_space
ac_bins = args["ac_bins"]
ac_noise_std = args["ac_noise_std"]
hidden_dims = args["hidden_dims"]
nonlin = args["nonlin_type"]

with tf.variable_scope("RoboschoolPolicy") as scope:
    # Observation normalization
    #ob_mean = tf.get_variable(
    #    'ob_mean', ob_space.shape, tf.float32, tf.constant_initializer(np.nan), trainable=False)
    #ob_std = tf.get_variable(
    #    'ob_std', ob_space.shape, tf.float32, tf.constant_initializer(np.nan), trainable=False)
    #in_mean = tf.placeholder(tf.float32, ob_space.shape)
    #in_std = tf.placeholder(tf.float32, ob_space.shape)
    #self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[
        #tf.assign(ob_mean, in_mean),
        #tf.assign(ob_std, in_std),
    #])

    # Policy network

    # Create a placeholder of type float32 with dimension None and the shape of the observation space
    o = tf.placeholder(tf.float32, [None] + list(ob_space.shape))

    # Normalize observation space and clip to [-5.0, 5.0]
    #o = tf.clip_by_value((o - ob_mean) / ob_std, -5.0, 5.0)


    # Feed-Forward Neural Network architecture
    x = o
    # Iterate through the hidden dimensions. In each iteration create a dense layer and activate it with the
    # self.nonlin activation function
    for ilayer, hd in enumerate(hidden_dims):
        shape = [x.get_shape()[1], hd]
        std = 1.0
        
        # Initializer for the newly created weights. TODO possible replacement tf.keras.initializers.RandomNormal
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        initializer= tf.constant(out)
        
        # Creates a Dense layer TODO possible replacement tf.keras.layers.Dense
        w = tf.get_variable("l" + str(ilayer) + "/w", shape, initializer=initializer)
        b = tf.get_variable("l" + str(ilayer) + "/b", [hd], initializer=tf.zeros_initializer)
        dense = tf.matmul(x, w) + b
       
        x = nonlin(dense)


    # Map to action
    adim = ac_space.shape[0]
    
    # Initializer for the newly created weights. TODO possible replacement tf.keras.initializers.RandomNormal
    out = np.random.randn(*adim).astype(np.float32)
    out *=  0.01 / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
    initializer= tf.constant(out)
    
    # Creates a Dense layer TODO possible replacement tf.keras.layers.Dense
    w = tf.get_variable("out" + "/w", [x.get_shape()[1], adim], initializer=initializer)
    b = tf.get_variable("out" + "/b", [adim], initializer=tf.zeros_initializer)
    dense = tf.matmul(x, w) + b
    
    a = dense
    
    # TODO _act


all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope.name)
trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope.name)

num_params = sum(int(np.prod(v.get_shape().as_list())) for v in trainable_variables)
    
#self._setfromflat = U.SetFromFlat(self.trainable_variables)
#self._getflat = U.GetFlat(self.trainable_variables)

#placeholders = [tf.placeholder(v.value().dtype, v.get_shape().as_list()) for v in self.all_variables]

# self.set_all_vars = U.function(
#     inputs=placeholders,
#     outputs=[],
#     updates=[tf.group(*[v.assign(p) for v, p in zip(self.all_variables, placeholders)])]
# )

In [None]:
#optimizer = {'sgd': SGD, 'adam': Adam}[exp['optimizer']['type']](policy, **exp['optimizer']['args'])

## Shared Noise

In [None]:
class SharedNoiseTable(object):
    def __init__(self):
        import ctypes, multiprocessing
        seed = 123
        count = 250000000  # 1 gigabyte of 32-bit numbers. Will actually sample 2 gigabytes below.
        #logger.info('Sampling {} random numbers with seed {}'.format(count, seed))

        # Instantiate an array of C float datatype with size count
        self._shared_mem = multiprocessing.Array(ctypes.c_float, count)

        # Convert to numpy array
        self.noise = np.ctypeslib.as_array(self._shared_mem.get_obj())
        assert self.noise.dtype == np.float32
        self.noise[:] = np.random.RandomState(seed).randn(count)  # 64-bit to 32-bit conversion here
        #logger.info('Sampled {} bytes'.format(self.noise.size * 4))

    def get(self, i, dim):
        return self.noise[i:i + dim]

    def sample_index(self, stream, dim):
        return stream.randint(0, len(self.noise) - dim + 1)

noise = SharedNoiseTable()

## Get flat

In [None]:
op_get_flat = tf.concat([tf.reshape(v, [-1]) for v in trainable_variables], 0)

def get_flat(var_list):
    return sess.run(op_get_flat)

## Set from flat

In [None]:
def _create_set_from_flat_op():
    shapes = [v.shape for v in trainable_variables]
    total_size = np.sum([shape.num_elements() for shape in shapes])
    
    theta = tf.placeholder(tf.float32, [total_size])
    
    start=0
    assigns = []
    for (shape, v) in zip(shapes, trainable_variables):
        size = shape.num_elements()
        assigns.append(tf.assign(v, tf.reshape(theta[start:start+size], shape)))
        start += size
        
    assert start == total_size
    
    return tf.group(*assigns)
 
op_set_from_flat = _create_set_from_flat_op()

def set_from_flat(var_list):
    return sess.run(op_set_from_flat)   

## Rollout TODO

In [None]:
def rollout(env, *, render=False, timestep_limit=None, save_obs=False, random_stream=None):
    """
    If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream.
    Otherwise, no action noise will be added.
    """
    env_timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    timestep_limit = env_timestep_limit if timestep_limit is None else min(timestep_limit, env_timestep_limit)
    rews = []
    t = 0
    if save_obs:
        obs = []
    ob = env.reset()
    for _ in range(timestep_limit):
        ac = self.act(ob[None], random_stream=random_stream)[0]
        if save_obs:
            obs.append(ob)
        ob, rew, done, _ = env.step(ac)
        rews.append(rew)
        t += 1
        if render:
            env.render()
        if done:
            break
    rews = np.array(rews, dtype=np.float32)
    if save_obs:
        return rews, t, np.array(obs)
    return rews, t


# Worker method


In [None]:
def run_worker(num_jobs, theta): #min_task_runtime=.2):

    #with lock:
    #    logger.info('run_worker: {}'.format(locals()))

    assert isinstance(noise, SharedNoiseTable)

    # Setup
    #config, env, sess, policy = setup(exp, single_threaded=True)
    env = gym.make(config.env_id)


    # Random stream used for todo
    rs = np.random.RandomState()
    #worker_id = rs.randint(2 ** 31)

    #assert policy.needs_ob_stat == (config.calc_obstat_prob != 0)

    #while True:
    # Prevent accessing empty array (master did not emit task yet)
    #while not tasks:
    #    time.sleep(0.05)

    #task_data = tasks[-1]

    #task_tstart = time.time()

    #assert isinstance(task_data, Task)
    #task_id = task_data.task_id
    #assert isinstance(task_id, int)

    #if policy.needs_ob_stat:
    #    policy.set_ob_stat(task_data.ob_mean, task_data.ob_std)

    # # todo whats this condition doing?
    # if rs.rand() < config.eval_prob:
    #     # Evaluation: noiseless weights and noiseless actions
    #     policy.set_trainable_flat(task_data.params)
    # 
    #     eval_rews, eval_length = policy.rollout(env)  # eval rollouts don't obey task_data.timestep_limit
    #     eval_return = eval_rews.sum()
    # 
    #     with lock:
    #         logger.info('Eval result: task={} return={:.3f} length={}'.format(task_id, eval_return, eval_length))
    # 
    #     result_queue.put(Result(
    #         worker_id=worker_id,
    #         noise_inds_n=None,
    #         returns_n2=None,
    #         signreturns_n2=None,
    #         lengths_n2=None,
    #         eval_return=eval_return,
    #         eval_length=eval_length,
    #         ob_sum=None,
    #         ob_sumsq=None,
    #         ob_count=None,
    #         task_id=task_id
    #     ))

    # Rollouts with noise
    noise_inds, returns, signreturns, lengths = [], [], [], []
    #task_ob_stat = RunningStat(env.observation_space.shape, eps=0.)  # eps=0 because we're incrementing only
    
    #while not noise_inds or time.time() - task_tstart < min_task_runtime:
    
    for _ in range(num_jobs):
        # ------------- Noise sample -------------------------------
        noise_idx = noise.sample_index(rs, num_params)
        v = config.noise_stdev * noise.get(noise_idx, num_params)
        
        # Evaluate the sampled noise positive
        set_from_flat(theta + v)
        rews_pos, len_pos = rollout(env)

        # rews_pos, len_pos = rollout_and_update_ob_stat(
        #     policy, env, task_data.timestep_limit, rs, task_ob_stat, config.calc_obstat_prob)
        
        # Evaluate the sample noise negative
        set_from_flat(theta - v)
        rews_neg, len_neg = rollout(env)
        
        # rews_neg, len_neg = rollout_and_update_ob_stat(
        #     policy, env, task_data.timestep_limit, rs, task_ob_stat, config.calc_obstat_prob)
        
        # Gather results
        noise_inds.append(noise_idx)
        returns.append([rews_pos.sum(), rews_neg.sum()])
        signreturns.append([np.sign(rews_pos).sum(), np.sign(rews_neg).sum()])
        lengths.append([len_pos, len_neg])
        
    # result_queue.put(Result(
    #     worker_id=worker_id,
    #     noise_inds_n=np.array(noise_inds),
    #     returns_n2=np.array(returns, dtype=np.float32),
    #     signreturns_n2=np.array(signreturns, dtype=np.float32),
    #     lengths_n2=np.array(lengths, dtype=np.int32),
    #     eval_return=None,
    #     eval_length=None,
    #     ob_sum=None if task_ob_stat.count == 0 else task_ob_stat.sum,
    #     ob_sumsq=None if task_ob_stat.count == 0 else task_ob_stat.sumsq,
    #     ob_count=task_ob_stat.count,
    #     task_id=task_id
    # ))
    result = Result(
        worker_id=None,
        noise_inds=noise_inds,
        returns_n2=returns,
        signreturns_n2=signreturns,
        lengths_n2=lengths,
        eval_return=None,
        eval_length=None,
        ob_sum=None,
        ob_count=None,
        task_id = 0
    )
    
    return result

# Master

In [None]:
import time

env = gym.make(config.env_id)
rs = np.random.RandomState()


# ob_stat = RunningStat(
#     env.observation_space.shape,
#     eps=1e-2  # eps to prevent dividing by zero at the beginning when computing mean/stdev
# )



tslimit, incr_tslimit_threshold, tslimit_incr_ratio = None, None, None
adaptive_tslimit = False


episodes_so_far = 0
timesteps_so_far = 0
tstart = time.time()

task_counter = 0

while True:
    step_tstart = time.time()

    # Flatten the trainable variables and store them in theta TODO shorten this code
    def var_shape(x):
        """
        Return the dimensions of a Tensor in an integer list.
    
        :param x: The Tensor from which one wants the dimensions
        :return: A list of integers with the dimensions from x
        """
        out = [k.value for k in x.get_shape()]
        assert all(isinstance(a, int) for a in out), \
            "shape function assumes that shape is fully known"
        return out
    
    def numel(x):
        """
        Calculate the product of the dimensions of x.
    
        :param x: Tensor
        :return: Integer value
        """
        return intprod(var_shape(x))
    
    def intprod(x):
        """
        Calculates the product of all members of the given array x and casts it to int.
    
        :param x: Array which elements shall be multiplied
        :return: The integer value of the multiplication
        """
        return int(np.prod(x))
        
    
    x = [tf.reshape(v, [numel(v)]) for v in trainable_variables]
    theta = session.run(tf.concat(x,0))
    
    #%%
    
    assert theta.dtype == np.float32

    # Task counter is used to recognize false tasks from previous iterations later
    curr_task_id = task_counter
    task_counter += 1

    # tasks.append(Task(
    #         params=theta,
    #         ob_mean=ob_stat.mean if policy.needs_ob_stat else None,
    #         ob_std=ob_stat.std if policy.needs_ob_stat else None,
    #         timestep_limit=tslimit,
    #         task_id = curr_task_id
    # ))
    
    # Start workers

    # Pop off results for the current task
    curr_task_results, eval_rets, eval_lens, worker_ids = [], [], [], []
    num_results_skipped, num_episodes_popped, num_timesteps_popped, ob_count_this_batch = 0, 0, 0, 0
    while num_episodes_popped < config.episodes_per_batch:
        # Wait for a result
        result = pop_item(result_queue, lock)

        assert isinstance(result, Result)
        task_id = result.task_id
        assert isinstance(task_id, int)

        assert (result.eval_return is None) == (result.eval_length is None)
        worker_ids.append(result.worker_id)

        if result.eval_length is not None:
            # This was an eval job
            episodes_so_far += 1
            timesteps_so_far += result.eval_length
            # Store the result only for current tasks
            if task_id == curr_task_id:
                eval_rets.append(result.eval_return)
                eval_lens.append(result.eval_length)
        else:
            # The real shit
            assert (result.noise_inds_n.ndim == 1 and
                    result.returns_n2.shape == result.lengths_n2.shape == (len(result.noise_inds_n), 2))
            assert result.returns_n2.dtype == np.float32
            # Update counts
            result_num_eps = result.lengths_n2.size
            result_num_timesteps = result.lengths_n2.sum()
            episodes_so_far += result_num_eps
            timesteps_so_far += result_num_timesteps
            # Store results only for current tasks
            if task_id == curr_task_id:
                curr_task_results.append(result)
                num_episodes_popped += result_num_eps
                num_timesteps_popped += result_num_timesteps
                # Update ob stats
                if policy.needs_ob_stat and result.ob_count > 0:
                    ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count)
                    ob_count_this_batch += result.ob_count
            else:
                num_results_skipped += 1

    # Compute skip fraction
    frac_results_skipped = num_results_skipped / (num_results_skipped + len(curr_task_results))
    if num_results_skipped > 0:
        logger.warning('Skipped {} out of date results ({:.2f}%)'.format(
            num_results_skipped, 100. * frac_results_skipped))

    # Assemble results
    noise_inds_n = np.concatenate([r.noise_inds_n for r in curr_task_results])
    returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
    lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
    assert noise_inds_n.shape[0] == returns_n2.shape[0] == lengths_n2.shape[0]
    # Process returns
    if config.return_proc_mode == 'centered_rank':
        proc_returns_n2 = compute_centered_ranks(returns_n2)
    elif config.return_proc_mode == 'sign':
        proc_returns_n2 = np.concatenate([r.signreturns_n2 for r in curr_task_results])
    elif config.return_proc_mode == 'centered_sign_rank':
        proc_returns_n2 = compute_centered_ranks(np.concatenate([r.signreturns_n2 for r in curr_task_results]))
    else:
        raise NotImplementedError(config.return_proc_mode)
    # Compute and take step
    g, count = batched_weighted_sum(
        proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
        (noise.get(idx, policy.num_params) for idx in noise_inds_n),
        batch_size=500
    )
    g /= returns_n2.size
    assert g.shape == (policy.num_params,) and g.dtype == np.float32 and count == len(noise_inds_n)
    #update_ratio = optimizer.update(-g + config.l2coeff * theta)
    update_ratio = optimizer.update(config.l2coeff * g)

    # Update ob stat (we're never running the policy in the master, but we might be snapshotting the policy)
    if policy.needs_ob_stat:
        policy.set_ob_stat(ob_stat.mean, ob_stat.std)

    # Update number of steps to take
    if adaptive_tslimit and (lengths_n2 == tslimit).mean() >= incr_tslimit_threshold:
        old_tslimit = tslimit
        tslimit = int(tslimit_incr_ratio * tslimit)
        logger.info('Increased timestep limit from {} to {}'.format(old_tslimit, tslimit))

    step_tend = time.time()
    tlogger.record_tabular("EpRewMean", returns_n2.mean())
    tlogger.record_tabular("EpRewStd", returns_n2.std())
    tlogger.record_tabular("EpLenMean", lengths_n2.mean())

    tlogger.record_tabular("EvalEpRewMean", np.nan if not eval_rets else np.mean(eval_rets))
    tlogger.record_tabular("EvalEpRewStd", np.nan if not eval_rets else np.std(eval_rets))
    tlogger.record_tabular("EvalEpLenMean", np.nan if not eval_rets else np.mean(eval_lens))
    tlogger.record_tabular("EvalPopRank", np.nan if not eval_rets else (
        np.searchsorted(np.sort(returns_n2.ravel()), eval_rets).mean() / returns_n2.size))
    tlogger.record_tabular("EvalEpCount", len(eval_rets))

    tlogger.record_tabular("Norm", float(np.square(policy.get_trainable_flat()).sum()))
    tlogger.record_tabular("GradNorm", float(np.square(g).sum()))
    tlogger.record_tabular("UpdateRatio", float(update_ratio))

    tlogger.record_tabular("EpisodesThisIter", lengths_n2.size)
    tlogger.record_tabular("EpisodesSoFar", episodes_so_far)
    tlogger.record_tabular("TimestepsThisIter", lengths_n2.sum())
    tlogger.record_tabular("TimestepsSoFar", timesteps_so_far)

    num_unique_workers = len(set(worker_ids))
    tlogger.record_tabular("UniqueWorkers", num_unique_workers)
    tlogger.record_tabular("UniqueWorkersFrac", num_unique_workers / len(worker_ids))
    tlogger.record_tabular("ResultsSkippedFrac", frac_results_skipped)
    tlogger.record_tabular("ObCount", ob_count_this_batch)

    tlogger.record_tabular("TimeElapsedThisIter", step_tend - step_tstart)
    tlogger.record_tabular("TimeElapsed", step_tend - tstart)
    tlogger.dump_tabular()

    if config.snapshot_freq != 0 and curr_task_id % config.snapshot_freq == 0:
        import os.path as osp
        filename = osp.join(tlogger.get_dir(), 'snapshot_iter{:05d}_rew{}.h5'.format(
            curr_task_id,
            np.nan if not eval_rets else int(np.mean(eval_rets))
        ))
        assert not osp.exists(filename)
        policy.save(filename)
        tlogger.log('Saved snapshot {}'.format(filename))
