# create an ANN for conversion

in this notebook we create an ANN. Then train it as a robot. Then we create some validation data for the network. Then everything gets stored to disk, so it can be used in the next step to convert it to an snn with snn_toolbox


## create and train ann 

In [None]:
import csv
import errno
import json
import os
import time

from collections import namedtuple, OrderedDict

import ctypes
import multiprocessing
import numpy as np

import gym, roboschool

In [None]:
import tensorflow as tf
print(tf.keras.__version__)
print(tf.__version__)


In [None]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

main_directory = "/home/jovyan/base_repository/Workspace/ann_for_conversion/".format(os.getpid())
try:
    mkdir_p(main_directory)
except PermissionError:
    print("The user running this notebook has no permission to create this folder. Please provide a path to a folder"
         + " with write permissions.")

In [None]:
Config = namedtuple('Config', [
    'env_id',
    'population_size',
    'timesteps_per_gen',
    'num_workers',
    'learning_rate',
    'noise_stdev',
    'snapshot_freq',
    'return_proc_mode',
    'calc_obstat_prob',
    'l2coeff',
    'eval_prob'
])

Optimizations = namedtuple('Optimizations', [
    'mirrored_sampling',
    'fitness_shaping',
    'weight_decay',
    'discretize_actions',
    'gradient_optimizer',
    'observation_normalization',
    'divide_by_stdev'
])

ModelStructure = namedtuple('ModelStructure', [
    'ac_noise_std',
    'ac_bins',
    'hidden_dims',
    'nonlin_type',
    'optimizer',
    'optimizer_args'
])

Task = namedtuple('Task', [
    'theta', 'ob_mean', 'ob_std', 'task_id'])

Result = namedtuple('Result', [
    'noise_inds','returns', 'signreturns', 'lengths',
    'eval_return', 'eval_length',
    'ob_sum', 'ob_sumsq', 'ob_count',
    'task_id',
    'times_predict'
])

In [None]:
optimizations = Optimizations(
    mirrored_sampling=True,
    fitness_shaping=True,
    weight_decay=True,
    discretize_actions=False,
    gradient_optimizer=False,
    observation_normalization=True,
    divide_by_stdev=False
)

In [None]:
RETURN_PROC_MODE_CR = 'centered_rank'
RETURN_PROC_MODE_SIGN = 'sign'
RETURN_PROC_MODE_CR_SIGN = 'centered_sign_rank'

config = Config(
    env_id="RoboschoolAnt-v1",
    population_size=10,
    timesteps_per_gen=10000,
    num_workers=os.cpu_count(),
    learning_rate=0.001,
    noise_stdev=0.02,
    snapshot_freq=5,
    return_proc_mode=RETURN_PROC_MODE_CR,
    calc_obstat_prob=0.01,
    l2coeff=0.005,
    eval_prob=0.003
)

try:
    env = gym.make(config.env_id)
except:
    print("Please provide a valid environment ID for the OpenAI Gym. {} is not valid.".format(config.env_id))

# These are used inside create_model for the input and output dimensions
ob_space = env.observation_space
ac_space = env.action_space

assert config.population_size > 0
assert config.num_workers > 0
assert config.learning_rate > 0
assert config.noise_stdev != 0
assert config.eval_prob >= 0

if (config.return_proc_mode != RETURN_PROC_MODE_CR
    and config.return_proc_mode != RETURN_PROC_MODE_SIGN
    and config.return_proc_mode != RETURN_PROC_MODE_CR_SIGN):
    
    raise NotImplementedError
    
if optimizations.observation_normalization:
    assert config.calc_obstat_prob > 0

if optimizations.gradient_optimizer:
    assert config.l2coeff > 0

In [None]:
OPTIMIZER_ADAM = 'adam'
OPTIMIZER_SGD = 'sgd'

model_structure = ModelStructure(
    ac_noise_std=0.01,
    ac_bins=5,
    hidden_dims=[256, 256],
    nonlin_type='tanh',
    optimizer=OPTIMIZER_ADAM,
    optimizer_args={
        'stepsize': config.learning_rate
    }
)

assert model_structure.ac_noise_std >= 0
assert isinstance(model_structure.hidden_dims, list)
assert all(hd >= 0 for hd in model_structure.hidden_dims)

if optimizations.gradient_optimizer:
    if model_structure.optimizer != OPTIMIZER_ADAM and model_structure.optimizer != OPTIMIZER_SGD:
        raise NotImplementedError
    
    try:
        stepsize = model_structure.optimizer_args['stepsize']
        assert stepsize > 0
    except KeyError:
        print("Please provide the stepsize parameter.")

if optimizations.discretize_actions:
    assert model_structure.ac_bins > 0

In [None]:
def save_configuration(save_directory):
    with open(os.path.join(save_directory, 'config.json'), 'w', encoding='utf-8') as f:
        chained_dict = OrderedDict([
            ('config', config._asdict()),
            ('model_structure', model_structure._asdict()),
            ('optimizations', optimizations._asdict())])

        json.dump(chained_dict, f, ensure_ascii=False, indent=4)

In [None]:
def create_model(initial_weights=None, model_name="model", ob_mean=None, ob_std=None):
       
    import tensorflow as tf
    
    class Normc_initializer(tf.keras.initializers.Initializer):
        def __init__(self, std=1.0):
            self.std=std

        def __call__(self, shape, dtype=None, partition_info=None):
            out = np.random.randn(*shape).astype(np.float32)
            out *= self.std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
            return tf.constant(out)
    
    class ObservationNormalizationLayer(tf.keras.layers.Layer):
        def __init__(self, ob_mean, ob_std, **kwargs):
            self.ob_mean = ob_mean
            self.ob_std = ob_std
            super(ObservationNormalizationLayer, self).__init__(**kwargs)

        def call(self, x):
            return tf.clip_by_value((x - self.ob_mean) / self.ob_std, -5.0, 5.0)
        
        # get_config and from_config need to implemented to be able to serialize the model
        def get_config(self):
            base_config = super(ObservationNormalizationLayer, self).get_config()
            base_config['ob_mean'] = self.ob_mean
            base_config['ob_std'] = self.ob_std
            return base_config
        
        @classmethod
        def from_config(cls, config):
            return cls(**config)
        
    class DiscretizeActionsUniformLayer(tf.keras.layers.Layer):
        def __init__(self, num_ac_bins, adim, ahigh, alow, **kwargs):
            self.num_ac_bins = num_ac_bins
            self.adim = adim
            # ahigh, alow are NumPy arrays when extracting from the environment, but when the model is loaded from a h5
            # File they get initialised as a normal list, where operations like subtraction does not work, thereforce
            # cast them explicitly
            self.ahigh = np.array(ahigh)
            self.alow = np.array(alow)
            super(DiscretizeActionsUniformLayer, self).__init__(**kwargs)

        def call(self, x):            
            # Reshape to [n x i x j] where n is dynamically chosen, i equals action dimension and j equals the number
            # of bins
            scores_nab = tf.reshape(x, [-1, self.adim, self.num_ac_bins])
            # This picks the bin with the greatest value
            a = tf.argmax(scores_nab, 2)
            
            # Then transform the interval from [0, num_ac_bins - 1] to [-1, 1] which equals alow and ahigh
            ac_range_1a = (self.ahigh - self.alow)[None, :]
            return 1. / (self.num_ac_bins - 1.) * tf.keras.backend.cast(a, 'float32') * ac_range_1a + self.alow[None, :]        
        
        # get_config and from_config need to implemented to be able to serialize the model
        def get_config(self):
            base_config = super(DiscretizeActionsUniformLayer, self).get_config()
            base_config['num_ac_bins'] = self.num_ac_bins
            base_config['adim'] = self.adim
            base_config['ahigh'] = self.ahigh
            base_config['alow'] = self.alow
            return base_config
        
        @classmethod
        def from_config(cls, config):
            return cls(**config)
    
    ac_space = env.action_space
    ob_space = env.observation_space
    
    nonlin = tf.nn.tanh
    
    if model_structure.nonlin_type == 'relu':
        nonlin = tf.nn.relu
    elif model_structure.nonlin_type == 'lrelu':
        nonlin = tf.nn.leaky_relu
    elif model_structure.nonlin_type == 'elu':
        nonlin = tf.nn.leaky_relu

    # Policy network
    input_layer = x = tf.keras.Input(ob_space.shape, dtype=tf.float32)
    
    if ob_mean is not None and ob_std is not None and optimizations.observation_normalization:
        if ob_std.all() != 0:
            x = ObservationNormalizationLayer(ob_mean, ob_std)(x)
                
    for hd in model_structure.hidden_dims:
        x = tf.keras.layers.Dense(
            hd, activation=nonlin,
            kernel_initializer=Normc_initializer(std=1.0),
            bias_initializer=tf.initializers.zeros())(x)

    # Action dimension and the lowest and highest possible values for an action
    adim, ahigh, alow = ac_space.shape[0], ac_space.high, ac_space.low        
    
    if optimizations.discretize_actions:
        num_ac_bins = int(model_structure.ac_bins)
        x = tf.keras.layers.Dense(
                        adim * num_ac_bins,
                        kernel_initializer=Normc_initializer(std=0.01),
                        bias_initializer=tf.initializers.zeros())(x)
        a = DiscretizeActionsUniformLayer(num_ac_bins, adim, ahigh, alow)(x)
    else:
        a = tf.keras.layers.Dense(
            adim,
            kernel_initializer=Normc_initializer(std=0.01),
            bias_initializer=tf.initializers.zeros())(x)
    
    model = tf.keras.Model(inputs=input_layer, outputs=a, name=model_name)
    
    if initial_weights is not None:
        set_from_flat(model, initial_weights)
        
    return model

In [None]:
def act(ob, model, random_stream=None):
    time_predict_s = time.time()
    action = model.predict_on_batch(ob)
    time_predict_e = time.time() - time_predict_s

    if random_stream is not None and model_structure.ac_noise_std != 0:
        action += random_stream.randn(*action.shape) * model_structure.ac_noise_std
    return action, time_predict_e

In [None]:
class RunningStat(object):
    def __init__(self, shape, eps):
        self.sum = np.zeros(shape, dtype=np.float32)
        self.sumsq = np.full(shape, eps, dtype=np.float32)
        self.count = eps

    def increment(self, s, ssq, c):
        self.sum += s
        self.sumsq += ssq
        self.count += c

    @property
    def mean(self):
        return self.sum / self.count

    @property
    def std(self):
        return np.sqrt(np.maximum(self.sumsq / self.count - np.square(self.mean), 1e-2))

In [None]:
def get_initial_weights(ob_mean=None, ob_std=None):
    
    model = create_model(ob_mean=ob_mean, ob_std=ob_std)
    
    # Print out the model
    model.summary()
    
    return model.get_weights()

def initialize_parameter_vector():
    with multiprocessing.Pool(1) as pool:
        if optimizations.observation_normalization:
            ob_stat = RunningStat(
                env.observation_space.shape,
                eps=1e-2  # eps to prevent dividing by zero at the beginning when computing mean/stdev
                )
            theta = pool.apply(func=get_initial_weights, args=(ob_stat.mean, ob_stat.std))
        else:
            theta = pool.apply(func=get_initial_weights)

    return theta, sum(np.prod(v.shape) for v in theta)

In [None]:
class Optimizer(object):
    def __init__(self, num_params):
        self.dim = num_params
        self.t = 0

    def update(self, theta, globalg):
        self.t += 1
        step = self._compute_step(globalg)
        ratio = np.linalg.norm(step) / np.linalg.norm(theta)
        theta_new = theta + step
        return theta_new, ratio

    def _compute_step(self, globalg):
        raise NotImplementedError

class SGD(Optimizer):
    def __init__(self, num_params, stepsize, momentum=0.9):
        Optimizer.__init__(self, num_params)
        self.v = np.zeros(self.dim, dtype=np.float32)
        self.stepsize, self.momentum = stepsize, momentum

    def _compute_step(self, globalg):
        self.v = self.momentum * self.v + (1. - self.momentum) * globalg
        step = -self.stepsize * self.v
        return step
        
class Adam(Optimizer):
    def __init__(self, num_params, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
        Optimizer.__init__(self, num_params)
        self.stepsize = stepsize
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = np.zeros(self.dim, dtype=np.float32)
        self.v = np.zeros(self.dim, dtype=np.float32)

    def _compute_step(self, globalg):
        a = self.stepsize * np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t)
        self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
        self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
        step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
        return step

In [None]:
class SharedNoiseTable(object):
    def __init__(self, seed=123):
        self.seed = seed
        count = 250000
        print('Sampling {} random numbers with seed {}'.format(count, self.seed))

        # Instantiate an array of C float datatype with size count
        self._shared_mem = multiprocessing.Array(ctypes.c_float, count)

        # Convert to numpy array
        self.noise = np.ctypeslib.as_array(self._shared_mem.get_obj())
        assert self.noise.dtype == np.float32
        self.noise[:] = np.random.RandomState(seed).randn(count)
        print('Sampled {} bytes'.format(self.noise.size * 4))

    def get(self, i, dim):
        return self.noise[i:i + dim]

    def sample_index(self, stream, dim):
        return stream.randint(0, len(self.noise) - dim + 1)

In [None]:
def get_flat(theta):
     return np.concatenate([np.reshape(v, [-1]) for v in theta], 0)

def set_from_flat(model, theta):
    old_theta = model.get_weights()
    shapes = [v.shape for v in old_theta]
    total_size = theta.size
        
    start = 0
    reshapes = []
    
    for (shape, v) in zip(shapes, theta):
        size = int(np.prod(shape))
        reshapes.append(np.reshape(theta[start:start+size], shape))
        start += size
    
    assert start == total_size
    model.set_weights(reshapes)

In [None]:
def rollout(env, 
            model, 
            *, 
            render=False, 
            timestep_limit=None, 
            save_obs=False, 
            random_stream=None):
    """
    If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream.
    Otherwise, no action noise will be added.
    """
    
    env_timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    timestep_limit = env_timestep_limit if timestep_limit is None else min(timestep_limit, env_timestep_limit)
    rews = []
    times_predict = []
    t = 0
    if save_obs:
        obs = []
    ob = env.reset()
    for _ in range(timestep_limit):
        ac, time_predict = act(ob[None], model, random_stream=random_stream)
        ac = ac[0]
        times_predict.append(time_predict)
        if save_obs:
            obs.append(ob)
        try:
            ob, rew, done, _ = env.step(ac)
        except AssertionError:
            # Is thrown when for example ac is a list which has at least one entry with NaN
            raise 
        rews.append(rew)
        t += 1
        if render:
            env.render()
        if done:
            break
    rews = np.array(rews, dtype=np.float32)
    if save_obs:
        return rews, t, np.array(obs), times_predict
    return rews, t, times_predict

In [None]:
def rollout_and_update_ob_stat(env, model, rs, task_ob_stat):
    if optimizations.observation_normalization and config.calc_obstat_prob != 0 and rs.rand() < config.calc_obstat_prob:
        try:
            rollout_rews, rollout_len, obs, times_predict = rollout(
                env, model, save_obs=True, random_stream=rs)
        except AssertionError:
            raise
        task_ob_stat.increment(obs.sum(axis=0), np.square(obs).sum(axis=0), len(obs))
    else:
        try:
            rollout_rews, rollout_len, times_predict = rollout(env, model, random_stream=rs)
        except AssertionError:
            raise
    return rollout_rews, rollout_len, times_predict

In [None]:
def run_worker(task_list, result_queue, stop_work, noise, num_params):
    from tensorflow.keras import backend as K
    import tensorflow as tf
    
    print("PID {}: Started worker".format(os.getpid()))
    
    assert isinstance(noise, SharedNoiseTable)

    # Setup
    # Create a new gym environment object because each worker needs its own one
    env = gym.make(config.env_id)
        
    # Random stream used for adding noise to the actions as well as deciding if the observation statistics shall be
    # updated
    rs = np.random.RandomState()
    
    wait_time = 1
    
    cached_task = None
    cached_task_id = -1
    model = None
    
    while not bool(stop_work.value):
        # Get the latest Task from the Manger list
        try:
            task = task_list[-1]
        except IndexError:
            if wait_time > 100:
                print("The task list does not get tasks, something went wrong in the Master. Aborting.")
                break
            print("Task list is empty, waiting {} seconds before trying again".format(wait_time))
            wait_time *= 2
            time.sleep(wait_time)
            continue
    
        assert isinstance(task, Task)
        task_id = task.task_id
        assert isinstance(task_id, int)
        
        if task_id != cached_task_id:
            cached_task = task
            cached_task_id = task_id
        
            K.clear_session()
            K.set_session(tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)))
            model = create_model(initial_weights=cached_task.theta, 
                             model_name=str(os.getpid()),
                             ob_mean=cached_task.ob_mean,
                             ob_std=cached_task.ob_std)
        
        if rs.rand() < config.eval_prob:
            # Evaluation sample
            set_from_flat(model, cached_task.theta)
            try:
                eval_rews, eval_length, times_predict = rollout(env, model)
            except AssertionError:
                result_queue.put(None)
                return
            
            result_queue.put(Result(
                noise_inds=None,
                returns=None,
                signreturns=None,
                lengths=None,
                eval_return=eval_rews.sum(),
                eval_length=eval_length,
                ob_sum=None,
                ob_sumsq=None,
                ob_count=None,
                task_id=cached_task_id,
                times_predict=times_predict
            ))
            
        else:
            task_ob_stat = RunningStat(env.observation_space.shape, eps=0.)  # eps=0 because we're incrementing only
            
            noise_inds, returns, signreturns, lengths = [], [], [], []
            times_predict = []
            
            while not noise_inds:

                # Noise sample
                noise_idx = noise.sample_index(rs, num_params)
                
                epsilon = config.noise_stdev * noise.get(noise_idx, num_params)
                
                # Evaluate the sampled noise
                set_from_flat(model, cached_task.theta + epsilon)
                
                try:
                    rews_pos, len_pos, times_predict_pos = rollout_and_update_ob_stat(env,
                                                                                      model,
                                                                                      rs=rs,
                                                                                      task_ob_stat=task_ob_stat)
                except AssertionError:
                    result_queue.put(None)
                    return
                
                # Gather results
                noise_inds.append(noise_idx)
                returns.append([rews_pos.sum()])
                signreturns.append([np.sign(rews_pos).sum()])
                lengths.append([len_pos])
                
                times_predict += times_predict_pos

                # Mirrored sampling also evaluates the noise by subtracting it
                if optimizations.mirrored_sampling:
                    set_from_flat(model, cached_task.theta - epsilon)
                    
                    try:
                        rews_neg, len_neg, times_predict_neg = rollout_and_update_ob_stat(env,
                                                                                          model, 
                                                                                          rs=rs, 
                                                                                          task_ob_stat=task_ob_stat)  
                    except AssertionError:
                        result_queue.put(None)
                        return

                    returns[-1].append(rews_neg.sum())
                    signreturns[-1].append(np.sign(rews_neg).sum())
                    lengths[-1].append(len_neg)
                    
                    times_predict += times_predict_neg

            
            result_queue.put(Result(
                noise_inds=np.array(noise_inds),
                returns=np.array(returns, dtype=np.float32),
                signreturns=np.array(signreturns, dtype=np.float32),
                lengths=np.array(lengths, dtype=np.int32),
                eval_return=None,
                eval_length=None,
                ob_sum=None if task_ob_stat.count == 0 else task_ob_stat.sum,
                ob_sumsq=None if task_ob_stat.count == 0 else task_ob_stat.sumsq,
                ob_count=task_ob_stat.count,
                task_id=cached_task_id,
                times_predict=times_predict
            ))

In [None]:
def itergroups(items, group_size):
    assert group_size >= 1
    group = []
    for x in items:
        group.append(x)
        if len(group) == group_size:
            yield tuple(group)
            del group[:]
    if group:
        yield tuple(group)
        
def batched_weighted_sum(weights, vecs, batch_size):
    total = 0.
    num_items_summed = 0
    for batch_weights, batch_vecs in zip(itergroups(weights, batch_size), itergroups(vecs, batch_size)):
        assert len(batch_weights) == len(batch_vecs) <= batch_size
        total += np.dot(np.asarray(batch_weights, dtype=np.float32), np.asarray(batch_vecs, dtype=np.float32))
        num_items_summed += len(batch_weights)
    return total, num_items_summed

In [None]:
def compute_ranks(x):
    """
    Returns ranks in [0, len(x))
    Note: This is different from scipy.stats.rankdata, which returns ranks in [1, len(x)].
    """
    assert x.ndim == 1
    ranks = np.empty(len(x), dtype=int)
    ranks[x.argsort()] = np.arange(len(x))
    return ranks


def compute_centered_ranks(x):
    y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
    y /= (x.size - 1)
    y -= .5
    return y

In [None]:
def save_model(save_directory, stop_work, save_tasks_queue):
    import tensorflow as tf
    from tensorflow.keras import backend as K
    print("PID {}: Started saving process".format(os.getpid()))
    while not bool(stop_work.value):
        save_task = save_tasks_queue.get()
        assert isinstance(save_task, Task)
        # We are creating models in a loop therefore we need to clear the session to avoid build up
        K.clear_session()
        model = create_model(initial_weights=save_task.theta, 
                            model_name=config.env_id + "_Generation_" + str(save_task.task_id),
                            ob_mean=save_task.ob_mean,
                            ob_std=save_task.ob_std)
        
        model.save(os.path.join(save_directory, "snapshot_{:05d}.h5".format(save_task.task_id)))
        save_tasks_queue.task_done()

In [None]:
def run_master(max_timesteps=np.inf, seed=123):
    save_directory = os.path.join(main_directory, time.strftime("%Y%m%dT%H%M%S", time.localtime(time.time())))
    mkdir_p(save_directory)
    save_configuration(save_directory)

    rs = np.random.RandomState()

    noise = SharedNoiseTable(seed)

    theta, num_params = initialize_parameter_vector()
    theta = get_flat(theta)

    if optimizations.gradient_optimizer:
        if model_structure.optimizer == OPTIMIZER_ADAM:
            optimizer = Adam(int(num_params), **model_structure.optimizer_args)
        elif model_structure.optimizer == OPTIMIZER_SGD:
            optimizer = SGD(int(num_params), **model_structure.optimizer_args)
        else:
            raise NotImplementedError

    manager = multiprocessing.Manager()
    task_list = manager.list()
    result_queue = multiprocessing.Queue()
    save_tasks_queue = multiprocessing.JoinableQueue()
    stop_work = multiprocessing.Value('i', 0, lock=False)

    # Start workers
    workers = []
    for _ in range(config.num_workers):
        worker = multiprocessing.Process(target=run_worker, args=(task_list, result_queue, stop_work, noise, num_params))
        workers.append(worker)
        worker.start()
        
    save_process = multiprocessing.Process(target=save_model, args=(save_directory, stop_work, save_tasks_queue))
    save_process.start()

    episodes_so_far = 0
    timesteps_so_far = 0
    generations = 0
    tstart = time.time()

    # Only used with observation_normalization optimization
    ob_stat = RunningStat(
        env.observation_space.shape,
        eps=1e-2  # eps to prevent dividing by zero at the beginning when computing mean/stdev
        )

    generation_log = OrderedDict()
    generation_log_file = os.path.join(save_directory, 'log.csv')
    fieldnames = [
        'Generation',
        'GenRewMean', 'GenRewStd', 'GenLenMean', 
        'EvalGenRewardMean', 'EvalGenRewardStd', 'EvalGenLengthMean', 'EvalGenCount',
        'EpisodesThisGen', 'EpisodesSoFar', 'TimestepsThisGen', 'TimestepsSoFar',
        'UniqueWorkers', 'ResultsSkippedFrac', 'ObCount',
        'TimeElapsedThisGen', 'TimeElapsed',
        'TimePredictMin', 'TimePredictMax', 'TimePredictMean', 'TimePredictCount']

    with open(generation_log_file, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

    while timesteps_so_far < max_timesteps:
        step_tstart = time.time()

        task_list.append(Task(
            theta=theta,
            ob_mean=ob_stat.mean if optimizations.observation_normalization else None,
            ob_std=ob_stat.std if optimizations.observation_normalization else None,
            task_id=generations
        ))

        print("---------------- Generation: {}----------------".format(generations))

        assert theta.dtype == np.float32

        curr_task_results, eval_returns, eval_lengths = [], [], []
        num_results_skipped, num_episodes_popped, num_timesteps_popped, ob_count_this_gen = 0, 0, 0, 0

        times_predict = []
        
        stop_training = False

        print("PID {}: Waiting for results".format(os.getpid()))

        while num_episodes_popped < config.population_size or num_timesteps_popped < config.timesteps_per_gen:
            result = result_queue.get()
            
            if result is None:
                print("Stopping training. The model produced non finite numbers inside the action vector. Try a"
                      + " different configuration.")
                stop_training = True
                break
            
            assert isinstance(result, Result)
            task_id = result.task_id
            assert isinstance(task_id, int)

            assert (result.eval_return is None) == (result.eval_length is None)

            if result.eval_length is not None:
                # The result was an evaluation job therefore do not collect the result only the evaluation
                if task_id == generations:
                    eval_returns.append(result.eval_return)
                    eval_lengths.append(result.eval_length)
                    times_predict += result.times_predict
            else:
                assert result.noise_inds.ndim == 1 and result.returns.dtype == np.float32

                if optimizations.mirrored_sampling:
                    assert result.returns.shape == result.lengths.shape == (len(result.noise_inds), 2)
                else:
                    assert result.returns.shape == result.lengths.shape == (len(result.noise_inds), 1)

                if task_id == generations:
                    curr_task_results.append(result)
                                    
                    # Update counts
                    result_num_eps = result.lengths.size
                    result_num_timesteps = result.lengths.sum()
                    episodes_so_far += result_num_eps
                    timesteps_so_far += result_num_timesteps
                    
                    num_episodes_popped += result_num_eps
                    num_timesteps_popped += result_num_timesteps

                    # Update observation stats if the optimization is used
                    if optimizations.observation_normalization and result.ob_count > 0:
                        ob_stat.increment(result.ob_sum, result.ob_sumsq, result.ob_count)
                        ob_count_this_gen += result.ob_count
                        
                    times_predict += result.times_predict
                else:
                    num_results_skipped += 1

        if stop_training:
            break
            
        print("Gathered results")

        # Compute skip fraction
        frac_results_skipped = num_results_skipped / (num_results_skipped + len(curr_task_results))
        if num_results_skipped > 0:
            print("Skipped {} out of date results ({:.2f}%)".format(
                num_results_skipped, 100. * frac_results_skipped))

        # Assemble results
        noise_inds = np.concatenate([r.noise_inds for r in curr_task_results])
        returns = np.concatenate([r.returns for r in curr_task_results])
        lengths = np.concatenate([r.lengths for r in curr_task_results])
        assert noise_inds.shape[0] == returns.shape[0] == lengths.shape[0]

        # If fitness shaping is turned on rank the results
        if optimizations.fitness_shaping:
            if config.return_proc_mode == RETURN_PROC_MODE_CR:
                proc_returns = compute_centered_ranks(returns)
            # sign and centered_sign_rank are obviously only useful in combination with mirrored sampling
            elif config.return_proc_mode == RETURN_PROC_MODE_SIGN:
                proc_returns = np.concatenate([r.signreturns for r in curr_task_results])
            elif config.return_proc_mode == RETURN_PROC_MODE_CR_SIGN:
                proc_returns = compute_centered_ranks(np.concatenate([r.signreturns for r in curr_task_results]))
            else:
                # Throw error to indicate the false input instead of silently pass on.
                # This should have been already catched in the configuration section, so this here is a misconfiguration.
                raise NotImplementedError
        else:
            proc_returns = returns

        # Mirrored sampling returns a 2D numpy array therefore we need to preprocess it accordingly
        if optimizations.mirrored_sampling:
            # Calculates the difference between the rewards sampled with the positive and negative noise
            proc_returns = proc_returns[:, 0] - proc_returns[:, 1]
        else:
            proc_returns = proc_returns.ravel()

        # Calculate the approximated gradient with a batch variant which saves time on large vectors
        g, count = batched_weighted_sum(
            proc_returns,
            (noise.get(idx, num_params) for idx in noise_inds),
            batch_size=500
        )

        assert g.shape == (num_params,) and g.dtype == np.float32 and count == len(noise_inds)
        
        # Update with the approximated gradient
        g /= returns.size
        
        if optimizations.divide_by_stdev:
            g /= config.noise_stdev
            
        if optimizations.gradient_optimizer:
            step = -g
            if optimizations.weight_decay:
                step += config.l2coeff * theta
            theta, _ = optimizer.update(theta, step)
        else:
            step = g * config.learning_rate
            if optimizations.weight_decay:
                step *= config.l2coeff
            theta += step
        
        step_tend = time.time()

        # Log the generation and print to stdout
        generation_log['Generation'] = generations

        generation_log['GenRewMean'] = returns.mean()
        generation_log['GenRewStd'] = returns.std()
        generation_log['GenLenMean'] = lengths.mean()

        generation_log['EvalGenRewardMean'] = np.nan if not eval_returns else np.mean(eval_returns)
        generation_log['EvalGenRewardStd'] = np.nan if not eval_returns else np.std(eval_returns)
        generation_log['EvalGenLengthMean'] = np.nan if not eval_lengths else np.mean(eval_lengths)
        generation_log['EvalGenCount'] = len(eval_returns)

        generation_log['EpisodesThisGen'] = lengths.size
        generation_log['EpisodesSoFar'] = episodes_so_far
        generation_log['TimestepsThisGen'] = lengths.sum()
        generation_log['TimestepsSoFar'] = timesteps_so_far

        generation_log['UniqueWorkers'] = config.num_workers
        generation_log['ResultsSkippedFrac'] = frac_results_skipped
        generation_log['ObCount'] = ob_count_this_gen

        generation_log['TimeElapsedThisGen'] = step_tend - step_tstart
        generation_log['TimeElapsed'] = step_tend - tstart

        generation_log['TimePredictMin'] = np.amin(times_predict)
        generation_log['TimePredictMax'] = np.amax(times_predict)
        generation_log['TimePredictMean'] = np.mean(times_predict)
        generation_log['TimePredictCount'] = len(times_predict)

        for key, value in generation_log.items():
            print(f'{key:25} {value}')

        # Append the log the csv file
        with open(generation_log_file, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(generation_log)

        # Note that the model is created with a custom layer and custom initializer, and therefore needs these two
        # custom classes if one wants to load a saved model
        if config.snapshot_freq != 0 and generations % config.snapshot_freq == 0:
            
            save_tasks_queue.put(Task(
                theta=theta,
                ob_mean=ob_stat.mean if optimizations.observation_normalization else None,
                ob_std=ob_stat.std if optimizations.observation_normalization else None,
                task_id=generations
                ))

            print("Saved model in generation {} to {}".format(generations, save_directory))

        generations += 1

    # Quit the multiprocessing data structures and processes
    stop_work.value = 1

    result_queue.close()
    
    for w in workers:
        # Workers are blocking on empty queues and cannot be joined. When attempted they will try to join forever
        # Therefore we terminate the process. This is not crucial since we already saved everything for the last
        # generation.
        w.terminate()
        
    # Save tasks queue is a joinable queue, therefore we can gracefully join the queue
    save_tasks_queue.join()
    save_tasks_queue.close()

    # Like the worker processes a join would result in an indefinite block, since save_tasks_queue is closed and all
    # save jobs have been processed we can terminate the process
    save_process.terminate()

In [None]:
run_master(max_timesteps=170000, seed=123)

## generate validation data

Now we the ANN are done. But we also need some validation data to see if the final SNN works as expected. 

the following cells read the latest ANN from disc, then let it run once while observing it's exact inputs and outputs. These data will then also be stored on discs right next to the ANN itself. 

In [None]:
import csv
import json
import os
import re
import time

import gym
import roboschool

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from gym import wrappers
from ipywidgets import Video
import ipywidgets as widgets
from multiprocessing import Pool, Process
from IPython.display import display

In [None]:
# %load load-model.py
def load_model(model_path):   
    import tensorflow as tf

    class Normc_initializer(tf.keras.initializers.Initializer):
        def __init__(self, std=1.0):
            self.std=std

        def __call__(self, shape, dtype=None, partition_info=None):
            out = np.random.randn(*shape).astype(np.float32)
            out *= self.std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
            return tf.constant(out)
    
    class ObservationNormalizationLayer(tf.keras.layers.Layer):
        def __init__(self, ob_mean, ob_std, **kwargs):
            self.ob_mean = ob_mean
            self.ob_std = ob_std
            super(ObservationNormalizationLayer, self).__init__(**kwargs)

        def call(self, x):
            return tf.clip_by_value((x - self.ob_mean) / self.ob_std, -5.0, 5.0)
        
        # get_config and from_config need to implemented to be able to serialize the model
        def get_config(self):
            base_config = super(ObservationNormalizationLayer, self).get_config()
            base_config['ob_mean'] = self.ob_mean
            base_config['ob_std'] = self.ob_std
            return base_config
        
        @classmethod
        def from_config(cls, config):
            return cls(**config)
        
    class DiscretizeActionsUniformLayer(tf.keras.layers.Layer):
        def __init__(self, num_ac_bins, adim, ahigh, alow, **kwargs):
            self.num_ac_bins = num_ac_bins
            self.adim = adim
            # ahigh, alow are NumPy arrays when extracting from the environment, but when the model is loaded from a h5
            # File they get initialised as a normal list, where operations like subtraction does not work, thereforce
            # cast them explicitly
            self.ahigh = np.array(ahigh)
            self.alow = np.array(alow)
            super(DiscretizeActionsUniformLayer, self).__init__(**kwargs)

        def call(self, x):            
            # Reshape to [n x i x j] where n is dynamically chosen, i equals action dimension and j equals the number
            # of bins
            scores_nab = tf.reshape(x, [-1, self.adim, self.num_ac_bins])
            # This picks the bin with the greatest value
            a = tf.argmax(scores_nab, 2)
            
            # Then transform the interval from [0, num_ac_bins - 1] to [-1, 1] which equals alow and ahigh
            ac_range_1a = (self.ahigh - self.alow)[None, :]
            return 1. / (self.num_ac_bins - 1.) * tf.keras.backend.cast(a, 'float32') * ac_range_1a + self.alow[None, :]        
        
        # get_config and from_config need to implemented to be able to serialize the model
        def get_config(self):
            base_config = super(DiscretizeActionsUniformLayer, self).get_config()
            base_config['num_ac_bins'] = self.num_ac_bins
            base_config['adim'] = self.adim
            base_config['ahigh'] = self.ahigh
            base_config['alow'] = self.alow
            return base_config
        
        @classmethod
        def from_config(cls, config):
            return cls(**config)
    
    custom_objects = {'Normc_initializer' : Normc_initializer, 
                      'ObservationNormalizationLayer' : ObservationNormalizationLayer,
                      'DiscretizeActionsUniformLayer' : DiscretizeActionsUniformLayer}
    
    try:
        model = tf.keras.models.load_model(model_path, custom_objects=custom_objects)
    except OSError as e:
        print(e)
        return None
    return model

In [None]:
def rollout_evaluation(env, model, render=False, timestep_limit=None, random_stream=None):
    """
    If random_stream is provided, the rollout will take noisy actions with noise drawn from that stream.
    Otherwise, no action noise will be added.
    """

    env_timestep_limit = env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')
    timestep_limit = env_timestep_limit if timestep_limit is None else min(timestep_limit, env_timestep_limit)
    rews = []
    t = 0

    ob = env.reset()
    inputs=[]
    results=[]
    for _ in range(timestep_limit):
        if render:
            env.render()
        
        inputs.append(ob[None])
        x = model.predict_on_batch(ob[None])
        results.append(x)
        ac = x[0]
        try:
            ob, rew, done, _ = env.step(ac)
        except AssertionError:
            # Is thrown when for example ac is a list which has at least one entry with NaN
            raise 
        rews.append(rew)
        t += 1

        if done:
            break
    x_test = np.concatenate(inputs)
    y_test = np.concatenate(results)
    np.savez_compressed(os.path.join(save_directory,'x_test'), x_test)
    np.savez_compressed(os.path.join(save_directory,'y_test'), y_test)

    return np.array(rews, dtype=np.float32), t


def run_model(model_file_path, model_file, save_directory, record=False):   
    
        with open(os.path.join(model_file_path, "config.json"), encoding='utf-8') as f:
            config = json.load(f)
    
        env = gym.make(config['config']['env_id'])
        env.reset()

        if record:
            env = wrappers.Monitor(env, save_directory, force=True)

        model = load_model(os.path.join(model_file_path, model_file))

        try:
            rewards, length = rollout_evaluation(env, model)
        except AssertionError:
            print("The model file provided produces non finite numbers. Stopping.")
            return
        
        print(rewards)
        print([rewards.sum(), length])

        return [rewards.sum(), length]
    


In [None]:
import glob
import os

#ann_path='/home/jovyan/work/evolution-strategies/training_runs/26_11_2019-11h_14m_11s/snapshot_00054.h5'
#ann_name="example_ann"
#config='/home/jovyan/work/evolution-strategies/training_runs/26_11_2019-11h_14m_11s/config.json'

# evolution-strategiestraining_runs
list_of_folders = glob.glob(main_directory+'/*') # * means all if need specific format then *.csv
latest_folder = max(list_of_folders, key=os.path.getctime)
print(latest_folder)
list_of_files = glob.glob(latest_folder+"/snapshot_*.h5") # * means all if need specific format then *.csv
latest_file = max(list_of_files, key=os.path.getctime)
print(latest_file)

In [None]:
#model_file_path = "/home/jovyan/work/evolution-strategies/training_runs/12_11_2019-13h_08m_06s"
model_file_path = latest_folder
#model_file_name = "snapshot_00013.h5"
#model_file_path = "/home/jovyan/base_repository/Workspace/ann_training_run/"
import ntpath
model_file_name = ntpath.basename(latest_file)

# Lets store the video file in the same directory as the model file
save_directory = model_file_path


#with Pool(os.cpu_count()) as pool:
#    pool.apply(func=run_model, args=(model_file_path, model_file_name, save_directory, True))
    
run_model(model_file_path, model_file_name, save_directory, True)
