Select your main folder:

In [1]:
cd '/home/molano/priors_project'

/home/molano/priors_project


# Utils

In [18]:
from scipy.signal import lfilter


def update_target_graph(from_scope, to_scope):
    """
    Copies one set of variables to another.
    Used to set worker network parameters to those of global network.
    """
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)

    op_holder = []
    for from_var, to_var in zip(from_vars, to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder


def discount(x, gamma):
    """
    Discounting function used to calculate discounted returns.
    """
    return lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]


def normalized_columns_initializer(std=1.0):
    """
    Used to initialize weights for policy and value output layers
    """
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer


def look_for_folder(main_folder='priors/', exp=''):
    """
    looks for a given folder and returns it.
    If it cannot find it, returns possible candidates
    """
    data_path = ''
    possibilities = []
    for root, dirs, files in os.walk(main_folder):
        ind = root.rfind('/')
        possibilities.append(root[ind+1:])
        if root[ind+1:] == exp:
            data_path = root
            break

    if data_path == '':
        candidates = difflib.get_close_matches(exp, possibilities,
                                               n=1, cutoff=0.)
        print(exp + ' NOT FOUND IN ' + main_folder)
        if len(candidates) > 0:
            print('possible candidates:')
            print(candidates)

    return data_path


def list_str(l):
    """
    list to str
    """
    nice_string = str(l[0])
    for ind_el in range(1, len(l)):
        nice_string += '_'+str(l[ind_el])
    return nice_string


def num2str(num):
    """
    pass big number to thousands
    """
    return str(int(num/1000))+'K'

# Network

In [3]:
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

def RNN_UGRU(inputs, prev_rewards, a_size, num_units):

    # create a UGRNNCell
    rnn_cell = tf.contrib.rnn.UGRNNCell(num_units, activation=tf.nn.relu)

    # this is the initial state used in the A3C model when training
    # or obtaining an action
    st_init = np.zeros((1, rnn_cell.state_size), np.float32)

    # defining initial state
    state_in = tf.placeholder(tf.float32, [1, rnn_cell.state_size])

    # reshape inputs size
    rnn_in = tf.expand_dims(inputs, [0])

    step_size = tf.shape(prev_rewards)[:1]

    # 'state' is a tensor of shape [batch_size, cell_state_size]
    # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
    outputs, state_out = tf.nn.dynamic_rnn(rnn_cell, rnn_in,
                                           initial_state=state_in,
                                           sequence_length=step_size,
                                           dtype=tf.float32,
                                           time_major=False)

    rnn_out = tf.reshape(outputs, [-1, num_units])

    actions, actions_onehot, policy, value = \
        process_output(rnn_out, outputs, a_size, num_units)

    return st_init, state_in, state_out, actions, actions_onehot, policy, value

def process_output(rnn_out, outputs, a_size, num_units):
    # Actions
    actions = tf.placeholder(shape=[None], dtype=tf.int32)
    actions_onehot = tf.one_hot(actions, a_size, dtype=tf.float32)

    # Output layers for policy and value estimations
    policy = slim.fully_connected(rnn_out, a_size,
                                  activation_fn=tf.nn.softmax,
                                  weights_initializer=normalized_columns_initializer(0.01),
                                  biases_initializer=None)
    value = slim.fully_connected(rnn_out, 1,
                                 activation_fn=None,
                                 weights_initializer=normalized_columns_initializer(1.0),
                                 biases_initializer=None)

    return actions, actions_onehot, policy, value

  from ._conv import register_converters as _register_converters


# Environment

We create the gym environment (should install and import gym if you have not done so yet).

Remove previous gym-priors folder and create new one:

In [4]:
! rm -rf gym-priors/
! mkdir gym-priors
! touch gym-priors/README.md
! touch gym-priors/setup.py
! mkdir gym-priors/gym_priors/
! touch gym-priors/gym_priors/__init__.py
! mkdir gym-priors/gym_priors/envs/
! touch gym-priors/gym_priors/envs/__init__.py
! touch gym-priors/gym_priors/envs/priors_env.py

## Auxiliary functions

Create the necessary functions.

In [5]:
%%writefile gym-priors/setup.py
from setuptools import setup
print('setup!')
setup(name='gym_priors',
      version='0.0.1',
      install_requires=['gym'])  # And any other dependencies required

Overwriting gym-priors/setup.py


In [6]:
%%writefile gym-priors/gym_priors/__init__.py
from gym.envs.registration import register
register(
    id='priors-v0',
    entry_point='gym_priors.envs:PriorsEnv')

Overwriting gym-priors/gym_priors/__init__.py


In [7]:
%%writefile gym-priors/gym_priors/envs/__init__.py
from gym_priors.envs.priors_env import PriorsEnv

Overwriting gym-priors/gym_priors/envs/__init__.py


## Task

In [8]:
%%writefile gym-priors/gym_priors/envs/priors_env.py
import sys
from gym import spaces
import numpy as np
import gym
import os

# from gym.utils import seeding


class PriorsEnv(gym.Env):
    metadata = {}

    def __init__(self, exp_dur=100, trial_dur=10, upd_net=5,
                 rep_prob=(.2, .8), rewards=(0.1, -0.1, 1.0, -1.0),
                 env_seed='0', block_dur=200, stim_ev=0.5, folder=None):
        print('init environment!')
        # exp. duration (training will consist in several experiments)
        self.exp_dur = exp_dur
        # num steps per trial
        self.trial_dur = trial_dur
        # rewards given for: stop fixating, keep fixating, correct, wrong
        self.rewards = rewards
        # number of trials per blocks
        self.block_dur = block_dur
        # stimulus evidence: one stimulus is always N(1,1), the mean of
        # the other is drawn from a uniform distrib.=U(stim_ev,1).
        # stim_ev must then be between 0 and 1 and the higher it is
        # the more difficult will be the task
        self.stim_ev = stim_ev
        # prob. of repeating the stimuli in the positions of previous trial
        self.rep_prob = rep_prob
        # model instance
        self.env_seed = env_seed
        # folder to save data
        self.folder = folder
        # update parameters
        self.upd_net = upd_net

        # num actions
        self.num_actions = 3
        self.action_space = spaces.Discrete(self.num_actions)
        # position of the first stimulus
        self.stms_pos_new_trial = np.random.choice([0, 1])
        # keeps track of the repeating prob of the current block
        self.curr_rep_prob = np.random.choice([0, 1])
        # position of the stimuli
        self.stm_pos_new_trial = 0
        # steps counter
        self.timestep = 0
        # initialize ground truth state [stim1 mean, stim2 mean, fixation])
        # the network has to output the action corresponding to the stim1 mean
        # that will be always 1.0 (I just initialize here at 0 for convinience)
        self.int_st = np.array([0, 0, -1])
        # accumulated evidence
        self.evidence = 0
        # observation space
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(3, ), dtype=np.float32)
        # number of trials
        self.num_tr = 0

        # trial data to save
        # stimulus evidence
        self.ev_mat = []
        # position of stimulus 1
        self.stm_pos = []
        # performance
        self.perf_mat = []
        # summed activity across the trial
        self.action = []

    def update_params(self, exp_dur=10**4, trial_dur=10, upd_net=5,
                 rep_prob=(.2, .8), rewards=(0.1, -0.1, 1.0, -1.0),
                 env_seed='0', block_dur=200, stim_ev=0.5, folder='', seed=0):
        """
        this function should be run after creating an environment to set the
        main parameters.
        There does not seem to be easy way of passing those parameters
        when using the make function of the gym toolbox, so this is a way
        around to set the parameters.
        """
        # we update the network every x trials
        self.upd_net = upd_net or self.upd_net
        # exp. duration (num. trials; training consists in several exps)
        self.exp_dur = exp_dur or self.exp_dur
        # num steps per trial
        self.trial_dur = trial_dur or self.trial_dur
        # rewards given for: stop fixating, keep fixating, correct, wrong
        self.rewards = rewards or self.rewards
        # number of trials per blocks
        self.block_dur = block_dur or self.block_dur
        # stimulus evidence
        stim_ev = stim_ev or self.stim_ev
        self.stim_ev = np.max([stim_ev, 10e-5])
        # prob. of repeating the stimuli in the positions of previous trial
        self.rep_prob = rep_prob or self.rep_prob
        # model seed
        self.env_seed = seed or self.env_seed
        # folder where data will be saved
        aux_folder = folder or self.folder
        if aux_folder is not None:
            self.sv_data = True
        else:
            self.sv_data = False
        print('--------------- Priors experiment ---------------')
        print('Duration of each experiment (in trials): ' +
              str(self.exp_dur))
        print('Duration of each trial (in steps): ' + str(self.trial_dur))
        print('Rewards: ' + str(self.rewards))
        print('Duration of each block (in trials): ' + str(self.block_dur))
        print('Repeating probabilities of each block: ' + str(self.rep_prob))
        print('Stim evidence: ' + str(self.stim_ev))
        print('Saving folder: ' + str(self.folder))
        print('--------------- ----------------- ---------------')

    def step(self, action):
        """
        receives an action and returns a reward, a state and flag variables
        that indicate whether to start a new trial and whether to update
        the network
        """
        new_trial = True
        correct = False
        done = False
        # decide which reward and state (new_trial, correct) we are in
        if self.timestep < self.trial_dur:
            if (self.int_st[action] != -1).all():
                reward = self.rewards[0]
            else:
                # don't abort the trial even if the network stops fixating
                reward = self.rewards[1]

            new_trial = False

        else:
            if (self.int_st[action] == 1.0).all():
                reward = self.rewards[2]
                correct = True
            else:
                reward = self.rewards[3]

        info = {'perf': correct, 'ev': self.evidence}

        if new_trial:
            if self.sv_data:
                # keep main variables of the trial
                self.stm_pos.append(self.stms_pos_new_trial)
                self.perf_mat.append(correct)
                self.action.append(action)
                self.ev_mat.append(self.evidence)
            new_st = self.new_trial()
            # check if it is time to update the network
            done = ((self.num_tr-1) % self.exp_dur == 0) and (self.num_tr != 1)
            # check if it is time to save the trial-to-trial data
            if self.sv_data and self.num_tr % 10000 == 0:
                self.save_trials_data()
                self.output_stats()
        else:
            new_st = self.get_state()

        return new_st, reward, done, info

    def get_state(self):
        """
        Outputs a new observation using stim 1 and 2 means.
        It also outputs a fixation signal that is always -1 except at the
        end of the trial that is 0
        """
        self.timestep += 1
        # if still in the integration period present a new observation
        if self.timestep < self.trial_dur:
            self.state = [np.random.normal(self.int_st[0]),
                          np.random.normal(self.int_st[1]), -1]
        else:
            self.state = [0, 0, 0]

        # update evidence
        self.evidence += self.state[0]-self.state[1]

        return np.reshape(self.state, (3, ))

    def new_trial(self):
        """
        this function creates a new trial, deciding the amount of coherence
        (through the mean of stim 2) and the position of stim 1. Once it has
        done this it calls get_state to get the first observation of the trial
        """
        self.num_tr += 1
        self.timestep = 0
        self.evidence = 0
        # this are the means of the two stimuli
        stim1 = 1.0
        stim2 = np.random.uniform(1-self.stim_ev, 1)
        assert stim2 != 1.0
        self.choices = [stim1, stim2]

        # decide the position of the stims
        # if the block is finished update the prob of repeating
        if self.num_tr % self.block_dur == 0:
            self.curr_rep_prob = int(not self.curr_rep_prob)

        # flip a coin
        repeat = np.random.uniform() < self.rep_prob[self.curr_rep_prob]
        if not repeat:
            self.stms_pos_new_trial = not(self.stms_pos_new_trial)

        aux = [self.choices[x] for x in [int(self.stms_pos_new_trial),
                                         int(not self.stms_pos_new_trial)]]

        self.int_st = np.concatenate((aux, np.array([-1])))

        # get state
        s = self.get_state()

        return s

    def save_trials_data(self):
        """
        save trial-to-trial data for:
        evidence, stim postion, action taken and outcome
        """
        # Periodically save model trials statistics.
        data = {'stims_position': self.stm_pos,
                'action': self.action,
                'performance': self.perf_mat,
                'evidence': self.ev_mat}
        np.savez(self.folder + '/trials_stats_' +
                 str(self.env_seed) + '_' + str(self.num_tr) + '.npz', **data)

    def reset(self):
        return self.new_trial()

    def output_stats(self):
        """
        plot temporary learning and bias curves
        """
        # add current path to sys.path so as to import analyses_priors
        sys.path.append(os.path.dirname(os.path.realpath(__file__)))
        import analyses_priors as ap
        aux_shape = (1, len(self.ev_mat))
        # plot psycho. curves
        per = 20000
        ev = np.reshape(self.ev_mat, aux_shape).copy()
        ev = ev[np.max([0, len(ev)-per]):]
        perf = np.reshape(self.perf_mat,
                          aux_shape).copy()
        perf = perf[np.max([0, len(perf)-per]):]
        action = np.reshape(self.action, aux_shape).copy()
        action = action[np.max([0, len(action)-per]):]
        stim_pos = np.reshape(self.stm_pos,
                              aux_shape).copy()
        stim_pos = stim_pos[np.max([0, len(stim_pos)-per]):]
        ap.plot_psychometric_curves(ev, perf, action, blk_dur=self.block_dur,
                                    figs=True, folder=self.folder,
                                    name='psycho_'+str(self.num_tr))
        # plot learning
        ev = np.reshape(self.ev_mat, aux_shape).copy()
        perf = np.reshape(self.perf_mat,
                          aux_shape).copy()
        action = np.reshape(self.action, aux_shape).copy()
        stim_pos = np.reshape(self.stm_pos,
                              aux_shape).copy()
        ap.plot_learning(perf, ev, stim_pos, action, folder=self.folder,
                         name='', save_fig=True, view_fig=True)

    def render():
        pass

Overwriting gym-priors/gym_priors/envs/priors_env.py


In [9]:
cd gym-priors/

/home/molano/priors_project/gym-priors


In [10]:
! pip install -e .

Obtaining file:///home/molano/priors_project/gym-priors
[31mtensorflow-gpu 1.11.0 has requirement tensorboard<1.12.0,>=1.11.0, but you'll have tensorboard 1.12.2 which is incompatible.[0m
Installing collected packages: gym-priors
  Found existing installation: gym-priors 0.0.1
    Uninstalling gym-priors-0.0.1:
      Successfully uninstalled gym-priors-0.0.1
  Running setup.py develop for gym-priors
Successfully installed gym-priors
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [11]:
ls

[0m[01;34mgym_priors[0m/  [01;34mgym_priors.egg-info[0m/  README.md  setup.py


In [12]:
cd ..

/home/molano/priors_project


In [13]:
ls

Cosyne_priors.ipynb  [0m[01;34mgym-priors[0m/  gym_priors.ipynb  [01;34mpriors[0m/


# Agent

## Network class

In [14]:
class AC_Network():
    def __init__(self, a_size, state_size, scope, trainer, num_units, network):
        with tf.variable_scope(scope):
            # Input and visual encoding layers
            self.st = tf.placeholder(shape=[None, 1, state_size, 1],
                                     dtype=tf.float32)
            self.prev_rewards = tf.placeholder(shape=[None, 1],
                                               dtype=tf.float32)
            self.prev_actions = tf.placeholder(shape=[None],
                                               dtype=tf.int32)

            self.prev_actions_onehot = tf.one_hot(self.prev_actions, a_size,
                                                  dtype=tf.float32)

            hidden = tf.concat([slim.flatten(self.st), self.prev_rewards,
                                self.prev_actions_onehot], 1)

            # call RNN network
            if network == 'relu':
                net = RNN_ReLU
            elif network == 'lstm':
                net = RNN
            elif network == 'gru':
                net = RNN_GRU
            elif network == 'ugru':
                net = RNN_UGRU
            else:
                raise ValueError('Unknown network')

            self.st_init, self.st_in, self.st_out, self.actions,\
                self.actions_onehot, self.policy, self.value =\
                net(hidden, self.prev_rewards, a_size, num_units)

            # Only the worker network needs ops for loss functions
            # and gradient updating.
            if scope != 'global':
                self.target_v = tf.placeholder(shape=[None], dtype=tf.float32)
                self.advantages = tf.placeholder(shape=[None],
                                                 dtype=tf.float32)

                self.resp_outputs = \
                    tf.reduce_sum(self.policy * self.actions_onehot, [1])

                # Loss functions
                self.value_loss = 0.5 * tf.reduce_sum(
                        tf.square(self.target_v -
                                  tf.reshape(self.value, [-1])))
                self.entropy = - tf.reduce_sum(
                        self.policy * tf.log(self.policy + 1e-7))
                self.policy_loss = -tf.reduce_sum(
                        tf.log(self.resp_outputs + 1e-7)*self.advantages)
                self.loss = 0.5 * self.value_loss +\
                    self.policy_loss -\
                    self.entropy * 0.05

                # Get gradients from local network using local losses
                local_vars = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES, scope)
                self.gradients = tf.gradients(self.loss, local_vars)
                self.var_norms = tf.global_norm(local_vars)
                grads, self.grad_norms =\
                    tf.clip_by_global_norm(self.gradients, 999.0)

                # Apply local gradients to global network
                global_vars = tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.apply_grads = trainer.apply_gradients(
                        zip(grads, global_vars))


## Worker class

In [15]:
class Worker():
    def __init__(self, game, name, a_size, state_size, trainer,
                 model_path, global_epss, data_path, num_units, network):
        self.name = "worker_" + str(name)
        self.number = name
        self.folder = './' + data_path + '/trains/train_' + str(self.number)
        self.model_path = model_path
        self.trainer = trainer
        self.global_epss = global_epss
        self.increment = self.global_epss.assign_add(1)
        self.network = network
        self.eps_rewards = []
        self.eps_mean_values = []

        self.summary_writer = tf.summary.FileWriter(self.folder)

        # Create the local copy of the network and the tensorflow op
        # to copy global parameters to local network
        self.local_AC = AC_Network(a_size, state_size, self.name, trainer,
                                   num_units, network)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = game

    def train(self, rollout, sess, gamma, bootstrap_value):
        rollout = np.array(rollout)
        states = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]

        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [0] + actions[:-1].tolist()
        values = rollout[:, 3]

        self.pr = prev_rewards
        self.pa = prev_actions
        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        # The advantage function uses "Generalized Advantage Estimation"
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards +\
            gamma * self.value_plus[1:] -\
            self.value_plus[:-1]
        advantages = discount(advantages, gamma)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        rnn_state = self.local_AC.st_init
        if self.network == 'lstm':
            feed_dict = {self.local_AC.target_v: discounted_rewards,
                         self.local_AC.state: np.stack(states, axis=0),
                         self.local_AC.prev_rewards: np.vstack(prev_rewards),
                         self.local_AC.prev_actions: prev_actions,
                         self.local_AC.actions: actions,
                         self.local_AC.advantages: advantages,
                         self.local_AC.state_in[0]: rnn_state[0],
                         self.local_AC.state_in[1]: rnn_state[1]}
        elif (self.network == 'relu') or\
             (self.network == 'gru') or\
             (self.network == 'ugru'):
            feed_dict = {self.local_AC.target_v: discounted_rewards,
                         self.local_AC.st: np.stack(states, axis=0),
                         self.local_AC.prev_rewards: np.vstack(prev_rewards),
                         self.local_AC.prev_actions: prev_actions,
                         self.local_AC.actions: actions,
                         self.local_AC.advantages: advantages,
                         self.local_AC.st_in: rnn_state}

        v_l, p_l, e_l, g_n, v_n, _ = sess.run([self.local_AC.value_loss,
                                               self.local_AC.policy_loss,
                                               self.local_AC.entropy,
                                               self.local_AC.grad_norms,
                                               self.local_AC.var_norms,
                                               self.local_AC.apply_grads],
                                              feed_dict=feed_dict)
        aux = len(rollout)
        return v_l / aux, p_l / aux, e_l / aux, g_n, v_n

    def work(self, gamma, sess, coord, saver, train, exp_dur):
        eps_count = sess.run(self.global_epss)
        num_eps_tr_stats = int(1000/self.env.upd_net)
        num_epss_end = int(exp_dur/self.env.upd_net)
        num_epss_save_model = int(5000/self.env.upd_net)
        total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                eps_buffer = []
                eps_values = []
                eps_reward = 0
                eps_step_count = 0
                d = False
                r = 0
                a = 0

                # get first state
                s = self.env.new_trial()

                rnn_state = self.local_AC.st_init
                net_smmd_act = np.zeros_like(rnn_state)
                while not d:
                    if self.network == 'lstm':
                        feed_dict = {
                                    self.local_AC.state: [s],
                                    self.local_AC.prev_rewards: [[r]],
                                    self.local_AC.prev_actions: [a],
                                    self.local_AC.state_in[0]: rnn_state[0],
                                    self.local_AC.state_in[1]: rnn_state[1]}
                    elif (self.network == 'relu') or\
                         (self.network == 'gru') or\
                         (self.network == 'ugru'):
                        feed_dict = {
                                    self.local_AC.st: [s],
                                    self.local_AC.prev_rewards: [[r]],
                                    self.local_AC.prev_actions: [a],
                                    self.local_AC.st_in: rnn_state}

                    # Take an action using probs from policy network output
                    a_dist, v, rnn_state_new = sess.run(
                                                        [self.local_AC.policy,
                                                         self.local_AC.value,
                                                         self.local_AC.st_out],
                                                        feed_dict=feed_dict)

                    a = np.random.choice(a_dist[0], p=a_dist[0])
                    a = np.argmax(a_dist == a)
                    rnn_state = rnn_state_new
                    net_smmd_act += rnn_state_new
                    aux = np.floor(self.env.num_tr/self.env.num_tr_svd)
                    if aux % self.env.sv_pts_stp == 0:
                        network_activity = rnn_state_new
                    else:
                        network_activity = []
                    # new_state, reward, update_net, new_trial
                    s1, r, d, nt = self.env.step(a)
                    # save samples for training the network later
                    eps_buffer.append([s, a, r, v[0, 0]])
                    eps_values.append(v[0, 0])
                    eps_reward += r
                    total_steps += 1
                    eps_step_count += 1
                    # store the summed activity at the end of the trial
                    if nt:
                        self.env.net_smmd_act.append(net_smmd_act)
                        net_smmd_act = np.zeros_like(rnn_state)
                        self.env.save_trials_data()
                    if not d:
                        if nt:
                            s = self.env.new_trial()
                        else:
                            s = s1

                self.eps_rewards.append(eps_reward)
                self.eps_mean_values.append(np.mean(eps_values))

                # Update the network using the experience buffer
                # at the end of the episode
                if len(eps_buffer) != 0 and train:
                    v_l, p_l, e_l, g_n, v_n = \
                        self.train(eps_buffer, sess, gamma, 0.0)

                # Periodically save model parameters and summary statistics.
                if eps_count % num_eps_tr_stats == 0 and eps_count != 0:
                    if eps_count % num_epss_save_model == 0 and\
                       self.name == 'worker_0' and\
                       train and\
                       len(self.eps_rewards) != 0:
                        saver.save(sess, self.model_path +
                                   '/model-' + str(eps_count) + '.cptk')

                    mean_tr_dur = np.mean(self.env.dur_tr[-10:])
                    mean_reward = np.mean(self.eps_rewards[-10:])
                    mean_value = np.mean(self.eps_mean_values[-10:])
                    summary = tf.Summary()
                    summary.value.add(tag='Perf/trial_duration',
                                      simple_value=float(mean_tr_dur))
                    summary.value.add(tag='Perf/Reward',
                                      simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Value',
                                      simple_value=float(mean_value))

                    performance_aux = np.vstack(np.array(self.env.perf_mat))

                    for ind_crr in range(performance_aux.shape[1]):
                        mean_performance = np.mean(performance_aux[:, ind_crr])
                        summary.value.add(tag='Perf/Perf_' + str(ind_crr),
                                          simple_value=float(mean_performance))

                    if train:
                        summary.value.add(tag='Losses/Value Loss',
                                          simple_value=float(v_l))
                        summary.value.add(tag='Losses/Policy Loss',
                                          simple_value=float(p_l))
                        summary.value.add(tag='Losses/Entropy',
                                          simple_value=float(e_l))
                        summary.value.add(tag='Losses/Grad Norm',
                                          simple_value=float(g_n))
                        summary.value.add(tag='Losses/Var Norm',
                                          simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, eps_count)

                    self.summary_writer.flush()

                if self.name == 'worker_0':
                    sess.run(self.increment)

                eps_count += 1
                if eps_count > num_epss_end:
                    break


# Call the function.

In [16]:
import threading
import multiprocessing
import os
import gym
import gym_priors

def main_priors(load_model=False, train=True, gamma=.8, up_net=5,
                trial_dur=10, rep_prob=(0.2, 0.8), exp_dur=1000,
                rewards=(-0.1, 0.0, 1.0, -1.0), block_dur=200,
                num_units=32, stim_ev=.3, network='ugru',
                learning_rate=1e-3, instance=0):
    a_size = 3  # number of actions
    state_size = a_size  # number of inputs
    if train:
        test_flag = ''
    else:
        test_flag = '_test'
    data_path = 'priors/' + 'trial_dur_' + str(trial_dur) +\
        '_rep_prob_' + str(list_str(rep_prob)) +\
        '_rewards_' + str(list_str(rewards)) +\
        '_block_dur_' + str(block_dur) + '_stimEv_' + str(stim_ev) +\
        '_gamma_' + str(gamma) + '_num_units_' + str(num_units) +\
        '_up_net_' + str(up_net) + '_network_' \
        + str(network) + '_' + str(instance) + test_flag + '/'

    data = {'trial_dur': trial_dur, 'rep_prob': rep_prob,
            'rewards': rewards, 'stim_ev': stim_ev,
            'block_dur': block_dur, 'gamma': gamma, 'num_units': num_units,
            'up_net': up_net, 'network': network}

    model_path = './' + data_path + '/model_meta_context'

    if not os.path.exists(model_path):
        os.makedirs(model_path)

    np.savez(data_path + '/experiment_setup.npz', **data)

    tf.reset_default_graph()
    with tf.device("/cpu:0"):
        global_episodes = tf.Variable(0, dtype=tf.int32,
                                      name='global_episodes',
                                      trainable=False)
        trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        AC_Network(a_size, state_size, 'global',
                             None, num_units, network)  # Generate global net
        # Set workers to number of available CPU threads
        num_workers = multiprocessing.cpu_count()
        workers = []
        # Create worker classes
        for i in range(num_workers):
            env = gym.make('priors-v0')
            saving_path = './' + data_path + '/trains/train_' + str(i)
            env.update_params(upd_net=up_net, trial_dur=trial_dur,
                              rep_prob=rep_prob, rewards=rewards,
                              block_dur=block_dur, stim_ev=stim_ev,
                              folder=saving_path)

            workers.append(Worker(env, i, a_size, state_size,
                            trainer, model_path, global_episodes,
                            data_path, num_units, network))
        saver = tf.train.Saver(max_to_keep=5)

    with tf.Session() as sess:
        coord = tf.train.Coordinator()
        if load_model:
            print('Loading Model...')
            print(model_path)
            ckpt = tf.train.get_checkpoint_state(model_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        worker_threads = []
        for worker in workers:
            worker_work = lambda: worker.work(gamma, sess, coord, saver, train, exp_dur)
            thread = threading.Thread(target=(worker_work))
            thread.start()
            worker_threads.append(thread)
        coord.join(worker_threads)


In [19]:
main_priors(load_model=False, train=True, gamma=.8, up_net=5,
                trial_dur=10, rep_prob=(0.2, 0.8),
                rewards=(-0.1, 0.0, 1.0, -1.0), block_dur=200,
                num_units=32, stim_ev=.3, network='ugru',
                learning_rate=1e-3, instance=123)

init environment!


NameError: name 'list_str' is not defined

In [None]:
env = gym.make('priors-v0')
env.update_params()

In [None]:
ls