# Reinforcement Learning with Goals
### An implementation of [Learning to Act by Predicting the Future](https://arxiv.org/pdf/1611.01779.pdf)

1. Install requirements with `pip install -r requirements.txt`
2. Run this notebook.
3. Monitor learning progress with `tensorboard --logdir=worker_0:'./train_0',worker_1:'./train_1',worker_2:'./train_2',worker_3:'./train_3',...worker_n:'./train_n'`

In [1]:
import imageio
import multiprocessing
import threading
import time
import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

from helper import *


### Helper Functions

#### Calculating Temporal offsets (f)
`get_f` takes a time-series of measurements as well as a set of temporal offsets, and produces the 'f' values for those measurements, which corresponds to how they change in the future at each offset.

Given a set of offsets T, and a set of measurements in temporal order m, produces f as follows:

`f = <m_T1  – m_0,m_T2  – m_0… m_Tn  – m_0>`


#### Experience Buffer
`ExperienceBuffer` is used to store a history of experiences that can be randomly drawn from when training the network.

In [2]:
def get_f(m,offsets):
    f = np.zeros([len(m),m.shape[1],len(offsets)])
    for i,offset in enumerate(offsets):
        f[:-offset,:,i] = m[offset:,:] - m[:-offset,:]
        if i > 0:
            f[-offset:,:,i] = f[-offset:,:,i-1]
    return f

class ExperienceBuffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(list(self.buffer)) + len(list(experience)) >= self.buffer_size:
            self.buffer[0:(len(list(experience))+len(list(self.buffer)))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

### 'Direct Future Prediction' (DFP) Network

This class contain the definition of the neural network in Tensorflow, including the tensorflow ops that will be required for updating the network. In the paper the authors refer to their model as "Direct Future Prediction (DFP)," and so I adopt the same notation.

In [3]:
class DFP_Network():
    def __init__(self,a_size,scope,trainer,num_offsets,num_measurements):
        with tf.variable_scope(scope):
            #Inputs and visual encoding layers
            self.observation = tf.placeholder(shape=[None,5,5,3],dtype=tf.float32)
            self.measurements = tf.placeholder(shape=[None,num_measurements],dtype=tf.float32)
            self.goals = tf.placeholder(shape=[None,num_measurements],dtype=tf.float32)
            self.hidden_o = slim.fully_connected(slim.flatten(self.observation),128,activation_fn=tf.nn.elu)
            self.hidden_m = slim.fully_connected(slim.flatten(self.measurements),64,activation_fn=tf.nn.elu)
            self.hidden_g = slim.fully_connected(slim.flatten(self.goals),64,activation_fn=tf.nn.elu)
            hidden_input = tf.concat([self.hidden_o,self.hidden_m,self.hidden_g],1)
            hidden_output = slim.fully_connected(hidden_input,256,activation_fn=tf.nn.elu)

            #We calculate separate expectation and advantage streams, then combine then later
            #This technique is described in https://arxiv.org/pdf/1511.06581.pdf

            self.expectation = slim.fully_connected(hidden_output,a_size * num_offsets * num_measurements,
                activation_fn=None,
                biases_initializer=None)
            self.advantages = slim.fully_connected(hidden_output,a_size * num_offsets * num_measurements,
                activation_fn=None,
                biases_initializer=None)
            
            self.advantages = self.advantages - tf.reduce_mean(self.advantages,reduction_indices=1,keep_dims=True)
            self.prediction = self.expectation + self.advantages
            
            #Reshape the predictions to be  [measurements x actions x offsets]
            self.prediction = tf.reshape(self.prediction, [-1,num_measurements,a_size,num_offsets])
            
            # We use a softmax with temperate to pick actions. This is instead of e-greedy.
            # For more info on action-selection strategies, see: 
            # goo.gl/oyL5Vx
            self.temperature = tf.placeholder(shape=[None],dtype=tf.float32)
            self.boltzmann = tf.nn.softmax(tf.reduce_sum(self.prediction,reduction_indices=3)/self.temperature)
            
            self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
            self.actions_onehot = tf.one_hot(self.actions,a_size,dtype=tf.float32)
            
            # Select the predictions relevant to the chosen action.
            self.pred_action = tf.reduce_sum(self.prediction * tf.reshape(self.actions_onehot,[-1,1,a_size,1]), [2])
            
            #Only the global network need ops for loss functions and gradient updating.
            if scope == 'global':
                self.target = tf.placeholder(shape=[None,num_measurements,num_offsets],dtype=tf.float32)
                
                #Loss function
                self.loss = tf.reduce_sum(tf.squared_difference(self.pred_action,self.target))
                
                #Sparsity of the action distribution
                self.entropy = -tf.reduce_sum(self.boltzmann * tf.log(self.boltzmann + 1e-7)) 

                #Get & apply gradients from network
                global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')
                self.gradients = tf.gradients(self.loss,global_vars)
                self.var_norms = tf.global_norm(global_vars)
                grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,9999.0)
                self.apply_grads = trainer.apply_gradients(list(zip(grads,global_vars)))

### Worker Agent

With asynchronous learning we have multiple 'worker agents,' each of which interacts with their own environment and collects experiences using its own local network. At the end of an episode those experiences are sent to the experience buffer. A random batch of experiences are then drawn from the buffer and the 'global network'  processes them and updates itself with backpropogation. The new global network is then copied over to the worker agent, and the process repeats.

In [4]:
class Worker():
    def __init__(self,game,name,a_size,trainer,model_path,global_episodes,offsets,exp_buff,num_measurements,master,gif_path):
        self.name = "worker_" + str(name)
        self.number = name        
        self.offsets = offsets
        self.global_net = master
        self.exp_buff = exp_buff
        self.model_path = model_path
        self.gif_path = gif_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_deliveries = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.num_measurements = num_measurements
        self.summary_writer = tf.summary.FileWriter("train_"+str(self.number))

        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_DFP = DFP_Network(a_size,self.name,trainer,len(offsets),num_measurements)
        self.update_local_ops = update_target_graph('global',self.name)        
        self.env = game
        
    def train(self,rollout,sess):
        rollout = np.array(rollout)
        measurements = np.vstack(rollout[:,2])
        targets = get_f(measurements,self.offsets) #Generate targets using measurements and offsets
        rollout[:,4] = list(zip(targets))
        self.exp_buff.add(list(zip(rollout)))
        
        
        #Get a batch of experiences from the buffer and use them to update the global network
        if len(self.exp_buff.buffer) > 128:
            exp_batch = self.exp_buff.sample(128)
            feed_dict = {self.global_net.observation:np.stack(exp_batch[:,0],axis=0),
                self.global_net.measurements:np.vstack(exp_batch[:,2]),
                self.global_net.temperature:[0.1],
                self.global_net.actions:exp_batch[:,1],
                self.global_net.target:np.vstack(exp_batch[:,4]),
                self.global_net.goals:np.vstack(exp_batch[:,3])}
            loss,entropy,g_n,v_n,_ = sess.run([self.global_net.loss,
                self.global_net.entropy,
                self.global_net.grad_norms,
                self.global_net.var_norms,
                self.global_net.apply_grads],feed_dict=feed_dict)
            return loss / len(rollout), entropy / len(rollout), g_n,v_n
        else:
            return 0,0,0,0
        
    def work(self,sess,coord,saver,train):
        episode_count = sess.run(self.global_episodes)
        self.episode_count = episode_count
        total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():                 
            while not coord.should_stop():
                sess.run(self.update_local_ops) #Copy parameters from global to local network
                episode_buffer = []
                episode_frames = []
                d = False
                t = 0
                temp = 0.25 #How spread out we want our action distribution to be
                
                s,o_big,m,g,h = self.env.reset()
                self.the_m = m

                while d == False:
                    
                    #Here is where our goal-switching takes place
                    # When the battery charge is below 0.3, we set the goal to optimize battery
                    # When the charge is above that value we set the goal to optimize deliveries
                    if m[1] <= .3:
                        self.g = np.array([0.0,1.0])
                    else:
                        self.g = np.array([1.0,0.0])
                    a_dist = sess.run(self.local_DFP.boltzmann, 
                        feed_dict={
                        self.local_DFP.temperature:[temp],
                        self.local_DFP.observation:[s],
                        self.local_DFP.measurements:[m],
                        self.local_DFP.goals:[self.g]})
                    b = self.g*a_dist[0].T
                    c = np.sum(b,1)
                    c /= c.sum()
                    a = np.random.choice(c,p=c)
                    a = np.argmax(c == a)
                    
                    s1,s1_big,m1,g1,h1,d = self.env.step(a)                        
                    episode_buffer.append([s,a,np.array(m),self.g,np.zeros(len(self.offsets))])
                    if self.name == 'worker_0' and episode_count % 150 == 0:
                        episode_frames.append(set_image_gridworld(s1_big,m1,t+1,g1,h1))
                    total_steps += 1
                    s = np.copy(s1)
                    m = []
                    m = m1[:]
                    g = g1[:]
                    h = h1
                    t += 1
                    
                    # End the episode after 100 steps
                    if t > 100:
                        d = True
                                            
                self.episode_deliveries.append(m[0])
                self.episode_lengths.append(t)
                
                # Update the network using the experience buffer at the end of the episode.
                if train == True:
                    loss,entropy,g_n,v_n = self.train(episode_buffer,sess)
            
                    
                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 50 == 0 and episode_count != 0:
                    if episode_count % 2000 == 0 and self.name == 'worker_0' and train == True:
                        saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk')
                        print("Saved Model")

                    if self.name == 'worker_0' and episode_count % 150 == 0:
                        time_per_step = 0.25
                        self.images = np.array(episode_frames)
                        imageio.mimsave(self.gif_path+'/image'+str(episode_count)+'.gif',self.images, duration=time_per_step)
                        
                    mean_deliveries = np.mean(self.episode_deliveries[-50:])
                    mean_length = np.mean(self.episode_lengths[-50:])
                    mean_value = np.mean(self.episode_mean_values[-50:])
                    summary = tf.Summary()
                    summary.value.add(tag='Performance/Deliveries', simple_value=float(mean_deliveries))
                    summary.value.add(tag='Performance/Length', simple_value=float(mean_length))
                    if train == True:
                        summary.value.add(tag='Losses/Loss', simple_value=float(loss))
                        summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()
                if self.name == 'worker_0':
                    sess.run(self.increment)
                episode_count += 1
                self.episode_count = episode_count

### Training the Network

Hyperparameters

In [6]:
a_size = 4 # Number of available actions
num_measurements = 2 #Number of measurements
learning_rate = 1e-3 #Learning ragte
offsets = [1,2,4,8,16,32] # Set of temporal offsets
load_model = False #Whether to load a saved model
train = True #Whether to train the network
model_path = './model_goals' #Path to save the model to
gif_path = './frames_goals' #Path to save gifs of agent performance to

The below code establishes the global tensorflow network, as well as creating and starting each of the workers with their own individual networks.

In [7]:
tf.reset_default_graph()

exp_buff = ExperienceBuffer()

if not os.path.exists(model_path):
    os.makedirs(model_path)
    
if not os.path.exists(gif_path):
    os.makedirs(gif_path)

trainer = tf.train.AdamOptimizer(learning_rate=learning_rate)
master_network = DFP_Network(a_size,'global',trainer,len(offsets),num_measurements) # Generate global network
with tf.device("/cpu:0"): 
    global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False)
    num_workers = 2 #multiprocessing.cpu_count() # Set workers ot number of available CPU threads
    workers = []
    # Create worker classes
    for i in range(num_workers):
        workers.append(
            Worker(gameEnv(partial=False,size=5),i,a_size,
            trainer,model_path,global_episodes,offsets,
            exp_buff,num_measurements,master_network,gif_path))
    saver = tf.train.Saver(max_to_keep=5)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(model_path)
        saver.restore(sess,ckpt.model_checkpoint_path)
    else:
        sess.run(tf.global_variables_initializer())
        
    # Start each of the workers on a separate thread
    worker_threads = []
    for worker in workers:
        worker_work = lambda: worker.work(sess,coord,saver,train)
        thread = threading.Thread(target=(worker_work))
        thread.start()
        time.sleep(0.5)
        worker_threads.append(thread)
    coord.join(worker_threads)

NameError: name 'gameEnv' is not defined