In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import theano
import theano.tensor as T
import lasagne
from lasagne.layers import InputLayer, DenseLayer
from lasagne.nonlinearities import tanh, softmax

Using gpu device 0: Quadro K1000M (CNMeM is enabled with initial size: 45.0% of memory, cuDNN 5110)


In [7]:
class Agent(object):
    """
    Reinforcement Learning Agent
    
    This agent can learn to solve reinforcement learning tasks from
    OpenAI Gym by applying the policy gradient method.
    """

    def __init__(self, n_inputs, n_outputs):
        # symbolic variables for state, action, and advantage
        sym_state = T.fmatrix()
        sym_action = T.ivector()
        sym_advantage = T.fvector()
        # policy network
        l_in = InputLayer(shape=(None, n_inputs))
        l_hid = DenseLayer(incoming=l_in, num_units=20, nonlinearity=tanh, name='hiddenlayer')
        l_out = DenseLayer(incoming=l_hid, num_units=n_outputs, nonlinearity=softmax, name='outputlayer')
        # get network output
        eval_out = lasagne.layers.get_output(l_out, {l_in: sym_state}, deterministic=True)
        # get trainable parameters in the network.
        params = lasagne.layers.get_all_params(l_out, trainable=True)
        # get total number of timesteps
        t_total = sym_state.shape[0]
        # loss function that we'll differentiate to get the policy gradient
        loss = -T.log(eval_out[T.arange(t_total), sym_action]).dot(sym_advantage) / t_total
        # learning_rate
        learning_rate = T.fscalar()
        # get gradients
        grads = T.grad(loss, params)
        # update function
        updates = lasagne.updates.sgd(grads, params, learning_rate=learning_rate)
        # declare training and evaluation functions
        self.f_train = theano.function([sym_state, sym_action, sym_advantage, learning_rate], loss, updates=updates, allow_input_downcast=True)
        self.f_eval = theano.function([sym_state], eval_out, allow_input_downcast=True)
    
    def learn(self, env, n_epochs=100, t_per_batch=10000, traj_t_limit=None,
              learning_rate=0.1, discount_factor=1.0, n_early_stop=0):
        """
        Learn the given environment by the policy gradient method.
        """
        self.mean_train_rs = []
        self.mean_val_rs = []
        self.loss = []
        for epoch in xrange(n_epochs):
            # 1. collect trajectories until we have at least t_per_batch total timesteps
            trajs = []; t_total = 0
            while t_total < t_per_batch:
                traj = self.get_trajectory(env, traj_t_limit, deterministic=False)
                trajs.append(traj)
                t_total += len(traj["r"])
            all_s = np.concatenate([traj["s"] for traj in trajs])
            # 2. compute cumulative discounted rewards (returns)
            rets = [self._cumulative_discount(traj["r"], discount_factor) for traj in trajs]
            maxlen = max(len(ret) for ret in rets)
            padded_rets = [np.concatenate([ret, np.zeros(maxlen-len(ret))]) for ret in rets]
            # 3. compute time-dependent baseline
            baseline = np.mean(padded_rets, axis=0)
            # 4. compute advantages
            advs = [ret - baseline[:len(ret)] for ret in rets]
            all_a = np.concatenate([traj["a"] for traj in trajs])
            all_adv = np.concatenate(advs)
            # 5. do policy gradient update step
            loss = self.f_train(all_s, all_a, all_adv, learning_rate)
            train_rs = np.array([traj["r"].sum() for traj in trajs]) # trajectory total rewards
            eplens = np.array([len(traj["r"]) for traj in trajs]) # trajectory lengths
            # compute validation reward
            val_rs = np.array([self.get_trajectory(env, traj_t_limit, deterministic=True)['r'].sum() for _ in range(10)])
            # update stats
            self.mean_train_rs.append(train_rs.mean())
            self.mean_val_rs.append(val_rs.mean())
            self.loss.append(loss)
            # print stats
            print '%3d mean_train_r: %6.2f mean_val_r: %6.2f loss: %f' % (epoch+1, train_rs.mean(), val_rs.mean(), loss)
            # render solution
            #self.get_trajectory(env, traj_t_limit, render=True)
            # check for early stopping: true if the validation reward has not changed in n_early_stop epochs
            if n_early_stop and len(self.mean_val_rs) >= n_early_stop and \
                all([x == self.mean_val_rs[-1] for x in self.mean_val_rs[-n_early_stop:-1]]):
                break
    
    def get_trajectory(self, env, t_limit=None, render=False, deterministic=True):
        """
        Compute trajectroy by iteratively evaluating the agent policy on the environment.
        """
        t_limit = t_limit or env.spec.timestep_limit
        s = env.reset()
        traj = {'s': [], 'a': [], 'r': [],}
        for _ in xrange(t_limit):
            a = self.get_action(s, deterministic)
            (s, r, done, _) = env.step(a)
            traj['s'].append(s)
            traj['a'].append(a)
            traj['r'].append(r)
            if render: env.render()
            if done: break
        return {'s': np.array(traj['s']), 'a': np.array(traj['a']), 'r': np.array(traj['r'])}
    
    def get_action(self, s, deterministic=True):
        """
        Evaluate the agent policy to choose an action, a, given state, s.
        """
        # compute action probabilities
        prob_a = self.f_eval(s.reshape(1,-1))
        if deterministic:
            # choose action with highest probability
            return prob_a.argmax()
        else:
            # sample action from distribution
            return (np.cumsum(np.asarray(prob_a)) > np.random.rand()).argmax()
    
    def _cumulative_discount(self, r, gamma):
        """
        Compute the cumulative discounted rewards (returns).
        """
        r_out = np.zeros(len(r), 'float64')
        r_out[-1] = r[-1]
        for i in reversed(xrange(len(r)-1)):
            r_out[i] = r[i] + gamma * r_out[i+1]
        return r_out

In [11]:
import World, Learner
# init environment
env = World
World
actions=Learner.actions

# init agent
agent = Agent(n_inputs=env.observation_space.shape[0],
              n_outputs=actions)
# train agent on the environment
agent.learn(env, n_epochs=10, learning_rate=0.05, discount_factor=1,
            t_per_batch=10000, traj_t_limit=env.spec.timestep_limit, n_early_stop=5)

[2017-03-01 18:54:36,799] Making new env: Acrobot-v1


TclError: invalid command name ".139858028161648"

In [None]:
# init environment
env = gym.make('CartPole-v0')
# init agent
agent = Agent(n_inputs=env.observation_space.shape[0],
              n_outputs=env.action_space.n)
# train agent on the environment
agent.learn(env, n_epochs=100, learning_rate=0.05, discount_factor=1,
            t_per_batch=10000, traj_t_limit=env.spec.timestep_limit, n_early_stop=5)

In [None]:
# plot training and validation mean reward
plt.figure(figsize=(10,5))
plt.xlabel('epochs'); plt.ylabel('mean reward')
plt.plot(agent.mean_train_rs, label='training')
plt.plot(agent.mean_val_rs, label='validation')
plt.xlim((0,len(agent.mean_val_rs)-1))
plt.legend(loc=2); plt.grid()
_=plt.show()

In [16]:
# review solution
agent.get_trajectory(env, t_limit=1000, render=True)
env.render(close=True)