In [45]:
# # in google colab uncomment this

# import os

# os.system('apt-get install -y xvfb')
# os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')
# os.system('apt-get install -y python-opengl ffmpeg')
# os.system('pip install pyglet==1.2.4')

# launch XVFB if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    %env DISPLAY = : 1

### Let's make a TRPO!

In this notebook we will write the code of the one Trust Region Policy Optimization.
As usually, it contains a few different parts which we are going to reproduce.



In [46]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
import gym

env = gym.make("Acrobot-v1")
env.reset()
observation_shape = env.observation_space.shape
n_actions = env.action_space.n
print("Observation Space", env.observation_space)
print("Action Space", env.action_space)

Observation Space Box(6,)
Action Space Discrete(3)


In [49]:
import matplotlib.pyplot as plt
%matplotlib inline
#plt.imshow(env.render('rgb_array'))

### Step 1: Defining a network

With all it's complexity, at it's core TRPO is yet another policy gradient method. 

This essentially means we're actually training a stochastic policy $ \pi_\theta(a|s) $. 

And yes, it's gonna be a neural network. So let's start by defining one.

In [51]:
class TRPOAgent(nn.Module):
    def __init__(self, state_shape, n_actions, hidden_size=32):
        '''
        Here you should define your model
        You should have LOG-PROBABILITIES as output because you will need it to compute loss
        We recommend that you start simple: 
        use 1-2 hidden layers with 100-500 units and relu for the first try
        '''
        nn.Module.__init__(self)

        #<your network here >
        self.fc1 = nn.Linear(state_shape[0],256)
        self.fc2 = nn.Linear(256,hidden_size)
        self.fc3 = nn.Linear(hidden_size,n_actions)

    def forward(self, states):
        """
        takes agent's observation (Variable), returns log-probabilities (Variable)
        :param state_t: a batch of states, shape = [batch_size, state_shape]
        """

        # Use your network to compute log_probs for given state
        x = F.relu(self.fc1(states))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        #print(x.shape)
        #print(x)
        log_probs = F.log_softmax(x,dim=1)
        return log_probs

    def get_log_probs(self, states):
        '''
        Log-probs for training
        '''

        return self.forward(states)

    def get_probs(self, states):
        '''
        Probs for interaction
        '''

        return torch.exp(self.forward(states))

    def act(self, obs, sample=True):
        '''
        Samples action from policy distribution (sample = True) or takes most likely action (sample = False)
        :param: obs - single observation vector
        :param sample: if True, samples from \pi, otherwise takes most likely action
        :returns: action (single integer) and probabilities for all actions
        '''

        probs = self.get_probs(Variable(torch.FloatTensor([obs]))).data.numpy()

        if sample:
            action = int(np.random.choice(n_actions, p=probs[0]))
        else:
            action = int(np.argmax(probs))

        return action, probs[0]


agent = TRPOAgent(observation_shape, n_actions)

In [52]:
# Check if log-probabilities satisfies all the requirements
log_probs = agent.get_log_probs(Variable(torch.FloatTensor([env.reset()])))
assert isinstance(
    log_probs, Variable) and log_probs.requires_grad, "qvalues must be a torch variable with grad"
assert len(
    log_probs.shape) == 2 and log_probs.shape[0] == 1 and log_probs.shape[1] == n_actions
sums = torch.sum(torch.exp(log_probs), dim=1)
assert (0.999 < sums).all() and (1.001 > sums).all()

# Demo use
print("sampled:", [agent.act(env.reset()) for _ in range(5)])
print("greedy:", [agent.act(env.reset(), sample=False) for _ in range(5)])

sampled: [(2, array([0.35947153, 0.2913053 , 0.3492231 ], dtype=float32)), (2, array([0.357119  , 0.29115525, 0.35172576], dtype=float32)), (0, array([0.35697815, 0.2918423 , 0.3511795 ], dtype=float32)), (0, array([0.35854623, 0.29162666, 0.34982714], dtype=float32)), (2, array([0.3569325 , 0.29200277, 0.35106468], dtype=float32))]
greedy: [(0, array([0.358427 , 0.2915341, 0.3500389], dtype=float32)), (0, array([0.35758692, 0.29114956, 0.3512635 ], dtype=float32)), (0, array([0.3553263 , 0.29117334, 0.35350037], dtype=float32)), (0, array([0.3558737 , 0.29118684, 0.35293943], dtype=float32)), (0, array([0.35740367, 0.29063135, 0.35196495], dtype=float32))]


#### Flat parameters operations

We are going to use it

In [53]:
def get_flat_params_from(model):
    params = []
    for param in model.parameters():
        params.append(param.data.view(-1))

    flat_params = torch.cat(params)
    return flat_params


def set_flat_params_to(model, flat_params):
    prev_ind = 0
    for param in model.parameters():
        flat_size = int(np.prod(list(param.size())))
        param.data.copy_(
            flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
        prev_ind += flat_size

Compute cummulative reward just like you did in vanilla REINFORCE

In [54]:
import scipy.signal


def get_cummulative_returns(r, gamma=1):
    """
    Computes cummulative discounted rewards given immediate rewards
    G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...
    Also known as R(s,a).
    """
    r = np.array(r)
    assert r.ndim >= 1
    return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]

In [55]:
# simple demo on rewards [0,0,1,0,0,1]
get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)

array([1.40049, 1.5561 , 1.729  , 0.81   , 0.9    , 1.     ])

**Rollout**

In [56]:
def rollout(env, agent, max_pathlength=2500, n_timesteps=50000):
    """
    Generate rollouts for training.
    :param: env - environment in which we will make actions to generate rollouts.
    :param: act - the function that can return policy and action given observation.
    :param: max_pathlength - maximum size of one path that we generate.
    :param: n_timesteps - total sum of sizes of all pathes we generate.
    """
    paths = []

    total_timesteps = 0
    while total_timesteps < n_timesteps:
        obervations, actions, rewards, action_probs = [], [], [], []
        obervation = env.reset()
        for _ in range(max_pathlength):
            action, policy = agent.act(obervation)
            obervations.append(obervation)
            actions.append(action)
            action_probs.append(policy)
            obervation, reward, done, _ = env.step(action)
            rewards.append(reward)
            total_timesteps += 1
            if done or total_timesteps == n_timesteps:
                path = {"observations": np.array(obervations),
                        "policy": np.array(action_probs),
                        "actions": np.array(actions),
                        "rewards": np.array(rewards),
                        "cumulative_returns": get_cummulative_returns(rewards),
                        }
                paths.append(path)
                break
    return paths

In [57]:
paths = rollout(env, agent, max_pathlength=5, n_timesteps=100)
print(paths[-1])
assert (paths[0]['policy'].shape == (5, n_actions))
assert (paths[0]['cumulative_returns'].shape == (5,))
assert (paths[0]['rewards'].shape == (5,))
assert (paths[0]['observations'].shape == (5,)+observation_shape)
assert (paths[0]['actions'].shape == (5,))
print('It\'s ok')

{'observations': array([[ 0.99746324, -0.07118348,  0.99723227,  0.07434922,  0.05132186,
         0.07783075],
       [ 0.99791875, -0.06448379,  0.99397806,  0.10957929,  0.01571766,
         0.27064073],
       [ 0.99927421, -0.03809267,  0.99387654,  0.1104963 ,  0.24284108,
        -0.25690373],
       [ 0.99960692,  0.02803571,  0.99990913,  0.01348046,  0.40199609,
        -0.68825459],
       [ 0.99489788,  0.10088709,  0.99333259, -0.11528388,  0.30949653,
        -0.56856268]]), 'policy': array([[0.35865697, 0.2926023 , 0.3487408 ],
       [0.35878274, 0.29423392, 0.3469834 ],
       [0.3570532 , 0.29099935, 0.35194737],
       [0.35443535, 0.2884733 , 0.35709134],
       [0.3530431 , 0.2886237 , 0.35833317]], dtype=float32), 'actions': array([2, 0, 0, 1, 1]), 'rewards': array([-1., -1., -1., -1., -1.]), 'cumulative_returns': array([-5., -4., -3., -2., -1.])}
It's ok


### Step 3: Auxiliary functions

Now let's define the loss functions and something else for actual TRPO training.

The surrogate reward should be
$$J_{surr}= {1 \over N} \sum\limits_{i=0}^N \frac{\pi_{\theta}(s_i, a_i)}{\pi_{\theta_{old}}(s_i, a_i)}A_{\theta_{old}(s_i, a_i)}$$

For simplicity, let's use cummulative returns instead of advantage for now:
$$J'_{surr}= {1 \over N} \sum\limits_{i=0}^N \frac{\pi_{\theta}(s_i, a_i)}{\pi_{\theta_{old}}(s_i, a_i)}G_{\theta_{old}(s_i, a_i)}$$

Or alternatively, minimize the surrogate loss:
$$ L_{surr} = - J'_{surr} $$  


In [58]:
def get_loss(agent, observations, actions, cummulative_returns, old_probs):
    """
    Computes TRPO objective
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns
    :param: old_probs - batch of probabilities computed by old network
    :returns: scalar value of the objective function
    """
    batch_size = observations.shape[0]
    log_probs_all = agent.get_log_probs(observations)
    probs_all = torch.exp(log_probs_all)

    probs_for_actions = probs_all[torch.arange(
        0, batch_size, out=torch.LongTensor()), actions]
    old_probs_for_actions = old_probs[torch.arange(
        0, batch_size, out=torch.LongTensor()), actions]

    # Compute surrogate loss, aka importance-sampled policy gradient
    Loss = -torch.mean((probs_for_actions/old_probs_for_actions)*cummulative_returns)

    assert Loss.shape == torch.Size([])
    return Loss

We can ascend these gradients as long as our $pi_\theta(a|s)$ satisfies the constraint
$$E_{s,\pi_{\Theta_{t}}}\Big[KL(\pi(\Theta_{t}, s) \:||\:\pi(\Theta_{t+1}, s))\Big]< \alpha$$


where

$$KL(p||q) = E _p log({p \over q})$$

In [59]:
def get_kl(agent, observations, actions, cummulative_returns, old_probs):
    """
    Computes KL-divergence between network policy and old policy
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns (we don't need it actually)
    :param: old_probs - batch of probabilities computed by old network
    :returns: scalar value of the KL-divergence
    """
    batch_size = observations.shape[0]
    log_probs_all = agent.get_log_probs(observations)
    probs_all = torch.exp(log_probs_all)

    # Compute Kullback-Leibler divergence (see formula above)
    # Note: you need to sum KL and entropy over all actions, not just the ones agent took
    old_log_probs = torch.log(old_probs+1e-10)

    #kl = <cumpute kullback-leibler >
    kl = torch.sum(old_probs * (old_log_probs-log_probs_all))/batch_size

    assert kl.shape == torch.Size([])
    assert (kl > -0.0001).all() and (kl < 10000).all()
    return kl

In [60]:
def get_entropy(agent, observations):
    """
    Computes entropy of the network policy 
    :param: observations - batch of observations
    :returns: scalar value of the entropy
    """

    observations = Variable(torch.FloatTensor(observations))

    batch_size = observations.shape[0]
    log_probs_all = agent.get_log_probs(observations)
    probs_all = torch.exp(log_probs_all)

    entropy = torch.sum(-probs_all * log_probs_all) / batch_size

    assert entropy.shape == torch.Size([])
    return entropy

**Linear search**

TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. 

In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)

In [61]:
def linesearch(f, x, fullstep, max_kl):
    """
    Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.
    :param: f - function that returns loss, kl and arbitrary third component.
    :param: x - old parameters of neural network.
    :param: fullstep - direction in which we make search.
    :param: max_kl - constraint of KL divergence.
    :returns:
    """
    max_backtracks = 10
    loss, _, = f(x)
    for stepfrac in .5**np.arange(max_backtracks):
        xnew = x + stepfrac * fullstep
        new_loss, kl = f(xnew)
        actual_improve = new_loss - loss
        if kl.data.numpy() <= max_kl and actual_improve.data.numpy() < 0:
            x = xnew
            loss = new_loss
    return x

**Conjugate gradients**

Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.

In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)

In [62]:
from numpy.linalg import inv


def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
    """
    This method solves system of equation Ax=b using iterative method called conjugate gradients
    :f_Ax: function that returns Ax
    :b: targets for Ax
    :cg_iters: how many iterations this method should do
    :residual_tol: epsilon for stability
    """
    p = b.clone()
    r = b.clone()
    x = torch.zeros(b.size())
    rdotr = torch.sum(r*r)
    for i in range(cg_iters):
        z = f_Ax(p)
        v = rdotr / (torch.sum(p*z) + 1e-8)
        x += v * p
        r -= v * z
        newrdotr = torch.sum(r*r)
        mu = newrdotr / (rdotr + 1e-8)
        p = r + mu * p
        rdotr = newrdotr
        if rdotr < residual_tol:
            break
    return x

In [63]:
# This code validates conjugate gradients
A = np.random.rand(8, 8)
A = np.matmul(np.transpose(A), A)


def f_Ax(x):
    return torch.matmul(torch.FloatTensor(A), x.view((-1, 1))).view(-1)


b = np.random.rand(8)

w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),
                        np.transpose(A)), b.reshape((-1, 1))).reshape(-1)
print(w)
print(conjugate_gradient(f_Ax, torch.FloatTensor(b)).numpy())

[-18.67383091 -24.07102509 -42.06491357  11.38999762   1.57374639
  14.77766929   4.70970379  44.64830099]
[-18.680172  -24.069382  -42.058407   11.389226    1.5742568  14.783769
   4.7202454  44.634914 ]


### Step 4: training
In this section we construct the whole update step function.

In [64]:
def update_step(agent, observations, actions, cummulative_returns, old_probs, max_kl):
    """
    This function does the TRPO update step
    :param: observations - batch of observations
    :param: actions - batch of actions
    :param: cummulative_returns - batch of cummulative returns
    :param: old_probs - batch of probabilities computed by old network
    :param: max_kl - controls how big KL divergence may be between old and new policy every step.
    :returns: KL between new and old policies and the value of the loss function.
    """

    # Here we prepare the information
    observations = Variable(torch.FloatTensor(observations))
    actions = torch.LongTensor(actions)
    cummulative_returns = Variable(torch.FloatTensor(cummulative_returns))
    old_probs = Variable(torch.FloatTensor(old_probs))

    # Here we compute gradient of the loss function
    loss = get_loss(agent, observations, actions,
                    cummulative_returns, old_probs)
    grads = torch.autograd.grad(loss, agent.parameters())
    loss_grad = torch.cat([grad.view(-1) for grad in grads]).data

    def Fvp(v):
        # Here we compute Fx to do solve Fx = g using conjugate gradients
        # We actually do here a couple of tricks to compute it efficiently

        kl = get_kl(agent, observations, actions,
                    cummulative_returns, old_probs)

        grads = torch.autograd.grad(kl, agent.parameters(), create_graph=True)
        flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])

        kl_v = (flat_grad_kl * Variable(v)).sum()
        grads = torch.autograd.grad(kl_v, agent.parameters())
        flat_grad_grad_kl = torch.cat(
            [grad.contiguous().view(-1) for grad in grads]).data

        return flat_grad_grad_kl + v * 0.1

    # Here we solveolve Fx = g system using conjugate gradients
    stepdir = conjugate_gradient(Fvp, -loss_grad, 10)

    # Here we compute the initial vector to do linear search
    shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)

    lm = torch.sqrt(shs / max_kl)
    fullstep = stepdir / lm[0]

    neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True)

    # Here we get the start point
    prev_params = get_flat_params_from(agent)

    def get_loss_kl(params):
        # Helper for linear search
        set_flat_params_to(agent, params)
        return [get_loss(agent, observations, actions, cummulative_returns, old_probs),
                get_kl(agent, observations, actions, cummulative_returns, old_probs)]

    # Here we find our new parameters
    new_params = linesearch(get_loss_kl, prev_params, fullstep, max_kl)

    # And we set it to our network
    set_flat_params_to(agent, new_params)

    return get_loss_kl(new_params)

##### Step 5: Main TRPO loop

Here we will train our network!

In [65]:
import time
from itertools import count
from collections import OrderedDict

# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.
max_kl = 0.01
numeptotal = 0  # this is number of episodes that we played.

start_time = time.time()

for i in count(1):

    print("\n********** Iteration %i ************" % i)

    # Generating paths.
    print("Rollout")
    paths = rollout(env, agent)
    print("Made rollout")

    # Updating policy.
    observations = np.concatenate([path["observations"] for path in paths])
    actions = np.concatenate([path["actions"] for path in paths])
    returns = np.concatenate([path["cumulative_returns"] for path in paths])
    old_probs = np.concatenate([path["policy"] for path in paths])

    loss, kl = update_step(agent, observations, actions,
                           returns, old_probs, max_kl)

    # Report current progress
    episode_rewards = np.array([path["rewards"].sum() for path in paths])

    stats = OrderedDict()
    numeptotal += len(episode_rewards)
    stats["Total number of episodes"] = numeptotal
    stats["Average sum of rewards per episode"] = episode_rewards.mean()
    stats["Std of rewards per episode"] = episode_rewards.std()
    stats["Time elapsed"] = "%.2f mins" % ((time.time() - start_time)/60.)
    stats["KL between old and new distribution"] = kl.data.numpy()
    stats["Entropy"] = get_entropy(agent, observations).data.numpy()
    stats["Surrogate loss"] = loss.data.numpy()
    for k, v in stats.items():
        print(k + ": " + " " * (40 - len(k)) + str(v))
    i += 1


********** Iteration 1 ************
Rollout
Made rollout
Total number of episodes:                 101
Average sum of rewards per episode:       -495.029702970297
Std of rewards per episode:               29.82372483909597
Time elapsed:                             0.27 mins
KL between old and new distribution:      0.009998239
Entropy:                                  1.0768735
Surrogate loss:                           248.4067

********** Iteration 2 ************
Rollout
Made rollout
Total number of episodes:                 201
Average sum of rewards per episode:       -500.0
Std of rewards per episode:               0.0
Time elapsed:                             0.53 mins
KL between old and new distribution:      0.009992524
Entropy:                                  1.074311
Surrogate loss:                           249.9983

********** Iteration 3 ************
Rollout
Made rollout
Total number of episodes:                 304
Average sum of rewards per episode:       -485.271844660

Entropy:                                  0.6000928
Surrogate loss:                           56.662956

********** Iteration 20 ************
Rollout
Made rollout
Total number of episodes:                 6153
Average sum of rewards per episode:       -99.6056338028169
Std of rewards per episode:               24.238449269469285
Time elapsed:                             5.40 mins
KL between old and new distribution:      0.009985874
Entropy:                                  0.5592053
Surrogate loss:                           52.54215

********** Iteration 21 ************
Rollout
Made rollout
Total number of episodes:                 6642
Average sum of rewards per episode:       -101.25153374233129
Std of rewards per episode:               30.36559986418591
Time elapsed:                             5.66 mins
KL between old and new distribution:      0.009974219
Entropy:                                  0.54621255
Surrogate loss:                           54.946014

********** Iteration

Std of rewards per episode:               24.981400346989325
Time elapsed:                             10.20 mins
KL between old and new distribution:      0.009985494
Entropy:                                  0.3237124
Surrogate loss:                           48.902946

********** Iteration 39 ************
Rollout
Made rollout
Total number of episodes:                 16210
Average sum of rewards per episode:       -91.0828729281768
Std of rewards per episode:               29.43918007700487
Time elapsed:                             10.47 mins
KL between old and new distribution:      0.009994028
Entropy:                                  0.2742218
Surrogate loss:                           50.0626

********** Iteration 40 ************
Rollout
Made rollout
Total number of episodes:                 16763
Average sum of rewards per episode:       -89.41772151898734
Std of rewards per episode:               26.340190151775115
Time elapsed:                             10.75 mins
KL between

********** Iteration 57 ************
Rollout
Made rollout
Total number of episodes:                 26414
Average sum of rewards per episode:       -84.77015437392797
Std of rewards per episode:               36.28128154492625
Time elapsed:                             15.28 mins
KL between old and new distribution:      0.009993436
Entropy:                                  0.17642756
Surrogate loss:                           49.894226

********** Iteration 58 ************
Rollout
Made rollout
Total number of episodes:                 26989
Average sum of rewards per episode:       -85.96
Std of rewards per episode:               32.105101314818846
Time elapsed:                             15.55 mins
KL between old and new distribution:      0.009977716
Entropy:                                  0.14921056
Surrogate loss:                           48.750237

********** Iteration 59 ************
Rollout
Made rollout
Total number of episodes:                 27595
Average sum of rewards pe

KL between old and new distribution:      0.009976936
Entropy:                                  0.074514754
Surrogate loss:                           42.693653

********** Iteration 76 ************
Rollout
Made rollout
Total number of episodes:                 37942
Average sum of rewards per episode:       -77.24882629107981
Std of rewards per episode:               18.297379881174002
Time elapsed:                             20.34 mins
KL between old and new distribution:      0.009971207
Entropy:                                  0.07387835
Surrogate loss:                           40.646446

********** Iteration 77 ************
Rollout
Made rollout
Total number of episodes:                 38585
Average sum of rewards per episode:       -76.76205287713842
Std of rewards per episode:               16.02110748527538
Time elapsed:                             20.61 mins
KL between old and new distribution:      0.009972673
Entropy:                                  0.07455063
Surrogate l

Average sum of rewards per episode:       -76.88317757009345
Std of rewards per episode:               16.137369718437018
Time elapsed:                             25.14 mins
KL between old and new distribution:      0.009977268
Entropy:                                  0.056727987
Surrogate loss:                           39.95376

********** Iteration 95 ************
Rollout
Made rollout
Total number of episodes:                 50128
Average sum of rewards per episode:       -77.61792452830188
Std of rewards per episode:               19.604026494878905
Time elapsed:                             25.41 mins
KL between old and new distribution:      0.009984415
Entropy:                                  0.055282626
Surrogate loss:                           41.1621

********** Iteration 96 ************
Rollout
Made rollout
Total number of episodes:                 50762
Average sum of rewards per episode:       -77.86593059936908
Std of rewards per episode:               21.5436466990381

Entropy:                                  0.03738677
Surrogate loss:                           40.341644

********** Iteration 113 ************
Rollout
Made rollout
Total number of episodes:                 61926
Average sum of rewards per episode:       -73.73991031390135
Std of rewards per episode:               14.8750020735635
Time elapsed:                             30.21 mins
KL between old and new distribution:      0.009984355
Entropy:                                  0.03715967
Surrogate loss:                           38.266926

********** Iteration 114 ************
Rollout
Made rollout
Total number of episodes:                 62589
Average sum of rewards per episode:       -74.41628959276018
Std of rewards per episode:               15.995432232241383
Time elapsed:                             30.48 mins
KL between old and new distribution:      0.009977124
Entropy:                                  0.036113594
Surrogate loss:                           38.8388

********** It

Average sum of rewards per episode:       -73.40625
Std of rewards per episode:               11.957395459530275
Time elapsed:                             35.00 mins
KL between old and new distribution:      0.009980543
Entropy:                                  0.030549714
Surrogate loss:                           37.568474

********** Iteration 132 ************
Rollout
Made rollout
Total number of episodes:                 74689
Average sum of rewards per episode:       -73.96551724137932
Std of rewards per episode:               19.958435328813735
Time elapsed:                             35.27 mins
KL between old and new distribution:      0.009990061
Entropy:                                  0.032648273
Surrogate loss:                           39.50515

********** Iteration 133 ************
Rollout
Made rollout
Total number of episodes:                 75335
Average sum of rewards per episode:       -76.40247678018576
Std of rewards per episode:               22.068194196715552
Ti

Entropy:                                  0.024795527
Surrogate loss:                           36.487377

********** Iteration 150 ************
Rollout
Made rollout
Total number of episodes:                 86887
Average sum of rewards per episode:       -71.78165938864629
Std of rewards per episode:               10.608780035438373
Time elapsed:                             40.06 mins
KL between old and new distribution:      0.009999356
Entropy:                                  0.02341817
Surrogate loss:                           36.570465

********** Iteration 151 ************
Rollout
Made rollout
Total number of episodes:                 87586
Average sum of rewards per episode:       -70.5307582260372
Std of rewards per episode:               8.99133971330033
Time elapsed:                             40.33 mins
KL between old and new distribution:      0.009993844
Entropy:                                  0.022367336
Surrogate loss:                           35.77621

********** I

Average sum of rewards per episode:       -74.41628959276018
Std of rewards per episode:               14.696995110143057
Time elapsed:                             44.90 mins
KL between old and new distribution:      0.009972454
Entropy:                                  0.021916773
Surrogate loss:                           38.571278

********** Iteration 169 ************
Rollout
Made rollout
Total number of episodes:                 99873
Average sum of rewards per episode:       -71.3603473227207
Std of rewards per episode:               11.810569079948976
Time elapsed:                             45.17 mins
KL between old and new distribution:      0.00999651
Entropy:                                  0.019308679
Surrogate loss:                           36.56501

********** Iteration 170 ************
Rollout
Made rollout
Total number of episodes:                 100543
Average sum of rewards per episode:       -73.62985074626866
Std of rewards per episode:               21.9815459571

KL between old and new distribution:      0.009994368
Entropy:                                  0.024179816
Surrogate loss:                           39.86697

********** Iteration 187 ************
Rollout
Made rollout
Total number of episodes:                 111855
Average sum of rewards per episode:       -76.04314329738058
Std of rewards per episode:               15.193806620172806
Time elapsed:                             50.04 mins
KL between old and new distribution:      0.009968182
Entropy:                                  0.023197375
Surrogate loss:                           39.41072

********** Iteration 188 ************
Rollout
Made rollout
Total number of episodes:                 112512
Average sum of rewards per episode:       -75.10502283105023
Std of rewards per episode:               16.366695655316494
Time elapsed:                             50.31 mins
KL between old and new distribution:      0.009972204
Entropy:                                  0.01856051
Surroga

Total number of episodes:                 124149
Average sum of rewards per episode:       -72.31524926686217
Std of rewards per episode:               11.453541334543168
Time elapsed:                             54.96 mins
KL between old and new distribution:      0.009983369
Entropy:                                  0.01775869
Surrogate loss:                           36.998592

********** Iteration 206 ************
Rollout
Made rollout
Total number of episodes:                 124832
Average sum of rewards per episode:       -72.20790629575403
Std of rewards per episode:               12.482753977424695
Time elapsed:                             55.25 mins
KL between old and new distribution:      0.009990318
Entropy:                                  0.019279605
Surrogate loss:                           37.099815

********** Iteration 207 ************
Rollout
Made rollout
Total number of episodes:                 125500
Average sum of rewards per episode:       -73.85179640718563
Std

KL between old and new distribution:      0.009994264
Entropy:                                  0.016273838
Surrogate loss:                           38.349483

********** Iteration 224 ************
Rollout
Made rollout
Total number of episodes:                 137201
Average sum of rewards per episode:       -72.74778761061947
Std of rewards per episode:               13.239448499852987
Time elapsed:                             60.29 mins
KL between old and new distribution:      0.009971927
Entropy:                                  0.012926938
Surrogate loss:                           37.427284

********** Iteration 225 ************
Rollout
Made rollout
Total number of episodes:                 137885
Average sum of rewards per episode:       -72.10087719298245
Std of rewards per episode:               15.340293970898127
Time elapsed:                             60.57 mins
KL between old and new distribution:      0.009994091
Entropy:                                  0.015343645
Surr

Total number of episodes:                 149553
Average sum of rewards per episode:       -71.9941605839416
Std of rewards per episode:               14.268436742303036
Time elapsed:                             65.28 mins
KL between old and new distribution:      0.009998784
Entropy:                                  0.011070323
Surrogate loss:                           37.32041

********** Iteration 243 ************
Rollout
Made rollout
Total number of episodes:                 150247
Average sum of rewards per episode:       -71.04755043227665
Std of rewards per episode:               12.573784014046632
Time elapsed:                             65.56 mins
KL between old and new distribution:      0.0013052211
Entropy:                                  0.00964906
Surrogate loss:                           36.603058

********** Iteration 244 ************
Rollout
Made rollout
Total number of episodes:                 150952
Average sum of rewards per episode:       -69.92340425531916
Std 

Time elapsed:                             70.25 mins
KL between old and new distribution:      0.009980774
Entropy:                                  0.015592002
Surrogate loss:                           37.441963

********** Iteration 261 ************
Rollout
Made rollout
Total number of episodes:                 162593
Average sum of rewards per episode:       -72.31524926686217
Std of rewards per episode:               12.936865722909372
Time elapsed:                             70.52 mins
KL between old and new distribution:      0.009981825
Entropy:                                  0.01338901
Surrogate loss:                           37.20769

********** Iteration 262 ************
Rollout
Made rollout
Total number of episodes:                 163286
Average sum of rewards per episode:       -71.15151515151516
Std of rewards per episode:               10.44832615060123
Time elapsed:                             70.79 mins
KL between old and new distribution:      0.009978797
Entropy:

********** Iteration 279 ************
Rollout
Made rollout
Total number of episodes:                 174930
Average sum of rewards per episode:       -71.46521739130435
Std of rewards per episode:               9.905470982619837
Time elapsed:                             75.56 mins
KL between old and new distribution:      0.009975279
Entropy:                                  0.014893081
Surrogate loss:                           36.28163

********** Iteration 280 ************
Rollout
Made rollout
Total number of episodes:                 175621
Average sum of rewards per episode:       -71.3603473227207
Std of rewards per episode:               11.721396413740557
Time elapsed:                             75.83 mins
KL between old and new distribution:      0.009982704
Entropy:                                  0.015821517
Surrogate loss:                           36.552353

********** Iteration 281 ************
Rollout
Made rollout
Total number of episodes:                 176309
Average

Std of rewards per episode:               13.896307522783815
Time elapsed:                             80.43 mins
KL between old and new distribution:      0.009995069
Entropy:                                  0.022060169
Surrogate loss:                           37.218857

********** Iteration 298 ************
Rollout
Made rollout
Total number of episodes:                 187917
Average sum of rewards per episode:       -72.96597633136095
Std of rewards per episode:               15.43895295755108
Time elapsed:                             80.70 mins
KL between old and new distribution:      0.009981417
Entropy:                                  0.02134832
Surrogate loss:                           37.951004

********** Iteration 299 ************
Rollout
Made rollout
Total number of episodes:                 188578
Average sum of rewards per episode:       -74.64447806354009
Std of rewards per episode:               15.961728639380961
Time elapsed:                             80.96 mins


Surrogate loss:                           38.05952

********** Iteration 316 ************
Rollout
Made rollout
Total number of episodes:                 200222
Average sum of rewards per episode:       -73.18694362017804
Std of rewards per episode:               22.980266502415585
Time elapsed:                             85.60 mins
KL between old and new distribution:      0.009985786
Entropy:                                  0.018229969
Surrogate loss:                           40.063625

********** Iteration 317 ************
Rollout
Made rollout
Total number of episodes:                 200925
Average sum of rewards per episode:       -70.12517780938833
Std of rewards per episode:               11.188987190837226
Time elapsed:                             85.87 mins
KL between old and new distribution:      0.009971808
Entropy:                                  0.023360485
Surrogate loss:                           35.847263

********** Iteration 318 ************
Rollout
Made rollout
T

Average sum of rewards per episode:       -69.62288135593221
Std of rewards per episode:               11.214542951841237
Time elapsed:                             90.52 mins
KL between old and new distribution:      0.007531218
Entropy:                                  0.014076657
Surrogate loss:                           35.64913

********** Iteration 335 ************
Rollout
Made rollout
Total number of episodes:                 213463
Average sum of rewards per episode:       -69.92340425531916
Std of rewards per episode:               10.33975072952121
Time elapsed:                             90.79 mins
KL between old and new distribution:      0.009970377
Entropy:                                  0.012881282
Surrogate loss:                           35.560097

********** Iteration 336 ************
Rollout
Made rollout
Total number of episodes:                 214172
Average sum of rewards per episode:       -69.52327221438647
Std of rewards per episode:               9.941497845

Entropy:                                  0.016396165
Surrogate loss:                           38.938957

********** Iteration 353 ************
Rollout
Made rollout
Total number of episodes:                 226008
Average sum of rewards per episode:       -70.84051724137932
Std of rewards per episode:               11.172067064383478
Time elapsed:                             95.67 mins
KL between old and new distribution:      0.009987808
Entropy:                                  0.01857441
Surrogate loss:                           36.155834

********** Iteration 354 ************
Rollout
Made rollout
Total number of episodes:                 226688
Average sum of rewards per episode:       -72.53088235294118
Std of rewards per episode:               15.079658653424694
Time elapsed:                             95.94 mins
KL between old and new distribution:      0.0099707
Entropy:                                  0.0158165
Surrogate loss:                           37.718075

**********

Average sum of rewards per episode:       -71.46521739130435
Std of rewards per episode:               12.704995321217607
Time elapsed:                             100.56 mins
KL between old and new distribution:      0.00844771
Entropy:                                  0.013078624
Surrogate loss:                           36.761307

********** Iteration 372 ************
Rollout
Made rollout
Total number of episodes:                 239250
Average sum of rewards per episode:       -69.42394366197183
Std of rewards per episode:               7.919613107436115
Time elapsed:                             100.82 mins
KL between old and new distribution:      0.009994286
Entropy:                                  0.012979067
Surrogate loss:                           35.09308

********** Iteration 373 ************
Rollout
Made rollout
Total number of episodes:                 239957
Average sum of rewards per episode:       -69.72277227722772
Std of rewards per episode:               15.5719647

KL between old and new distribution:      0.009982963
Entropy:                                  0.014304277
Surrogate loss:                           35.43665

********** Iteration 390 ************
Rollout
Made rollout
Total number of episodes:                 251938
Average sum of rewards per episode:       -70.9453237410072
Std of rewards per episode:               19.76938707150431
Time elapsed:                             105.70 mins
KL between old and new distribution:      0.009997852
Entropy:                                  0.013895457
Surrogate loss:                           38.143757

********** Iteration 391 ************
Rollout
Made rollout
Total number of episodes:                 252648
Average sum of rewards per episode:       -69.42394366197183
Std of rewards per episode:               11.15444665692468
Time elapsed:                             105.97 mins
KL between old and new distribution:      0.009990273
Entropy:                                  0.012565585
Surrog

Made rollout
Total number of episodes:                 264566
Average sum of rewards per episode:       -71.04755043227665
Std of rewards per episode:               12.77444714119976
Time elapsed:                             110.61 mins
KL between old and new distribution:      0.009974147
Entropy:                                  0.012299502
Surrogate loss:                           36.56492

********** Iteration 409 ************
Rollout
Made rollout
Total number of episodes:                 265257
Average sum of rewards per episode:       -71.3603473227207
Std of rewards per episode:               11.692222452155272
Time elapsed:                             110.87 mins
KL between old and new distribution:      0.009980133
Entropy:                                  0.011174643
Surrogate loss:                           36.535404

********** Iteration 410 ************
Rollout
Made rollout
Total number of episodes:                 265958
Average sum of rewards per episode:       -70.32810

KeyboardInterrupt: 

# Homework option I: better sampling (10+pts)

In this section, you're invited to implement a better rollout strategy called _vine_.

![img](https://s17.postimg.org/i90chxgvj/vine.png)

In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the mcts seminar](https://github.com/yandexdataschool/Practical_RL/blob/master/week10_planning/seminar_MCTS.ipynb).

You can read more about in the [TRPO article](https://arxiv.org/abs/1502.05477) in section 5.2.

The goal here is to implement such rollout policy (we recommend using tree data structure like in the seminar above).
Then you can assign cummulative rewards similar to `get_cummulative_rewards`, but for a tree.

__bonus task__ - parallelize samples using multiple cores

# Homework option II (10+pts)

Let's use TRPO to train evil robots! (pick any of two)
* [MuJoCo robots](https://gym.openai.com/envs#mujoco)
* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)

The catch here is that those environments have continuous action spaces. 

Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\pi_\theta(a|s)$. We recommend starting with gaussian policy:

$$\pi_\theta(a|s) = N(\mu_\theta(s),\sigma^2_\theta(s)) = {1 \over \sqrt { 2 \pi {\sigma^2}_\theta(s) } } e^{ (a - 
\mu_\theta(s))^2 \over 2 {\sigma^2}_\theta(s) } $$

In the $\sqrt { 2 \pi {\sigma^2}_\theta(s) }$ clause, $\pi$ means ~3.1415926, not agent's policy.

This essentially means that you will need two output layers:
* $\mu_\theta(s)$, a dense layer with linear activation
* ${\sigma^2}_\theta(s)$, a dense layer with activation tf.exp (to make it positive; like rho from bandits)

For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).

__bonus task__: compare performance of continuous action space method to action space discretization