In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# using tf to normalize
size = 100
adv_n = np.random.rand(100)  

# Option 1

In [3]:
def stddev(v, mean):
#     adv_std = tf.sqrt(tf.reduce_mean(tf.square(v - mean)))
    adv_std = tf.sqrt(tf.reduce_mean(tf.squared_difference(v, mean)))
    return adv_std

adv_mean = tf.reduce_mean(adv_n)
adv_std = stddev(adv_n, adv_mean)

with tf.Session() as sess:
    print("mean, std = %s" % (sess.run([adv_mean, adv_std])))

mean, std = [0.49677381416382793, 0.29401708396386794]


In [4]:
# after normalization to check gaussion
adv_normal_n = (adv_n - adv_mean) / adv_std 
adv_normal_mean = tf.reduce_mean(adv_normal_n)
adv_normal_std = stddev(adv_normal_n, adv_normal_mean)

with tf.Session() as sess:
    print("normalized mean, std = %s" % (sess.run([adv_normal_mean, adv_normal_std])))

normalized mean, std = [3.2862601528904633e-16, 0.9999999999999998]


# Option 2

In [5]:
adv_mean, adv_variance = tf.nn.moments(tf.constant(adv_n, dtype=tf.float32), axes=[0])
adv_std = tf.sqrt(adv_variance)
with tf.Session() as sess:
    print("mean, std = %s" % (sess.run([adv_mean, adv_std])))

mean, std = [0.4967738, 0.29401708]


In [6]:
# after normalization to check gaussion
adv_normal_n = (adv_n - adv_mean) / adv_std 
adv_normal_mean = tf.reduce_mean(adv_normal_n)
adv_normal_std = stddev(adv_normal_n, adv_normal_mean)

with tf.Session() as sess:
    print("normalized mean, std = %s" % (sess.run([adv_normal_mean, adv_normal_std])))

normalized mean, std = [1.4305114e-08, 1.0]


# Handle adv_std $\approx$ 0

In [7]:
# simulate adv_std is very small
# adv_std = tf.constant(1e-8)

In [8]:
# after normalization to check gaussion
adv_normal_n = tf.cond(adv_std < 1e-7, lambda: (adv_n - adv_mean), lambda: (adv_n - adv_mean) / adv_std)
adv_normal_mean = tf.reduce_mean(adv_normal_n)
adv_normal_std = stddev(adv_normal_n, adv_normal_mean)

with tf.Session() as sess:
    print("normalized mean, std = %s" % (sess.run([adv_normal_mean, adv_normal_std])))

normalized mean, std = [1.4305114e-08, 1.0]


## Policy Gradient

Recall that the expression for the policy gradient PG is  
    $$ PG = E_{\tau} [\sum_{t=0}^T \nabla_{\theta} \log \pi_{\theta}(a_t|s_t) * (Q_t - b_t )] $$  
where  
    $ tau=(s_0, a_0, ...) $ is a trajectory,  
    $ Q_t $ is the Q-value at time t, $Q^{\pi}(s_t, a_t)$,  
    and $ b_t $ is a baseline which may depend on $s_t$.   

In [9]:
# global setting of Q_n
N = 5
tau_len = 100
# list of 1D np.array - rewards
re_n = list(np.random.rand(N, tau_len))
gamma = 0.99

In [10]:
gamma ** np.array([3, 2, 1, 0])

array([0.970299, 0.9801  , 0.99    , 1.      ])

### Case 1: trajectory-based PG 
            
(reward_to_go = False)

Instead of $Q^{\pi}(s_t, a_t)$, we use the total discounted reward summed over 
entire trajectory (regardless of which time step the Q-value should be for). 

For this case, the policy gradient estimator is

  $$ E_{\tau} [\sum_{t=0}^T \nabla_{\theta} \log \pi_{\theta}(a_t|s_t) * Ret(\tau)] $$

where

  $ Ret(\tau) = \sum_{t'=0}^T \gamma^{t'} r_{t'} $.

**Thus, you should compute**

  $ Q_t = Ret(\tau) $

In [11]:
# for all reward step t=1:T, they have the same total rewards, just copy * len(re_tau)
Q_n = np.concatenate([[sum(re_tau * (gamma ** np.arange(len(re_tau))[::-1]))] * len(re_tau) for re_tau in re_n])

In [12]:
def sum_discount_rewards(rewards, gamma):
    return sum((gamma**i) * rewards[i] for i in range(len(rewards)))

In [17]:
q_n1 = np.concatenate([[sum_discount_rewards(re_tau, gamma)] * len(re_tau)
                    for re_tau in re_n])

In [16]:
q_n1

array([29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43651937,
       29.43651937, 29.43651937, 29.43651937, 29.43651937, 29.43

In [18]:
Q_n

array([29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58063966,
       29.58063966, 29.58063966, 29.58063966, 29.58063966, 29.58

In [None]:
len(gamma ** np.array(range(len(re_n[0]))[::-1])) == len(re_n[0])

### Case 2: reward-to-go PG 

(reward_to_go = True)

Here, you estimate $Q^{\pi}(s_t, a_t)$ by the discounted sum of rewards starting
from time step t. 

**Thus, you should compute**

  $$ Q_t = \sum_{t'=t}^T \gamma^{(t'-t)} * r_{t'} $$

In [None]:
Q_n_index = [[(np.arange(len(re_tau)-start)) \
        for start in np.arange(len(re_tau))] \
       for re_tau in re_n]

In [None]:
Q_n_index[0][0]

In [None]:
Q_n_index[0][1]

In [None]:
# flatten result to 500
Q_n = np.concatenate([[sum(re_tau[::-1][:len(re_tau)-start] * (gamma ** np.arange(len(re_tau)-start))) \
        for start in np.arange(len(re_tau))] \
       for re_tau in re_n])

In [None]:
assert Q_n[99] == re_n[0][99]

In [None]:
re_n[0][99]

In [None]:
# verify to be consistent with https://github.com/Kelym/DeepRL-UCB2017-Homework/blob/master/hw2/train_pg.py
def discount_rewards_to_go(rewards, gamma):
    res = [] 
    future_reward = 0
    for r in reversed(rewards):
        future_reward = future_reward * gamma + r
        res.append(future_reward)
    return res[::-1]

In [None]:
q_n1 = np.concatenate([discount_rewards_to_go(re_tau, gamma) for re_tau in re_n])

In [None]:
q_n1[98]

In [None]:
Q_n[98]

In [None]:
assert len(q_n1) == len(Q_n)