# Policy Gradient

Reference: [openai-cartpole](https://github.com/kvfrans/openai-cartpole/blob/master/cartpole-policygradient.py)

### Notations
+ num_trans: number of (s, a, r) in the `transitions`, meaning the time steps taken to finish an episode
+ transitions: a list of (s, a, r). Note that `s` is the old state instead of that returned by `env.step(a)`   --- (num_trans, 3)
+ reward: current reward returned by `env.step(action)`
+ rewards: a list of reward in 1 sample.       --- (num_trans, )
+ future_rewards: a list of future discounted reward computed from `rewards`
+ future_rewards_pred: a list of predicted future reward by Value Network based on current state
+ advantages: `future_rewards` - `future_rewards_pred`


### Objectiveness
#### Policy Network
**Maximize** the following expected reward.
$$\bigtriangledown_{\theta}\text{J}(\theta) = \frac{1}{N}\sum_{i=1}^N{ [ \sum_{t=0}^T{ [\bigtriangledown_{\theta} \text{log} \pi_{\theta}(a_{i, t}|s_{i, t}) \text{advantages}(i, t)] }]  }$$

#### Value Network
Predict the future reward as accurately as possible based on current state.
$$\text{loss} = \text{MSE}(\text{future_rewards}, \text{future_rewards_pred})$$

## Codes

Step 1: Import packages.

In [1]:
import numpy as np
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Step 2: Build Policy Network & Value Network.

In [2]:
class Policy(torch.nn.Module):
    '''\pi_{\theta}(s)'''
    def __init__(self, action_size, state_size):
        super(Policy, self).__init__()

        self.action_size = action_size
        self.state_size = state_size
        
        self.fc = nn.Linear(state_size, action_size)
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        
    def forward(self, states):
        '''
            states: (batch_size, state_size)
        '''
        prob_actions = self.fc(states) # (batch_size, action_size)
        return F.softmax(prob_actions, dim=1) # (batch_size, action_size)
    
    def fit(self, states, advantages, actions):
        '''
            states: (batch_size, state_size)
            advantages: (batch_size, 1)
            actions: (batch_size, ), 
            
            Note that batch_size is equal to num_trans
        '''
        prob_actions = self.forward(states) # (batch_size, action_size)
        
        # one-hot encoding
        def one_hot(y, nb_digits): 
            y_onehot = torch.FloatTensor(y.size(0), nb_digits).to(device)
            y_onehot.zero_() 
            y_onehot.scatter_(1, y.unsqueeze(1), 1) 
            return y_onehot 
        actions = one_hot(actions, 2) # (batch_size, 2)
        
        # log{{\pi}_{\theta}(a_t|s_t)}
        probs = torch.log(torch.sum(prob_actions * actions, dim=1)) # (batch_size, )
        # The loss of this sample
        loss = - torch.sum(probs * advantages)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        

    
class Value(torch.nn.Module):
    '''estimate reward given state'''
    def __init__(self, state_size, output_size, hidden_size=10):
        super(Value, self).__init__()
        
        self.fc1 = nn.Linear(state_size, hidden_size, bias=True)
        self.fc2 = nn.Linear(hidden_size, output_size, bias=True)
        
        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.1)

    def forward(self, states):
        '''
            states: (batch_size, state_size)
        '''
        net = F.relu(self.fc1(states)) # (batch_size, hidden_size)
        future_rewards_pred = self.fc2(net) # (batch_size, output_size)
        return future_rewards_pred
    
    def fit(self, states, future_rewards):
        '''
            states: (batch_size, state_size)
            future_rewards: (batch_size, 1)
            
            Note that batch_size is equal to num_trans
        '''
        future_rewards_pred = self.forward(states) # (batch_size, 1)
        loss = self.loss_fn(future_rewards_pred, future_rewards)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    

Step 3: Define training processes.
1. Get 1 sample based on Policy Network
2. Compute future discounted rewards
3. Compute advantages based on Value Network
4. Train Value Network based on states & future rewards
5. Train Policy Network based on states, actions & advantages

In [3]:
def run_episode(env, policy, value):

    state = env.reset()
    action_size = env.action_space.n

    states = list()
    actions = list()
    rewards = list()
    total_reward = 0
    
    # --------------- Get 1 Sample ------------------
    for _ in range(200):
        states.append(state) # (state_size). add old state, not current state !!!
        # get action
        state = np.expand_dims(state, axis=0) # (batch_size=1, state_size)

        prob_actions = policy(torch.Tensor(state).to(device))[0] # (action_size)
        #prob_actions = prob_actions.data.numpy()
        
        action = 0 if np.random.random() < prob_actions[0] else 1 # scalar
        
        state, reward, done, _ = env.step(action)
        actions.append(action) # (action_size, ), one_hot encoding
        rewards.append(reward) # scalar
        total_reward += reward
        
        
        if done:
            break
    
    # Compute Future Discounted Reward
    gamma = 0.97 # old gamma=0.8 produces poor result !!!
    def discount(rewards, gamma):
        discounted_rewards = np.zeros_like(rewards)
        discounted_value = 0

        for ri in reversed(range(len(rewards))):
            discounted_value = discounted_value * gamma + rewards[ri]
            discounted_rewards[ri] = discounted_value
        return discounted_rewards

    future_rewards = discount(rewards, gamma) # (num_trans,)
    # Get (s, a, f_r) list
    transitions = [(s, a, f_r) for s, a, f_r in zip(states, actions, future_rewards)] # not rewards !!!

    # Get advantages list
    advantages = list()
    for index, transition in enumerate(transitions):
        state, action, future_reward = transition
        state = np.expand_dims(state, axis=0) # (batch_size=1, state_size)
        # calculate estimated reward from Value Network
        future_reward_pred = value(torch.Tensor(state).to(device)).squeeze().detach() # (batch_size=1,)
        future_reward_pred = future_reward_pred.cpu().numpy()
        advantages.append(future_reward - future_reward_pred) # (num_trans,)
    
    future_rewards = np.expand_dims(future_rewards, axis=1) # (num_trans, 1)
    advantages = np.expand_dims(advantages, axis=1) # (num_trans, 1)
    
    # Convert to tensor
    states = torch.Tensor(states).to(device)
    future_rewards = torch.Tensor(future_rewards).to(device)
    advantages = torch.Tensor(advantages).to(device)
    actions = torch.LongTensor(actions).to(device)
    
    # train Value Network & Policy Network
    value.fit(states, future_rewards)
    policy.fit(states, advantages, actions)

    return total_reward

Step 4: Train & Test

In [4]:
env = gym.make("CartPole-v0")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy = Policy(action_size, state_size).to(device)
value = Value(state_size, output_size=1).to(device)

for i in range(2000):
    total_reward = run_episode(env, policy, value)
    if total_reward == 200:
        print('total reward 200')
        print(i)
        break

t = 0
for _ in range(2000):
    total_reward = run_episode(env, policy, value)
    t += total_reward
print('avg reward after training:', t / 2000)

env.close()

  result = entry_point.load(False)


total reward 200
205
avg reward after training: 158.4035
