# Policy Gradient

## Loss
$$\bigtriangledown_{\theta}\text{J}(\theta) = \frac{1}{N}\sum_{i=1}^N{ [ \sum_{t=0}^T{ [\bigtriangledown_{\theta} \text{log} \pi_{\theta}(a_{i, t}|s_{i, t}) \text{future_rewards}(t)] }] }$$ 

## 1. Import packages

In [0]:
import gym
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Categorical
import torch.nn.functional as F

## 2. Define constants

In [0]:
gamma = 0.98
num_epochs = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 3. Prepare data

In [0]:
env = gym.make("CartPole-v0")

def discount(rewards, gamma):
    discounted_rewards = np.zeros_like(rewards)
    discounted_value = 0

    for ri in reversed(range(len(rewards))):
        discounted_value = discounted_value * gamma + rewards[ri]
        discounted_rewards[ri] = discounted_value
    return discounted_rewards

def get_sample(env, policy, max_iter=500):
    s = env.reset() # (1, 4)
    ss, aa, rr, log_probs = list(), list(), list(), list()
    for i in range(max_iter):
        a, log_prob = policy.predict(s)
        ns, r, done, _ = env.step(a) # a is 0 or 1
        ss.append(s)
        aa.append(a)
        rr.append(r)
        log_probs.append(log_prob) # log probability of the current action
        s = ns
        if done:
            break

    rr = discount(rr, gamma)
    sample = [(s, a, r) for s, a, r in zip(ss, aa, rr)]
    return sample, log_probs

## 4. Build model

In [0]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(4, 128),
            nn.Linear(128, 2),
        )
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.99))
      
    def predict(self, state): # state: (4,) => indicates that the fully-connected layer in PyTorch can receive inputs without batch_size
        outputs = self.fc(torch.Tensor(state).to(device)) # outputs: (1, 2)
        probs = F.softmax(outputs) # !!! Do not use log_softmax before Categorical, since the Categorical requires the softmax result instead of the log + softmax one. Otherwise the score might decreases while training
        a_pred = Categorical(probs).sample()
        return a_pred.cpu().item(), torch.log(probs[a_pred]) # (predicted action: 0 or 1, log of probability of current action)
      
    def fit(self, sample, log_probs): # samples: [(s1, a1, r1), (s2, a2, r2), ...], log_probs: (log_prob1, log_prob2, ...)
        loss = 0
        for (s, a, r), log_prob in zip(sample, log_probs):
            loss += -log_prob * r
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

policy = Policy().to(device)

## Train

In [5]:
score = 0.0
for epoch in range(num_epochs):
    sample, log_probs = get_sample(env, policy)
    policy.fit(sample, log_probs)
    rewards = list(zip(*sample))[1]
    score += sum(rewards)
    if epoch % 100 == 0:
        print('Epoch %d || Average Score: %.6f'%(epoch, score / (epoch + 1)))

  del sys.path[0]


Epoch 0 || Average Score: 4.000000
Epoch 100 || Average Score: 31.099010
Epoch 200 || Average Score: 52.044776
Epoch 300 || Average Score: 59.000000
Epoch 400 || Average Score: 66.256858
Epoch 500 || Average Score: 71.273453
Epoch 600 || Average Score: 73.442596
Epoch 700 || Average Score: 75.724679
Epoch 800 || Average Score: 78.500624
Epoch 900 || Average Score: 80.441731
Epoch 1000 || Average Score: 82.038961
Epoch 1100 || Average Score: 83.705722
Epoch 1200 || Average Score: 85.029975
Epoch 1300 || Average Score: 84.876249
Epoch 1400 || Average Score: 83.565310
Epoch 1500 || Average Score: 83.669554
Epoch 1600 || Average Score: 83.032480
Epoch 1700 || Average Score: 82.449735
Epoch 1800 || Average Score: 82.433648
Epoch 1900 || Average Score: 83.134666
