# Asynchronous Advantage Actor-Critic (A3C)

## Loss
### 1. Policy Network
$$\bigtriangledown_{\theta}\text{J}(\theta) = \frac{1}{N}\sum_{i=1}^N{ [ \sum_{t=0}^T{ [\bigtriangledown_{\theta} \text{log} \pi_{\theta}(a_{i, t}|s_{i, t}) (r(s_{i, t}, a_{i, t}) + \gamma v(s_{i, t+1}) - v(i, s_t))] }] }$$

### 2. Value Network
$$\bigtriangledown_{\theta}\text{J}(\theta) = \frac{1}{N}\sum_{i=1}^N{ [ \sum_{t=0}^T{[ \text{smooth_l1_loss}(v(s_{i, t}), r(s_{i, t}, a_{i, t}) + \gamma v(s_{i, t+1}) )  ]}] }$$
 
## Points
+ Train 1 Actor Critic object which shares memory using pytorch multiprocessing module
+ CPU or GPU: In colab, the code can only be run on **CPU**

## Reference
+ Paper: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
+ Code: [a3c.py](https://github.com/seungeunrho/minimalRL/blob/master/a3c.py): Note that in our code the optimizer is the member of ActorCritic object

## 1. Import packages

In [0]:
import gym
import torch
import torch.nn as nn
from torch.distributions import Categorical
import torch.nn.functional as F
import torch.multiprocessing as mp

## 2. Define constants

In [0]:
gamma = 0.98
num_epochs = 3000
num_rollouts = 5
reward_div = 100
num_train_processes = 3

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # report error
device = 'cpu'

## 3. Prepare data

In [0]:

def get_sample(env, policy):
    done = False
    s = env.reset() # (state_size, )
    while not done:
        ss, aa, rr, s_primes, done_masks = list(), list(), list(), list(), list()
        for t in range(num_rollouts):
            a = policy.sample_action(s)
            s_prime, r, done, _ = env.step(a) # a is 0 or 1
            ss.append(s)
            aa.append(a)
            rr.append(r)
            s_primes.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_masks.append(done_mask)
            s = s_prime
            if done:
                break
                
        sample = (torch.Tensor(ss).to(device), torch.LongTensor(aa).to(device), torch.Tensor(rr).to(device), torch.Tensor(s_primes).to(device), torch.Tensor(done_masks).to(device))
        yield sample

## 4. Build model

In [0]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        
        self.fc1 = nn.Linear(4, 256)
        self.fc_pi = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256, 1)
        
        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.0002, betas=(0.9, 0.99)) # !!! env should be identical to each actor critic
        

    def policy(self, state, softmax_dim=0):
        net = F.relu(self.fc1(state)) # (B, 256)
        probs = F.softmax(self.fc_pi(net), dim=softmax_dim)
        return probs
        
    def sample_action(self, state, softmax_dim=0): # state: (4,) => indicates that the fully-connected layer in PyTorch can receive inputs without batch_size
        state = torch.Tensor(state).to(device)
        probs = self.policy(state)
        m = Categorical(probs)
        a_pred = m.sample().item()
        return a_pred # (predicted action: 0 or 1, log of probability of current action)

    def value(self, state):
        net = F.relu(self.fc1(state))
        return self.fc_v(net)
      
def train(model, rank): # samples: [(s1, a1, r1), (s2, a2, r2), ...]
    env = gym.make("CartPole-v0") # !!! env should be identical to each actor critic
    
    for epoch in range(num_epochs):
        sample_iter = get_sample(env, model)
        
        for sample in sample_iter:
            (s, a, r, ns, done_mask) = sample

            r /= reward_div
            td_target = (r + gamma * model.value(ns).squeeze() * done_mask).unsqueeze(1) # (num_rollouts, 1)
            vs = model.value(s) # (num_rollouts, 1)
            advantages = td_target - vs # (num_rollouts, 1)

            probs = model.policy(s, softmax_dim=1) # (num_rollouts, action_size=2)
            probs = probs.gather(1, a.unsqueeze(1)) # (num_rollouts, 1)
            loss = torch.mean(-torch.log(probs) * advantages.detach() +  F.smooth_l1_loss(vs, td_target.detach()))

            model.optimizer.zero_grad()
            loss.backward()
            model.optimizer.step()
            
    env.close()
    print("Training process {} reached maximum episode.".format(rank))


def test(model): # samples: [(s1, a1, r1), (s2, a2, r2), ...]
    env = gym.make("CartPole-v0") # !!! env should be identity to each actor critic
    score = 0.0

    for epoch in range(num_epochs):
        sample_iter = get_sample(env, model)
        for sample in sample_iter:
            rewards = sample[2]
            score += sum(rewards)

        if epoch % 100 == 0:
            print('Epoch %d || Average Score: %.6f'%(epoch, score / (epoch + 1)))        
    env.close()

## 5. Train

In [5]:
model = ActorCritic().to(device)
model.share_memory()
processes = list()
for rank in range(num_train_processes + 1):
    if rank == 0:
        p = mp.Process(target=test, args=(model,))
    else:
        p = mp.Process(target=train, args=(model, rank))
    p.start()
    processes.append(p)
for p in processes:
    p.join()

Epoch 0 || Average Score: 23.000000
Epoch 100 || Average Score: 23.594059
Epoch 200 || Average Score: 54.597015
Epoch 300 || Average Score: 97.312294
Epoch 400 || Average Score: 115.486282
Epoch 500 || Average Score: 130.183640
Epoch 600 || Average Score: 140.603989
Epoch 700 || Average Score: 145.935806
Epoch 800 || Average Score: 152.318359
Epoch 900 || Average Score: 156.624863
Epoch 1000 || Average Score: 159.665329
Epoch 1100 || Average Score: 160.255219
Epoch 1200 || Average Score: 159.769363
Epoch 1300 || Average Score: 160.226746
Epoch 1400 || Average Score: 159.668808
Epoch 1500 || Average Score: 158.459686
Epoch 1600 || Average Score: 159.064331
Epoch 1700 || Average Score: 158.410339
Epoch 1800 || Average Score: 159.218216
Epoch 1900 || Average Score: 158.640717
Epoch 2000 || Average Score: 157.774612
Epoch 2100 || Average Score: 156.679199
Epoch 2200 || Average Score: 157.944122
Epoch 2300 || Average Score: 158.612778
Epoch 2400 || Average Score: 160.226151
Epoch 2500 || Av