# A2C

In [1]:
import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical, Normal

import pybullet_envs
from collections import deque
import time

from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

use_cuda = torch.cuda.is_available()
device   = torch.device("cuda" if use_cuda else "cpu")
use_cuda

False

## for Mac OS

In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

## Create Environments

In [3]:
from multiprocessing_env import SubprocVecEnv

num_envs = 12
env_name = "HalfCheetahBulletEnv-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)



In [5]:
def plot(frame_idx, rewards):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(131)
    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
    plt.plot(rewards)
    plt.show()
    
def test_env(vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    env.close()
    return total_reward

## Neural Network

In [6]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)
        
class ActorCritic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=256, seed=100, std=0.0):
        super(ActorCritic, self).__init__()
        self.seed = torch.manual_seed(seed)
        
        self.critic = nn.Sequential(
            nn.Linear(state_size, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 1)
        )

        self.actor = nn.Sequential(
            nn.Linear(state_size, 256),
            nn.ReLU(),
            nn.Linear(256, action_size),
            nn.Tanh()
        )
        
        self.log_std = nn.Parameter(torch.ones(1, action_size) * std)

        self.apply(init_weights)
        
    def forward(self, x):
        self.value = self.critic(x)
        self.mu    = self.actor(x)      
        self.std   = self.log_std.exp().expand_as(self.mu)
        self.dist  = Normal(self.mu, self.std)
        return self.dist, self.value
    
    
    def learn(self, state_lst, logprob_lst, q_val_lst, entropy, optimizer):
        
        self.log_probs = torch.cat(logprob_lst)
        self.returns   = torch.cat(q_val_lst).detach()
        self.values    = torch.cat(state_lst)
        
        self.advantage = self.returns - self.values
              
        self.actor_loss  = -(self.log_probs * self.advantage.detach()).mean()
        self.critic_loss = self.advantage.pow(2).mean()

        self.loss = self.actor_loss + 0.5 * self.critic_loss - 0.001 * entropy 

        optimizer.zero_grad()
        self.loss.backward()    
        optimizer.step()

## A2C: Synchronous Advantage Actor Critic

[OpenAI Blog:]("https://blog.openai.com/baselines-acktr-a2c/#a2canda3c\")

The Asynchronous Advantage Actor Critic method (A3C) has been very influential since the paper was published. The algorithm combines a few key ideas:

- An updating scheme that operates on fixed-length segments of experience (say, 20 timesteps) and uses these segments to compute estimators of the returns and advantage function.
- Architectures that share layers between the policy and value function.
- Asynchronous updates.

After reading the paper, AI researchers wondered whether the asynchrony led to improved performance (e.g. “perhaps the added noise would provide some regularization or exploration?“), or if it was just an implementation detail that allowed for faster training with a CPU-based implementation.

As an alternative to the asynchronous implementation, researchers found you can write a synchronous, deterministic implementation that waits for each actor to finish its segment of experience before performing an update, averaging over all of the actors. One advantage of this method is that it can more effectively use of GPUs, which perform best with large batch sizes. This algorithm is naturally called A2C, short for advantage actor critic. (This term has been used in several papers.)



In [7]:
state_size  = envs.observation_space.shape[0]
action_size = envs.action_space.shape[0]
high_limit = envs.action_space.high[0]
print("input space:", state_size)
print("input space:", action_size)
print("high_limit:", high_limit)

#Hyper params:
hidden_size = 256                                         
lr          = 1e-4
num_steps   = 5
num_episodes = 2000


input space: 26
input space: 6
high_limit: 1.0


In [8]:
def n_step_td(reward_lst, V, gamma=0.99):
    q_val_lst = []
    # TODO: n_step td return
    R = V
    for step in reversed(range(len(reward_lst))):  # TODO:
        R = reward_lst[step] + gamma * R
        q_val_lst.insert(0, R)
    return q_val_lst


def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns

In [15]:
#state = envs.reset()

def Worker(num_episodes, n_steps):
    #env = gym.make("HalfCheetahBulletEnv-v0")

    agent = ActorCritic(state_size, action_size).to(device)
    optimizer = optim.Adam(agent.parameters(), lr=1e-4) 
    
    #####################################################
    episode_rewards = deque(maxlen=100)
    start_time = time.time()
    epi_plot = []
    finish = False
    #####################################################


    for episode in range(num_episodes):
        
        states = envs.reset()
        scores = np.zeros(num_envs)                          # initialize the score (for each agent)
        dones = np.zeros(num_envs, dtype=bool)
        while not np.any(dones):

            log_probs = []
            values    = []
            nstep_rewards   = []
            masks     = []
            entropy = 0

            for step in range(n_steps):

                dist, value = agent(torch.FloatTensor(states).to(device)) 
                actions = dist.sample()
                
                next_states, rewards, dones, _ = envs.step(actions.cpu().numpy())
                scores += rewards                         
                states = next_states                      
             
                log_prob = dist.log_prob(actions)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                nstep_rewards.append(torch.FloatTensor(rewards).unsqueeze(1).to(device))
                masks.append(torch.FloatTensor(1 - dones).unsqueeze(1).to(device))

                if np.any(dones):                                  # exit loop if episode finished
                    break

            _, next_values = agent(torch.FloatTensor(next_states).to(device))
            next_values = torch.zeros(num_envs).unsqueeze(1) if np.any(dones) else next_values

            returns = compute_returns(next_values, nstep_rewards, masks)
            
            agent.learn(values, log_probs, returns, entropy, optimizer)

        episode_rewards.append(scores.mean())
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(episode_rewards)), end="")

        if episode >= 100:
            mean_100_episode_reward = np.mean(episode_rewards)
            epi_plot.append(mean_100_episode_reward)
            if episode % 10 == 0:
                print("Episode: {}, avg score: {:.1f}".format(episode, mean_100_episode_reward))

            if mean_100_episode_reward >= 500:
                finish = True
                print("Solved (1)!!!, Time : {:.2f}".format(time.time() - start_time))
                np.save("./single.npy", np.array(epi_plot))
                break

    env.close()
    print("Fail... Retry")


In [16]:
num_episodes = 2000

def run(num_episodes):
    n_steps = 5  # TODO up to you

    Worker(num_episodes, n_steps)

if __name__ == '__main__':
    run(2000)

Episode 100	Average Score: -195.80Episode: 100, avg score: -195.8
Episode 110	Average Score: -63.51Episode: 110, avg score: -63.5
Episode 120	Average Score: 49.73Episode: 120, avg score: 49.7
Episode 130	Average Score: 149.89Episode: 130, avg score: 149.9
Episode 140	Average Score: 236.32Episode: 140, avg score: 236.3
Episode 150	Average Score: 300.97Episode: 150, avg score: 301.0
Episode 160	Average Score: 343.97Episode: 160, avg score: 344.0
Episode 170	Average Score: 374.73Episode: 170, avg score: 374.7
Episode 180	Average Score: 401.45Episode: 180, avg score: 401.5
Episode 190	Average Score: 436.17Episode: 190, avg score: 436.2
Episode 200	Average Score: 455.68Episode: 200, avg score: 455.7
Episode 210	Average Score: 496.50Episode: 210, avg score: 496.5
Episode 213	Average Score: 500.65Solved (1)!!!, Time : 1208.22
Fail... Retry
