# "Asynchronous Methods for Deep Reinforcement Learning" paper implementation - https://arxiv.org/pdf/1602.01783.pdf

## This implementation includes only the Actor-Critic algorithm

In [1]:
import torch
import torch.nn.functional as F
import torch.optim as optim

from environments import create_atari_env
from model import ActorCritic

def ensure_shared_grads(model, shared_model):
    for param, shared_param in zip(model.parameters(), shared_model.parameters()):
        if shared_param.grad is not None:
            return
        shared_param._grad = param.grad

def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args['seed'] + rank)

    env = create_atari_env(args['env-name'])
    env.seed(args['seed'] + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args['lr'])

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args['num-steps']):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args['max-episode-length']
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args['gamma'] * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args['gamma'] * values[i + 1] - values[i]
            gae = gae * args['gamma'] * args['gae-lambda'] + delta_t

            policy_loss = policy_loss - log_probs[i] * gae.detach() - args['entropy-coef'] * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args['value-loss-coef'] * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args['max-grad-norm'])

        ensure_shared_grads(model, shared_model)
        optimizer.step()


In [2]:
import time
from collections import deque
import torch
import torch.nn.functional as F
from environments import create_atari_env
from model import ActorCritic

def test(rank, args, shared_model, counter):
    torch.manual_seed(args['seed'] + rank)

    env = create_atari_env(args['env-name'])
    env.seed(args['seed'] + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args['max-episode-length']
        reward_sum += reward

        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)


In [None]:
from __future__ import print_function
import os
import torch
import torch.multiprocessing as mp
from optimizer import SharedAdam
from environments import create_atari_env
from model import ActorCritic

args = {
    'lr': 0.0001,
    'gamma': 0.99,
    'gae-lambda': 1.00,
    'entropy-coef': 0.01,
    'value-loss-coef': 0.5,
    'max-grad-norm': 50,
    'seed': 1,
    'num-processes': 4,
    'num-steps': 20,
    'max-episode-length': 1000000,
    'env-name': 'PongDeterministic-v4',
    'no-shared': False,
}

if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    torch.manual_seed(args['seed'])
    env = create_atari_env(args['env-name'])
    shared_model = ActorCritic(
        env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()

    if args['no-shared']:
        optimizer = None
    else:
        optimizer = SharedAdam(shared_model.parameters(), lr=args['lr'])
        optimizer.share_memory()

    processes = []

    counter = mp.Value('i', 0)
    lock = mp.Lock()

    p = mp.Process(target=test, args=(
        args['num-processes'], args, shared_model, counter))
    p.start()
    processes.append(p)

    for rank in range(0, args['num-processes']):
        p = mp.Process(target=train, args=(
            rank, args, shared_model, counter, lock, optimizer))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
