To use a batched A2C we have to adapt both the step method and how we store information for the update.
In particular:
1. Instead of using observations of the environment with everything attached, elaborate the information and extract only the useful variables, so that they can be stacked in numpy arrays when possbile.
2. step method works with observation, but should work with a batch of states and available actions; Probably it will be infeasible to vectorize the selection of the parameters of the actions, because the output and the procedure varies from action to action.
3. The part in which we compute the loss is okay, because we use only the critic to make predictions, so the only problem is to store the information in batches.

NOTE: remember to add all networks to the optimizer initialization

In [1]:
from Utils.train_batched_A2C import *
from AC_modules.BatchedA2C import MoveToBeaconSpatialA2C
import torch

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
def train_batched_A2C(agent, game_params, lr, n_train_processes, max_train_steps, unroll_length, test_interval=100):
    envs = ParallelEnv(n_train_processes, game_params)

    optimizer = torch.optim.Adam(agent.parameters(), lr=lr)

    score = []
    steps_to_solve = []
    critic_losses = [] 
    actor_losses = []
    entropies = []
    
    step_idx = 0
    s = envs.reset()
    while step_idx < max_train_steps:
        s_lst, a_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(), list(), list(), list(), list()
        log_probs = []
        distributions = []
        for _ in range(unroll_length):

            a, log_prob, probs = agent.get_action(s)
            a_lst.append(a)
            log_probs.append(log_prob)
            distributions.append(probs)

            s_prime, r, done, info, bootstrap, s_trg = envs.step(a)
            s_lst.append(s)
            r_lst.append(r)
            done_lst.append(done)
            bootstrap_lst.append(bootstrap)
            s_trg_lst.append(s_trg)

            s = s_prime
            step_idx += 1 #n_train_processes

        s_lst = np.array(s_lst).transpose(1,0,2,3,4)
        r_lst = np.array(r_lst).transpose(1,0)
        done_lst = np.array(done_lst).transpose(1,0)
        bootstrap_lst = np.array(bootstrap_lst).transpose(1,0)
        s_trg_lst = np.array(s_trg_lst).transpose(1,0,2,3,4)
        
        ### Update time ###
        #print("len(r_lst): ", r_lst.shape)
        #print("len(s_lst): ", s_lst.shape)
        #print("len(done_lst): ", done_lst.shape)
        #print("len(s_trg_lst): ", s_trg_lst.shape)
        critic_loss, actor_loss, entropy = agent.compute_ac_loss(r_lst, log_probs, distributions, 
                                                                 s_lst, done_lst, bootstrap_lst, s_trg_lst)

        loss = (critic_loss + actor_loss).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        #print("critic_loss: ", critic_loss)
        #print("actor_loss: ", actor_loss)
        #print("entropy: ", entropy)
        critic_losses.append(critic_loss.item())
        actor_losses.append(actor_loss.item())
        entropies.append(entropy.item())
        
        ### Test time ###
        if step_idx % test_interval == 0:
            avg_score, avg_steps = test(step_idx, agent, test_env)
            score.append(avg_score)
            steps_to_solve.append(avg_steps)
    envs.close()
    
    losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropies)
    return score, steps_to_solve, losses, agent

## Line by line

In [3]:
RESOLUTION = 16
MAX_STEPS = 256
game_params = dict(feature_screen=RESOLUTION, feature_minimap=RESOLUTION, action_space="FEATURES") 

In [4]:
env = init_game(game_params, max_steps=MAX_STEPS)

In [5]:
HPs = dict(action_space=3, n_layers=2, in_channels=3, n_channels=12, linear_size=RESOLUTION,
           gamma=0.99, n_steps = 20, pixel_n_residuals=2, feature_n_residuals=2)

if torch.cuda.is_available():
    HPs['device'] = 'cuda'
else:
    HPs['device'] = 'cpu'
    
print("Using device "+HPs['device'])

lr = 1e-4
agent = MoveToBeaconSpatialA2C(env=env, **HPs)

Using device cuda

arg.name:  select_add
size:  (2,)
Init CategoricalNet for select_add argument

arg.name:  queued
size:  (2,)
Init CategoricalNet for queued argument

arg.name:  screen
size:  (16, 16)
Init SpatialNet for screen argument
Discount factor:  0.99
Action space:  3
n_steps for TD:  20
Device used:  cuda


Architecture: 
 SpatialActorCritic(
  (spatial_features_net): SpatialFeatures(
    (net): Sequential(
      (0): Conv2d(3, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU()
      (2): ResidualConvolutional(
        (net): Sequential(
          (0): LayerNorm((16, 16), eps=1e-05, elementwise_affine=True)
          (1): Conv2d(12, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (2): ReLU()
          (3): Conv2d(12, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
      )
    )
  )
  (nonspatial_features_net): NonSpatialFeatures(
    (pixel_res_block): Sequential(
      (0): ResidualLayer(
        (norm): LayerNorm((256

In [6]:
n_train_processes = 2
max_train_steps = 2000
unroll_length = 240
max_steps = MAX_STEPS

In [7]:
envs = ParallelEnv(n_train_processes, game_params, max_steps)

reset done
info sent
reset done
info sent


In [8]:
optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr)

In [9]:
s, a_mask = envs.reset()

In [10]:
s.shape

(2, 3, 16, 16)

In [11]:
a_mask # action is not available = True

array([[False, False,  True],
       [False, False,  True]])

## AC step

To make a batched step we already have the state in the correct shape, but we need to:
1. change get_action_mask of the AC so to work with a list of arrays - DONE
2. sample all parameters every time, so that we don't need to loop between the parameter networks to get the right ones and we can vectorize it
3. select only the parameters that we neeed

Each action has 0, 1 or 2 parameters associated. We should sample a batch of arguments for each parameter of each argument and then access only the correct ones. We need a bit of structure to do that.

(batch_dim, [[params0],[params1],[params2]])
and same thing for the log_prob

Unfortunately we get something like ([[],[(batch_dim, args)],[(batch_dim, arg1),(batch_dim, arg2)]])

In [12]:
agent.step(s, a_mask)


action mask:  tensor([[False, False,  True],
        [False, False,  True]], device='cuda:0')
x.shape:  torch.Size([2, 12, 256])
x.shape:  torch.Size([2, 12, 256])
x.shape:  torch.Size([2, 12])
x.shape:  torch.Size([2, 12])
logits shape:  torch.Size([2, 3])
logits:  tensor([[-1.1504, -0.8252, -1.4051],
        [-1.2276, -0.7689, -1.4126]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)
mask shape:  torch.Size([2, 3])
mask:  tensor([[False, False,  True],
        [False, False,  True]], device='cuda:0')
logits (after mask):  tensor([[-1.1504, -0.8252,    -inf],
        [-1.2276, -0.7689,    -inf]], device='cuda:0',
       grad_fn=<IndexPutBackward>)
log_probs:  tensor([[-0.8689, -0.5437,    -inf],
        [-0.9486, -0.4899,    -inf]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)
spatial_features.shape:  torch.Size([2, 12, 16, 16])
spatial_features:  tensor([[[[ 7.3533e-02,  2.1391e-01,  1.9562e-01,  ...,  1.3956e-01,
            1.3314e-01,  1.0519e-01],
          [ 1.3

probs:  tensor([[0.4194, 0.5806, 0.0000],
        [0.3873, 0.6127, 0.0000]], device='cuda:0', grad_fn=<ExpBackward>)
log_prob:  tensor([-0.5437, -0.4899], device='cuda:0', grad_fn=<IndexBackward>)
action_id:  [7 7]


ValueError: only one element tensors can be converted to Python scalars

In [None]:
import torch
import numpy as np

In [18]:
t = torch.rand(2,3)
t

tensor([[0.9175, 0.8611, 0.8561],
        [0.5384, 0.0359, 0.7425]])

In [19]:
m = np.array([[False, False,  True],
       [False, False,  True]])

In [20]:
t[torch.tensor(m)] 

tensor([0.8561, 0.7425])