# Training

In [1]:
import sys
sys.path.insert(0, "../")
from SC_Utils.game_utils import ObsProcesser, get_action_dict
from SC_Utils.train_MaxEnt import *
from AC_modules.BatchedA2C import SpatialA2C_MaxEnt_v2
import AC_modules.Networks as net
import torch

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Environment parameters
RESOLUTION = 32
game_params = dict(feature_screen=RESOLUTION, feature_minimap=RESOLUTION, action_space="FEATURES") 
game_names = {1:'MoveToBeacon',
              2:'CollectMineralShards',
              3:'DefeatRoaches',
              4:'FindAndDefeatZerglings',
              5:'DefeatZerglingsAndBanelings',
              6:'CollectMineralsAndGas',
              7:'BuildMarines'
              }
map_name = game_names[1]

# Observation Processer parameters
#screen_names = ['visibility_map', 'player_relative', 'selected', 'unit_density', 'unit_density_aa']
#minimap_names = []
#obs_proc_params = {'screen_names':screen_names, 'minimap_names':minimap_names}
obs_proc_params = {'select_all':True}

In [3]:
env = init_game(game_params, map_name)
op = ObsProcesser(**obs_proc_params)
screen_channels, minimap_channels = op.get_n_channels()
in_channels = screen_channels + minimap_channels 

In [4]:
action_names = ['select_army', 'Move_screen','Attack_screen']
#action_names = ['select_army','Move_screen']
#action_names = ['select_army', 'Attack_screen', 'Move_screen', 'select_point', 'select_rect',
#                'move_camera','Stop_quick','Move_minimap','Attack_minimap','HoldPosition_quick']
action_dict = get_action_dict(action_names)
action_space = len(action_dict)

In [5]:
spatial_model = net.FullyConvSpatial
nonspatial_model = net.FullyConvNonSpatial
n_channels = 32
n_features = 256
spatial_dict = {"in_channels":in_channels}
nonspatial_dict = {'resolution':RESOLUTION, 'kernel_size':3, 'stride':2}

In [6]:
HPs = dict(action_space=action_space, gamma=0.99, n_steps=20, H=7e-3, 
           spatial_model=spatial_model, nonspatial_model=nonspatial_model,
           n_features=n_features, n_channels=n_channels, 
           spatial_dict=spatial_dict, nonspatial_dict=nonspatial_dict, 
           action_dict=action_dict)

if torch.cuda.is_available():
    HPs['device'] = 'cuda'
else:
    HPs['device'] = 'cpu'
    
print("Using device "+HPs['device'])

lr = 7e-4

Using device cuda


In [7]:
agent = SpatialA2C_MaxEnt_v2(env=env, **HPs)

In [8]:
unroll_length = 60

train_dict = dict(n_train_processes = 11,
                  max_train_steps = unroll_length*1000,
                  unroll_length = unroll_length,
                  test_interval = unroll_length*10,
                  inspection_interval = unroll_length*10
                  )

In [9]:
env.close()

In [10]:
action_dict

{0: <_Functions.select_army: 7>,
 1: <_Functions.Move_screen: 331>,
 2: <_Functions.Attack_screen: 12>}

In [11]:
agent.AC

SpatialActorCritic_v2(
  (spatial_features_net): FullyConvSpatial(
    (net): Sequential(
      (0): Conv2d(38, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (1): ReLU()
      (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU()
    )
  )
  (nonspatial_features_net): FullyConvNonSpatial(
    (conv): Sequential(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2))
      (1): ReLU()
    )
    (net): Sequential(
      (0): Linear(in_features=14400, out_features=256, bias=True)
      (1): ReLU()
    )
  )
  (actor): SharedActor(
    (linear): Linear(in_features=256, out_features=3, bias=True)
  )
  (critic): SharedCritic(
    (net): Linear(in_features=256, out_features=1, bias=True)
  )
  (arguments_networks): ModuleDict(
    (Attack_screen/queued): CategoricalNet(
      (net): Sequential(
        (0): Linear(in_features=256, out_features=256, bias=True)
        (1): ReLU()
        (2): Linear(in_features=256, out_features=2, bias=

In [12]:
agent.AC.act_to_arg_names

{0: ['select_army/select_add'],
 1: ['Move_screen/queued', 'Move_screen/screen'],
 2: ['Attack_screen/queued', 'Attack_screen/screen']}

In [13]:
agent.AC.arguments_dict

{'select_army/select_add': 7,
 'Move_screen/queued': 3,
 'Move_screen/screen': 0,
 'Attack_screen/queued': 3,
 'Attack_screen/screen': 0}

In [None]:
%%time
results = train_batched_A2C(agent, game_params, map_name, lr, 
                            obs_proc_params=obs_proc_params, action_dict=action_dict, **train_dict)

Process ID:  PXFS


# Development

In [15]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F 
from torch.distributions import Categorical

In [16]:
rewards = np.array([[0,0,1,0,1],
                    [0,0,0,0,1]])

done = np.array([[0,0,1,0,1],
                 [0,0,0,0,1]])

B = rewards.shape[0]
T = rewards.shape[1]
n_steps = 3
gamma = 0.99

In [17]:
# produce simil-entropies with gradient associated
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.net = nn.Linear(1,4)
    
    def forward(self, x):
        return F.log_softmax(self.net(x), dim=-1)

In [18]:
states = torch.rand(B,T,1)
actor = Actor()
log_probs = actor(states)
probs = torch.exp(log_probs) 
distr = Categorical(probs=probs)
entropy = distr.entropy() # POSITIVE
entropy.shape

torch.Size([2, 5])

In [28]:
rel_entropy = torch.log(torch.ones(probs.shape)/probs.shape[2]).mean(axis=-1) # negative
rel_entropy

tensor([[-1.3863, -1.3863, -1.3863, -1.3863, -1.3863],
        [-1.3863, -1.3863, -1.3863, -1.3863, -1.3863]])

Now we basically have to add between them the entropy (multiplied by a factor H in general) and the rewards, and then perform what the function compute_n_step_rewards does, but in pytorch, so that the computational graph will be preserved.

```python

def compute_n_step_rewards(self, rewards, done, n_steps=None):
        """
        Computes n-steps discounted reward. 
        Note: the rewards considered are AT MOST n, but can be less for the last n-1 elements.
        """
        if n_steps is None:
            n_steps = self.n_steps
        B = done.shape[0]
        T = done.shape[1]
        
        # Compute episode mask (i-th row contains 1 if col j is in the same episode of col i, 0 otherwise)
        episode_mask = [[] for _ in range(B)]
        last = [-1 for _ in range(B)]
        xs, ys = np.nonzero(done)
        
        # Add done at the end of every batch to avoid exceptions -> not used in real target computations
        xs = np.concatenate([xs, np.arange(B)])
        ys = np.concatenate([ys, np.full(B, T-1)])
        for x, y in zip(xs, ys):
            m = [1 if (i > last[x] and i <= y) else 0 for i in range(T)]
            for _ in range(y-last[x]):
                episode_mask[x].append(m)
            last[x] = y
        episode_mask = np.array(episode_mask)
        
        # Compute n-steps mask and repeat it B times
        n_steps_mask = []
        for i in range(T):
            m = [1 if (j>=i and j<i+n_steps) else 0 for j in range(T)]
            n_steps_mask.append(m)
        n_steps_mask = np.array(n_steps_mask)
        n_steps_mask_b = np.repeat(n_steps_mask[np.newaxis,...] , B, axis=0)
        
        # Broadcast rewards to use multiplicative masks
        rewards_repeated = np.repeat(rewards[:,np.newaxis,:], T, axis=1)
        
        # Exponential discount factor
        Gamma = np.array([self.gamma**i for i in range(T)]).reshape(1,-1)
        n_steps_r = (Gamma*rewards_repeated*episode_mask*n_steps_mask_b).sum(axis=2)/Gamma
        return n_steps_r, episode_mask, n_steps_mask_b
    
```

In [23]:
# Compute episode mask (i-th row contains 1 if col j is in the same episode of col i, 0 otherwise)
episode_mask = [[] for _ in range(B)]
last = [-1 for _ in range(B)]
xs, ys = np.nonzero(done)

# Add done at the end of every batch to avoid exceptions -> not used in real target computations
xs = np.concatenate([xs, np.arange(B)])
ys = np.concatenate([ys, np.full(B, T-1)])
for x, y in zip(xs, ys):
    m = [1 if (i > last[x] and i <= y) else 0 for i in range(T)]
    for _ in range(y-last[x]):
        episode_mask[x].append(m)
    last[x] = y
episode_mask = np.array(episode_mask)
print("episode_mask.shape: ", episode_mask.shape)
episode_mask

episode_mask.shape:  (2, 5, 5)


array([[[1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 1, 1]],

       [[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]]])

In [24]:
# Compute n-steps mask and repeat it B times
n_steps_mask = []
for i in range(T):
    m = [1 if (j>=i and j<i+n_steps) else 0 for j in range(T)]
    n_steps_mask.append(m)
n_steps_mask = np.array(n_steps_mask)
n_steps_mask_b = np.repeat(n_steps_mask[np.newaxis,...] , B, axis=0)
print("n_steps_mask_b.shape: ", n_steps_mask_b.shape)
n_steps_mask_b

n_steps_mask_b.shape:  (2, 5, 5)


array([[[1, 1, 1, 0, 0],
        [0, 1, 1, 1, 0],
        [0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 0, 1]],

       [[1, 1, 1, 0, 0],
        [0, 1, 1, 1, 0],
        [0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 0, 1]]])

In [44]:
r = torch.tensor(rewards, dtype=torch.float32)
augmented_r = r + entropy
rewards_repeated = augmented_r.view(B,1,T).repeat(1,T,1)
rewards_repeated

tensor([[[1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600]],

        [[1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325]]], grad_fn=<RepeatBackward>)

In [34]:
# Broadcast rewards to use multiplicative masks
rewards_repeated = np.repeat(rewards[:,np.newaxis,:], T, axis=1)
print("rewards_repeated.shape: ", rewards_repeated.shape)
print("rewards_repeated: ", rewards_repeated)
rewards_repeated = torch.tensor(rewards_repeated, dtype=float)
augmented_r = rewards_repeated + entropy.view(B,1,T) # broadcasting along correct axis
augmented_r.shape
augmented_r

rewards_repeated.shape:  (2, 5, 5)
rewards_repeated:  [[[0 0 1 0 1]
  [0 0 1 0 1]
  [0 0 1 0 1]
  [0 0 1 0 1]
  [0 0 1 0 1]]

 [[0 0 0 0 1]
  [0 0 0 0 1]
  [0 0 0 0 1]
  [0 0 0 0 1]
  [0 0 0 0 1]]]


tensor([[[1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600],
         [1.3613, 1.3393, 2.3468, 1.2158, 2.3600]],

        [[1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325],
         [1.2363, 1.3442, 1.2788, 1.3432, 2.3325]]], dtype=torch.float64,
       grad_fn=<AddBackward0>)

In [47]:
# Exponential discount factor
Gamma = torch.tensor([gamma**i for i in range(T)]).reshape(1,-1)

In [49]:
t_episode_mask = torch.tensor(episode_mask)
t_n_steps_mask_b = torch.tensor(n_steps_mask_b)

In [55]:
n_steps_r = torch.sum(Gamma*rewards_repeated*t_episode_mask*t_n_steps_mask_b, axis=2)/Gamma
n_steps_r

tensor([[4.9874, 3.6627, 2.3468, 3.5521, 2.3600],
        [3.8204, 3.9266, 4.8946, 3.6523, 2.3325]], grad_fn=<DivBackward0>)

In [70]:
def compute_n_step_rewards(rewards, done, entropies, n_steps=3, gamma=0.99):
        """
        Computes n-steps discounted reward. 
        Note: the rewards considered are AT MOST n, but can be less for the last n-1 elements.
        """
        B = done.shape[0]
        T = done.shape[1]

        # Compute episode mask (i-th row contains 1 if col j is in the same episode of col i, 0 otherwise)
        episode_mask = [[] for _ in range(B)]
        last = [-1 for _ in range(B)]
        xs, ys = np.nonzero(done)

        # Add done at the end of every batch to avoid exceptions -> not used in real target computations
        xs = np.concatenate([xs, np.arange(B)])
        ys = np.concatenate([ys, np.full(B, T-1)])
        for x, y in zip(xs, ys):
            m = [1 if (i > last[x] and i <= y) else 0 for i in range(T)]
            for _ in range(y-last[x]):
                episode_mask[x].append(m)
            last[x] = y
        episode_mask = np.array(episode_mask)

        # Compute n-steps mask and repeat it B times
        n_steps_mask = []
        for i in range(T):
            m = [1 if (j>=i and j<i+n_steps) else 0 for j in range(T)]
            n_steps_mask.append(m)
        n_steps_mask = np.array(n_steps_mask)
        n_steps_mask_b = np.repeat(n_steps_mask[np.newaxis,...] , B, axis=0)

        r = torch.tensor(rewards, dtype=torch.float64)
        augmented_r = r + entropy
        rewards_repeated = augmented_r.view(B,1,T).repeat(1,T,1)

        # Exponential discount factor
        Gamma = torch.tensor([gamma**i for i in range(T)]).reshape(1,-1)
        t_episode_mask = torch.tensor(episode_mask)
        t_n_steps_mask_b = torch.tensor(n_steps_mask_b)
        n_steps_r = torch.sum(Gamma*rewards_repeated*t_episode_mask*t_n_steps_mask_b, axis=2)/Gamma
        
        return n_steps_r, episode_mask, n_steps_mask_b

In [71]:
t_n_steps_r, t_episode_mask, t_n_steps_mask_b = compute_n_step_rewards(rewards, done, entropy)

# Testing

In [72]:
def numpy_check(rewards, done, n_steps=3, gamma=0.99):
    B = done.shape[0]
    T = done.shape[1]

    # Compute episode mask (i-th row contains 1 if col j is in the same episode of col i, 0 otherwise)
    episode_mask = [[] for _ in range(B)]
    last = [-1 for _ in range(B)]
    xs, ys = np.nonzero(done)

    # Add done at the end of every batch to avoid exceptions -> not used in real target computations
    xs = np.concatenate([xs, np.arange(B)])
    ys = np.concatenate([ys, np.full(B, T-1)])
    for x, y in zip(xs, ys):
        m = [1 if (i > last[x] and i <= y) else 0 for i in range(T)]
        for _ in range(y-last[x]):
            episode_mask[x].append(m)
        last[x] = y
    episode_mask = np.array(episode_mask)

    # Compute n-steps mask and repeat it B times
    n_steps_mask = []
    for i in range(T):
        m = [1 if (j>=i and j<i+n_steps) else 0 for j in range(T)]
        n_steps_mask.append(m)
    n_steps_mask = np.array(n_steps_mask)
    n_steps_mask_b = np.repeat(n_steps_mask[np.newaxis,...] , B, axis=0)

    # Broadcast rewards to use multiplicative masks
    rewards_repeated = np.repeat(rewards[:,np.newaxis,:], T, axis=1)

    # Exponential discount factor
    Gamma = np.array([gamma**i for i in range(T)]).reshape(1,-1)
    n_steps_r = (Gamma*rewards_repeated*episode_mask*n_steps_mask_b).sum(axis=2)/Gamma
    return n_steps_r, episode_mask, n_steps_mask_b

In [73]:
np_rewards = rewards + entropy.detach().numpy()
np_rewards

array([[1.36133623, 1.33932781, 2.34680569, 1.21575236, 2.35999632],
       [1.23633766, 1.3441546 , 1.27875113, 1.34319746, 2.33246183]])

In [74]:
n_steps_r, episode_mask, n_steps_mask_b = numpy_check(np_rewards, done)
print("n_steps_r: ", n_steps_r)
print("episode_mask: ", episode_mask)
print("n_steps_mask_b: ", n_steps_mask_b)

n_steps_r:  [[4.98737502 3.66266545 2.34680569 3.55214872 2.35999632]
 [3.8203547  3.92658606 4.89456247 3.65233468 2.33246183]]
episode_mask:  [[[1 1 1 0 0]
  [1 1 1 0 0]
  [1 1 1 0 0]
  [0 0 0 1 1]
  [0 0 0 1 1]]

 [[1 1 1 1 1]
  [1 1 1 1 1]
  [1 1 1 1 1]
  [1 1 1 1 1]
  [1 1 1 1 1]]]
n_steps_mask_b:  [[[1 1 1 0 0]
  [0 1 1 1 0]
  [0 0 1 1 1]
  [0 0 0 1 1]
  [0 0 0 0 1]]

 [[1 1 1 0 0]
  [0 1 1 1 0]
  [0 0 1 1 1]
  [0 0 0 1 1]
  [0 0 0 0 1]]]


In [76]:
t_n_steps_r.detach().numpy()

array([[4.98737498, 3.66266537, 2.34680569, 3.55214874, 2.35999632],
       [3.82035468, 3.92658601, 4.8945626 , 3.6523347 , 2.33246183]])

In [69]:
n_steps_r

array([[4.98737502, 3.66266545, 2.34680569, 3.55214872, 2.35999632],
       [3.8203547 , 3.92658606, 4.89456247, 3.65233468, 2.33246183]])

There seems to be some precision issue, but except for that the result is nearly identical.