In [18]:
import sys
sys.path.insert(0, "../")
from AC_modules.BatchedA2C import SpatialA2C
from AC_modules.ActorCriticArchitecture import SpatialActorCritic_v4
from SC_Utils.game_utils import get_action_dict, ObsProcesser
from SC_Utils.train_v4 import *
import torch

In [2]:
# Environment parameters
RESOLUTION = 32
game_params = dict(feature_screen=RESOLUTION, feature_minimap=RESOLUTION, action_space="FEATURES") 
game_names = {1:'MoveToBeacon',
              2:'CollectMineralShards',
              3:'DefeatRoaches',
              4:'FindAndDefeatZerglings',
              5:'DefeatZerglingsAndBanelings',
              6:'CollectMineralsAndGas',
              7:'BuildMarines'
              }
map_name = game_names[6]

# Observation Processer parameters
#screen_names = ['visibility_map', 'player_relative', 'selected', 'unit_density', 'unit_density_aa']
#minimap_names = []
#obs_proc_params = {'screen_names':screen_names, 'minimap_names':minimap_names}
obs_proc_params = {'select_all':True}

In [3]:
action_names = ['select_army', 'Move_screen','Attack_screen']
#action_names = ['select_army','Move_screen']
#action_names = ['select_army', 'Attack_screen', 'Move_screen', 'select_point', 'select_rect',
#                'move_camera','Stop_quick','Move_minimap','Attack_minimap','HoldPosition_quick']
action_dict = get_action_dict(action_names)
action_space = len(action_dict)

# Including player information in the state

In [4]:
env = init_game(game_params, map_name)

In [5]:
obs = env.reset()

In [6]:
player = obs[0].observation['player']
player

NamedNumpyArray([ 1, 50,  0, 12, 15,  0, 12,  0,  0,  0,  0], ['player_id', 'minerals', 'vespene', 'food_used', 'food_cap', 'food_army', 'food_workers', 'idle_worker_count', 'army_count', 'warp_gate_count', 'larva_count'], dtype=int32)

In [7]:
list(player._index_names[0].keys())

['player_id',
 'minerals',
 'vespene',
 'food_used',
 'food_cap',
 'food_army',
 'food_workers',
 'idle_worker_count',
 'army_count',
 'warp_gate_count',
 'larva_count']

Info that I will exclude:
- player_id is always 1
- warp_gate_count and larva_count are useless during minigames

Preprocessing of the rest:
- apply a log2 to minerals and vespene, because being resources they can grow a lot in value
- cast to float the rest

In [8]:
class PlayerProcesser():
    def __init__(self):
        self.useful_indexes = np.arange(1,9)
    
    def get_player_info(self, obs):
        player = obs[0].observation['player'].astype(float)
        x = player[self.useful_indexes]
        f =  lambda x: np.log2(x) if x != 0 else x
        x['minerals'] = f(x['minerals'])
        x['vespene'] = f(x['vespene'])
        return np.array(x), np.array(list(x._index_names[0].keys()))

In [9]:
pp = PlayerProcesser()
pp.get_player_info(obs)

(array([ 5.64385619,  0.        , 12.        , 15.        ,  0.        ,
        12.        ,  0.        ,  0.        ]),
 array(['minerals', 'vespene', 'food_used', 'food_cap', 'food_army',
        'food_workers', 'idle_worker_count', 'army_count'], dtype='<U17'))

In [10]:
env.close()

Merging this new feature in the old ObsProcesser class...

In [11]:
class FullObsProcesser(ObsProcesser):
    def __init__(self, screen_names=[], minimap_names=[], select_all=False):
        super().__init__(screen_names, minimap_names, select_all)
        self.useful_indexes = np.arange(1,9)
        
    def get_state(self, obs):
        feature_screen = obs[0].observation['feature_screen']
        feature_minimap = obs[0].observation['feature_minimap']
        player_info = obs[0].observation['player'].astype(float)
        
        screen_layers, screen_names = self._process_screen_features(feature_screen)
        minimap_layers, minimap_names = self._process_minimap_features(feature_minimap)
        player_features, player_names = self._process_player_features(player_info)
        state = {'screen_layers':screen_layers, 'minimap_layers':minimap_layers, 'player_features':player_features}
        names = {'screen_names':screen_names, 'minimap_names':minimap_names, 'player_names':player_names}
        return state, names
    
    def _process_player_features(self, player):
        x = player[self.useful_indexes]
        f =  lambda x: np.log2(x) if x != 0 else x
        x['minerals'] = f(x['minerals'])
        x['vespene'] = f(x['vespene'])
        return np.array(x), np.array(list(x._index_names[0].keys()))
    
    def get_n_channels(self):
        screen_channels, minimap_channels = super().get_n_channels()
        player_channels = len(self.useful_indexes)
        return screen_channels, minimap_channels, player_channels

In [12]:
fop = FullObsProcesser(select_all=True)
state, names = fop.get_state(obs)
fop.get_n_channels()

(26, 12, 8)

In [13]:
state

{'screen_layers': array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 1., 1., ..., 1., 1., 0.],
         [0., 1., 1., ..., 1., 1., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., .

Following the workflow of the algorithm, now we have to modify this function
```python
def merge_screen_and_minimap(state_dict):
    screen = state_dict['screen_layers']
    minimap = state_dict['minimap_layers']
    if len(minimap) > 0:
        try:
            assert screen.shape[-2:] == minimap.shape[-2:], 'different resolutions'
        except:
            print("Shape mismatch between screen and minimap. They must have the same resolution.")
            print("Screen resolution: ", screen.shape[-2:])
            print("Minimap resolution: ", minimap.shape[-2:])

        state = np.concatenate([screen, minimap])
    elif len(minimap)==0 and len(screen) >0:
        state = screen
    else:
        raise Exception("Both screen and minimap seem to have 0 layers.")
    return state              
```

In [14]:
def merge_screen_and_minimap(state_dict):
    """
    Returns a tuple (state, player), where
    state = (screen+minimap channels, res, res) # no batch dim
    player = (player_features,) # no batch dim
    """
    screen = state_dict['screen_layers']
    minimap = state_dict['minimap_layers']
    player = state_dict['player_features']
    if len(minimap) > 0:
        try:
            assert screen.shape[-2:] == minimap.shape[-2:], 'different resolutions'
        except:
            print("Shape mismatch between screen and minimap. They must have the same resolution.")
            print("Screen resolution: ", screen.shape[-2:])
            print("Minimap resolution: ", minimap.shape[-2:])

        state = np.concatenate([screen, minimap])
    elif len(minimap)==0 and len(screen) >0:
        state = screen
    else:
        raise Exception("Both screen and minimap seem to have 0 layers.")
    return state, player              

Let's check that the workers can send the tuple of the state correctly and then using 2 parallel envs let's batch the states together the new state representations keeping state and player separated.

In [15]:
n_train_processes = 2
envs = ParallelEnv(n_train_processes, game_params, map_name, obs_proc_params, action_dict)

In [16]:
state, action_mask = envs.reset()

In [17]:
state

{'spatial': array([[[[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 1., 1., ..., 1., 1., 0.],
          [0., 1., 1., ..., 1., 1., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
  

Now we have to redefine the architecture and the step function of the A2C agent. We will use SpatialA2C as a proof of concept, but at the end it should inherit from the best version of A2C at our disposal.

Before we had to define 3 parameters that depended both on the model and on the state representation:
1. in_channels : screen + minimap channels
2. n_channels : number of channels at the end of the FullyConvSpatial
3. n_features : number of features at the end of the FullyConvNonSpatial

Now ideally if we want full control we should specify:
1. in_channels : screen + minimap channels
2. n_channels : number of channels at the end of the FullyConvSpatial
3. in_player : Number of features of the player_info (state['player']) in input
4. out_player : Number of features of the player_info (state['player']) in output
5. cat_n_channels = n_channels + out_player

The way I dealt before with this was:
1. Expose only n_channels and n_features as mandatory parameters
2. Specify all other input and model parameters in the dictionaries of the 2 models

Also now we want a FullyConvSpatial that accepts 2 inputs (spatial_state and player_state) and returns a single spatial_features with n_channels.

The new network looks like this:

```python
class FullyConvPlayerAndSpatial(nn.Module):
    def __init__(self, in_channels, in_player, player_features, conv_channels=32):
        super(FullyConvPlayerAndSpatial, self).__init__()
        self.conv_net = FullyConvSpatial(in_channels, conv_channels)
        self.fc_net = nn.Sequential(
                                    nn.Linear(in_player, 64),
                                    nn.ReLU(),
                                    nn.Linear(64, player_features),
                                    nn.ReLU()
                                    )
        
    def forward(self, spatial_state, player_state):
        spatial_x = self.conv_net(spatial_state)
        player_x = self.fc_net(player_state)
        spatial_features = self._cat_player_to_spatial(player_x, spatial_x)
        
    def _cat_player_to_spatial(self, player_x, spatial_x):
        """ 
        Assume spatial_x of shape (B, conv_channels, res, res).
        Cast player_x from (B, player_features) to (B, player_features, res, res)
        Concatenate spatial_x with the broadcasted player_x along the channel dim.
        """
        res = spatial_x.shape[-1]
        player_x = player_x.reshape((player_x.shape[:2]+(1,1,)))
        player_x = player_x.repeat(1,1,res,res)
        spatial_features = torch.cat([spatial_x, player_x], dim=1)
        return spatial_features
```

In [None]:
class FullSpace_A2C(SpatialA2C): 
    def __init__(self, action_space, env, spatial_model, nonspatial_model, spatial_dict, nonspatial_dict, 
                 n_features, n_channels, gamma=0.99, action_dict=None, H=1e-3, n_steps=20, device='cpu'):
        self.gamma = gamma
        self.n_actions = action_space
        self.n_steps = n_steps
        self.H = H
        self.AC = SpatialActorCritic_v4(action_space, env, spatial_model, nonspatial_model, spatial_dict, 
                                     nonspatial_dict, n_features, n_channels, action_dict=action_dict)
        self.device = device 
        self.AC.to(self.device) 
        
    def step(self, state, action_mask):
        spatial_state = state['spatial']
        player_state = state['player']
        spatial_state = torch.from_numpy(spatial_state).float().to(self.device)
        player_state = torch.from_numpy(player_state).float().to(self.device)
        action_mask = torch.tensor(action_mask).to(self.device)
        
        log_probs, spatial_features, nonspatial_features = self.AC.pi(spatial_state, player_state, action_mask)
        entropy = self.compute_entropy(log_probs)
        probs = torch.exp(log_probs)
        a = Categorical(probs).sample()
        a = a.detach().cpu().numpy()
        log_prob = log_probs[range(len(a)), a]
        
        args, args_log_prob, args_entropy = self.get_arguments(spatial_features, nonspatial_features, a)
        log_prob = log_prob + args_log_prob
        # Use only entropy of main actions for regularization
        #entropy = entropy + args_entropy

        action_id = np.array([self.AC.action_dict[act] for act in a])
        action = [actions.FunctionCall(action_id[i], args[i]) for i in range(len(action_id))]

        return action, log_prob, torch.mean(entropy)
    
    def compute_ac_loss(self, rewards, log_probs, entropies, states, done, bootstrap, trg_states): 
        # from list of dictionaries of arrays to 2 separate arrays
        spatial_states_lst = [s['spatial'] for s in states] #[(batch, other dims) x traj_len times]
        player_states_lst = [s['player'] for s in states] #[(batch, other dims) x traj_len times]
        spatial_states = torch.
        # merge batch and episode dimensions
        old_states = torch.tensor(states).float().to(self.device).reshape((-1,)+states.shape[2:])

        average_n_steps = False # TRY ME
        if average_n_steps:
            # Use as V target the mean of 1-step to n-step V targets
            V_trg = []
            for n in range(1, self.n_steps + 1):
                n_step_V_trg = self.compute_n_step_V_trg(n, rewards, done, bootstrap, states)
                V_trg.append(n_step_V_trg)
            V_trg = torch.mean(torch.stack(V_trg, axis=0), axis=0)
        else:
            V_trg = self.compute_n_step_V_trg(self.n_steps, rewards, done, bootstrap, states)
            
        ### Wrap variables into tensors - merge batch and episode dimensions ###    
        log_probs = torch.stack(log_probs).to(self.device).transpose(1,0).reshape(-1)
        entropies = torch.stack(entropies, axis=0).to(self.device).reshape(-1)
        
        ### Compute critic and actor losses ###
        critic_loss = self.compute_critic_loss(old_states, V_trg)
        actor_loss, entropy = self.compute_actor_loss(log_probs, entropies, old_states, V_trg)

        return critic_loss, actor_loss, entropy

In [23]:
states = [state for _ in range(5)]

In [24]:
spatial_states_lst = [s['spatial'] for s in states]
spatial_states_lst

[array([[[[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 1., 1., ..., 1., 1., 0.],
          [0., 1., 1., ..., 1., 1., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
 
         ...,
 
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0.

In [26]:
spatial_states = torch.tensor(spatial_states_lst)
spatial_states.shape

torch.Size([5, 2, 38, 32, 32])

In addition to this we need to 
1. Store properly the 2 parts of the state (probably a list of dictionaries would do)
2. Obtain the tensors spatial_state and player_state with the "time" dimension too 
3. Transpose them so the batch dimension is first (check needed) - anyways they should look like the old state in the first 2 dimensions, so that we can obtain the n_steps_states from them
4. Finish to implement all changes in the update
5. Fix all the inspections (storage, inspection_step and plots)

# Using all possible actions and then masking out the ones not available