What I do:
- Init CUDA
- Load agent on GPU

Note:
1 MiB = 1 048 576 byte 

In [1]:
import sys
sys.path.insert(0, "../")
from SC_Utils.game_utils import ObsProcesser, get_action_dict
from SC_Utils.train_v2 import *
from AC_modules.BatchedA2C import SpatialA2C, ActionDependentA2C
import AC_modules.Networks as net
import torch
import gc

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

In [3]:
# start with 20MiB /  5932MiB used
! nvidia-smi

Tue Jun 16 17:53:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   32C    P8     6W / 160W |     20MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [4]:
# minimal code to initialize CUDA
a=torch.cuda.FloatTensor(1)

In [5]:
# 414MiB / 5932MiB used -> 394 MiB used for CUDA init
! nvidia-smi

Tue Jun 16 17:53:26 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   33C    P2    24W / 160W |    414MiB /  5932MiB |      5%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [6]:
# Environment parameters
RESOLUTION = 64
MAX_STEPS = 256
game_params = dict(feature_screen=RESOLUTION, feature_minimap=RESOLUTION, action_space="FEATURES") 
game_names = {1:'MoveToBeacon',
              2:'CollectMineralShards',
              3:'DefeatRoaches',
              4:'FindAndDefeatZerglings',
              5:'DefeatZerglingsAndBanelings',
              6:'CollectMineralsAndGas',
              7:'BuildMarines'
              }
map_name = game_names[2]

# Observation Processer parameters
screen_names = ['visibility_map', 'player_relative', 'selected', 'unit_density', 'unit_density_aa']
minimap_names = []
obs_proc_params = {'screen_names':screen_names, 'minimap_names':minimap_names}
#obs_proc_params = {'select_all':True}

env = init_game(game_params, map_name)
op = ObsProcesser(**obs_proc_params)
screen_channels, minimap_channels = op.get_n_channels()
in_channels = screen_channels + minimap_channels 

action_names = ['select_point', 'Move_screen']
#action_names = ['no_op', 'select_army', 'Attack_screen', 'Move_screen', 'select_point', 'select_rect']
action_dict = get_action_dict(action_names)
action_space = len(action_dict)

spatial_model = net.FullyConvSpatial
nonspatial_model = net.FullyConvNonSpatial
embed_dim = 8
n_channels = 32
n_features = 256
spatial_dict = {"in_channels":in_channels}
nonspatial_dict = {'resolution':RESOLUTION, 'kernel_size':3, 'stride':2}

In [7]:
in_channels

7

In [8]:
HPs = dict(action_space=action_space, gamma=0.99, n_steps=20, H=1e-3, 
           spatial_model=spatial_model, nonspatial_model=nonspatial_model,
           n_features=n_features, n_channels=n_channels, 
           spatial_dict=spatial_dict, nonspatial_dict=nonspatial_dict, 
           action_dict=action_dict, embed_dim=embed_dim)

if torch.cuda.is_available():
    HPs['device'] = 'cuda'
else:
    HPs['device'] = 'cpu'
    
print("Using device "+HPs['device'])

lr = 7e-4

agent = ActionDependentA2C(env=env, **HPs)

Using device cuda


In [9]:
# 430MiB / 5932MiB used -> 16 MiB for loading the agent
### 64 x 64 ###
# 476MiB /  5932MiB -> 62 MiB
! nvidia-smi

Tue Jun 16 17:53:35 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   33C    P2    25W / 160W |    476MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [10]:
byte_memory = 0
for p in agent.AC.parameters():
    byte_memory += p.element_size()*p.nelement()
print("Memory in bytes: ", byte_memory)
MiB_memory = byte_memory/2**20
print("Memory in MiB: ", MiB_memory)

Memory in bytes:  63638408
Memory in MiB:  60.69031524658203


14.7 MiB of memory only for the learnable parameters is close enough to the 16 MiB counted by nvidia-smi

In [11]:
unroll_length = 30
n_train_processes = 10

train_dict = dict(n_train_processes = n_train_processes,
                  max_train_steps = unroll_length*10000,
                  unroll_length = unroll_length,
                  test_interval = unroll_length*50 #100
                  )

In [12]:
envs = ParallelEnv(n_train_processes, game_params, map_name, obs_proc_params, action_dict)

In [13]:
optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr)

Input shape for forward pass: (b, 7, 32, 32)
Output shapes: (b,) , scalar

In [None]:
step_idx = 0
s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(), list(), list(), list()
log_probs = []
entropies = []
s, a_mask = envs.reset()
for _ in range(unroll_length):

    a, log_prob, entropy = agent.step(s, a_mask)
    # variables with gradient
    log_probs.append(log_prob)
    entropies.append(entropy)

    s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a)
    s_lst.append(s)
    r_lst.append(r)
    done_lst.append(done)
    bootstrap_lst.append(bootstrap)
    s_trg_lst.append(s_trg)
    

    s = s_prime
    step_idx += 1 #n_train_processes


In [None]:
byte_memory = 0
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
            byte_memory += obj.element_size()*obj.nelement() 
    except:
        pass

In [None]:
byte_memory/2**20

In [None]:
torch.cuda.memory_allocated()/2**20 #912.8

In [19]:
#4252 - 3463 #??
1652 - 867

785

In [49]:
### 1 process ###
# 784MiB /  5932MiB after 1 forward loop -> 354 MiB used
# 786MiB /  5932MiB after 2 forward loop -> 354 + 2MiB used
# 868MiB /  5932MiB after 120 forward loops -> 438 MiB used
# 1038MiB /  5932MiB if executed a second time after backward
# 926MiB /  5932MiB if executed a second time after backward and empty_cache
### 10 processes ###
# 1652MiB /  5932MiB after 120 forward loops -> 1222 MiB used
# 1694MiB /  5932MiB if executed a second time after backward and empty_cache -> 42 MiB more
# 1706
! nvidia-smi

Tue Jun 16 17:52:20 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   33C    P2    24W / 160W |   1706MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [50]:
s_lst = np.array(s_lst).transpose(1,0,2,3,4)
print('s_lst.shape: ', s_lst.shape)
r_lst = np.array(r_lst).transpose(1,0)
done_lst = np.array(done_lst).transpose(1,0)
bootstrap_lst = np.array(bootstrap_lst).transpose(1,0)
s_trg_lst = np.array(s_trg_lst).transpose(1,0,2,3,4)

critic_loss, actor_loss, entropy_term = agent.compute_ac_loss(r_lst, log_probs, entropies, 
                                                         s_lst, done_lst, bootstrap_lst, s_trg_lst)


loss = (critic_loss + actor_loss).mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()

s_lst.shape:  (10, 120, 7, 32, 32)


In [51]:
torch.cuda.memory_allocated()/2**20

58.896484375

In [52]:
torch.cuda.max_memory_allocated()/2**20

3296.91943359375

In [24]:
5932 - 5584

348

In [53]:
### 1 process ###
# 948MiB /  5932MiB with cudnn backend
#1036MiB /  5932MiB after backward with 1 process -> 168 MiB used
# 1094MiB /  5932MiB if executed a second time after another forward cycle
# 1090MiB /  5932MiB if executed a second time after empty_cache and another forward cycle
### 10 processes ###
# 3188MiB /  5932MiB after backward with 10 process -> 1536 MiB used
# 3230MiB /  5932MiB if executed a second time after empty_cache and another forward cycle
! nvidia-smi

Tue Jun 16 17:52:32 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   34C    P2    24W / 160W |   2810MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [54]:
torch.cuda.empty_cache()

In [55]:
# 16MiB /  5932MiB  with cudnn
# 924MiB /  5932MiB after torch.cuda.empty_cache() - freed 112 MiB
# 1080MiB /  5932MiB after torch.cuda.empty_cache() - freed 2108 MiB
# 1104
! nvidia-smi

Tue Jun 16 17:52:36 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   34C    P2    24W / 160W |   1110MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [24]:
del critic_loss, actor_loss, entropy_term, loss
torch.cuda.empty_cache()

In [25]:
# 924MiB /  5932MiB after deleating critic_loss, actor_loss, entropy_term, loss 
# and calling torch.cuda.empty_cache() -> del had no effect
! nvidia-smi

Tue Jun 16 17:27:15 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   32C    P8     6W / 160W |    916MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [26]:
optimizer.zero_grad() # does not change the amount of GPU used

In [27]:
! nvidia-smi

Tue Jun 16 17:27:17 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.59       Driver Version: 440.59       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 2060    Off  | 00000000:B3:00.0 Off |                  N/A |
| 20%   33C    P2    24W / 160W |    916MiB /  5932MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [30]:
torch.cuda.max_memory_allocated()/2**20

355.74951171875

In [32]:
help(torch.cuda.max_memory_allocated)

Help on function max_memory_allocated in module torch.cuda:

max_memory_allocated(device=None)
    Returns the maximum GPU memory occupied by tensors in bytes for a given
    device.
    
    By default, this returns the peak allocated memory since the beginning of
    this program. :func:`~torch.cuda.reset_max_memory_allocated` can be used to
    reset the starting point in tracking this metric. For example, these two
    functions can measure the peak allocated memory usage of each iteration in a
    training loop.
    
    Arguments:
        device (torch.device or int, optional): selected device. Returns
            statistic for the current device, given by :func:`~torch.cuda.current_device`,
            if :attr:`device` is ``None`` (default).
    
    .. note::
        See :ref:`cuda-memory-management` for more details about GPU memory
        management.



In [None]:
def train_batched_A2C(agent, game_params, map_name, lr, n_train_processes, max_train_steps, 
                      unroll_length, obs_proc_params, action_dict,
                      test_interval=100, num_tests=5, inspection_interval=200):
    
    replay_dict = dict(save_replay_episodes=num_tests,
                       replay_dir='Replays/',
                       replay_prefix='A2C_'+map_name)
    test_env = init_game(game_params, map_name, **replay_dict) # save just test episodes
    op = ObsProcesser(**obs_proc_params)
    envs = ParallelEnv(n_train_processes, game_params, map_name, obs_proc_params, action_dict)

    optimizer = torch.optim.Adam(agent.AC.parameters(), lr=lr)
    PID = gen_PID()
    print("Process ID: ", PID)
    score = []
    critic_losses = [] 
    actor_losses = []
    entropy_losses = []
    
    step_idx = 0
    while step_idx < max_train_steps:
        s_lst, r_lst, done_lst, bootstrap_lst, s_trg_lst = list(), list(), list(), list(), list()
        log_probs = []
        entropies = []
        s, a_mask = envs.reset()
        for _ in range(unroll_length):

            a, log_prob, entropy = agent.step(s, a_mask)
            # variables with gradient
            log_probs.append(log_prob)
            entropies.append(entropy)

            s_prime, r, done, bootstrap, s_trg, a_mask = envs.step(a)
            s_lst.append(s)
            r_lst.append(r)
            done_lst.append(done)
            bootstrap_lst.append(bootstrap)
            s_trg_lst.append(s_trg)

            s = s_prime
            step_idx += 1 #n_train_processes

        # all variables without gradient
        s_lst = np.array(s_lst).transpose(1,0,2,3,4)
        r_lst = np.array(r_lst).transpose(1,0)
        done_lst = np.array(done_lst).transpose(1,0)
        bootstrap_lst = np.array(bootstrap_lst).transpose(1,0)
        s_trg_lst = np.array(s_trg_lst).transpose(1,0,2,3,4)

        critic_loss, actor_loss, entropy_term = agent.compute_ac_loss(r_lst, log_probs, entropies, 
                                                                 s_lst, done_lst, bootstrap_lst, s_trg_lst)

        
        loss = (critic_loss + actor_loss).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        critic_losses.append(critic_loss.item())
        actor_losses.append(actor_loss.item())
        entropy_losses.append(entropy_term.item())
        
        
        ### Test time ###
        if step_idx % test_interval == 0:
            avg_score = test(step_idx, agent, test_env, PID, op, action_dict, num_tests)
            if inspection and (step_idx%inspection_interval==0):
                inspector = inspection_test(step_idx, agent, test_env, PID, op, action_dict)
                # save episode for inspection and model weights at that point
                save_path = "../Results/"+map_name+"/Checkpoints/"
                inspector.save_dict(path=save_path)
                torch.save(agent.AC.state_dict(), save_path+PID+"_"+str(step_idx))
            score.append(avg_score)
    envs.close()
    
    losses = dict(critic_losses=critic_losses, actor_losses=actor_losses, entropies=entropy_losses)
    return score, losses, agent, PID

In [None]:
#help(torch.cuda.max_memory_allocated)