# Gradient Exploration

The purpose of this notebook is to explore how gradients can be preserved on and off a GPU, accumulated *within* a batch, and be handled by multiple GPUs.

In [1]:
import os

import torch
import torch.nn as nn
from torchviz import make_dot
from torch import optim

from copy import deepcopy

# Most Minimal Play

- `retain_graph` allows multiple backward passes
- The grad's are accumulated *on the leaf variables*
- `backward` is run on an output variable

## Basics

In [74]:
x = torch.rand(4, requires_grad=True)
y = torch.rand(4, requires_grad=True)

In [75]:
x, y

(tensor([0.4820, 0.1776, 0.9892, 0.9979], requires_grad=True),
 tensor([0.5668, 0.8687, 0.2601, 0.8752], requires_grad=True))

In [82]:
z = x*y

In [83]:
z.backward(torch.FloatTensor([1.0, 1.0, 1.0, 1.0]))

In [84]:
x.grad.data

tensor([1.1336, 1.7374, 0.5203, 1.7504])

In [85]:
y.grad.data

tensor([0.9639, 0.3551, 1.9784, 1.9958])

## GPU on-and-off

In [131]:
x1_cpu = torch.tensor([1., 1., 1., 1.], requires_grad=True)
y1_cpu = torch.tensor([4., 4., 4., 4.], requires_grad=True)
x2_cpu = torch.tensor([1., 1., 1., 1.], requires_grad=True)
y2_cpu = torch.tensor([3., 3., 3., 3.], requires_grad=True)

In [132]:
x1_cuda, y1_cuda = x1_cpu.to("cuda"), y1_cpu.to("cuda")

In [133]:
z1_cuda = x1_cuda * y1_cuda

In [134]:
z1_cpu = z1_cuda.to("cpu")
x1_cpu2, y1_cpu2 = x1_cuda.to("cpu"), y1_cuda.to("cpu")

In [135]:
x2_cuda, y2_cuda = x2_cpu.to("cuda"), y2_cpu.to("cuda")

In [136]:
z2_cuda = x2_cuda * y2_cuda

In [137]:
z1_cuda2 = z1_cpu.to("cuda")

In [138]:
z_total = z1_cuda2 + z2_cuda

In [139]:
del x1_cuda

In [140]:
z_total.sum().backward()

In [141]:
x1_cpu.grad

tensor([4., 4., 4., 4.])

In [142]:
y1_cpu.grad

tensor([1., 1., 1., 1.])

In [143]:
x2_cpu.grad

tensor([3., 3., 3., 3.])

In [144]:
y2_cpu.grad

tensor([1., 1., 1., 1.])

## Memory Test

In [2]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()



In [3]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [67]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [68]:
x1_size = (100000,1000)
y1_size = (100000,1000)
x1 = torch.rand(x1_size, requires_grad=True, device="cuda")
y1 = torch.rand(y1_size, requires_grad=True, device="cuda")
x2 = torch.rand(x1_size, requires_grad=True, device="cuda")
y2 = torch.rand(y1_size, requires_grad=True, device="cuda")
# x1 = torch.rand(x1_size, device="cuda")
# y1 = torch.rand(y1_size, device="cuda")

In [69]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [70]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [71]:
z = x1*y1

In [72]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [73]:
x1 = x1.to("cpu")
y1 = y1.to("cpu")

In [75]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [74]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [76]:
z.sum().backward()

In [77]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

2.6113290786743164


In [78]:
del z

In [79]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [80]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

2.23828125


In [29]:
x1.grad=None
y1.grad=None

In [30]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [31]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [132]:
x1_cuda, y1_cuda = x1_cpu.to("cuda"), y1_cpu.to("cuda")

In [133]:
z1_cuda = x1_cuda * y1_cuda

In [134]:
z1_cpu = z1_cuda.to("cpu")
x1_cpu2, y1_cpu2 = x1_cuda.to("cpu"), y1_cuda.to("cpu")

In [135]:
x2_cuda, y2_cuda = x2_cpu.to("cuda"), y2_cpu.to("cuda")

In [136]:
z2_cuda = x2_cuda * y2_cuda

In [137]:
z1_cuda2 = z1_cpu.to("cuda")

In [138]:
z_total = z1_cuda2 + z2_cuda

# Sequential Model Play

We create a sequential NN, then feed a variable through it

In [11]:
model = nn.Sequential()

In [12]:
model.add_module('W0', nn.Linear(3, 16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16, 1))

In [13]:
x = torch.randn((1,3), requires_grad=True)

In [14]:
x

tensor([[ 1.0646,  0.7194, -0.4448]], requires_grad=True)

In [15]:
y = model(x)

In [16]:
y

tensor([[-0.1016]], grad_fn=<AddmmBackward>)

In [17]:
dict(model.named_parameters())

{'W0.weight': Parameter containing:
 tensor([[ 0.2392,  0.3530, -0.0977],
         [ 0.1958, -0.0073,  0.2408],
         [ 0.0658,  0.4778,  0.1292],
         [ 0.1938, -0.3796,  0.1065],
         [-0.1575,  0.2916,  0.3924],
         [-0.2167, -0.0704, -0.5197],
         [ 0.4216,  0.1047,  0.5400],
         [-0.4827,  0.4868, -0.5655],
         [ 0.3096, -0.3725, -0.3382],
         [ 0.0190, -0.3260, -0.0828],
         [-0.1678,  0.4821,  0.1528],
         [ 0.3671,  0.4760, -0.5092],
         [ 0.2448,  0.2161, -0.1691],
         [-0.4507,  0.2656,  0.4777],
         [-0.1823, -0.2169,  0.4688],
         [-0.2667,  0.5446,  0.2151]], requires_grad=True),
 'W0.bias': Parameter containing:
 tensor([ 0.2466, -0.5018,  0.0103,  0.1711,  0.4140, -0.0998,  0.3709, -0.3463,
          0.4880,  0.1452,  0.3080,  0.0793, -0.5400,  0.1486,  0.2751,  0.0492],
        requires_grad=True),
 'W1.weight': Parameter containing:
 tensor([[-0.1382,  0.0121,  0.1774,  0.1123, -0.0276,  0.1617,  0.0063,

Now we can backprop with a scalar (e.g. mean of y) and check that each variable accumulated a gradient!

In [18]:
y.mean().backward()

In [21]:
x.grad.data

tensor([[-0.0893,  0.0204, -0.1982]])

In [37]:
dict(model.named_parameters()).items()

dict_items([('W0.weight', Parameter containing:
tensor([[ 0.2392,  0.3530, -0.0977],
        [ 0.1958, -0.0073,  0.2408],
        [ 0.0658,  0.4778,  0.1292],
        [ 0.1938, -0.3796,  0.1065],
        [-0.1575,  0.2916,  0.3924],
        [-0.2167, -0.0704, -0.5197],
        [ 0.4216,  0.1047,  0.5400],
        [-0.4827,  0.4868, -0.5655],
        [ 0.3096, -0.3725, -0.3382],
        [ 0.0190, -0.3260, -0.0828],
        [-0.1678,  0.4821,  0.1528],
        [ 0.3671,  0.4760, -0.5092],
        [ 0.2448,  0.2161, -0.1691],
        [-0.4507,  0.2656,  0.4777],
        [-0.1823, -0.2169,  0.4688],
        [-0.2667,  0.5446,  0.2151]], requires_grad=True)), ('W0.bias', Parameter containing:
tensor([ 0.2466, -0.5018,  0.0103,  0.1711,  0.4140, -0.0998,  0.3709, -0.3463,
         0.4880,  0.1452,  0.3080,  0.0793, -0.5400,  0.1486,  0.2751,  0.0492],
       requires_grad=True)), ('W1.weight', Parameter containing:
tensor([[-0.1382,  0.0121,  0.1774,  0.1123, -0.0276,  0.1617,  0.0063,  0.20

In [38]:
for (k, v) in dict(model.named_parameters()).items():
    print(k,v.grad)

W0.weight tensor([[-0.0824, -0.0557,  0.0344],
        [ 0.0110,  0.0074, -0.0046],
        [ 0.1656,  0.1119, -0.0692],
        [ 0.1192,  0.0806, -0.0498],
        [-0.0272, -0.0183,  0.0113],
        [ 0.1683,  0.1137, -0.0703],
        [ 0.0045,  0.0030, -0.0019],
        [ 0.2013,  0.1360, -0.0841],
        [ 0.0669,  0.0452, -0.0279],
        [-0.1071, -0.0724,  0.0447],
        [-0.1608, -0.1086,  0.0672],
        [-0.0410, -0.0277,  0.0171],
        [-0.2067, -0.1397,  0.0864],
        [-0.1796, -0.1213,  0.0750],
        [-0.0441, -0.0298,  0.0184],
        [ 0.1986,  0.1342, -0.0830]])
W0.bias tensor([-0.0774,  0.0103,  0.1556,  0.1120, -0.0255,  0.1581,  0.0042,  0.1891,
         0.0628, -0.1006, -0.1510, -0.0385, -0.1942, -0.1687, -0.0414,  0.1866])
W1.weight tensor([[ 0.6633, -0.3848,  0.3510,  0.0568,  0.2743, -0.1488,  0.5750, -0.2529,
          0.6044, -0.0322,  0.3869,  0.7775, -0.0487, -0.3388, -0.2762,  0.0613]])
W1.bias tensor([1.])


In [39]:
for (k, v) in dict(model.named_parameters()).items():
    print(k,v)

W0.weight Parameter containing:
tensor([[ 0.2392,  0.3530, -0.0977],
        [ 0.1958, -0.0073,  0.2408],
        [ 0.0658,  0.4778,  0.1292],
        [ 0.1938, -0.3796,  0.1065],
        [-0.1575,  0.2916,  0.3924],
        [-0.2167, -0.0704, -0.5197],
        [ 0.4216,  0.1047,  0.5400],
        [-0.4827,  0.4868, -0.5655],
        [ 0.3096, -0.3725, -0.3382],
        [ 0.0190, -0.3260, -0.0828],
        [-0.1678,  0.4821,  0.1528],
        [ 0.3671,  0.4760, -0.5092],
        [ 0.2448,  0.2161, -0.1691],
        [-0.4507,  0.2656,  0.4777],
        [-0.1823, -0.2169,  0.4688],
        [-0.2667,  0.5446,  0.2151]], requires_grad=True)
W0.bias Parameter containing:
tensor([ 0.2466, -0.5018,  0.0103,  0.1711,  0.4140, -0.0998,  0.3709, -0.3463,
         0.4880,  0.1452,  0.3080,  0.0793, -0.5400,  0.1486,  0.2751,  0.0492],
       requires_grad=True)
W1.weight Parameter containing:
tensor([[-0.1382,  0.0121,  0.1774,  0.1123, -0.0276,  0.1617,  0.0063,  0.2020,
          0.0990, -0.100

# Toys

In [2]:
gpu0 = "cuda:0"
gpu1 = "cuda:1"

In [3]:
def mean_grads(model_A, model_B):
    for layer_A, layer_B in zip(model_A.parameters(), model_B.parameters()):
        device_A = layer_A.device
        if layer_A.grad is None:
            layer_A.grad = layer_B.grad/2
        elif layer_B.grad is None:
            layer_A.grad = layer_A.grad/2
        else:
            layer_A.grad = torch.mean(torch.stack([layer_A.grad, layer_B.grad.to(device_A)]), axis=0)

In [4]:
def compare_layers(layer_A, layer_B, eps=1e-4):
    return (layer_A.data - layer_B.data)/(layer_A.data + layer_B.data) < eps

def compare_grads(layer_A, layer_B, eps=1e-4):
    return (layer_A.grad - layer_B.grad)/(layer_A.grad + layer_B.grad) < eps

def compare_models(model_A, model_B, eps=1e-4):
    
    equal_grads = True
    
    zipped_models = zip(model_A.parameters(), model_B.parameters())
    compare_model_list = [compare_layers(layer_A, layer_B).all() for layer_A, layer_B in zipped_models]
    equal_weights = torch.stack(compare_model_list).all()

    if list(model_A.parameters())[0].grad is not None:
        zipped_models = zip(model_A.parameters(), model_B.parameters())
        compare_grad_list = [compare_grads(layer_A, layer_B).all() for layer_A, layer_B in zipped_models]
        equal_grads = torch.stack(compare_grad_list).all()
    
    return equal_weights, equal_grads 

In [5]:
def scatter_add_attention(encoded_nodes, encoded_edges, edge_list):
    start, end = edge_list

    src = encoded_nodes[end]*encoded_edges
    index = start.unsqueeze(-1)
    in_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

    src = encoded_nodes[start]*encoded_edges
    index = end.unsqueeze(-1)
    out_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 
    
    aggr_nodes = in_messages + out_messages
    
    return aggr_nodes

## Single-GPU Toy A

In [63]:
x = torch.randint(0, 10, (10, 3)).float()
M = torch.rand((3,3), requires_grad=True).float()

1, 2.

In [171]:
x = torch.tensor([[1,2,5],
                  [2,3,2],
                  [3,4,3],
                  [4,4,4],
                  [5,5,5],
                  [9,5,2],
                  [1,3,5],
                  [2,5,7],
                  [6,7,8],
                  [1,2,3]], device=gpu0).float()
M = torch.tensor([[0.1, 0.3, 0.5],
                  [0.7, 0.9, 0.2],
                  [0.4, 0.6, 0.8]], requires_grad=True, device=gpu0).float()

3.

In [172]:
output = torch.matmul(x, M)

In [173]:
output

tensor([[ 3.5000,  5.1000,  4.9000],
        [ 3.1000,  4.5000,  3.2000],
        [ 4.3000,  6.3000,  4.7000],
        [ 4.8000,  7.2000,  6.0000],
        [ 6.0000,  9.0000,  7.5000],
        [ 5.2000,  8.4000,  7.1000],
        [ 4.2000,  6.0000,  5.1000],
        [ 6.5000,  9.3000,  7.6000],
        [ 8.7000, 12.9000, 10.8000],
        [ 2.7000,  3.9000,  3.3000]], device='cuda:0', grad_fn=<MmBackward>)

In [174]:
# The "loss" is the average of the size of each vector
loss = output.sum(axis=1).mean()

In [175]:
loss

tensor(18.1800, device='cuda:0', grad_fn=<MeanBackward0>)

In [176]:
loss.backward()

In [177]:
M.grad

tensor([[3.4000, 3.4000, 3.4000],
        [4.0000, 4.0000, 4.0000],
        [4.4000, 4.4000, 4.4000]], device='cuda:0')

In [178]:
M

tensor([[0.1000, 0.3000, 0.5000],
        [0.7000, 0.9000, 0.2000],
        [0.4000, 0.6000, 0.8000]], device='cuda:0', requires_grad=True)

## Multi-GPU Toy A

In [159]:
x = torch.tensor([[1,2,5],
                  [2,3,2],
                  [3,4,3],
                  [4,4,4],
                  [5,5,5],
                  [9,5,2],
                  [1,3,5],
                  [2,5,7],
                  [6,7,8],
                  [1,2,3]], device=gpu0).float()
M = torch.tensor([[0.1, 0.3, 0.5],
                  [0.7, 0.9, 0.2],
                  [0.4, 0.6, 0.8]], requires_grad=True, device=gpu0).float()

4.

In [160]:
x0 = x[:5].to(gpu0)
x1 = x[5:].to(gpu1)
M1 = M.to(gpu1)

5.

In [161]:
output0 = torch.matmul(x0, M)
output1 = torch.matmul(x1, M1)

In [162]:
output0, output1

(tensor([[3.5000, 5.1000, 4.9000],
         [3.1000, 4.5000, 3.2000],
         [4.3000, 6.3000, 4.7000],
         [4.8000, 7.2000, 6.0000],
         [6.0000, 9.0000, 7.5000]], device='cuda:0', grad_fn=<MmBackward>),
 tensor([[ 5.2000,  8.4000,  7.1000],
         [ 4.2000,  6.0000,  5.1000],
         [ 6.5000,  9.3000,  7.6000],
         [ 8.7000, 12.9000, 10.8000],
         [ 2.7000,  3.9000,  3.3000]], device='cuda:1', grad_fn=<MmBackward>))

In [163]:
loss0 = output0.sum(axis=1).mean()
loss1 = output1.sum(axis=1).mean()

In [164]:
loss0, loss1

(tensor(16.0200, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(20.3400, device='cuda:1', grad_fn=<MeanBackward0>))

In [157]:
loss0.backward()

In [158]:
M.grad

tensor([[3.0000, 3.0000, 3.0000],
        [3.6000, 3.6000, 3.6000],
        [3.8000, 3.8000, 3.8000]], device='cuda:0')

6. 

In [165]:
loss1 = loss1.to(gpu0)

7.

In [166]:
loss_total = torch.cat([loss0.unsqueeze(0), loss1.unsqueeze(0)]).mean()

In [167]:
loss_total

tensor(18.1800, device='cuda:0', grad_fn=<MeanBackward0>)

8.

In [168]:
loss_total.backward()

In [169]:
M.grad

tensor([[3.4000, 3.4000, 3.4000],
        [4.0000, 4.0000, 4.0000],
        [4.4000, 4.4000, 4.4000]], device='cuda:0')

In [170]:
M

tensor([[0.1000, 0.3000, 0.5000],
        [0.7000, 0.9000, 0.2000],
        [0.4000, 0.6000, 0.8000]], device='cuda:0', requires_grad=True)

In [79]:
loss.backward()

In [126]:
M1.grad

  """Entry point for launching an IPython kernel.


In [82]:
M

tensor([[0.1000, 0.3000, 0.5000],
        [0.7000, 0.9000, 0.2000],
        [0.4000, 0.6000, 0.8000]], device='cuda:0', requires_grad=True)

## Single-GPU Toy B

1, 2.

In [213]:
# x = torch.tensor([[1,2,5],
#                   [2,3,2],
#                   [3,4,3],
#                   [4,4,4],
#                   [5,5,5],
#                   [9,5,2],
#                   [1,3,5],
#                   [2,5,7],
#                   [6,7,8],
#                   [1,2,3]], device=gpu0).float()
torch.random.manual_seed(0)
x = torch.randint(0, 5, (20, 3), device=gpu0).float()
model = nn.Sequential()
model.add_module('W0', nn.Linear(3, 16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16, 1))
model = model.to(gpu0)

In [214]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

3.

In [215]:
output = model(x)

In [216]:
# The "loss" is the average of the size of each vector
loss = output.mean()

In [217]:
loss.backward()

In [218]:
dict(model.named_parameters())["W0.weight"].grad

tensor([[ 0.1914,  0.1616,  0.1033],
        [-0.0220, -0.0157, -0.0333],
        [ 0.0085,  0.0041,  0.0098],
        [ 0.1084,  0.0936,  0.1036],
        [ 0.0247,  0.0124,  0.0441],
        [ 0.2035,  0.1286,  0.1052],
        [-0.2375, -0.1711, -0.2726],
        [-0.0571, -0.0499, -0.0299],
        [ 0.1281,  0.0958,  0.0452],
        [ 0.0136,  0.0068,  0.0327],
        [ 0.2939,  0.2770,  0.1683],
        [ 0.0220,  0.0107,  0.0102],
        [ 0.0668,  0.0362,  0.0384],
        [-0.3333, -0.2625, -0.3525],
        [ 0.0095,  0.0038,  0.0047],
        [-0.2360, -0.1438, -0.1185]], device='cuda:0')

In [219]:
optimizer.step()

In [220]:
single_gpu_model = deepcopy(model)

In [221]:
optimizer.zero_grad()

In [222]:
dict(model.named_parameters())["W0.weight"]

Parameter containing:
tensor([[-0.0062,  0.3081, -0.4762],
        [-0.4247, -0.2222,  0.1552],
        [-0.0115,  0.4577, -0.0513],
        [ 0.1517, -0.1754, -0.1145],
        [-0.5518, -0.3825, -0.2384],
        [ 0.0194,  0.2270,  0.3454],
        [-0.3890, -0.2497,  0.2124],
        [ 0.4800, -0.1183,  0.4323],
        [-0.0943,  0.0601,  0.5223],
        [-0.5357, -0.3635, -0.1465],
        [-0.2280,  0.4961, -0.3759],
        [-0.2660, -0.4035, -0.5408],
        [-0.3377,  0.4959,  0.2572],
        [ 0.2832,  0.0330, -0.2925],
        [ 0.0976, -0.5391, -0.4172],
        [-0.2953,  0.3657,  0.3397]], device='cuda:0', requires_grad=True)

## Multi-GPU Toy B

In [37]:
x = torch.tensor([[1,2,5],
                  [2,3,2],
                  [3,4,3],
                  [4,4,4],
                  [5,5,5],
                  [9,5,2],
                  [1,3,5],
                  [2,5,7],
                  [6,7,8],
                  [1,2,3]], device=gpu0).float()
torch.random.manual_seed(0)
model = nn.Sequential()
model.add_module('W0', nn.Linear(3, 16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16, 1))
model = model.to(gpu0)

In [315]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [316]:
dict(model.named_parameters())

{'W0.weight': Parameter containing:
 tensor([[-0.0043,  0.3097, -0.4752],
         [-0.4249, -0.2224,  0.1548],
         [-0.0114,  0.4578, -0.0512],
         [ 0.1528, -0.1745, -0.1135],
         [-0.5516, -0.3824, -0.2380],
         [ 0.0214,  0.2282,  0.3464],
         [-0.3914, -0.2514,  0.2097],
         [ 0.4794, -0.1188,  0.4320],
         [-0.0931,  0.0611,  0.5228],
         [-0.5356, -0.3635, -0.1462],
         [-0.2251,  0.4988, -0.3742],
         [-0.2658, -0.4034, -0.5407],
         [-0.3370,  0.4963,  0.2576],
         [ 0.2798,  0.0304, -0.2960],
         [ 0.0977, -0.5391, -0.4172],
         [-0.2976,  0.3643,  0.3385]], device='cuda:0', requires_grad=True),
 'W0.bias': Parameter containing:
 tensor([-0.2561, -0.0208,  0.3693,  0.5740,  0.2291,  0.0780,  0.3871, -0.3399,
          0.1076, -0.4476, -0.4002, -0.2982,  0.2612,  0.2322, -0.3420,  0.1744],
        device='cuda:0', requires_grad=True),
 'W1.weight': Parameter containing:
 tensor([[ 0.1372, -0.0316,  0.0095,  

4.

In [317]:
x0 = x[:5].to(gpu0)
x1 = x[5:].to(gpu1)
model1 = deepcopy(model).to(gpu1)

5.

In [318]:
output0 = model(x0)
output1 = model1(x1)

In [319]:
output0, output1

(tensor([[-0.9610],
         [-0.6608],
         [-0.6709],
         [-0.7950],
         [-0.8115]], device='cuda:0', grad_fn=<AddmmBackward>),
 tensor([[-0.6590],
         [-0.8913],
         [-0.7894],
         [-0.7682],
         [-0.8659]], device='cuda:1', grad_fn=<AddmmBackward>))

In [320]:
loss0 = output0.mean()
loss1 = output1.mean()

In [321]:
loss0, loss1

(tensor(-0.7798, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(-0.7948, device='cuda:1', grad_fn=<MeanBackward0>))

6. 

In [322]:
loss1 = loss1.to(gpu0)

7.

In [323]:
loss_total = torch.cat([loss0.unsqueeze(0), loss1.unsqueeze(0)]).mean()

In [324]:
loss_total

tensor(-0.7873, device='cuda:0', grad_fn=<MeanBackward0>)

8.

In [325]:
loss_total.backward()

In [326]:
dict(model.named_parameters())["W0.weight"].grad, dict(model1.named_parameters())["W0.weight"].grad

(tensor([[ 1.0979e-01,  1.3462e-01,  1.1355e-01],
         [-7.4301e-03, -1.1841e-02, -1.9882e-02],
         [ 1.3359e-03,  1.9314e-03,  2.8978e-03],
         [ 8.6169e-02,  1.0325e-01,  1.0853e-01],
         [ 1.6269e-03,  2.7213e-03,  4.1838e-03],
         [ 1.7714e-02,  2.4446e-02,  2.1588e-02],
         [-8.5569e-02, -1.1815e-01, -1.4382e-01],
         [-9.7857e-03, -1.4269e-02, -1.2082e-02],
         [ 1.4356e-02,  1.9139e-02,  1.4992e-02],
         [ 1.2452e-03,  2.1435e-03,  3.7204e-03],
         [ 2.1096e-01,  2.5732e-01,  2.2673e-01],
         [ 4.2664e-04,  6.3832e-04,  5.2862e-04],
         [ 5.3049e-03,  6.6481e-03,  6.2496e-03],
         [-2.8935e-01, -3.3966e-01, -3.3125e-01],
         [ 1.6181e-04,  2.3137e-04,  2.0203e-04],
         [-2.3601e-02, -2.9684e-02, -2.6077e-02]], device='cuda:0'),
 tensor([[ 1.2834e-01,  9.0607e-02,  6.4513e-02],
         [-8.7872e-03, -2.1724e-02, -3.3122e-02],
         [ 8.8898e-04,  1.5966e-03,  2.2927e-03],
         [ 6.8412e-02,  8.6089e

In [328]:
dict(model.named_parameters())["W0.weight"].grad = dict(model.named_parameters())["W0.weight"].grad + dict(model1.named_parameters())["W0.weight"].grad.to(gpu0)

In [307]:
dict(model.named_parameters())["W0.weight"], dict(model1.named_parameters())["W0.weight"]

(Parameter containing:
 tensor([[-0.0043,  0.3097, -0.4752],
         [-0.4249, -0.2224,  0.1548],
         [-0.0114,  0.4578, -0.0512],
         [ 0.1528, -0.1745, -0.1135],
         [-0.5516, -0.3824, -0.2380],
         [ 0.0214,  0.2282,  0.3464],
         [-0.3914, -0.2514,  0.2097],
         [ 0.4794, -0.1188,  0.4320],
         [-0.0931,  0.0611,  0.5228],
         [-0.5356, -0.3635, -0.1462],
         [-0.2251,  0.4988, -0.3742],
         [-0.2658, -0.4034, -0.5407],
         [-0.3370,  0.4963,  0.2576],
         [ 0.2798,  0.0304, -0.2960],
         [ 0.0977, -0.5391, -0.4172],
         [-0.2976,  0.3643,  0.3385]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([[-0.0043,  0.3097, -0.4752],
         [-0.4249, -0.2224,  0.1548],
         [-0.0114,  0.4578, -0.0512],
         [ 0.1528, -0.1745, -0.1135],
         [-0.5516, -0.3824, -0.2380],
         [ 0.0214,  0.2282,  0.3464],
         [-0.3914, -0.2514,  0.2097],
         [ 0.4794, -0.1188,  0.4320],
   

In [329]:
optimizer.step()

In [330]:
optimizer.zero_grad()

In [331]:
dict(model.named_parameters())["W0.weight"]

Parameter containing:
tensor([[-0.0067,  0.3075, -0.4770],
        [-0.4247, -0.2220,  0.1554],
        [-0.0115,  0.4577, -0.0513],
        [ 0.1512, -0.1764, -0.1156],
        [-0.5516, -0.3824, -0.2381],
        [ 0.0210,  0.2278,  0.3460],
        [-0.3897, -0.2483,  0.2140],
        [ 0.4796, -0.1186,  0.4323],
        [-0.0938,  0.0605,  0.5224],
        [-0.5356, -0.3635, -0.1463],
        [-0.2292,  0.4941, -0.3788],
        [-0.2658, -0.4034, -0.5407],
        [-0.3375,  0.4960,  0.2574],
        [ 0.2845,  0.0363, -0.2894],
        [ 0.0977, -0.5391, -0.4172],
        [-0.2960,  0.3654,  0.3392]], device='cuda:0', requires_grad=True)

## Multi-GPU Toy B (Alternate)

In [223]:
# x = torch.tensor([[1,2,5],
#                   [2,3,2],
#                   [3,4,3],
#                   [4,4,4],
#                   [5,5,5],
#                   [9,5,2],
#                   [1,3,5],
#                   [2,5,7],
#                   [6,7,8],
#                   [1,2,3]], device=gpu0).float()
torch.random.manual_seed(0)
x = torch.randint(0, 5, (20, 3), device=gpu0).float()
model = nn.Sequential()
model.add_module('W0', nn.Linear(3, 16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16, 1))
model = model.to(gpu0)

In [224]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

4.

In [226]:
x0 = x[:len(x)//2].to(gpu0)
x1 = x[len(x)//2:].to(gpu1)
model1 = deepcopy(model).to(gpu1)

5.

In [227]:
output0 = model(x0)
output1 = model1(x1)

In [228]:
output0, output1

(tensor([[-0.6488],
         [-0.8170],
         [-0.7162],
         [-0.9621],
         [-0.6709],
         [-0.4777],
         [-0.9215],
         [-0.7507],
         [-0.7782],
         [-0.9010]], device='cuda:0', grad_fn=<AddmmBackward>),
 tensor([[-0.9491],
         [-0.9512],
         [-0.9010],
         [-0.9053],
         [-0.9539],
         [-0.9902],
         [-0.5192],
         [-0.6586],
         [-0.8127],
         [-0.4590]], device='cuda:1', grad_fn=<AddmmBackward>))

In [229]:
loss0 = output0.mean()
loss1 = output1.mean()

In [230]:
loss0, loss1

(tensor(-0.7644, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(-0.8100, device='cuda:1', grad_fn=<MeanBackward0>))

In [231]:
loss0.backward()
loss1.backward()

8.

In [232]:
mean_grads(model, model1)

In [233]:
list(model.parameters())[0].grad

tensor([[ 0.1914,  0.1616,  0.1033],
        [-0.0220, -0.0157, -0.0333],
        [ 0.0085,  0.0041,  0.0098],
        [ 0.1084,  0.0936,  0.1036],
        [ 0.0247,  0.0124,  0.0441],
        [ 0.2035,  0.1286,  0.1052],
        [-0.2375, -0.1711, -0.2726],
        [-0.0571, -0.0499, -0.0299],
        [ 0.1281,  0.0958,  0.0452],
        [ 0.0136,  0.0068,  0.0327],
        [ 0.2939,  0.2770,  0.1683],
        [ 0.0220,  0.0107,  0.0102],
        [ 0.0668,  0.0362,  0.0384],
        [-0.3333, -0.2625, -0.3525],
        [ 0.0095,  0.0038,  0.0047],
        [-0.2360, -0.1438, -0.1185]], device='cuda:0')

In [234]:
optimizer.step()

In [235]:
multi_gpu_model = deepcopy(model)

In [236]:
optimizer.zero_grad()

In [237]:
compare_models(single_gpu_model, multi_gpu_model)

(tensor(True, device='cuda:0'),
 [tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0')])

## Single-GPU Toy C

In [379]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (20, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 10), device=gpu0)

In [380]:
encoder_model = nn.Sequential()
encoder_model.add_module('W0', nn.Linear(3, 16))
encoder_model.add_module('tanh', nn.Tanh())
encoder_model.add_module('W1', nn.Linear(16, 3))
encoder_model = encoder_model.to(gpu0)

class_model = nn.Sequential()
class_model.add_module('W0', nn.Linear(3, 16))
class_model.add_module('tanh', nn.Tanh())
class_model.add_module('W1', nn.Linear(16, 1))
class_model = class_model.to(gpu0)

In [381]:
optimizer = optim.SGD(list(encoder_model.parameters()) + list(class_model.parameters()), lr=0.01, momentum=0.9)

3.

In [382]:
encoded_nodes = encoder_model(x)

In [383]:
start, end = e

src = encoded_nodes[end]
index = start.unsqueeze(-1)
in_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

src = encoded_nodes[start]
index = end.unsqueeze(-1)
out_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

In [384]:
aggr_nodes = in_messages + out_messages

In [385]:
output = class_model(aggr_nodes)

In [386]:
output

tensor([[-0.0448],
        [ 0.3763],
        [-0.3279],
        [ 0.1582],
        [ 0.2266],
        [ 0.2254],
        [-0.3279],
        [-0.3279],
        [-0.3279],
        [ 0.1102],
        [ 0.3028],
        [-0.3279],
        [-0.0107],
        [-0.0013],
        [-0.1709],
        [-0.0309],
        [-0.3279],
        [-0.3279],
        [-0.3279],
        [-0.3279]], device='cuda:0', grad_fn=<AddmmBackward>)

In [387]:
# The "loss" is the average of the size of each vector
loss = output.mean()

In [388]:
loss

tensor(-0.0905, device='cuda:0', grad_fn=<MeanBackward0>)

In [389]:
loss.backward()

In [390]:
dict(encoder_model.named_parameters())["W0.weight"].grad

tensor([[-3.7717e-02, -4.0987e-02, -3.3916e-02],
        [ 4.3297e-03,  1.3792e-03,  9.3630e-03],
        [ 1.1613e-02,  3.9398e-03,  2.1406e-02],
        [-1.4218e-02, -1.0726e-02, -1.5382e-02],
        [-4.3082e-03, -2.2294e-03, -2.5488e-02],
        [-2.7659e-02, -1.5260e-02, -3.3299e-02],
        [ 6.4564e-02,  5.3853e-02,  1.0259e-01],
        [ 6.5279e-03,  7.9564e-03,  9.4455e-03],
        [-3.9221e-03, -3.7380e-03, -2.5951e-03],
        [-2.1158e-03, -8.6539e-04, -1.5249e-02],
        [-3.3486e-02, -4.2239e-02, -3.6859e-02],
        [-9.4915e-04, -3.4031e-04, -1.6120e-03],
        [-4.0669e-03, -2.0772e-03, -5.5126e-03],
        [ 7.7644e-02,  6.7564e-02,  1.1117e-01],
        [-1.4008e-03, -4.8877e-05, -1.2799e-03],
        [ 1.9433e-02,  9.7203e-03,  1.9479e-02]], device='cuda:0')

In [391]:
optimizer.step()

In [392]:
single_gpu_encoder_model = deepcopy(encoder_model)
single_gpu_class_model = deepcopy(class_model)

In [393]:
optimizer.zero_grad()

In [394]:
dict(model.named_parameters())["W0.weight"]

Parameter containing:
tensor([[ 0.2097,  0.4794, -0.1188],
        [ 0.4320, -0.0931,  0.0611],
        [ 0.5228, -0.5356, -0.3635],
        [-0.1462, -0.2251,  0.4988],
        [-0.3742, -0.2658, -0.4034],
        [-0.5407, -0.3370,  0.4963],
        [ 0.2576,  0.2798,  0.0304],
        [-0.2960,  0.0977, -0.5391],
        [-0.4172, -0.2976,  0.3643],
        [ 0.3385, -0.2561, -0.0208],
        [ 0.3693,  0.5740,  0.2291],
        [ 0.0780,  0.3871, -0.3399],
        [ 0.1076, -0.4476, -0.4002],
        [-0.2982,  0.2612,  0.2322],
        [-0.3420,  0.1744,  0.3169],
        [-0.0729,  0.0220,  0.1338]], device='cuda:0', requires_grad=True)

## Multi-GPU Toy C (Alternate)

In [440]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (20, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 10), device=gpu0)

In [441]:
encoder_model = nn.Sequential()
encoder_model.add_module('W0', nn.Linear(3, 16))
encoder_model.add_module('tanh', nn.Tanh())
encoder_model.add_module('W1', nn.Linear(16, 3))
encoder_model = encoder_model.to(gpu0)

class_model = nn.Sequential()
class_model.add_module('W0', nn.Linear(3, 16))
class_model.add_module('tanh', nn.Tanh())
class_model.add_module('W1', nn.Linear(16, 1))
class_model = class_model.to(gpu0)

In [442]:
optimizer = optim.SGD(list(encoder_model.parameters()) + list(class_model.parameters()), lr=0.01, momentum=0.9)

4.

In [443]:
e0 = e[:, :e.shape[1]//2].to(gpu0)
e1 = e[:, e.shape[1]//2:].to(gpu1)
x1 = x.to(gpu1)
encoder_model1 = deepcopy(encoder_model).to(gpu1)
class_model1 = deepcopy(class_model).to(gpu1)

5.

In [444]:
encoded_nodes0 = encoder_model(x)
encoded_nodes1 = encoder_model1(x1)

In [445]:
(encoded_nodes0 == encoded_nodes1.to(gpu0)).all()

tensor(True, device='cuda:0')

In [446]:
start, end = e0

src = encoded_nodes0[end]
index = start.unsqueeze(-1)
in_messages = torch.zeros(encoded_nodes0.shape, dtype=src.dtype, device=encoded_nodes0.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

src = encoded_nodes0[start]
index = end.unsqueeze(-1)
out_messages = torch.zeros(encoded_nodes0.shape, dtype=src.dtype, device=encoded_nodes0.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

In [447]:
aggr_nodes0 = in_messages + out_messages

In [448]:
start, end = e1

src = encoded_nodes1[end]
index = start.unsqueeze(-1)
in_messages = torch.zeros(encoded_nodes1.shape, dtype=src.dtype, device=encoded_nodes1.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

src = encoded_nodes1[start]
index = end.unsqueeze(-1)
out_messages = torch.zeros(encoded_nodes1.shape, dtype=src.dtype, device=encoded_nodes1.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

In [449]:
aggr_nodes1 = in_messages + out_messages

In [450]:
(aggr_nodes0 == aggr_nodes).all()

tensor(False, device='cuda:0')

In [451]:
aggr_nodes_total0 = aggr_nodes0 + aggr_nodes1.to(aggr_nodes0.device)

In [452]:
aggr_nodes_total1 = aggr_nodes_total0.to(gpu1)

In [453]:
(aggr_nodes_total0 == aggr_nodes).all()

tensor(True, device='cuda:0')

In [454]:
output0 = class_model(aggr_nodes_total0)
output1 = class_model1(aggr_nodes_total1)

In [455]:
loss0 = output0.mean()
loss1 = output1.mean()

In [456]:
loss0, loss1

(tensor(-0.0905, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(-0.0905, device='cuda:1', grad_fn=<MeanBackward0>))

In [457]:
loss0.backward(retain_graph=True)
loss1.backward()

8.

In [466]:
mean_grads(encoder_model, encoder_model1)

In [467]:
mean_grads(class_model, class_model1)

In [468]:
list(encoder_model.parameters())[0].grad

tensor([[-3.7717e-02, -4.0987e-02, -3.3916e-02],
        [ 4.3297e-03,  1.3792e-03,  9.3630e-03],
        [ 1.1613e-02,  3.9398e-03,  2.1406e-02],
        [-1.4218e-02, -1.0726e-02, -1.5382e-02],
        [-4.3082e-03, -2.2294e-03, -2.5488e-02],
        [-2.7659e-02, -1.5260e-02, -3.3299e-02],
        [ 6.4564e-02,  5.3853e-02,  1.0259e-01],
        [ 6.5279e-03,  7.9564e-03,  9.4455e-03],
        [-3.9221e-03, -3.7380e-03, -2.5951e-03],
        [-2.1158e-03, -8.6539e-04, -1.5249e-02],
        [-3.3486e-02, -4.2239e-02, -3.6859e-02],
        [-9.4915e-04, -3.4031e-04, -1.6120e-03],
        [-4.0669e-03, -2.0772e-03, -5.5126e-03],
        [ 7.7644e-02,  6.7564e-02,  1.1117e-01],
        [-1.4008e-03, -4.8877e-05, -1.2799e-03],
        [ 1.9433e-02,  9.7203e-03,  1.9479e-02]], device='cuda:0')

In [469]:
optimizer.step()

In [470]:
multi_gpu_encoder_model = deepcopy(encoder_model)
multi_gpu_class_model = deepcopy(class_model)

In [471]:
optimizer.zero_grad()

In [472]:
compare_models(single_gpu_encoder_model, multi_gpu_encoder_model)

(tensor(True, device='cuda:0'),
 [tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0')])

In [473]:
compare_models(single_gpu_class_model, multi_gpu_class_model)

(tensor(True, device='cuda:0'),
 [tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0')])

## Single-GPU Toy D

In [714]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (20, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 10), device=gpu0)

In [715]:
encoder_model = nn.Sequential()
encoder_model.add_module('W0', nn.Linear(3, 16))
encoder_model.add_module('tanh', nn.Tanh())
encoder_model.add_module('W1', nn.Linear(16, 3))
encoder_model = encoder_model.to(gpu0)

class_model = nn.Sequential()
class_model.add_module('W0', nn.Linear(6, 16))
class_model.add_module('tanh', nn.Tanh())
class_model.add_module('W1', nn.Linear(16, 1))
class_model = class_model.to(gpu0)

In [716]:
optimizer = optim.SGD(list(encoder_model.parameters()) + list(class_model.parameters()), lr=0.01, momentum=0.9)

3.

In [717]:
encoded_nodes = encoder_model(x)

In [718]:
start, end = e

src = encoded_nodes[end]
index = start.unsqueeze(-1)
in_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

src = encoded_nodes[start]
index = end.unsqueeze(-1)
out_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

In [719]:
aggr_nodes = in_messages + out_messages

In [720]:
input_edges = torch.cat([aggr_nodes[start], aggr_nodes[end]], axis=-1)

In [721]:
output = class_model(input_edges)

In [722]:
# The "loss" is the average of the size of each vector
loss = output.mean()

In [723]:
loss

tensor(-0.0477, device='cuda:0', grad_fn=<MeanBackward0>)

In [724]:
loss.backward()

In [725]:
list(encoder_model.parameters())[0].grad

tensor([[ 0.0017, -0.0006,  0.0031],
        [-0.0089, -0.0035, -0.0386],
        [-0.0014,  0.0016,  0.0043],
        [-0.0072, -0.0163,  0.0477],
        [-0.0017, -0.0010, -0.0084],
        [ 0.0053,  0.0023,  0.0062],
        [-0.0121, -0.0122, -0.0169],
        [-0.0004, -0.0010,  0.0021],
        [ 0.0018,  0.0012,  0.0045],
        [ 0.0016,  0.0008,  0.0152],
        [ 0.0010,  0.0017,  0.0098],
        [ 0.0007,  0.0004,  0.0017],
        [-0.0008,  0.0012, -0.0046],
        [-0.0114, -0.0063, -0.0419],
        [ 0.0051,  0.0003,  0.0076],
        [-0.0022, -0.0021, -0.0072]], device='cuda:0')

In [726]:
optimizer.step()

In [727]:
single_gpu_encoder_model = deepcopy(encoder_model)
single_gpu_class_model = deepcopy(class_model)

In [728]:
optimizer.zero_grad()

In [730]:
dict(class_model.named_parameters())["W0.weight"]

Parameter containing:
tensor([[-0.3452, -0.1010,  0.0186,  0.0587,  0.0974,  0.1603],
        [ 0.0230, -0.1982,  0.1934, -0.3929, -0.2412, -0.1020],
        [-0.1995, -0.1424, -0.3345, -0.0874,  0.0876, -0.2659],
        [-0.0209,  0.2922, -0.0420,  0.0114, -0.0352,  0.0826],
        [ 0.2621,  0.3851,  0.2589,  0.3896, -0.0307, -0.3670],
        [-0.1931,  0.2777, -0.0027, -0.2025, -0.3131, -0.3821],
        [-0.3413, -0.0850,  0.2234,  0.2235, -0.3954,  0.2543],
        [-0.3179, -0.0873, -0.1658, -0.0771, -0.0811, -0.3666],
        [-0.3543, -0.0627,  0.0055, -0.1871,  0.1547, -0.3672],
        [-0.0257,  0.3578, -0.1668,  0.3703,  0.1468, -0.3687],
        [ 0.2570, -0.0463, -0.1821,  0.3253, -0.3293,  0.0440],
        [-0.0868,  0.2924,  0.1142,  0.1948,  0.1450, -0.0980],
        [-0.0849, -0.3371,  0.2211,  0.3250,  0.2788, -0.2881],
        [ 0.0194, -0.2886, -0.2249, -0.2367,  0.1388, -0.2435],
        [-0.0062,  0.0154,  0.2628, -0.3062, -0.2818, -0.2374],
        [ 0.2853, 

## Multi-GPU Toy D (Alternate)

In [368]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (20, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 10), device=gpu0)

In [369]:
encoder_model0 = nn.Sequential()
encoder_model0.add_module('W0', nn.Linear(3, 16))
encoder_model0.add_module('tanh', nn.Tanh())
encoder_model0.add_module('W1', nn.Linear(16, 3))
encoder_model0 = encoder_model0.to(gpu0)

class_model0 = nn.Sequential()
class_model0.add_module('W0', nn.Linear(6, 16))
class_model0.add_module('tanh', nn.Tanh())
class_model0.add_module('W1', nn.Linear(16, 1))
class_model0 = class_model0.to(gpu0)

In [370]:
optimizer = optim.SGD(list(encoder_model0.parameters()) + list(class_model0.parameters()), lr=0.01, momentum=0.9)

4.

In [371]:
e0 = e[:, :e.shape[1]//2].to(gpu0)
e1 = e[:, e.shape[1]//2:].to(gpu1)
x1 = x.to(gpu1)
encoder_model1 = deepcopy(encoder_model0).to(gpu1)
class_model1 = deepcopy(class_model0).to(gpu1)

5.

In [372]:
encoded_nodes0 = encoder_model0(x)
encoded_nodes1 = encoder_model1(x1)

In [610]:
(encoded_nodes0 == encoded_nodes1.to(gpu0)).all()

tensor(True, device='cuda:0')

In [611]:
start, end = e0

src = encoded_nodes0[end]
index = start.unsqueeze(-1)
in_messages = torch.zeros(encoded_nodes0.shape, dtype=src.dtype, device=encoded_nodes0.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

src = encoded_nodes0[start]
index = end.unsqueeze(-1)
out_messages = torch.zeros(encoded_nodes0.shape, dtype=src.dtype, device=encoded_nodes0.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

In [612]:
aggr_nodes0 = in_messages + out_messages

In [613]:
start, end = e1

src = encoded_nodes1[end]
index = start.unsqueeze(-1)
in_messages = torch.zeros(encoded_nodes1.shape, dtype=src.dtype, device=encoded_nodes1.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

src = encoded_nodes1[start]
index = end.unsqueeze(-1)
out_messages = torch.zeros(encoded_nodes1.shape, dtype=src.dtype, device=encoded_nodes1.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

In [614]:
aggr_nodes1 = in_messages + out_messages

In [615]:
aggr_nodes_total0 = aggr_nodes0 + aggr_nodes1.to(aggr_nodes0.device)

In [616]:
aggr_nodes_total1 = aggr_nodes_total0.to(gpu1)

In [617]:
(aggr_nodes_total0 == aggr_nodes).all()

tensor(True, device='cuda:0')

In [618]:
input_edges0 = torch.cat([aggr_nodes_total0[e0[0]], aggr_nodes_total0[e0[1]]], axis=-1)
input_edges1 = torch.cat([aggr_nodes_total1[e1[0]], aggr_nodes_total1[e1[1]]], axis=-1)

In [619]:
output0 = class_model0(input_edges0)
output1 = class_model1(input_edges1)

In [620]:
loss0 = output0.mean()
loss1 = output1.mean()

In [621]:
loss0, loss1

(tensor(-0.0223, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(-0.0731, device='cuda:1', grad_fn=<MeanBackward0>))

In [622]:
loss

tensor(-0.0477, device='cuda:0', grad_fn=<MeanBackward0>)

In [623]:
loss0.backward(retain_graph=True)
loss1.backward()

In [624]:
mean_grads(encoder_model0, encoder_model1)

In [625]:
mean_grads(class_model0, class_model1)

In [627]:
list(encoder_model0.parameters())[0].grad

tensor([[ 0.0017, -0.0006,  0.0031],
        [-0.0089, -0.0035, -0.0386],
        [-0.0014,  0.0016,  0.0043],
        [-0.0072, -0.0163,  0.0477],
        [-0.0017, -0.0010, -0.0084],
        [ 0.0053,  0.0023,  0.0062],
        [-0.0121, -0.0122, -0.0169],
        [-0.0004, -0.0010,  0.0021],
        [ 0.0018,  0.0012,  0.0045],
        [ 0.0016,  0.0008,  0.0152],
        [ 0.0010,  0.0017,  0.0098],
        [ 0.0007,  0.0004,  0.0017],
        [-0.0008,  0.0012, -0.0046],
        [-0.0114, -0.0063, -0.0419],
        [ 0.0051,  0.0003,  0.0076],
        [-0.0022, -0.0021, -0.0072]], device='cuda:0')

In [628]:
optimizer.step()

In [629]:
multi_gpu_encoder_model = deepcopy(encoder_model0)
multi_gpu_class_model = deepcopy(class_model0)

In [630]:
optimizer.zero_grad()

In [631]:
compare_models(single_gpu_encoder_model, multi_gpu_encoder_model)

(tensor(True, device='cuda:0'),
 [tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0')])

In [632]:
compare_models(single_gpu_class_model, multi_gpu_class_model)

(tensor(True, device='cuda:0'),
 [tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0'),
  tensor(True, device='cuda:0')])

# Realistic Multi-GPU Training

In [5]:
def scatter_add_nodes(encoded_nodes, edge_list):
    start, end = edge_list

    src = encoded_nodes[end]
    index = start.unsqueeze(-1)
    in_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 

    src = encoded_nodes[start]
    index = end.unsqueeze(-1)
    out_messages = torch.zeros(encoded_nodes.shape, dtype=src.dtype, device=encoded_nodes.device).scatter_add(0, index.repeat((1,src.shape[1])), src) 
    
    aggr_nodes = in_messages + out_messages
    
    return aggr_nodes

## Single-GPU Case

In [449]:
n_graph_iters = 1

In [450]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (10, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 100), device=gpu0)

In [451]:
encoder_model = nn.Sequential()
encoder_model.add_module('W0', nn.Linear(3, 16))
encoder_model.add_module('tanh', nn.Tanh())
encoder_model.add_module('W1', nn.Linear(16, 3))
encoder_model = encoder_model.to(gpu0)

node_model = nn.Sequential()
node_model.add_module('W0', nn.Linear(3, 16))
node_model.add_module('tanh', nn.Tanh())
node_model.add_module('W1', nn.Linear(16, 3))
node_model = node_model.to(gpu0)

edge_model = nn.Sequential()
edge_model.add_module('W0', nn.Linear(6, 16))
edge_model.add_module('tanh', nn.Tanh())
edge_model.add_module('W1', nn.Linear(16, 1))
edge_model = edge_model.to(gpu0)

In [452]:
optimizer = optim.SGD(list(encoder_model.parameters()) + list(node_model.parameters()) + list(edge_model.parameters()), lr=0.01, momentum=0.9)

In [453]:
encoded_nodes = encoder_model(x)

In [454]:
for i in range(n_graph_iters):
    scattered_nodes = scatter_add_nodes(encoded_nodes, e)
    encoded_nodes = node_model(scattered_nodes)

In [455]:
input_edges = torch.cat([encoded_nodes[e[0]], encoded_nodes[e[1]]], axis=-1)

In [456]:
output = edge_model(input_edges)

In [457]:
# The "loss" is the average of the size of each vector
loss = output.mean()

In [458]:
loss

tensor(-0.0632, device='cuda:0', grad_fn=<MeanBackward0>)

In [459]:
loss.backward()

In [460]:
optimizer.step()

In [461]:
single_gpu_encoder_model = deepcopy(encoder_model)
single_gpu_node_model = deepcopy(node_model)
single_gpu_edge_model = deepcopy(edge_model)

## Multi-GPU Case

In [464]:
n_graph_iters = 1

In [465]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (10, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 100), device=gpu0)

In [466]:
encoder_model0 = nn.Sequential()
encoder_model0.add_module('W0', nn.Linear(3, 16))
encoder_model0.add_module('tanh', nn.Tanh())
encoder_model0.add_module('W1', nn.Linear(16, 3))
encoder_model0 = encoder_model0.to(gpu0)

node_model0 = nn.Sequential()
node_model0.add_module('W0', nn.Linear(3, 16))
node_model0.add_module('tanh', nn.Tanh())
node_model0.add_module('W1', nn.Linear(16, 3))
node_model0 = node_model0.to(gpu0)

edge_model0 = nn.Sequential()
edge_model0.add_module('W0', nn.Linear(6, 16))
edge_model0.add_module('tanh', nn.Tanh())
edge_model0.add_module('W1', nn.Linear(16, 1))
edge_model0 = edge_model0.to(gpu0)

In [467]:
optimizer = optim.SGD(list(encoder_model0.parameters()) + list(node_model0.parameters()) + list(edge_model0.parameters()), lr=0.01, momentum=0.9)

In [468]:
e0 = e[:, :e.shape[1]//2].to(gpu0)
e1 = e[:, e.shape[1]//2:].to(gpu1)
x1 = x.to(gpu1)

In [469]:
encoder_model1 = deepcopy(encoder_model0).to(gpu1)
node_model1 = deepcopy(node_model0).to(gpu1)
edge_model1 = deepcopy(edge_model0).to(gpu1)

5.

In [470]:
encoded_nodes0 = encoder_model0(x)
encoded_nodes1 = encoder_model1(x1)

In [471]:
(encoded_nodes0 == encoded_nodes1.to(gpu0)).all()

tensor(True, device='cuda:0')

In [472]:
for i in range(n_graph_iters):
    scattered_nodes0 = scatter_add_nodes(encoded_nodes0, e0)
    scattered_nodes1 = scatter_add_nodes(encoded_nodes1, e1)

    scattered_nodes_total0 = scattered_nodes0 + scattered_nodes1.to(scattered_nodes0.device)
    encoded_nodes0 = node_model0(scattered_nodes_total0)
    encoded_nodes1 = encoded_nodes0.to(gpu1)
#     encoded_nodes1 = node_model1(scattered_nodes_total0.to(gpu1))

In [473]:
input_edges0 = torch.cat([encoded_nodes0[e0[0]], encoded_nodes0[e0[1]]], axis=-1)
input_edges1 = torch.cat([encoded_nodes1[e1[0]], encoded_nodes1[e1[1]]], axis=-1)

In [474]:
output0 = edge_model0(input_edges0)
output1 = edge_model1(input_edges1)

In [475]:
loss0 = output0.mean()
loss1 = output1.mean()

In [476]:
loss0, loss1

(tensor(-0.0635, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(-0.0629, device='cuda:1', grad_fn=<MeanBackward0>))

In [477]:
loss

tensor(-0.0632, device='cuda:0', grad_fn=<MeanBackward0>)

In [478]:
loss0.backward(retain_graph=True)
loss1.backward()

In [486]:
compare_models(encoder_model0, encoder_model)

(tensor(False, device='cuda:0'), tensor(True, device='cuda:0'))

In [485]:
mean_grads(encoder_model0, encoder_model1)

In [495]:
compare_models(node_model0, node_model)

(tensor(False, device='cuda:0'), tensor(True, device='cuda:0'))

In [492]:
mean_grads(node_model0, node_model1)

In [499]:
mean_grads(edge_model0, edge_model1)

In [502]:
compare_models(edge_model0, edge_model)

(tensor(True, device='cuda:0'), tensor(True, device='cuda:0'))

In [501]:
optimizer.step()

In [503]:
multi_gpu_encoder_model = deepcopy(encoder_model0)
multi_gpu_node_model = deepcopy(node_model0)
multi_gpu_edge_model = deepcopy(edge_model0)

In [504]:
compare_models(single_gpu_encoder_model, multi_gpu_encoder_model)

(tensor(True, device='cuda:0'), True)

In [505]:
compare_models(single_gpu_node_model, multi_gpu_node_model)

(tensor(True, device='cuda:0'), True)

In [506]:
compare_models(single_gpu_edge_model, multi_gpu_edge_model)

(tensor(True, device='cuda:0'), True)

# Attention Multi-GPU Training

## Single-GPU Case

In [6]:
n_graph_iters = 8

In [7]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (100000, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 4000000), device=gpu0)

In [8]:
encoder_model = nn.Sequential()
encoder_model.add_module('W0', nn.Linear(3, 64))
encoder_model.add_module('tanh', nn.Tanh())
encoder_model.add_module('W1', nn.Linear(64, 3))
encoder_model = encoder_model.to(gpu0)

node_model = nn.Sequential()
node_model.add_module('W0', nn.Linear(3, 64))
node_model.add_module('tanh', nn.Tanh())
node_model.add_module('W1', nn.Linear(64, 3))
node_model = node_model.to(gpu0)

edge_model = nn.Sequential()
edge_model.add_module('W0', nn.Linear(6, 64))
edge_model.add_module('tanh', nn.Tanh())
edge_model.add_module('W1', nn.Linear(64, 1))
edge_model = edge_model.to(gpu0)

In [9]:
optimizer = optim.SGD(list(encoder_model.parameters()) + list(node_model.parameters()) + list(edge_model.parameters()), lr=0.01, momentum=0.9)

In [10]:
encoded_nodes = encoder_model(x)

In [11]:
for i in range(n_graph_iters):
    encoded_edges = edge_model(torch.cat([encoded_nodes[e[0]], encoded_nodes[e[1]]], axis=-1))    
    scattered_nodes = scatter_add_attention(encoded_nodes, encoded_edges, e)
    encoded_nodes = node_model(scattered_nodes)

In [12]:
input_edges = torch.cat([encoded_nodes[e[0]], encoded_nodes[e[1]]], axis=-1)
output = edge_model(input_edges)

In [13]:
# The "loss" is the average of the size of each vector
loss = output.mean()

In [14]:
loss

tensor(0.3759, device='cuda:0', grad_fn=<MeanBackward0>)

In [15]:
loss.backward()

In [16]:
optimizer.step()

In [17]:
single_gpu_encoder_model = deepcopy(encoder_model)
single_gpu_node_model = deepcopy(node_model)
single_gpu_edge_model = deepcopy(edge_model)

In [18]:
print("Device 0 using: {:.2f}Gb".format(torch.cuda.memory_stats(device=0)["active_bytes.all.peak"] / 1024**3))

Device 0 using: 13.86Gb


In [19]:
print("Device 1 using: {:.2f}Gb".format(torch.cuda.memory_stats(device=1)["active_bytes.all.peak"] / 1024**3))

Device 1 using: 0.00Gb


## Multi-GPU Case

In [6]:
n_graph_iters = 8

In [7]:
torch.random.manual_seed(0)
x = torch.randint(0, 5, (100000, 3), device=gpu0).float()
e = torch.randint(0, len(x), (2, 4000000), device=gpu0)

In [8]:
encoder_model0 = nn.Sequential()
encoder_model0.add_module('W0', nn.Linear(3, 64))
encoder_model0.add_module('tanh', nn.Tanh())
encoder_model0.add_module('W1', nn.Linear(64, 3))
encoder_model0 = encoder_model0.to(gpu0)

node_model0 = nn.Sequential()
node_model0.add_module('W0', nn.Linear(3, 64))
node_model0.add_module('tanh', nn.Tanh())
node_model0.add_module('W1', nn.Linear(64, 3))
node_model0 = node_model0.to(gpu0)

edge_model0 = nn.Sequential()
edge_model0.add_module('W0', nn.Linear(6, 64))
edge_model0.add_module('tanh', nn.Tanh())
edge_model0.add_module('W1', nn.Linear(64, 1))
edge_model0 = edge_model0.to(gpu0)

In [9]:
optimizer = optim.SGD(list(encoder_model0.parameters()) + list(node_model0.parameters()) + list(edge_model0.parameters()), lr=0.01, momentum=0.9)

In [10]:
%%time
e0 = e[:, :e.shape[1]//2].to(gpu0)
e1 = e[:, e.shape[1]//2:].to(gpu1)
x1 = x.to(gpu1)

CPU times: user 1.88 s, sys: 499 ms, total: 2.38 s
Wall time: 2.43 s


In [11]:
encoder_model1 = deepcopy(encoder_model0).to(gpu1)
node_model1 = deepcopy(node_model0).to(gpu1)
edge_model1 = deepcopy(edge_model0).to(gpu1)

5.

In [12]:
encoded_nodes0 = encoder_model0(x)
encoded_nodes1 = encoder_model1(x1)

In [13]:
(encoded_nodes0 == encoded_nodes1.to(gpu0)).all()

tensor(True, device='cuda:0')

In [14]:
for i in range(n_graph_iters):
    
    encoded_edges0 = edge_model0(torch.cat([encoded_nodes0[e0[0]], encoded_nodes0[e0[1]]], axis=-1))    
    scattered_nodes0 = scatter_add_attention(encoded_nodes0, encoded_edges0, e0)
    
    encoded_edges1 = edge_model1(torch.cat([encoded_nodes1[e1[0]], encoded_nodes1[e1[1]]], axis=-1))    
    scattered_nodes1 = scatter_add_attention(encoded_nodes1, encoded_edges1, e1)

    scattered_nodes_total0 = scattered_nodes0 + scattered_nodes1.to(scattered_nodes0.device)
    encoded_nodes0 = node_model0(scattered_nodes_total0)
    encoded_nodes1 = encoded_nodes0.to(gpu1)
#     encoded_nodes1 = node_model1(scattered_nodes_total0.to(gpu1))

In [15]:
input_edges0 = torch.cat([encoded_nodes0[e0[0]], encoded_nodes0[e0[1]]], axis=-1)
input_edges1 = torch.cat([encoded_nodes1[e1[0]], encoded_nodes1[e1[1]]], axis=-1)

In [16]:
output0 = edge_model0(input_edges0)
output1 = edge_model1(input_edges1)

In [17]:
loss0 = output0.mean()
loss1 = output1.mean()

In [18]:
loss0, loss1

(tensor(0.3760, device='cuda:0', grad_fn=<MeanBackward0>),
 tensor(0.3759, device='cuda:1', grad_fn=<MeanBackward0>))

In [35]:
loss

tensor(0.0995, device='cuda:0', grad_fn=<MeanBackward0>)

In [19]:
loss0.backward(retain_graph=True)
loss1.backward()

In [37]:
mean_grads(encoder_model0, encoder_model1)

In [38]:
compare_models(encoder_model0, encoder_model)

(tensor(False, device='cuda:0'), tensor(True, device='cuda:0'))

In [39]:
mean_grads(node_model0, node_model1)

In [40]:
compare_models(node_model0, node_model)

(tensor(False, device='cuda:0'), tensor(True, device='cuda:0'))

In [41]:
mean_grads(edge_model0, edge_model1)

In [42]:
compare_models(edge_model0, edge_model)

(tensor(False, device='cuda:0'), tensor(True, device='cuda:0'))

In [20]:
optimizer.step()

In [21]:
multi_gpu_encoder_model = deepcopy(encoder_model0)
multi_gpu_node_model = deepcopy(node_model0)
multi_gpu_edge_model = deepcopy(edge_model0)

In [45]:
compare_models(single_gpu_encoder_model, multi_gpu_encoder_model)

(tensor(True, device='cuda:0'), True)

In [46]:
compare_models(single_gpu_node_model, multi_gpu_node_model)

(tensor(True, device='cuda:0'), True)

In [47]:
compare_models(single_gpu_edge_model, multi_gpu_edge_model)

(tensor(True, device='cuda:0'), True)

## Memory Compare

In [58]:
print("Device 0 using: {:.2f}Mb".format(torch.cuda.memory_stats(device=0)["active_bytes.all.peak"] / 1024**2))

Device 0 using: 590.15Mb


In [59]:
print("Device 1 using: {:.2f}Mb".format(torch.cuda.memory_stats(device=1)["active_bytes.all.peak"] / 1024**2))

Device 1 using: 183.02Mb


In [22]:
print("Device 0 using: {:.2f}Gb".format(torch.cuda.memory_stats(device=0)["active_bytes.all.peak"] / 1024**3))

Device 0 using: 7.08Gb


In [23]:
print("Device 1 using: {:.2f}Gb".format(torch.cuda.memory_stats(device=1)["active_bytes.all.peak"] / 1024**3))

Device 1 using: 6.85Gb


# Filter Gradient Play

## Dataset

In [3]:
input_dir = "/global/cscratch1/sd/danieltm/ExaTrkX/trackml-codalab/embedding_processed/1_pt_cut_endcaps_unweighted/train/"
num_events = 20

In [5]:
all_events = os.listdir(input_dir)
all_events = sorted([os.path.join(input_dir, event) for event in all_events])
loaded_events = []
for event in all_events[:num_events]:
    try:
        loaded_event = torch.load(event, map_location=torch.device('cpu'))
        loaded_events.append(loaded_event)
    except:
        pass

## Model

In [9]:
class VanillaFilter(nn.module):

    def __init__(self, hparams):
        super().__init__(hparams)
        '''
        Initialise the Lightning Module that can scan over different filter training regimes
        '''

        # Construct the MLP architecture
        self.input_layer = Linear(hparams["in_channels"]*2 + hparams["emb_channels"]*2, hparams["hidden"])
        layers = [Linear(hparams["hidden"], hparams["hidden"]) for _ in range(hparams["nb_layer"]-1)]
        self.layers = nn.ModuleList(layers)
        self.output_layer = nn.Linear(hparams["hidden"], 1)
        self.layernorm = nn.LayerNorm(hparams["hidden"])
        self.batchnorm = nn.BatchNorm1d(num_features=hparams["hidden"], track_running_stats=False)
        self.act = nn.Tanh()

    def forward(self, x, e, emb=None):
        if emb is not None:
            x = self.input_layer(torch.cat([x[e[0]], emb[e[0]], x[e[1]], emb[e[1]]], dim=-1))
        else:
            x = self.input_layer(torch.cat([x[e[0]], x[e[1]]], dim=-1))
        for l in self.layers:
            x = l(x)
            x = self.act(x)
            if self.hparams["layernorm"]: x = self.layernorm(x) #Option of LayerNorm
            if self.hparams["batchnorm"]: x = self.batchnorm(x) #Option of Batch
        x = self.output_layer(x)
        return x

AttributeError: module 'torch.nn' has no attribute 'module'

## Training Step

In [None]:
for batch in loaded_events:
    