# Gradient Exploration

The purpose of this notebook is to explore how gradients can be preserved on and off a GPU, accumulated *within* a batch, and be handled by multiple GPUs.

In [1]:
import os

import torch
import torch.nn as nn
from torchviz import make_dot

### TODO

- [ ] Load a test dataset
- [ ] Set up a reproducible training step
- [ ] Ensure that I understand how the backwards step works!

# Most Minimal Play

- `retain_graph` allows multiple backward passes
- The grad's are accumulated *on the leaf variables*
- `backward` is run on an output variable

## Basics

In [74]:
x = torch.rand(4, requires_grad=True)
y = torch.rand(4, requires_grad=True)

In [75]:
x, y

(tensor([0.4820, 0.1776, 0.9892, 0.9979], requires_grad=True),
 tensor([0.5668, 0.8687, 0.2601, 0.8752], requires_grad=True))

In [82]:
z = x*y

In [83]:
z.backward(torch.FloatTensor([1.0, 1.0, 1.0, 1.0]))

In [84]:
x.grad.data

tensor([1.1336, 1.7374, 0.5203, 1.7504])

In [85]:
y.grad.data

tensor([0.9639, 0.3551, 1.9784, 1.9958])

## GPU on-and-off

In [131]:
x1_cpu = torch.tensor([1., 1., 1., 1.], requires_grad=True)
y1_cpu = torch.tensor([4., 4., 4., 4.], requires_grad=True)
x2_cpu = torch.tensor([1., 1., 1., 1.], requires_grad=True)
y2_cpu = torch.tensor([3., 3., 3., 3.], requires_grad=True)

In [132]:
x1_cuda, y1_cuda = x1_cpu.to("cuda"), y1_cpu.to("cuda")

In [133]:
z1_cuda = x1_cuda * y1_cuda

In [134]:
z1_cpu = z1_cuda.to("cpu")
x1_cpu2, y1_cpu2 = x1_cuda.to("cpu"), y1_cuda.to("cpu")

In [135]:
x2_cuda, y2_cuda = x2_cpu.to("cuda"), y2_cpu.to("cuda")

In [136]:
z2_cuda = x2_cuda * y2_cuda

In [137]:
z1_cuda2 = z1_cpu.to("cuda")

In [138]:
z_total = z1_cuda2 + z2_cuda

In [139]:
del x1_cuda

In [140]:
z_total.sum().backward()

In [141]:
x1_cpu.grad

tensor([4., 4., 4., 4.])

In [142]:
y1_cpu.grad

tensor([1., 1., 1., 1.])

In [143]:
x2_cpu.grad

tensor([3., 3., 3., 3.])

In [144]:
y2_cpu.grad

tensor([1., 1., 1., 1.])

## Memory Test

In [2]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()



In [3]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [67]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [68]:
x1_size = (100000,1000)
y1_size = (100000,1000)
x1 = torch.rand(x1_size, requires_grad=True, device="cuda")
y1 = torch.rand(y1_size, requires_grad=True, device="cuda")
x2 = torch.rand(x1_size, requires_grad=True, device="cuda")
y2 = torch.rand(y1_size, requires_grad=True, device="cuda")
# x1 = torch.rand(x1_size, device="cuda")
# y1 = torch.rand(y1_size, device="cuda")

In [69]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [70]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [71]:
z = x1*y1

In [72]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [73]:
x1 = x1.to("cpu")
y1 = y1.to("cpu")

In [75]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [74]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [76]:
z.sum().backward()

In [77]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

2.6113290786743164


In [78]:
del z

In [79]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [80]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

2.23828125


In [29]:
x1.grad=None
y1.grad=None

In [30]:
torch.cuda.reset_max_memory_allocated()
torch.cuda.reset_max_memory_cached()

In [31]:
print(torch.cuda.max_memory_allocated()/ 1024**3) 

1.865234375


In [132]:
x1_cuda, y1_cuda = x1_cpu.to("cuda"), y1_cpu.to("cuda")

In [133]:
z1_cuda = x1_cuda * y1_cuda

In [134]:
z1_cpu = z1_cuda.to("cpu")
x1_cpu2, y1_cpu2 = x1_cuda.to("cpu"), y1_cuda.to("cpu")

In [135]:
x2_cuda, y2_cuda = x2_cpu.to("cuda"), y2_cpu.to("cuda")

In [136]:
z2_cuda = x2_cuda * y2_cuda

In [137]:
z1_cuda2 = z1_cpu.to("cuda")

In [138]:
z_total = z1_cuda2 + z2_cuda

# Sequential Model Play

We create a sequential NN, then feed a variable through it

In [11]:
model = nn.Sequential()

In [12]:
model.add_module('W0', nn.Linear(3, 16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16, 1))

In [13]:
x = torch.randn((1,3), requires_grad=True)

In [14]:
x

tensor([[ 1.0646,  0.7194, -0.4448]], requires_grad=True)

In [15]:
y = model(x)

In [16]:
y

tensor([[-0.1016]], grad_fn=<AddmmBackward>)

In [17]:
dict(model.named_parameters())

{'W0.weight': Parameter containing:
 tensor([[ 0.2392,  0.3530, -0.0977],
         [ 0.1958, -0.0073,  0.2408],
         [ 0.0658,  0.4778,  0.1292],
         [ 0.1938, -0.3796,  0.1065],
         [-0.1575,  0.2916,  0.3924],
         [-0.2167, -0.0704, -0.5197],
         [ 0.4216,  0.1047,  0.5400],
         [-0.4827,  0.4868, -0.5655],
         [ 0.3096, -0.3725, -0.3382],
         [ 0.0190, -0.3260, -0.0828],
         [-0.1678,  0.4821,  0.1528],
         [ 0.3671,  0.4760, -0.5092],
         [ 0.2448,  0.2161, -0.1691],
         [-0.4507,  0.2656,  0.4777],
         [-0.1823, -0.2169,  0.4688],
         [-0.2667,  0.5446,  0.2151]], requires_grad=True),
 'W0.bias': Parameter containing:
 tensor([ 0.2466, -0.5018,  0.0103,  0.1711,  0.4140, -0.0998,  0.3709, -0.3463,
          0.4880,  0.1452,  0.3080,  0.0793, -0.5400,  0.1486,  0.2751,  0.0492],
        requires_grad=True),
 'W1.weight': Parameter containing:
 tensor([[-0.1382,  0.0121,  0.1774,  0.1123, -0.0276,  0.1617,  0.0063,

Now we can backprop with a scalar (e.g. mean of y) and check that each variable accumulated a gradient!

In [18]:
y.mean().backward()

In [21]:
x.grad.data

tensor([[-0.0893,  0.0204, -0.1982]])

In [37]:
dict(model.named_parameters()).items()

dict_items([('W0.weight', Parameter containing:
tensor([[ 0.2392,  0.3530, -0.0977],
        [ 0.1958, -0.0073,  0.2408],
        [ 0.0658,  0.4778,  0.1292],
        [ 0.1938, -0.3796,  0.1065],
        [-0.1575,  0.2916,  0.3924],
        [-0.2167, -0.0704, -0.5197],
        [ 0.4216,  0.1047,  0.5400],
        [-0.4827,  0.4868, -0.5655],
        [ 0.3096, -0.3725, -0.3382],
        [ 0.0190, -0.3260, -0.0828],
        [-0.1678,  0.4821,  0.1528],
        [ 0.3671,  0.4760, -0.5092],
        [ 0.2448,  0.2161, -0.1691],
        [-0.4507,  0.2656,  0.4777],
        [-0.1823, -0.2169,  0.4688],
        [-0.2667,  0.5446,  0.2151]], requires_grad=True)), ('W0.bias', Parameter containing:
tensor([ 0.2466, -0.5018,  0.0103,  0.1711,  0.4140, -0.0998,  0.3709, -0.3463,
         0.4880,  0.1452,  0.3080,  0.0793, -0.5400,  0.1486,  0.2751,  0.0492],
       requires_grad=True)), ('W1.weight', Parameter containing:
tensor([[-0.1382,  0.0121,  0.1774,  0.1123, -0.0276,  0.1617,  0.0063,  0.20

In [38]:
for (k, v) in dict(model.named_parameters()).items():
    print(k,v.grad)

W0.weight tensor([[-0.0824, -0.0557,  0.0344],
        [ 0.0110,  0.0074, -0.0046],
        [ 0.1656,  0.1119, -0.0692],
        [ 0.1192,  0.0806, -0.0498],
        [-0.0272, -0.0183,  0.0113],
        [ 0.1683,  0.1137, -0.0703],
        [ 0.0045,  0.0030, -0.0019],
        [ 0.2013,  0.1360, -0.0841],
        [ 0.0669,  0.0452, -0.0279],
        [-0.1071, -0.0724,  0.0447],
        [-0.1608, -0.1086,  0.0672],
        [-0.0410, -0.0277,  0.0171],
        [-0.2067, -0.1397,  0.0864],
        [-0.1796, -0.1213,  0.0750],
        [-0.0441, -0.0298,  0.0184],
        [ 0.1986,  0.1342, -0.0830]])
W0.bias tensor([-0.0774,  0.0103,  0.1556,  0.1120, -0.0255,  0.1581,  0.0042,  0.1891,
         0.0628, -0.1006, -0.1510, -0.0385, -0.1942, -0.1687, -0.0414,  0.1866])
W1.weight tensor([[ 0.6633, -0.3848,  0.3510,  0.0568,  0.2743, -0.1488,  0.5750, -0.2529,
          0.6044, -0.0322,  0.3869,  0.7775, -0.0487, -0.3388, -0.2762,  0.0613]])
W1.bias tensor([1.])


In [39]:
for (k, v) in dict(model.named_parameters()).items():
    print(k,v)

W0.weight Parameter containing:
tensor([[ 0.2392,  0.3530, -0.0977],
        [ 0.1958, -0.0073,  0.2408],
        [ 0.0658,  0.4778,  0.1292],
        [ 0.1938, -0.3796,  0.1065],
        [-0.1575,  0.2916,  0.3924],
        [-0.2167, -0.0704, -0.5197],
        [ 0.4216,  0.1047,  0.5400],
        [-0.4827,  0.4868, -0.5655],
        [ 0.3096, -0.3725, -0.3382],
        [ 0.0190, -0.3260, -0.0828],
        [-0.1678,  0.4821,  0.1528],
        [ 0.3671,  0.4760, -0.5092],
        [ 0.2448,  0.2161, -0.1691],
        [-0.4507,  0.2656,  0.4777],
        [-0.1823, -0.2169,  0.4688],
        [-0.2667,  0.5446,  0.2151]], requires_grad=True)
W0.bias Parameter containing:
tensor([ 0.2466, -0.5018,  0.0103,  0.1711,  0.4140, -0.0998,  0.3709, -0.3463,
         0.4880,  0.1452,  0.3080,  0.0793, -0.5400,  0.1486,  0.2751,  0.0492],
       requires_grad=True)
W1.weight Parameter containing:
tensor([[-0.1382,  0.0121,  0.1774,  0.1123, -0.0276,  0.1617,  0.0063,  0.2020,
          0.0990, -0.100

# Filter Gradient Play

## Dataset

In [3]:
input_dir = "/global/cscratch1/sd/danieltm/ExaTrkX/trackml-codalab/embedding_processed/1_pt_cut_endcaps_unweighted/train/"
num_events = 20

In [5]:
all_events = os.listdir(input_dir)
all_events = sorted([os.path.join(input_dir, event) for event in all_events])
loaded_events = []
for event in all_events[:num_events]:
    try:
        loaded_event = torch.load(event, map_location=torch.device('cpu'))
        loaded_events.append(loaded_event)
    except:
        pass

## Model

In [9]:
class VanillaFilter(nn.module):

    def __init__(self, hparams):
        super().__init__(hparams)
        '''
        Initialise the Lightning Module that can scan over different filter training regimes
        '''

        # Construct the MLP architecture
        self.input_layer = Linear(hparams["in_channels"]*2 + hparams["emb_channels"]*2, hparams["hidden"])
        layers = [Linear(hparams["hidden"], hparams["hidden"]) for _ in range(hparams["nb_layer"]-1)]
        self.layers = nn.ModuleList(layers)
        self.output_layer = nn.Linear(hparams["hidden"], 1)
        self.layernorm = nn.LayerNorm(hparams["hidden"])
        self.batchnorm = nn.BatchNorm1d(num_features=hparams["hidden"], track_running_stats=False)
        self.act = nn.Tanh()

    def forward(self, x, e, emb=None):
        if emb is not None:
            x = self.input_layer(torch.cat([x[e[0]], emb[e[0]], x[e[1]], emb[e[1]]], dim=-1))
        else:
            x = self.input_layer(torch.cat([x[e[0]], x[e[1]]], dim=-1))
        for l in self.layers:
            x = l(x)
            x = self.act(x)
            if self.hparams["layernorm"]: x = self.layernorm(x) #Option of LayerNorm
            if self.hparams["batchnorm"]: x = self.batchnorm(x) #Option of Batch
        x = self.output_layer(x)
        return x

AttributeError: module 'torch.nn' has no attribute 'module'

## Training Step

In [None]:
for batch in loaded_events:
    