<a href="https://colab.research.google.com/github/lognat0704/TopGun/blob/main/7_Pytorch_tips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
import torch 
from torch import nn

### 1. Create Directly on the Target Device

In [2]:
start_time = time.time()
for _ in range(100):
  cpu_tensor = torch.ones((1000,64,64))
  gpu_tensor = cpu_tensor.cuda()

print('Total time: {:.3f}s'.format(time.time()-start_time))

Total time: 12.935s


In [3]:
start_time = time.time()
for _ in range(100):
  gpu_tensor = torch.ones((1000,64,64), device='cuda')

print('Total time: {:.3f}s'.format(time.time()-start_time))

Total time: 0.011s


### 2. Use Sequential Layers When Possible

In [10]:
class ExampleModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.input_layer = nn.Linear(input_size, hidden_size)
    self.input_activation = nn.ReLU()

    self.mid_layer = nn.Linear(hidden_size, hidden_size)
    self.mid_activation = nn.ReLU()

    self.output_layer = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    z = self.input_layer(x)
    z = self.input_activation(z)
    
    z = self.mid_layer(z)
    z = self.mid_activation(z)
    
    out = self.output_layer(z)

    return out

In [11]:
example_model = ExampleModel()
print(example_model)
print('Output shape:', example_model(torch.ones([100, 2])).shape)

ExampleModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layer): Linear(in_features=16, out_features=16, bias=True)
  (mid_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)
Output shape: torch.Size([100, 3])


In [12]:
class ExampleSequentialModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.layers = nn.Sequential(
      nn.Linear(input_size, hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size, hidden_size),
      nn.ReLU(),
      nn.Linear(hidden_size, output_size))

  def forward(self, x):
    out = self.layers(x)
    return out

In [13]:
example_seq_model = ExampleSequentialModel()
print(example_seq_model)
print('Output shape:', example_seq_model(torch.ones([100, 2])).shape)

ExampleSequentialModel(
  (layers): Sequential(
    (0): Linear(in_features=2, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=3, bias=True)
  )
)
Output shape: torch.Size([100, 3])


### 3. Don't Make Lists of Layers


In [14]:
class BadListModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.input_layer = nn.Linear(input_size, hidden_size)
    self.input_activation = nn.ReLU()

    # Fairly common when using residual layers
    self.mid_layers = []
    for _ in range(5):
      self.mid_layers.append(nn.Linear(hidden_size, hidden_size))
      self.mid_layers.append(nn.ReLU())

    self.output_layer = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    z = self.input_layer(x)
    z = self.input_activation(z)
    
    for layer in self.mid_layers:
      z = layer(z)
    
    out = self.output_layer(z)

    return out

In [15]:
bad_list_model = BadListModel()
print('Output shape:', bad_list_model(torch.ones([100, 2])).shape)

Output shape: torch.Size([100, 3])


In [16]:
bad_list_model

BadListModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)

In [17]:
gpu_input = torch.ones([100, 2], device='cuda')
gpu_bad_list_model = bad_list_model.cuda()
print('Output shape:', bad_list_model(gpu_input).shape)

RuntimeError: ignored

In [18]:
class CorrectListModel(nn.Module):
  def __init__(self):
    super().__init__()

    input_size = 2
    output_size = 3
    hidden_size = 16

    self.input_layer = nn.Linear(input_size, hidden_size)
    self.input_activation = nn.ReLU()

    # Fairly common when using residual layers
    self.mid_layers = []
    for _ in range(5):
      self.mid_layers.append(nn.Linear(hidden_size, hidden_size))
      self.mid_layers.append(nn.ReLU())
    self.mid_layers = nn.Sequential(*self.mid_layers)

    self.output_layer = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    z = self.input_layer(x)
    z = self.input_activation(z)
    z = self.mid_layers(z)
    out = self.output_layer(z)

    return out

In [20]:
correct_list_model = CorrectListModel()
correct_list_model

CorrectListModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layers): Sequential(
    (0): Linear(in_features=16, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=16, bias=True)
    (7): ReLU()
    (8): Linear(in_features=16, out_features=16, bias=True)
    (9): ReLU()
  )
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)

In [21]:
gpu_input = torch.ones([100, 2], device='cuda')
gpu_correct_list_model = correct_list_model.cuda()
print('Output shape:', correct_list_model(gpu_input).shape)

Output shape: torch.Size([100, 3])


### 4. Make Use of Distributions


In [22]:
# Setup
example_model = ExampleModel()
input_tensor = torch.rand(5, 2)
output = example_model(input_tensor)
print(output)

tensor([[ 0.2366, -0.0852, -0.2771],
        [ 0.2565, -0.0390, -0.2301],
        [ 0.2452, -0.0751, -0.2673],
        [ 0.2604, -0.0490, -0.2469],
        [ 0.2258, -0.1094, -0.3023]], grad_fn=<AddmmBackward0>)


In [23]:
from torch.distributions import Categorical
from torch.distributions.kl import kl_divergence

In [24]:
dist = Categorical(logits=output)
dist

Categorical(logits: torch.Size([5, 3]))

In [26]:
# Get probabilities
dist.probs

tensor([[0.4305, 0.3120, 0.2575],
        [0.4239, 0.3155, 0.2606],
        [0.4301, 0.3122, 0.2576],
        [0.4281, 0.3142, 0.2577],
        [0.4339, 0.3103, 0.2559]], grad_fn=<SoftmaxBackward0>)

In [27]:
# Take samples
dist.sample()

tensor([1, 2, 2, 0, 2])

In [28]:
# Calculate the KL-Divergence
dist_1 = Categorical(logits=output[0])
dist_2 = Categorical(logits=output[1])
kl_divergence(dist_1, dist_2)

tensor(8.7176e-05, grad_fn=<SumBackward1>)

### 5. Use detach() On Long-Term Metrics (detach from gradient)

In [29]:
# Setup
example_model = ExampleModel()
data_batches = [torch.rand((10, 2)) for _ in range(5)]
criterion = nn.MSELoss(reduce='mean')



In [30]:
data_batches

[tensor([[0.6664, 0.8480],
         [0.5517, 0.7500],
         [0.3905, 0.4795],
         [0.0203, 0.0076],
         [0.0342, 0.3239],
         [0.0498, 0.4571],
         [0.9083, 0.8778],
         [0.8400, 0.3951],
         [0.0853, 0.3669],
         [0.6047, 0.5367]]), tensor([[0.5449, 0.4341],
         [0.5255, 0.4515],
         [0.8894, 0.5661],
         [0.1432, 0.8134],
         [0.8033, 0.2864],
         [0.4758, 0.6736],
         [0.8030, 0.8155],
         [0.6492, 0.4487],
         [0.6512, 0.3577],
         [0.3095, 0.5880]]), tensor([[0.3538, 0.3726],
         [0.2964, 0.4321],
         [0.4899, 0.4250],
         [0.7985, 0.5057],
         [0.5507, 0.0346],
         [0.0793, 0.7434],
         [0.7791, 0.1451],
         [0.5251, 0.0108],
         [0.0390, 0.6371],
         [0.6535, 0.2493]]), tensor([[0.4968, 0.8196],
         [0.1775, 0.9922],
         [0.9447, 0.2378],
         [0.9809, 0.2310],
         [0.9258, 0.9089],
         [0.7418, 0.6368],
         [0.4436, 0.6105]

### Bad Example

In [31]:
losses = []

# Training loop
for batch in data_batches:
  output = example_model(batch)

  target = torch.rand((10, 3))
  loss = criterion(output, target)
  losses.append(loss)

  # Optimization happens here

print(losses)

[tensor(0.4776, grad_fn=<MseLossBackward0>), tensor(0.5256, grad_fn=<MseLossBackward0>), tensor(0.4602, grad_fn=<MseLossBackward0>), tensor(0.6642, grad_fn=<MseLossBackward0>), tensor(0.5915, grad_fn=<MseLossBackward0>)]


### Better Example

In [34]:
losses = []

# Training loop
for batch in data_batches:
  output = example_model(batch)

  target = torch.rand((10, 3))
  loss = criterion(output, target)
  losses.append(loss.item()) # Or `loss.detach()`

  # Optimization happens here

print(losses)

[0.649771511554718, 0.5891002416610718, 0.47328534722328186, 0.43510550260543823, 0.42641639709472656]


### 6. Trick to Delete a Model from GPU

In [35]:
import gc # garbage collection

In [36]:
example_model = ExampleModel().cuda()

del example_model

gc.collect()
# The model will normally stay on the cache until something takes it's place
torch.cuda.empty_cache()

In [37]:
example_model = ExampleModel()

# Do training

example_model.eval()

# Do testing

example_model.train()

# Do training again

ExampleModel(
  (input_layer): Linear(in_features=2, out_features=16, bias=True)
  (input_activation): ReLU()
  (mid_layer): Linear(in_features=16, out_features=16, bias=True)
  (mid_activation): ReLU()
  (output_layer): Linear(in_features=16, out_features=3, bias=True)
)

### Affects
- Dropout
- Batch Normalization
- RNNs
- Lazy Variants
