In [1]:
import torch

In [2]:
torch.__version__
device = 'cuda' if torch.cuda.is_available()  else 'cpu'
n_cudas = torch.cuda.device_count()
for i in range(n_cudas):
    print(torch.cuda.get_device_name(i))

NVIDIA GeForce RTX 3060
NVIDIA GeForce RTX 3060


## Tensor Operations

#### Create Tensor

In [3]:
scalar = torch.tensor(3.14, device=device)
vector = torch.tensor([1, 2, 3], device=device)
matrix = torch.tensor([[1, 2, 3], [4, 5, 6]], device=device)
tensor = torch.tensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [0, 1, 2]]], device=device)
print(scalar)
print(vector)
print(matrix)
print(tensor)

tensor(3.1400, device='cuda:0')
tensor([1, 2, 3], device='cuda:0')
tensor([[1, 2, 3],
        [4, 5, 6]], device='cuda:0')
tensor([[[1, 2, 3],
         [4, 5, 6]],

        [[7, 8, 9],
         [0, 1, 2]]], device='cuda:0')


In [4]:
tensor.size(), tensor.shape

(torch.Size([2, 2, 3]), torch.Size([2, 2, 3]))

#### Reshape Tensor

In [7]:
# Use view() method to reshape the tensor
# Beware: the view() method only returns a tensor with the dsired shape
# that shares the same underlying data with original tensor - it DOES NOT create a new tensor!
same_matrix = matrix.view(1, -1)
print(same_matrix)
print(matrix)

tensor([[1, 2, 3, 4, 5, 6]], device='cuda:0')
tensor([[1, 2, 3],
        [4, 5, 6]], device='cuda:0')


In [8]:
same_matrix[0, 0] = 100
print(same_matrix)
print(matrix)

tensor([[100,   2,   3,   4,   5,   6]], device='cuda:0')
tensor([[100,   2,   3],
        [  4,   5,   6]], device='cuda:0')


#### Copy Tensor

In [9]:
# use .clone().detach() to copy a tensor instead of view()
other_tensor = same_matrix.clone().detach()
other_tensor[0,0] = 1234
print(same_matrix)
print(other_tensor)
# search detach() and clone() in pytorch doc for more details

tensor([[100,   2,   3,   4,   5,   6]], device='cuda:0')
tensor([[1234,    2,    3,    4,    5,    6]], device='cuda:0')


In [12]:
# if tensor in GPU, we must conver to cpu first, then convert to numpy
print(other_tensor.device)
other_tensor.cpu().numpy()

cuda:0


array([[1234,    2,    3,    4,    5,    6]])

## Normal Tensor versus Trainable Tensor

- Normal Tensor: Doesn't require gradient computation.
- Trainable Tensor (Parameter/weight): Requires gradient computation.

In [14]:
# Standard method to create Trainable Tensor
# We must specify the requires_grad=True to track computation
# and specify the device to be the GPU
# this method also accelerates the computation (see 7 PyTorch Tips Github)
torch.manual_seed(7)
b = torch.randn(1, requires_grad=True, device=device)
w = torch.randn(1, requires_grad=True, device=device)
print(b, w)

tensor([-0.3048], device='cuda:0', requires_grad=True) tensor([-1.2870], device='cuda:0', requires_grad=True)


## Autograd

Autograd is PyTorch's *automatic differentiation package* that automatically calculates derivatives, chain rule or anything like it.

#### backward

backward() method compute all gradients for all (requires_grad=True) tensors involved in the computation of a given variable.

In [21]:
x_train = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8, 9], device=device, dtype=torch.float32)
y_train = torch.tensor([11, 22, 33, 44, 53, 66, 77, 87, 95], device=device, dtype=torch.float32)

In [22]:
# Autograd in action
# Step 1 - Computes our model's predicted output - forward pass
yhat = b + w * x_train

# Step 2 - Computes the loss
error = (yhat - y_train)
loss = (error ** 2).mean()

# Step 3 - Computes the gradients for every parameter with requires_grad=True
loss.backward()

In [26]:
print(b.requires_grad, w.requires_grad, loss.requires_grad, error.requires_grad)
print(x_train.requires_grad, y_train.requires_grad)

True True True True
False False


#### grad

Use grad attribute to check actual values of the gradients

In [29]:
b.grad, w.grad, loss.grad, error.grad
# grad value of loss and error is None because they are not leaf node

  b.grad, w.grad, loss.grad, error.grad


(tensor([-121.9243], device='cuda:0'),
 tensor([-769.2258], device='cuda:0'),
 None,
 None)

* NOTE: PyTorch default is accumulating gradients. We need to clear them out before each instance.

zero_

In [30]:
# Every time we use the gradients to update the parameters, we need to zero the gradients afterwards.
b.grad.zero_(), w.grad.zero_()

(tensor([0.], device='cuda:0'), tensor([0.], device='cuda:0'))

#### Updating Parameters