## PyTorch Basics

### Different types of tensors

In [8]:
import torch

# uninitialized
# scalar
x = torch.empty(1)
print("empty(1): ", x)

# vector
x = torch.empty(3)
print("empty(3): ", x)

# matrix
x = torch.empty(2, 3)
print("empty(2,3): ", x)

# 3 dimensional tensor
x = torch.empty(2, 2, 3)
print("empty(2, 2, 3): ", x)

# 4 dimensional tensor
x = torch.empty(2, 2, 3, 4)
print("empty(2, 2, 3, 4): ", x)

# initialise randomly
r = torch.rand(5, 3)
print("rand(5,3): ", r)

# zero filled
z = torch.zeros(3, 4)
print("zeros(3, 4): ", z)

# one filled
z = torch.ones(3, 4)
print("ones(3, 4): ", z)

empty(1):  tensor([5.1611e-06])
empty(3):  tensor([2.9112e-20, 3.2111e-41, 0.0000e+00])
empty(2,3):  tensor([[5.1611e-06, 4.4074e-41, 5.1611e-06],
        [4.4074e-41, 4.4842e-44, 0.0000e+00]])
empty(2, 2, 3):  tensor([[[-2.2829e-34,  3.2115e-41, -2.7622e-34],
         [ 3.2115e-41,  4.4842e-44,  0.0000e+00]],

        [[ 8.9683e-44,  0.0000e+00,  8.2908e-20],
         [ 3.2111e-41,  0.0000e+00,  2.3510e-38]]])
empty(2, 2, 3, 4):  tensor([[[[ 5.1612e-06,  4.4074e-41,  5.1612e-06,  4.4074e-41],
          [ 3.2618e-17,  4.4074e-41,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],

         [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]],


        [[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  2.7972e-17,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+0

In [9]:
# check size
print("size(): ", z.size())  # function
print("size(0): ", z.size(0))

print("shape: ", z.shape)  # attribute
print("shape[0]: ", z.shape[0])

size():  torch.Size([3, 4])
size(0):  3
shape:  torch.Size([3, 4])
shape[0]:  3


In [10]:
# check data type
print(x.dtype)

# specify types, default float32
x = torch.rand(4, 3, dtype=torch.float16)
print(x)

# check
print(x.dtype)

torch.float32
tensor([[0.1470, 0.6235, 0.5991],
        [0.4111, 0.1787, 0.8418],
        [0.1372, 0.4185, 0.1890],
        [0.1650, 0.4082, 0.9995]], dtype=torch.float16)
torch.float16


In [11]:
# construct tensor from data
x = torch.tensor([5.5, 3, 1])
print(x, x.dtype)

tensor([5.5000, 3.0000, 1.0000]) torch.float32


In [12]:
# requires grad arg
# False by dafault, this tells pytorch that this tensor will be required to compute gradient later
# i.e this is a variable in model that will be optimized later
x = torch.tensor([5, 4, 5.0, 8], requires_grad = True)
print(x)

tensor([5., 4., 5., 8.], requires_grad=True)


### Arithmetic operation of tensors

In [13]:
x = torch.ones(2, 3)
y = torch.rand(2, 3)

# elementwise
s = x+y
print(s)

s = torch.add(x, y)
print(s)

tensor([[1.1778, 1.9540, 1.3053],
        [1.0840, 1.3519, 1.0847]])
tensor([[1.1778, 1.9540, 1.3053],
        [1.0840, 1.3519, 1.0847]])


In [14]:
# in place addition, everything with a trailing underscore is an inplace operation
# i.e it will modify te variable
print(y)

s = y + x
print(s)

y.add_(x)
print(y)

tensor([[0.1778, 0.9540, 0.3053],
        [0.0840, 0.3519, 0.0847]])
tensor([[1.1778, 1.9540, 1.3053],
        [1.0840, 1.3519, 1.0847]])
tensor([[1.1778, 1.9540, 1.3053],
        [1.0840, 1.3519, 1.0847]])


In [15]:
# subtraction
z = x - y

# multiplication
m = x * y
mm = torch.mul(x,y)

# division
d = x / y
dd = torch.div(x, y)

In [16]:
# slicing
x = torch.rand(5, 3)
print(x)
print("x[:, 0]", x[:, 0])
print("x[:, 1]", x[:, 1])
print("x[:, 2]", x[:, 2])

print("x[0, :]", x[0, :])

print("x[0, 0].item()", x[0, 0].item())  # only for scalers

tensor([[0.3319, 0.7285, 0.8806],
        [0.8677, 0.6126, 0.7850],
        [0.9919, 0.5969, 0.1650],
        [0.3013, 0.3135, 0.1380],
        [0.8149, 0.5411, 0.4005]])
x[:, 0] tensor([0.3319, 0.8677, 0.9919, 0.3013, 0.8149])
x[:, 1] tensor([0.7285, 0.6126, 0.5969, 0.3135, 0.5411])
x[:, 2] tensor([0.8806, 0.7850, 0.1650, 0.1380, 0.4005])
x[0, :] tensor([0.3319, 0.7285, 0.8806])
x[0, 0].item() 0.3319409489631653


In [17]:
x = torch.randn(4, 4)  # normal distribution
print(x)
y = x.view(16) # reshape
print(y)

# -1 dimension implies, infer from te input to et required shape
z= x.view(-1, 8)
print(x.shape, y.shape, z.shape)

tensor([[ 0.4251,  0.0549, -1.4495, -0.6712],
        [ 0.4987,  1.1041, -0.0975, -1.1447],
        [-0.1572,  1.3294, -2.1893, -0.8384],
        [ 0.1376,  0.1588, -0.5424, -2.2434]])
tensor([ 0.4251,  0.0549, -1.4495, -0.6712,  0.4987,  1.1041, -0.0975, -1.1447,
        -0.1572,  1.3294, -2.1893, -0.8384,  0.1376,  0.1588, -0.5424, -2.2434])
torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


In [18]:
# Converting a torch tensor to numpy array and vice versa
a = torch.ones(5)
print(a)
print(a.dtype)

# torch to numpy
b = a.numpy()
print(b)
print(b.dtype)

tensor([1., 1., 1., 1., 1.])
torch.float32
[1. 1. 1. 1. 1.]
float32


In [19]:
# Caution: If the tensor is on CPU, both will share same memory location
# i.e. changing one will change other
a.add_(3)
print(a)
print(b)

tensor([4., 4., 4., 4., 4.])
[4. 4. 4. 4. 4.]


In [20]:
# numpy to torch
import numpy as np
a = np.ones(5)

# same memory location
b = torch.from_numpy(a)

# new memory
c = torch.tensor(a)
print(a)
print(b)
print(c)

a += 1
print(a)
print(b)
print(c)

[1. 1. 1. 1. 1.]
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)


### GPU support
- By default all tensors are created on CPU. But we can also move them to GPU(if vailable), or create them directly on the GPU

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [24]:
# move tensors to device
x = torch.rand(2, 2).to(device)

x = x.to('cpu')
# x = x.to('cuda:0')

# create directly on device, more efficient
x = torch.rand(2, 2, device=device)

### Autograd
- Autograd package provides automatic differentiation for all operations on Tensors. Generally speaking torch.autograd is an engine for computing the vector-Jacobian product. It computes partial derivatives while applying the chain rule.

- Set `requires_grad = True`. This tracks all the ops on the tensor


In [25]:
import torch

# requires_grad = True -> tracks all operations on the tensor
x = torch.randn(3, requires_grad=True)
y = x + 2

# y was created as a result of an operation, so it has a grad_in attrib.
# grad_fn: references a Function that has created the Tensor
print(x)
print(y)
print(y.grad_fn)

tensor([-1.4129, -0.0946, -0.5641], requires_grad=True)
tensor([0.5871, 1.9054, 1.4359], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x7adb3c21d540>


In [26]:
# do more ops
z = y * y * 3
print(z)

z = z.mean()
print(z)

tensor([ 1.0340, 10.8917,  6.1855], grad_fn=<MulBackward0>)
tensor(6.0371, grad_fn=<MeanBackward0>)


In [27]:
# compute gradient with backprop
# when we finish our computation, we can call .backward() and have all gradients computed automatically
# Gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivative of the function w.r.t. the tensor

print(x.grad)
z.backward()
print(x.grad)  # dz/dx

# Caution: backward() accumulates the gradient for this tensor into .grad attribute
# so in training pipeline, be careful during optimization, optimizer.zero_grad()

None
tensor([1.1742, 3.8108, 2.8718])


#### Stop tensor from tracking history
For example during training loop when we want to update our weights, or after training during evaluation. These ops shouldn't be part of the gradient computation, to prevent this we can use:
- `x.requires_grad_(False)`
- `x.detach()`
- wrap in `with torch.no_grad():`
-

In [28]:
# example
a = torch.randn(2, 3)
b = (a * a).sum()

print(a.requires_grad)
print(b.grad_fn)

a = torch.randn(2, 3, requires_grad=True)
b = (a * a).sum()

print(a.requires_grad)
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x7adb3c21e6e0>


In [29]:
a = torch.randn(2, 3, requires_grad=True)
b = a.detach()

print(a.requires_grad)
print(b.grad_fn)

True
None


In [30]:
a = torch.randn(2, 3, requires_grad=True)
with torch.no_grad():
  b = a**2
  print(a.requires_grad)
  print(b.requires_grad)

True
False


### Gradient Descent

In [31]:
# Linear regression
# f = w*x + b
# here f = 2 *x

x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float32)
y = torch.tensor([2, 4, 6, 8, 10, 12, 14, 16], dtype=torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# moodel output
def forward(x):
  return w * x

# loss = MSE
def loss(y, y_pred):
  return ((y_pred - y)**2).mean()

x_test = 5.0

print(f"Pred before training: f({x_test}) = {forward(x_test).item():.3f}")

Pred before training: f(5.0) = 0.000


In [32]:
# Training
learning_rate = 0.01
n_epochs = 100

for epoch in range(n_epochs):
  y_pred = forward(x)

  # loss
  l = loss(y, y_pred)

  # calc gradient = backward pass
  l.backward()

  # update weights
  # w.data = w.data - lr * w.grad
  with torch.no_grad():
    w -= learning_rate * w.grad


  w.grad.zero_()

  if (epoch+1) % 10 == 0:
    print(f"epoch {epoch+1}: w = {w.item():.3f}, loss = {l.item():.3f}")

print(f"Pred after training: f({x_test}) = {forward(x_test).item():.3f}")

epoch 10: w = 1.998, loss = 0.000
epoch 20: w = 2.000, loss = 0.000
epoch 30: w = 2.000, loss = 0.000
epoch 40: w = 2.000, loss = 0.000
epoch 50: w = 2.000, loss = 0.000
epoch 60: w = 2.000, loss = 0.000
epoch 70: w = 2.000, loss = 0.000
epoch 80: w = 2.000, loss = 0.000
epoch 90: w = 2.000, loss = 0.000
epoch 100: w = 2.000, loss = 0.000
Pred after training: f(5.0) = 10.000


### Model, Loss and Optimizer
A typical PyTorch pipeline looks like:
1. Design model(input, output, forward passwith different layers)
2. Construct loss and optimizer
3. Training loop:
  - Forward : compute prediction and loss
  - Backward: compute gradient
  - Update weights
  

In [33]:
import torch
import torch.nn as nn

# Linear Regression
# f = w * x
# here: f = 2 * x

# 0. Data preparation, Training sample
X = torch.tensor([[1], [2], [3], [4], [5], [6], [7], [8]], dtype = torch.float32)
Y = torch.tensor([[2], [4], [6], [8], [10], [12], [14], [16]], dtype = torch.float32)

n_samples, n_features = X.shape
print(f'n_samples = {n_samples}, n_features = {n_features}')

# create a test sample
X_test = torch.tensor([5], dtype = torch.float32)

n_samples = 8, n_features = 1


In [34]:
# Design model, the model has to implement forward pass

# 1. Define Model
class LinearRegression(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(LinearRegression, self).__init__()

    # define different layers
    self.lin = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    return self.lin(x)

input_size, output_size = n_features, n_features

model = LinearRegression(input_size, output_size)

print(f"Pred before training: f({X_test}) = {model(X_test).item():.3f}")

# 2. Define loss and optimizer
learning_rate = 0.01
n_epochs= 100

loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

# 3. Training Loop
for epoch in range(n_epochs):
  # forward pass, forward function from model will be called
  y_pred = model(X)

  # loss
  l = loss(y_pred, Y)

  # backward pass
  l.backward()

  # update weights
  optimizer.step()

  # zero the gradients after updating
  optimizer.zero_grad()

  if (epoch + 1) % 10 == 0:
    w, b = model.parameters()   # unpack params
    print(f'epoch {epoch+1} : w = {w[0][0].item()}, loss = {l.item()}')


print(f"Pred after training: f({X_test}) = {model(X_test).item():.3f}")

Pred before training: f(tensor([5.])) = -1.563
epoch 10 : w = 1.8202273845672607, loss = 0.20923356711864471
epoch 20 : w = 1.8285167217254639, loss = 0.19295983016490936
epoch 30 : w = 1.8352419137954712, loss = 0.17812317609786987
epoch 40 : w = 1.8417026996612549, loss = 0.16442739963531494
epoch 50 : w = 1.8479101657867432, loss = 0.15178456902503967
epoch 60 : w = 1.8538740873336792, loss = 0.1401139795780182
epoch 70 : w = 1.8596042394638062, loss = 0.12934064865112305
epoch 80 : w = 1.8651096820831299, loss = 0.1193956732749939
epoch 90 : w = 1.8703992366790771, loss = 0.11021538823843002
epoch 100 : w = 1.875481367111206, loss = 0.10174088925123215
Pred after training: f(tensor([5.])) = 10.077
