In [52]:
import torch
import torch.nn as nn
import numpy as np

# 1 - Tensor Basics

In [23]:
# Everything in pytorch is based on Tensor operations.
# A tensor can have different dimensions so it can be 1d, 2d, or even 3d and higher (scalar, vector, matrix, tensor)

# torch.empty(size): uninitiallized
x = torch.empty(1) # scalar
print(x)
x = torch.empty(3) # vector, 1D
print(x)
x = torch.empty(2,3) # matrix, 2D
print(x)
x = torch.empty(2,2,3) # tensor, 3 dimensions
print(x)
x = torch.empty(2,2,2,3) # tensor, 4 dimensions
print(x)

tensor([6.7062e+22])
tensor([0., 0., 0.])
tensor([[9.2755e-39, 1.0561e-38, 7.1633e-39],
        [9.0919e-39, 8.4490e-39, 9.6429e-39]])
tensor([[[0., 0., 0.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [0., 0., 0.]]])
tensor([[[[9.2755e-39, 1.0561e-38, 7.1633e-39],
          [9.0919e-39, 8.4490e-39, 9.6429e-39]],

         [[9.6429e-39, 1.0194e-38, 9.1837e-39],
          [4.6837e-39, 9.9184e-39, 9.0000e-39]]],


        [[[1.0561e-38, 1.0653e-38, 4.1327e-39],
          [8.9082e-39, 9.8265e-39, 9.4592e-39]],

         [[1.0561e-38, 1.0286e-38, 1.0929e-38],
          [1.0102e-38, 4.5918e-39, 4.6837e-39]]]])


In [24]:
# torch.rand(size): random numbers [0, 1]
x = torch.rand(5, 3)
print(x)

tensor([[0.3448, 0.5563, 0.6852],
        [0.4843, 0.1852, 0.4757],
        [0.2946, 0.2348, 0.9744],
        [0.7820, 0.5614, 0.6313],
        [0.2734, 0.9907, 0.0563]])


In [25]:
# torch.zeros(size), fill with 0
# torch.ones(size), fill with 1
x, y = torch.zeros(5, 3), torch.ones(5, 2)
print(x, y)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]) tensor([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]])


In [26]:
# check size
print(x.size())

# check data type
print(x.dtype)

torch.Size([5, 3])
torch.float32


In [27]:
# specify types, torch.float32 default, can also do torch.int, torch.double
x = torch.zeros(5, 3, dtype=torch.float16)
print(x)

# check type
print(x.dtype)


tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]], dtype=torch.float16)
torch.float16


In [29]:
# construct from data
x = torch.tensor([5.5, 3])
print(x, x.size())

tensor([5.5000, 3.0000]) torch.Size([2])


In [30]:
# requires_grad argument
# This will tell pytorch that it will need to calculate the gradients for this tensor
# later in your optimization steps
# i.e. this is a variable in your model that you want to optimize
x = torch.tensor([5.5, 3], requires_grad=True)

In [31]:
# Operations
y = torch.rand(2, 2)
x = torch.rand(2, 2)

# elementwise addition
z = x + y
torch.add(x,y) # does same as above

# in-place addition, everything with a trailing underscore is an in-place operation
# i.e. it will modify the variable
y.add_(x)

# subtraction
z = x - y
z = torch.sub(x, y)

# multiplication
z = x * y
z = torch.mul(x,y)
y.mul_(x)

# division
z = x / y
z = torch.div(x,y)


In [32]:
# Slicing
x = torch.rand(5,3)
print(x)
print(x[:, 0]) # all rows, column 0
print(x[1, :]) # row 1, all columns
print(x[1,1]) # element at 1, 1

# Get the actual value if only 1 element in your tensor
print(x[1,1].item())

tensor([[0.4553, 0.9628, 0.3184],
        [0.1778, 0.8979, 0.0676],
        [0.7388, 0.0814, 0.9793],
        [0.9709, 0.9634, 0.7002],
        [0.3897, 0.5502, 0.5409]])
tensor([0.4553, 0.1778, 0.7388, 0.9709, 0.3897])
tensor([0.1778, 0.8979, 0.0676])
tensor(0.8979)
0.8978679776191711


In [33]:
# Reshape with torch.view()
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
# if -1 it pytorch will automatically determine the necessary size
print(x.size(), y.size(), z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


In [35]:
# Numpy
# Converting a Torch Tensor to a NumPy array and vice versa is very easy
a = torch.ones(5)
print(a)

# torch to numpy with .numpy()
b = a.numpy()
print(b)
print(type(b))
# Carful: If the Tensor is on the CPU (not the GPU),
# both objects will share the same memory location, so changing one
# will also change the other
a.add_(1)
print(a)
print(b)

# numpy to torch with .from_numpy(x)
a = np.ones(5)
b = torch.from_numpy(a)
print(a)
print(b)
# again be careful when modifying
a += 1
print(a)
print(b)

tensor([1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1.]
<class 'numpy.ndarray'>
tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]
[1. 1. 1. 1. 1.]
tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


In [None]:
# by default all tensors are created on the CPU,
# but you can also move them to the GPU (only if it's available )
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    # z = z.numpy() # not possible because numpy cannot handle GPU tenors
    # move to CPU again
    z.to("cpu")       # ``.to`` can also change dtype together!
    # z = z.numpy()

# 2 - Gradient Calculation With Autograd

In [42]:
# The autograd package provides automatic differentiation 
# for all operations on Tensors

# requires_grad = True -> tracks all operations on the tensor. 
x = torch.randn(3, requires_grad=True)

In [43]:
# whenever operations are done with tensor, a computational graph is created (each operation is a node with inputs and output)
y = x + 2

# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor used to calculate gradients in backpropagation
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

# image below is one node of the graph after doing the first operation above - gradient function depends on the type of operation done

tensor([-1.2056, -0.4835, -1.1400], requires_grad=True)
tensor([0.7944, 1.5165, 0.8600], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x0000021CB0C3D8C8>


<img src="autograd.png">

In [44]:
# Do more operations on y
z = y * y * 2
print(z)
z = z.mean()
print(z)

tensor([1.2621, 4.5994, 1.4793], grad_fn=<MulBackward0>)
tensor(2.4469, grad_fn=<MeanBackward0>)


In [45]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor

z.backward() # calculates dz/dx - vector-Jacobian product
print(x.grad) # where gradients are stored

tensor([1.0592, 2.0220, 1.1467])


In [46]:
# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derivates while applying the chain rule

# -------------
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)

y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

tensor([ 3143.2620, -2902.8093,   472.6248], grad_fn=<MulBackward0>)
torch.Size([3])
tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


In [47]:
# -------------
# Stop a tensor from tracking history - 3 options show below:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

False
None
True
<SumBackward0 object at 0x0000021CB0C3DA48>
True
False
True
False


In [48]:
# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)


# 3 - Backpropagation Theory

See Backpropagation.pdf

In [49]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)

# This is the parameter we want to optimize -> requires_grad=True
w = torch.tensor(1.0, requires_grad=True)

# forward pass to compute loss
y_predicted = w * x
loss = (y_predicted - y)**2
print(loss)

# backward pass to compute gradient dLoss/dw
loss.backward()
print(w.grad)

# update weights
# next forward and backward pass...

# continue optimizing:
# update weights, this operation should not be part of the computational graph
with torch.no_grad():
    w -= 0.01 * w.grad
# don't forget to zero the gradients
w.grad.zero_()

# next forward and backward pass...

tensor(1., grad_fn=<PowBackward0>)
tensor(-2.)


tensor(0.)

# 4 - Gradient Descent With Autograd and Backpropagation

In [50]:
# Compute every step manually first

'''
General Training Pipeline Steps:
- Prediction
- Gradient Computation
- Loss Computation
- Parameter updates
'''

# Linear regression
# f = w * x ... here, f = 2 * x
X = np.array([1, 2, 3, 4], dtype=np.float32)
Y = np.array([2, 4, 6, 8], dtype=np.float32)

w = 0.0 # same w as in f = w * x that we want to optimise

# model output
def forward(x):
    return w * x

# loss = MSE
def loss(y, y_pred):
    return ((y_pred - y)**2).mean()

# gradient
# J = MSE = 1/N * (w*x - y)**2
# dJ/dw = 1/N * 2x(w*x - y)
def gradient(x, y, y_pred):
    return np.dot(2*x, y_pred - y).mean()

print(f'Prediction before training: f(5) = {forward(5):.3f}') # 

# Training
learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    # prediction (forward pass)
    y_pred = forward(X)
    # loss
    l = loss(Y, y_pred)
    # calculate gradients
    dw = gradient(X, Y, y_pred)
    # update weights
    w -= learning_rate * dw

    if epoch % 2 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
     
print(f'Prediction after training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000
epoch 1: w = 1.200, loss = 30.00000000
epoch 3: w = 1.872, loss = 0.76800019
epoch 5: w = 1.980, loss = 0.01966083
epoch 7: w = 1.997, loss = 0.00050332
epoch 9: w = 1.999, loss = 0.00001288
epoch 11: w = 2.000, loss = 0.00000033
epoch 13: w = 2.000, loss = 0.00000001
epoch 15: w = 2.000, loss = 0.00000000
epoch 17: w = 2.000, loss = 0.00000000
epoch 19: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000


In [51]:
# Now compute every step using the tool specified

'''
General Training Pipeline Steps:
- Prediction: PyTorch Model (needs to be designed) - done in section 5
- Gradient Computation: Autograd (automatic backpropagation algorithm) - done below
- Loss Computation: PyTorch Loss class (needs to be selected) - done in section 5
- Parameter updates: PyTorch Optimiser class (needs to be selected) - done in section 5
'''

# Here we replace the manually computed gradient with autograd

# Linear regression
# f = w * x 

# here : f = 2 * x
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# model output
def forward(x):
    return w * x

# loss = MSE
def loss(y, y_pred):
    return ((y_pred - y)**2).mean()

print(f'Prediction before training: f(5) = {forward(5).item():.3f}')

# Training
learning_rate = 0.01
n_iters = 100

for epoch in range(n_iters):
    # prediction (forward pass)
    y_pred = forward(X)
    # loss
    l = loss(Y, y_pred)
    # calculate gradients = backward pass
    l.backward()
    # update weights: w.data = w.data - learning_rate * w.grad
    with torch.no_grad():
        w -= learning_rate * w.grad
    # zero the gradients after updating
    w.grad.zero_()

    if epoch % 10 == 0:
        print(f'epoch {epoch+1}: w = {w.item():.3f}, loss = {l.item():.8f}')

print(f'Prediction after training: f(5) = {forward(5).item():.3f}')


Prediction before training: f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 11: w = 1.665, loss = 1.16278565
epoch 21: w = 1.934, loss = 0.04506890
epoch 31: w = 1.987, loss = 0.00174685
epoch 41: w = 1.997, loss = 0.00006770
epoch 51: w = 1.999, loss = 0.00000262
epoch 61: w = 2.000, loss = 0.00000010
epoch 71: w = 2.000, loss = 0.00000000
epoch 81: w = 2.000, loss = 0.00000000
epoch 91: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000


# 7 - Training Pipeline: Model, Loss, and Optimizer

In [53]:
'''
General Training Pipeline Steps:
- Prediction: PyTorch Model (needs to be designed) - done below
- Gradient Computation: Autograd (automatic backpropagation algorithm)
- Loss Computation: PyTorch Loss class (needs to be selected) - done here
- Parameter updates: PyTorch Optimiser class (needs to be selected) - done here
'''

# 1) Design model (input, output, forward pass with different layers)
# 2) Construct loss and optimizer
# 3) Training loop
#       - Forward = compute prediction and loss
#       - Backward = compute gradients
#       - Update weights

# Linear regression
# f = w * x 

# here : f = 2 * x

# 0) Training samples
X = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
Y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# 1) Design Model: Weights to optimize and forward function
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

def forward(x):
    return w * x

print(f'Prediction before training: f(5) = {forward(5).item():.3f}')

# 2) Define loss and optimizer
learning_rate = 0.01
n_iters = 100

# callable function
loss = nn.MSELoss() # mean squared error

optimizer = torch.optim.SGD([w], lr=learning_rate)

# 3) Training loop
for epoch in range(n_iters):
    # predict = forward pass
    y_predicted = forward(X)
    # loss
    l = loss(Y, y_predicted)
    # calculate gradients = backward pass
    l.backward()
    # update weights
    optimizer.step()
    # zero the gradients after updating
    optimizer.zero_grad()

    if epoch % 10 == 0:
        print('epoch ', epoch+1, ': w = ', w, ' loss = ', l)

print(f'Prediction after training: f(5) = {forward(5).item():.3f}')

Prediction before training: f(5) = 0.000
epoch  1 : w =  tensor(0.3000, requires_grad=True)  loss =  tensor(30., grad_fn=<MseLossBackward0>)
epoch  11 : w =  tensor(1.6653, requires_grad=True)  loss =  tensor(1.1628, grad_fn=<MseLossBackward0>)
epoch  21 : w =  tensor(1.9341, requires_grad=True)  loss =  tensor(0.0451, grad_fn=<MseLossBackward0>)
epoch  31 : w =  tensor(1.9870, requires_grad=True)  loss =  tensor(0.0017, grad_fn=<MseLossBackward0>)
epoch  41 : w =  tensor(1.9974, requires_grad=True)  loss =  tensor(6.7705e-05, grad_fn=<MseLossBackward0>)
epoch  51 : w =  tensor(1.9995, requires_grad=True)  loss =  tensor(2.6244e-06, grad_fn=<MseLossBackward0>)
epoch  61 : w =  tensor(1.9999, requires_grad=True)  loss =  tensor(1.0176e-07, grad_fn=<MseLossBackward0>)
epoch  71 : w =  tensor(2.0000, requires_grad=True)  loss =  tensor(3.9742e-09, grad_fn=<MseLossBackward0>)
epoch  81 : w =  tensor(2.0000, requires_grad=True)  loss =  tensor(1.4670e-10, grad_fn=<MseLossBackward0>)
epoch  

In [55]:
# 0) Training samples, watch the shape!
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32) # both are 2D arrays
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)

n_samples, n_features = X.shape
print(f'#samples: {n_samples}, #features: {n_features}')
# 0) create a test sample
X_test = torch.tensor([5], dtype=torch.float32)

# 1) Design Model, the model has to implement the forward pass!
# Here we can use a built-in model from PyTorch
input_size = n_features
output_size = n_features

# we can call this model with samples X
model = nn.Linear(input_size, output_size) # linear model knows we want to optimise 'w' in f=wx

# to write custom model, need to extend nn.Module and define __init__, forward()... below does same as above
'''
class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        # define diferent layers
        self.lin = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.lin(x)

model = LinearRegression(input_size, output_size)
'''

print(f'Prediction before training: f(5) = {model(X_test).item():.3f}')

# 2) Define loss and optimizer
learning_rate = 0.01
n_iters = 100

loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# 3) Training loop
for epoch in range(n_iters):
    # predict = forward pass with our model
    y_predicted = model(X)

    # loss
    l = loss(Y, y_predicted)

    # calculate gradients = backward pass
    l.backward()

    # update weights
    optimizer.step()

    # zero the gradients after updating
    optimizer.zero_grad()

    if epoch % 10 == 0:
        [w, b] = model.parameters() # unpack parameters (weight and bias)
        print('epoch ', epoch+1, ': w = ', w[0][0].item(), ' loss = ', l)

print(f'Prediction after training: f(5) = {model(X_test).item():.3f}')


#samples: 4, #features: 1
Prediction before training: f(5) = -3.231
epoch  1 : w =  -0.28043633699417114  loss =  tensor(51.7713, grad_fn=<MseLossBackward0>)
epoch  11 : w =  1.3744432926177979  loss =  tensor(1.4753, grad_fn=<MseLossBackward0>)
epoch  21 : w =  1.648272156715393  loss =  tensor(0.1661, grad_fn=<MseLossBackward0>)
epoch  31 : w =  1.6997346878051758  loss =  tensor(0.1248, grad_fn=<MseLossBackward0>)
epoch  41 : w =  1.7152106761932373  loss =  tensor(0.1167, grad_fn=<MseLossBackward0>)
epoch  51 : w =  1.724685788154602  loss =  tensor(0.1099, grad_fn=<MseLossBackward0>)
epoch  61 : w =  1.7329891920089722  loss =  tensor(0.1035, grad_fn=<MseLossBackward0>)
epoch  71 : w =  1.7409038543701172  loss =  tensor(0.0975, grad_fn=<MseLossBackward0>)
epoch  81 : w =  1.7485616207122803  loss =  tensor(0.0918, grad_fn=<MseLossBackward0>)
epoch  91 : w =  1.7559895515441895  loss =  tensor(0.0865, grad_fn=<MseLossBackward0>)
Prediction after training: f(5) = 9.511
