<a href="https://colab.research.google.com/github/WeizmannML/course2020/blob/master/Tutorial2/Pytorch_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### <font color=blue>Pytorch allows us to use automatic differentiation & use the power of GPU </font>
#### All the code snippets in the following example are taken from https://pytorch.org/tutorials/beginner/pytorch_with_examples.html 
#### Other libraries like tensorflow or JAX allows us to do the same. 

In [1]:
from __future__ import print_function
import torch

In [2]:
torch.__version__

'1.4.0'

In [3]:
import os, sys
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [4]:
# ----- for automatic detection of CPU vs GPU device ---------- #
cuda_device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu' )

In [5]:
cuda_device

device(type='cpu')

#### The basic working objects are torch.Tensors 

In [6]:
a = torch.tensor([[1., -1.], [1., -1.]])

In [7]:
b = torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]))

In [8]:
type(a), type(b)

(torch.Tensor, torch.Tensor)

In [9]:
x = torch.zeros(5, 3, dtype=torch.long)
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [10]:
x = x.new_ones(5, 3, dtype=torch.double, device=cuda_device)      # new_* methods take in sizes
print(x)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)


In [11]:
x.device ### equivalently can be done as x = x.to(cuda_device)

device(type='cpu')

In [12]:
x = torch.randn_like(x, dtype=torch.float64)    # override dtype!
print(x) 

tensor([[ 0.9469, -0.3019,  1.1910],
        [ 0.3945, -1.4922, -0.4882],
        [ 0.2685, -0.6423,  0.4618],
        [-2.2543,  0.5878, -0.9547],
        [ 0.7147, -0.5957, -0.9562]], dtype=torch.float64)


In [13]:
print(x.shape), print(x.size())

torch.Size([5, 3])
torch.Size([5, 3])


(None, None)

#### Conversion to numpy arrays

In [14]:
y = x.cpu().numpy()

In [15]:
y

array([[ 0.94689429, -0.30188594,  1.19095648],
       [ 0.39449512, -1.49220777, -0.48823568],
       [ 0.26850162, -0.64230773,  0.46181024],
       [-2.25426328,  0.5877575 , -0.95472652],
       [ 0.71467772, -0.59566856, -0.95616511]])

In [16]:
y.dtype

dtype('float64')

## The automatic differentiation of tensors through autograd

In [18]:
x = x.new_ones(5, 3, dtype=torch.double, device=cuda_device)

x = torch.randn_like(x, dtype=torch.double, device=cuda_device, requires_grad=True)
print(x)

tensor([[ 9.2821e-01,  9.1555e-01,  8.0839e-01],
        [ 2.6810e-01, -3.1011e-01, -1.0702e+00],
        [-5.1151e-04, -5.7802e-01,  1.1385e+00],
        [-1.2129e+00,  8.7644e-01,  9.9323e-01],
        [ 2.1423e+00, -1.1426e+00, -1.1780e+00]], dtype=torch.float64,
       requires_grad=True)


In [19]:
b = torch.tensor(np.array([[1, 2, 3], [4, 5, 6]]), dtype=torch.double, device=cuda_device, requires_grad=True)
print(b)

tensor([[1., 2., 3.],
        [4., 5., 6.]], dtype=torch.float64, requires_grad=True)


In [20]:
y = x + 2
print(y)

tensor([[2.9282, 2.9156, 2.8084],
        [2.2681, 1.6899, 0.9298],
        [1.9995, 1.4220, 3.1385],
        [0.7871, 2.8764, 2.9932],
        [4.1423, 0.8574, 0.8220]], dtype=torch.float64, grad_fn=<AddBackward0>)


In [21]:
y.grad_fn

<AddBackward0 at 0x270f64283c8>

In [22]:
## -- assigning a required_grad criteria -- #
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
True
<SumBackward0 object at 0x00000270F6428148>


In [23]:
print(a), print(b)

tensor([[-1.8174,  0.3028],
        [ 5.9903, -2.1315]], requires_grad=True)
tensor(43.8222, grad_fn=<SumBackward0>)


(None, None)

In [31]:
x = torch.randn(3, requires_grad=True)

y = x * 2
while y.data.norm() < 1000:
    y = y * 2

print(y)

tensor([-449.6989,  447.2596, -902.5790], grad_fn=<MulBackward0>)


In [32]:
x

tensor([-0.4392,  0.4368, -0.8814], requires_grad=True)

In [33]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v)

print(x.grad)

tensor([1.0240e+02, 1.0240e+03, 1.0240e-01])


### Doing a backpropagation in the numpy way with torch tensors

In [34]:

dtype = torch.float
device = cuda_device
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0) # -- the clamp filters out < 0 values
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 50 == 49:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h) # -- we had x.T for numpy arrays

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

49 14688.6044921875
99 690.2163696289062
149 49.527565002441406
199 4.0405402183532715
249 0.3492967188358307
299 0.03148128092288971
349 0.0031730837654322386
399 0.0005099220434203744
449 0.00014596105029340833
499 6.138013850431889e-05


## Using the autograd function of pytorch to compute the derivatives & do backpropagation

In [35]:
# -*- coding: utf-8 -*-
dtype = torch.float
device = cuda_device
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2) # --- the forward pass through two layers is happening here.

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 50 == 49:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

49 12349.078125
99 369.4203796386719
149 19.788862228393555
199 1.3998385667800903
249 0.11729955673217773
299 0.011038672178983688
349 0.0013435232685878873
399 0.00027956144185736775
449 9.277441131416708e-05
499 4.3606847611954436e-05


### The PyTorch nn Modeule, contains the basic NN layers

In [36]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=cuda_device)
y = torch.randn(N, D_out, device=cuda_device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

model.to(cuda_device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 50 == 49:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

49 35.213050842285156
99 3.1963510513305664
149 0.4574349820613861
199 0.0778004378080368
249 0.014499150216579437
299 0.002867157105356455
349 0.0005926850717514753
399 0.00012656324543058872
449 2.7710391805157997e-05
499 6.194936759129632e-06


### Updating the weights through PyTorch optim 

In [37]:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=cuda_device)
y = torch.randn(N, D_out, device=cuda_device)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

model.to(cuda_device)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 50 == 49:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

49 191.3822479248047
99 44.102664947509766
149 6.343743801116943
199 0.6481114625930786
249 0.05472734197974205
299 0.003978633787482977
349 0.0002989043714478612
399 2.1841035049874336e-05
449 1.3204685274104122e-06
499 5.961729243608715e-08
