# What is Pytorch?

A replacement of Numpy to use power of GPU


In [1]:
from __future__ import print_function
import torch

In [14]:
x=torch.empty(5,3)

In [3]:
x

tensor([[ 1.4552e-21,  3.0751e-41,  0.0000e+00],
        [ 0.0000e+00, -4.3328e-35,  4.5677e-41],
        [-4.5296e+36,  4.5677e-41, -4.4019e-35],
        [ 4.5677e-41, -4.2490e-35,  4.5677e-41],
        [ 7.0584e-22,  3.0751e-41,  0.0000e+00]])

In [4]:
x=torch.rand(5,3)

In [5]:
x

tensor([[0.1777, 0.5446, 0.0469],
        [0.5779, 0.5202, 0.9370],
        [0.9177, 0.8162, 0.7820],
        [0.8408, 0.4028, 0.2253],
        [0.0675, 0.9523, 0.7218]])

In [6]:
torch.zeros(5,3,dtype=torch.long)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])

In [7]:
torch.tensor([5.5,3])

tensor([5.5000, 3.0000])

In [9]:
x= x.new_ones(5,3,dtype=torch.double) #new methods take in sizes
print(x)

s=torch.randn_like(x,dtype=torch.float) #override the datatype
print(s)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)
tensor([[ 0.8694, -0.4120, -0.1938],
        [-0.3051, -0.6027, -0.5006],
        [ 1.3078, -0.2332,  0.5984],
        [ 0.8961, -0.5389, -1.5267],
        [ 0.0659,  2.3417, -0.1924]])


In [10]:
print(x.size())

torch.Size([5, 3])


torch.Size is in fact a tuple, so it supports all tuple operations.

In [15]:
y=torch.rand(5,3)

In [16]:
print(x+y)

tensor([[ 3.2954e-01,  2.8301e-01,  8.3632e-01],
        [ 9.2236e-01,  5.9283e-01,  6.3654e-01],
        [-4.5296e+36,  5.7739e-01, -4.4936e+36],
        [ 7.4549e-01,  2.7822e-01,  4.7793e-01],
        [ 2.4357e-01,  7.7802e-01,  6.7406e-01]])


In [18]:
x=torch.randn(4,4)
y=x.view(16)
z=x.view(-1,8)
print(x.size() ,y.size(),z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


In [20]:
print(x)

tensor([[-2.1746,  0.1789,  1.0476,  0.1406],
        [ 0.2584, -0.1482, -0.8971,  1.3815],
        [-1.7731, -0.2380, -1.0885, -1.5369],
        [ 0.2517, -0.5297,  1.0092, -1.0603]])


In [22]:
x=torch.randn(1) # Only one element tensors can be converted to python scalars
print(x.item())

2.25146412849


# Numpy Bridge

Comverting a Torch Tensor to Numpy arrays and vice versa.
Both will share their underlying memeory locations .

In [23]:
a=torch.ones(5)
print(a)

tensor([1., 1., 1., 1., 1.])


In [24]:
b=a.numpy()
print(b)

[1. 1. 1. 1. 1.]


In [25]:
a.add_(1)
print(a)
print(b)

tensor([2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2.]


Converting numpy to torch

In [26]:
import numpy as np
a=np.ones(5)
b=torch.from_numpy(a)
np.add(a,1,out=a)
print(a)
print(b)

[2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2.], dtype=torch.float64)


# Cuda Tensors

Tensors can be moved from one device to another

In [27]:
#Let run this cell only to know cuda is available
#We will use torch device objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device=torch.device("cuda")  # Cuda is device objects
    y=torch.ones_like(x,device=device)
    x=x.to(device)
    z=x+y
    print(z)
    print(z.to("cpu",torch.double))

tensor([3.2515], device='cuda:0')
tensor([3.2515], dtype=torch.float64)


# AutoGrad :Automation Differentiation

Pytorch is autograd package.

The autograd package provides automatic differentiation for all operations on tensors.

# Tensor

torch.Tensor is the central class of the package. If you set its attribute .requires_grad as True, it starts to track all operations on it. When you finish your computation you can call .backward() and have all the gradients computed automatically. The gradient for this tensor will be accumulated into .grad attribute.

To stop a tensor from tracking history, you can call .detach() to detach it from the computation history, and to prevent future computation from being tracked.

To prevent tracking history (and using memory), you can also wrap the code block in with torch.no_grad():. This can be particularly helpful when evaluating a model because the model may have trainable parameters with requires_grad=True, but for which we don’t need the gradients.

There’s one more class which is very important for autograd implementation - a Function.

Tensor and Function are interconnected and build up an acyclic graph, that encodes a complete history of computation. Each tensor has a .grad_fn attribute that references a Function that has created the Tensor (except for Tensors created by the user - their grad_fn is None).

If you want to compute the derivatives, you can call .backward() on a Tensor. If Tensor is a scalar (i.e. it holds a one element data), you don’t need to specify any arguments to backward(), however if it has more elements, you need to specify a gradient argument that is a tensor of matching shape.

In [28]:
import torch

Create a tensor and set require_grad-True to track computation with it

In [29]:
x=torch.ones(2,2,requires_grad=True)
print(x)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)


In [31]:
y=x+2
print(y)

tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)


In [32]:
print(y.grad_fn)

<AddBackward0 object at 0x7f54865f79d0>


In [33]:
z = y * y * 3
out = z.mean()

print(z, out)

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward1>)


In [35]:
a=torch.randn(2,2)
a=((a*3)/(a-1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b=(a*a).sum()
print(b.grad_fn)

False
True
<SumBackward0 object at 0x7f54865f95d0>


# Gradients
Lets backprop now . Because out contains a single scalar, out.backward() 

In [36]:
out.backward()

In [37]:
print(x.grad)

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


In [39]:
x=torch.randn(3,requires_grad=True)

y=x*2
while y.data.norm() < 1000:
    y=y*2
print(y)

tensor([  243.7509, -1229.7775,   532.8825], grad_fn=<MulBackward0>)


In [40]:
v=torch.tensor([0.1,1.0,0.0001],dtype=torch.float)

In [41]:
y.backward(v)
print(x.grad)

tensor([1.0240e+02, 1.0240e+03, 1.0240e-01])


In [44]:
#Stopping to track the history

print(x.requires_grad)
print((x**2).requires_grad)

with torch.no_grad():
    print((x**2).requires_grad)

True
True
False


# Neural Networks
Using package torch.nn

nn.Module contains layersm and method forward(input) that returns the output

# A typical training procedure for a neural network is as follows:

    -> Define the neural network that has some learnable parameters (or weights)
    -> Iterate over a dataset of inputs
    -> Process input through the network
    -> Compute the loss (how far is the output from being correct)
    -> Propagate gradients back into the network’s parameters
    -> Update the weights of the network, typically using a simple update rule:
    
    weight = weight - learning_rate * gradient


# Define a Simple NN

In [46]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [48]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net,self).__init__() #1 import image channel, 6 output channesl, 5x5 square convolutions
        #kernel
        self.conv1=nn.Conv2d(1,6,5)
        self.conv2=nn.Conv2d(6,16,5)
        #an affine operation : y = Wx +b
        self.fc1 = nn.Linear(16*5*5,120)
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,10)
        
    def forward(self,x):
        # Max pooling over (2,2) window
        x = F.max_pool2d(F.relu(self.conv1(x)),(2,2))
        #If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)),2)
        x = x.view(-1,self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    def num_flat_features(self,x):
        size = x.size()[1:] #All dimensions except the batch dimension
        num_features =1
        for s in size:
            num_features *=s
        return num_features
    

net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


You just have to define the forward function, and the backward function (where gradients are computed) is automatically defined for you using autograd. You can use any of the Tensor operations in the forward function.

The learnable parameters of a model are returned by net.parameters()

In [49]:
para =list(net.parameters())
print(len(para))
print(para[0].size()) # conv1 weight

10
torch.Size([6, 1, 5, 5])


In [50]:
input = torch.randn(1,1,32,32)
out = net(input)
print(out)

tensor([[ 0.1646, -0.0014, -0.0773, -0.0785, -0.0183,  0.0714,  0.0277, -0.0177,
          0.1291, -0.0978]], grad_fn=<AddmmBackward>)


Recap:

       -> torch.Tensor - A multi-dimensional array with support for autograd operations like backward(). Also holds the gradient w.r.t. the tensor.
        
        -> nn.Module - Neural network module. Convenient way of encapsulating parameters, with helpers for moving them to GPU, exporting, loading, etc.
        
        -> nn.Parameter - A kind of Tensor, that is automatically registered as a parameter when assigned as an attribute to a Module.
        
        -> autograd.Function - Implements forward and backward definitions of an autograd operation. Every Tensor operation creates at least a single Function node that connects to functions that created a Tensor and encodes its history.



# Loss Function
A loss function takes the pair of inputs and computes a value that estimates how far away the output is from the target.

In [51]:
output=net(input)
target = torch.randn(10) #A dummy target,
target = target.view(1,-1)
criterion = nn.MSELoss()

loss = criterion(output,target)
print(loss)

tensor(1.1589, grad_fn=<MseLossBackward>)


# Follow the loss in backward direction, using .grad_fn
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d

      -> view -> linear -> relu -> linear -> relu -> linear
      
      -> MSELoss
      
      -> loss

In [52]:
print(loss.grad_fn) #MSELoss
print(loss.grad_fn.next_functions[0][0]) #Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) #Relu

<MseLossBackward object at 0x7f54865f73d0>
<AddmmBackward object at 0x7f54865f7490>
<AccumulateGrad object at 0x7f54865f73d0>


# Backprop
To backpropagate te error all we have to do is to loss.backward(). You need to clear the existing gradients

In [53]:
net.zero_grad() #zeros the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
None
conv1.bias.grad after backward
tensor([-0.0025,  0.0155, -0.0027, -0.0059, -0.0037, -0.0029])


# Update the weights
weight = weight - learning rate *gradient

In [54]:
learning_rate=0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
    

However, as you use neural networks, you want to use various different update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc. To enable this, we built a small package: torch.optim that implements all these methods. Using it is very simple:

In [55]:
import torch.optim as optim

#create your optimizer
optimizer = optim.SGD(net.parameters(),lr=0.01)

#in your training loop:
optimizer.zero_grad() #zero the gradient buffers
output = net(input)
loss = criterion(output,target)
loss.backward()
optimizer.step() #Does the update