In [1]:
import torch

import numpy as np

In [14]:
# device
dev =  torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
dev

device(type='cuda', index=0)

#### Torch "variable"


##### upto version 0.3.1
from torch.autograd import Variable  
Variable(tr)  
in older version any tensor involved in a computation that needed to be tracked by autograd had to be wrapped in a Variable  

##### newer than 0.3.1
does not actually require the use of "Variable"  

In 0.4 the functionality of Variable was merged into the Tensor class. In modern PyTorch, you simply have to set the requires_grad attribute of the tensor to achieve the same behavior.  

me - "calling" torch **variable** -> that which is tracked in computational graph.

## Autograd  

automatic computation of gradients  

specify 'requires_grad' parameter in tensor definition.  

telling that :-  can make functions using those variable; and differentiate (the functions) **w.r.t** those variables.

In [4]:
# specify 'requires_grad' parameter in tensor definition.

X2 = torch.ones([3,2], requires_grad=True)
print(X2)

# notice requires_grad in printed 

# OR

# var.requires_grad_()
# set requires grad afterwards (in place)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


when a new variable is made using a variable.. (some relation.. ), because we set requires_grad to True, it knows we want derivative.. so the analytical thing of that term in the chain, etc.. will be logged??

build **computation graph** , everytime we make a new variable with it.. 

In [5]:
Y2 = X2+5
Z2 = Y2*Y2 +1
t = torch.sum(Z2) #add up all values. single number

print(Y2)
print(Z2)
print(t)

# the book keeping of the relations is done.

# notice 'addbackward' in two of them, and 'sumbackward' in third. 
# book keeping of the relations.

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)
tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)
tensor(222., grad_fn=<SumBackward0>)


**fn.backward()**    
find partial derivative of function **wrt** all with req_grad=True  
store those in `var.grad`

In [6]:
# above(doing operations) is like a forward pass.. 

t.backward() # do derivation computation and store to 'grad' parameter
# backward starting from t
# now derivative can be accessed with 'grad'

X2.grad # derivative of t wrt X2. 
# DERIVATIVE AFTER SUBSTITUTING THE CURRENT VALUES

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])

objective_fn_var.backward()   
do backprop(ie, computes gradient) - and store in **each torch_var.grad attribute** (where, torch_var is torch variable, ie with requires_grad = True.)   

(derivative **wrt** that variable)

In [64]:
# any cascading set of functions.

In [7]:
R2 = 1/(1+torch.exp(-Y2)) #sigmoid
# note : "torch.exp" - torch function.
s = torch.sum(R2)

s.backward()
X2.grad


tensor([[12.0025, 12.0025],
        [12.0025, 12.0025],
        [12.0025, 12.0025]])

above - called backward on single valued variable.  

### backward on a tensor-variable(not scalar):  

additional argument passed with backward  
find partial derivative and also does an element wise multiplication with the argument passed.

In [71]:
R2 = 1/(1+torch.exp(-Y2))

A2 = torch.ones(R2.shape)

R2.backward(A2) #should be called with an arg = tensor with same dimension  

# finding derivative of 'R2' wrt X2; and also does an element wise multiplication with 'A2'.

X2.grad

tensor([[36.0148, 36.0148],
        [36.0148, 36.0148],
        [36.0148, 36.0148]])

this is made this way - so that we can cascade our chain rule through multiple function.

x->r->s (say)

s.backward(r.backward)

ds/dr . dr/dx

The .grad attribute of a Tensor that is not a "leaf" Tensor is being accessed. Its .grad attribute **won't** be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead.  

leaf Tensor - those which we make first with require_grad=True. Want derivative wrt those.  
non-leaf Tensor - operation returns of leaf-tensors; but backward() not called on it.



can find wrt defined variable. (ie with req_grad thing...)

### Autodiff eg that looks like what we have been doing.

In [8]:
# data
X = torch.randn([20, 1], requires_grad=True)
Y = 3*X - 2

# parameters
w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

# model
Y_hat = w*X + b

# loss
loss = torch.sum((Y_hat - Y)**2)

print(loss)

tensor(222.0467, grad_fn=<SumBackward0>)


all variables involved be with req_grad=True

In [9]:
# find gradient
loss.backward()

# found and stored in var.grad
print(w.grad, b.grad)

tensor([-37.9472]) tensor([122.7330])


### Train w, b in a loop

In [10]:
learning_rate = 0.01

w = torch.tensor([1.0], requires_grad=True)
b = torch.tensor([1.0], requires_grad=True)

print(w.item(), b.item())

for i in range(10):
    X = torch.randn([20, 1])
    # different x for each iteration - to simulate something like batch inputs
    Y = 3 * X - 2

    # forward pass
    Y_hat = w * X + b
    
    # loss
    loss = torch.sum((Y_hat - Y) ** 2)

    # backward (finding gradient using forward-pass-values)
    loss.backward()
    
    # update parameters
    with torch.no_grad():
        # update parameters
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad

        # set gradients to 0
        w.grad.zero_() # make zero in place. (grad attibute of w-object)
        b.grad.zero_()

    print(w.item(), b.item())

1.0 1.0
1.5878593921661377 0.10683983564376831
2.037818670272827 -0.6759647130966187
2.7846431732177734 -1.275583028793335
2.810288429260254 -1.54954993724823
2.8761017322540283 -1.7338801622390747
2.920194625854492 -1.8425291776657104
2.9628212451934814 -1.9117941856384277
2.9826712608337402 -1.9483814239501953
2.9959774017333984 -1.9701703786849976
2.997032642364502 -1.9820444583892822


**forward pass is only to get the values - which are needed in gradient calculation**  

- forward with the current parameters (updates intermediate values)
- compute gradients (uses the intermediate values)
- update parameters


the update line is like making another relation.  
so if we write that w/o 'torch.no_grad()' , then pytorch will think its another reln and will continue to build the computation graph.

**with torch.no_grad():** -> says don't do the book keeping.  
what in the block is not updated in the computation table.  
computations to be done - but not in the forward pass, etc..   


var.grad.zero_() -> make var.grad=0 (inplace)  

why explicitly set to 0? won't it overwrite?  
by default - find gradient and adds to the current??   
(so that something like finding gradient for each data point - adding up can be implemented)  

### speed comparison for autodiff,..  :

In [11]:
# true system : y= (3x1 + 3x2 + ....) -2 


# model : y_hat = W.X + b
# (vector input, scalar output model. parameters:W,b )

#### torch, CPU

In [12]:
%%time

learning_rate = 0.001
N = 1000000 #large number of parameters
epochs = 2000

W = torch.rand([N], requires_grad=True)
b = torch.ones([1], requires_grad=True)

for i in range(epochs):
      
    # data
    X = torch.randn([N])
    y = torch.dot( 3*torch.ones([N]), X ) - 2 
  
    # forward pass (prediction)
    y_hat = torch.dot(W, X) + b
    
    # loss
    loss = torch.sum((y_hat - y)**2)
      
    # compute gradient
    loss.backward()
  
    with torch.no_grad():
        
        # update parameters
        W -= learning_rate * W.grad
        b -= learning_rate * b.grad
    
        W.grad.zero_()
        b.grad.zero_()

print(torch.mean(W).item(), b.item()) 

nan nan
CPU times: user 1min 41s, sys: 955 ms, total: 1min 42s
Wall time: 17.1 s


#### torch, GPU

In [15]:
%%time

learning_rate = 0.001
N = 1000000 #large number of parameters
epochs = 200

# parameters in GPU
W_gpu = torch.rand([N], requires_grad=True, device=dev)
b_gpu = torch.ones([1], requires_grad=True, device=dev)

for i in range(epochs):
      
    # data in GPU    
    X_gpu = torch.randn([N], device=dev)
    # operation on gpu-variable. return will also be a gpu-variable
    y_gpu = torch.dot( 3*torch.ones([N], device=dev), X_gpu ) - 2 # returns gpu variable
  
    # forward pass
    y_hat_gpu = torch.dot(W_gpu, X_gpu) + b_gpu
    
    # loss
    loss = torch.sum((y_hat_gpu - y_gpu)**2)
  
    # compute gradient
    loss.backward()
  
    with torch.no_grad():
        # update parameters
        W_gpu -= learning_rate * W_gpu.grad
        b_gpu -= learning_rate * b_gpu.grad
    
        W_gpu.grad.zero_()
        b_gpu.grad.zero_()

print(torch.mean(W_gpu).item(), b_gpu.item()) 

nan nan
CPU times: user 304 ms, sys: 100 ms, total: 405 ms
Wall time: 403 ms


In [None]:
# all weights be learned to 3, as that is the true reln. 
# bias be learned to 2 (true value)

# here since all w values are same in actual reln.. 
# printing mean of all w, instead of printing all w.

both **data and parameters** in GPU  

operation on gpu-variable. return will also be a gpu-variable.  

**all should be in same place**