In [1]:
import torch
import numpy as np
import time

## First, let's create the NN using numpy

N = batch size, D is input dimension
H is hidden dimension, D_out is output dimension

In [16]:
N, D_in, H, D_out = 1000, 4000, 100, 10

Let's create some input and output data.

In [17]:
# 10 labels for each 64x1000 instances
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

Randomly initialize the weights.

In [18]:
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

In [5]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute the loss using RSS
    loss = np.square(y_pred-y).sum()
    if (t%100 == 0):
        print("loss at iteration ", t, ": ", loss)
    
    # Back propogate to compute the gradients with resepct to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

loss at iteration  0 :  30875644.820042342
loss at iteration  100 :  186.27551934551707
loss at iteration  200 :  0.2406082901832312
loss at iteration  300 :  0.0006639500610564422
loss at iteration  400 :  2.4600319804153245e-06


## Now we can do the same thing but using torch.

In [42]:
torch.cuda.is_available()

True

In [59]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda")

# Random numbers.
N, D_in, H, D_out = 1000, 4000, 100, 10

Create random input and output data.

In [60]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

Randomly init the weights

In [61]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate=1e-8

In [62]:
first = time.time()
for t in range(1000):
    # Forward pass 
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
#     print(y_pred.size())
#     print(y.size())
    
    # Compute loss
    loss = (y_pred - y).pow(2).sum()
    if t % 200 == 0:
        print("loss at iteration ", t , " : " , loss.item())
    
    # Backprop to compute gradients - autograd will compute the gradient of loss
    # with resepct to all tensors with requires_grad=True.
    # w1.grad and w2.grad will be tensors holding the gradient of the loss wrt to w1 w2
    loss.backward()
    
    # Update the weights using gradient descent. wrap in torch.no_grad() because 
    # weights have requires_grad=True; but we don't need to track this. 
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # set the gradients back to zero 
        w1.grad.zero_()
        w2.grad.zero_()
end = time.time()
print("CPU time took was: ", end-first)

loss at iteration  0  :  2542859520.0
loss at iteration  200  :  862070.75
loss at iteration  400  :  13489.583984375
loss at iteration  600  :  9117.712890625
loss at iteration  800  :  9086.10546875
CPU time took was:  8.837102174758911


## GPU

https://stackoverflow.com/questions/53325418/pytorch-speed-comparison-gpu-slower-than-cpu

In the low dimensional problem, I was trying to solve the NN on the GPU: the GMC data was 200x13--relatively small. I was also trying to iterate over more loops because I thought this would make the problem longer and I would be able to better see the effects of GPU vs CPU. I did, but not in the way that I thought I would.

The GPU is opitmized for doing small, parallelized problems, i.e solving the gradients in a HIGH dimensional problem. In these, some of the partial derivatives are not dependent on each other so we can paralellize them. However, in a small network architecture, running ```loss.backward()``` would parallelize it, but it was not paralellizing a lot because there was not much to parallelize.

I came back to this toy data set and increased the number of dimensons. Increasing the number of dimensions increases the number of parallelization operations we need to compute. Increasing the number of iterations does not really mean anything in this problem, so iterating it over 100000 operations or whatever I did just made the problem in general slower, not how many operations I was sending to the GPU. As we can see, the CPU took much longer this time (with fewer iterations) and then GPU took much faster. 

In [57]:
dtype = torch.float
# device = torch.device("cpu")
device = torch.device("cuda")

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate=2e-8

In [58]:
first = time.time()
for t in range(1000):
    # Forward pass 
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
#     print(y_pred.size())
#     print(y.size())
    
    # Compute loss
    loss = (y_pred - y).pow(2).sum()
    if t % 200 == 0:
        print("loss at iteration ", t , " : " , loss.item())
    
    # Backprop to compute gradients - autograd will compute the gradient of loss
    # with resepct to all tensors with requires_grad=True.
    # w1.grad and w2.grad will be tensors holding the gradient of the loss wrt to w1 w2
    loss.backward()
    
    # Update the weights using gradient descent. wrap in torch.no_grad() because 
    # weights have requires_grad=True; but we don't need to track this. 
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # set the gradients back to zero 
        w1.grad.zero_()
        w2.grad.zero_()
end = time.time()
print("GPU time took was: ", end-first)

loss at iteration  0  :  1491390592.0
loss at iteration  200  :  67001.0625
loss at iteration  400  :  8984.865234375
loss at iteration  600  :  8931.345703125
loss at iteration  800  :  8931.2744140625
GPU time took was:  0.927548885345459
