Pytorch test

In [1]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.9941, 0.5941, 0.7430],
        [0.8553, 0.8699, 0.6855],
        [0.3183, 0.7150, 0.6173],
        [0.0220, 0.3078, 0.2753],
        [0.1454, 0.3344, 0.9285]])


getting started with the MNIST warmup using numpy

In [53]:
import numpy as np

# batchsize, input dim, hidden dim, output dim, 
N, D_in, H, D_out = 64, 1000, 100, 10
learning_rate = 1e-6
# creating training pairs
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# init weigths randomly
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
print('init done')

init done


In [55]:
from tqdm import trange
for t in trange(500):
    #forward pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    #compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 20 == 0: print(t, loss)
    
    #backprob
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    #updating weights
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2


  0%|          | 0/500 [00:00<?, ?it/s] 19%|█▉        | 94/500 [00:00<00:00, 939.25it/s] 36%|███▋      | 182/500 [00:00<00:00, 920.42it/s] 55%|█████▌    | 276/500 [00:00<00:00, 925.99it/s] 74%|███████▍  | 372/500 [00:00<00:00, 935.71it/s] 93%|█████████▎| 465/500 [00:00<00:00, 933.77it/s]100%|██████████| 500/500 [00:00<00:00, 937.33it/s]


0 34694364.61272139
20 278369.4902211348
40 44916.02652077355
60 11977.517762145373
80 4069.2827308097926
100 1562.3358607308571
120 644.4821039807108
140 278.5290334372529
160 124.09501950302062
180 56.53141281285318
200 26.19597977248481
220 12.305245863285485
240 5.848934493876751
260 2.8052119874610164
280 1.3549968233014769
300 0.6583601856355871
320 0.3214395814924452
340 0.1575734675475658
360 0.07749649415900485
380 0.03822246450648908
400 0.01889454885971909
420 0.009357864349720482
440 0.004642344997606677
460 0.0023059241102501276
480 0.0011467117614947237


Now implement the same using torch

In [57]:
import torch

dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

lr = 1e-6
for t in trange(500):
    #forward pass
    h = x.mm(w1)
    h_relu = h.clamp_min(0)
    y_pred = h_relu.mm(w2)
    
    #loss 
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    #backprob
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

  0%|          | 0/500 [00:00<?, ?it/s]  6%|▌         | 28/500 [00:00<00:01, 274.29it/s] 11%|█         | 55/500 [00:00<00:01, 271.27it/s] 17%|█▋        | 83/500 [00:00<00:01, 272.97it/s] 22%|██▏       | 111/500 [00:00<00:01, 274.17it/s] 28%|██▊       | 140/500 [00:00<00:01, 277.87it/s] 34%|███▎      | 168/500 [00:00<00:01, 278.44it/s] 39%|███▉      | 194/500 [00:00<00:01, 272.57it/s] 45%|████▍     | 224/500 [00:00<00:00, 278.63it/s] 51%|█████     | 253/500 [00:00<00:00, 281.88it/s] 57%|█████▋    | 283/500 [00:01<00:00, 287.02it/s] 62%|██████▏   | 311/500 [00:01<00:00, 282.22it/s] 68%|██████▊   | 339/500 [00:01<00:00, 277.30it/s] 73%|███████▎  | 367/500 [00:01<00:00, 273.15it/s] 79%|███████▉  | 397/500 [00:01<00:00, 279.84it/s] 85%|████████▌ | 427/500 [00:01<00:00, 283.91it/s] 91%|█████████ | 456/500 [00:01<00:00, 280.66it/s] 97%|█████████▋| 487/500 [00:01<00:00, 287.99it/s]100%|██████████| 500/500 [00:01<00:00, 281.30it/s]


0 32298590.0
1 28500458.0
2 27334844.0
3 24498364.0
4 19153478.0
5 12758679.0
6 7635847.0
7 4388805.5
8 2627796.0
9 1706109.875
10 1212900.75
11 927122.5
12 745037.0
13 617477.4375
14 521709.4375
15 446017.03125
16 384505.625
17 333537.0
18 290743.21875
19 254495.484375
20 223532.828125
21 196977.828125
22 174105.609375
23 154329.078125
24 137184.171875
25 122274.953125
26 109259.2421875
27 97854.0546875
28 87848.1875
29 79061.0625
30 71297.015625
31 64415.875
32 58306.56640625
33 52872.1875
34 48025.6796875
35 43693.4765625
36 39812.375
37 36328.53515625
38 33195.8125
39 30374.41015625
40 27828.140625
41 25527.990234375
42 23445.341796875
43 21557.919921875
44 19845.345703125
45 18290.51953125
46 16874.6015625
47 15584.19140625
48 14405.8759765625
49 13329.2705078125
50 12344.4296875
51 11442.4375
52 10616.1708984375
53 9857.501953125
54 9160.1640625
55 8518.6083984375
56 7927.7958984375
57 7383.18994140625
58 6880.59375
59 6416.5009765625
60 5987.41064453125
61 5590.5419921875
62 522

now using autograd and pytorch

In [66]:
import torch

dtype= torch.float
device = torch.device("cuda:0")

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype=dtype, device=device)
y = torch.randn(N, D_out, dtype=dtype, device=device)

w1 = torch.randn(D_in, H, dtype=dtype, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, dtype=dtype, device=device, requires_grad=True)

lr = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp_min(0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    if t % 20 == 0: print(t, loss.item())
    loss.backward()
    
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    

0 30814096.0
20 217258.96875
40 26810.88671875
60 5692.32861328125
80 1525.727294921875
100 457.0524597167969
120 145.91287231445312
140 48.53709411621094
160 16.615447998046875
180 5.812753677368164
200 2.0694146156311035
220 0.7475102543830872
240 0.2733345031738281
260 0.10107274353504181
280 0.03775857388973236
300 0.01431548036634922
320 0.005589090287685394
340 0.002326078712940216
360 0.0010775100672617555
380 0.0005572912632487714
400 0.0003171424032188952
420 0.00019519819761626422
440 0.00012903513561468571
460 9.0166118752677e-05
480 6.618486804654822e-05


now using torch and the nn module

In [73]:
import torch
from torch import nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = nn.Sequential(
    nn.Linear(D_in, H),
    nn.ReLU(),
    nn.Linear(H, D_out)
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 20 == 0: print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 683.8037719726562
20 224.12423706054688
40 66.06622314453125
60 18.799837112426758
80 6.059969902038574
100 2.1966168880462646
120 0.874790608882904
140 0.3727380931377411
160 0.1673925817012787
180 0.07821992039680481
200 0.03771229088306427
220 0.01858675666153431
240 0.009320400655269623
260 0.0047404225915670395
280 0.0024383801501244307
300 0.0012655062600970268
320 0.0006614806479774415
340 0.0003477782302070409
360 0.00018373440252617002
380 9.746326395543292e-05
400 5.187429633224383e-05
420 2.7686492103384808e-05
440 1.4817692317592446e-05
460 7.951502993819304e-06
480 4.2754272726597264e-06


now using the nn and optim modules

In [3]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
gpu=torch.device("cuda:0")

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in).cuda(gpu) #creates data on cpu and transfers to gpu memory
y = torch.randn(N, D_out).cuda(gpu)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

model.cuda(device=gpu)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = loss_fn(y_pred, y)
    if t % 50 == 0: print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    


0 659.57666015625
50 194.81143188476562
100 45.64204025268555
150 7.791614532470703
200 1.133540391921997
250 0.11608036607503891
300 0.007920362055301666
350 0.000422261276980862
400 1.7613432646612637e-05
450 5.171129942027619e-07
