---
### numpy 기반의 net
---

In [1]:
import numpy as np

N, D_in, D_out, H = 64, 1000, 10, 100 # batch size, input_dim, output_dim, hidden_dim

# random input and output
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

print(x.shape)
print(y.shape)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

print(w1.shape)
print(w2.shape)

lr = 1e-6

for i in range(500):
    # forward
    h = x.dot(w1)
    print(h.shape)
    h_relu = np.maximum(0, h) # relu 구현
    print(h_relu.shape)
    y_pred = h_relu.dot(w2)
    print(y_pred.shape)
    
    # compute loss
    loss = np.square(y_pred - y).sum()
    
    # backprop
    grad_y_pred = 2.0*(y_pred-y)
    grad_w2 = h_relu
    break

(64, 1000)
(64, 10)
(1000, 100)
(100, 10)
(64, 100)
(64, 100)
(64, 10)


---
### torch 기반의 net (without autograd)
---

In [13]:
# -*- coding : utf-8 -*-

import torch

dtype = torch.float
device = torch.device('cpu')
# device = torch.device('cuda:0') # for gpu

# N is batch size, D_in in input dimension;
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting require_grad=False indicate that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device = device, dtype = dtype)
y = torch.randn(N, D_out, device = device, dtype = dtype)

# Create random Tensors for weights
# Setting require_grad=True indicates that we want to comput gradients with
# respect to these Tensors during the backward pass
w1 = torch.randn(D_in, H, device = device, dtype = dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device = device, dtype = dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass : compute predicted y using operations on Tensors; these
    # are exactly the sample operation we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand
    y_pred = x.mm(w1).clamp(min=0).mm(w2) # torch.dot의 경우는 1d tensor에 대해 계산. 2d 이상일 경우 torch.mm 이용
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() get the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item()) # torch.Tensor.item() -> return to single scalar value
    
    
    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 repectivly.
    loss.backward()
    
    
print(w1.grad, w1.grad.size())
print(w2.grad, w2.grad.size())

99 30940392.0
199 30940392.0
299 30940392.0
399 30940392.0
499 30940392.0
tensor([[   886490.8125,    804029.2500,    883086.2500,  ...,
          -1395443.3750, -10250666.0000,  -6932557.0000],
        [ -2658750.2500,   1195687.0000,  -1717418.5000,  ...,
          -1660828.5000,   4111274.5000,   7753501.0000],
        [  -345470.1562,  -2935206.0000,  -2717213.7500,  ...,
          -5053170.5000,  -6309516.0000,  -1979812.6250],
        ...,
        [   203193.4062,   -699593.8125,   3743133.7500,  ...,
          -3153885.5000,  -1285123.7500,  -1207944.2500],
        [  -698031.1875,   2668137.2500,  -5690574.5000,  ...,
           3658498.0000,    844813.0000,   3158208.5000],
        [ -1506566.8750,   1194563.8750,  -6570561.0000,  ...,
           8516812.0000,   2967491.0000,   2627953.2500]]) torch.Size([1000, 100])
tensor([[ 9.8808e+07,  8.7400e+07, -3.7476e+07,  2.8995e+07,  1.1030e+08,
          1.0523e+08,  1.1442e+08,  6.0485e+07,  5.3675e+06, -5.1627e+06],
        [ 1.0

In [14]:
# The backward function receives the gradient of the output Tensors with respect to some scalar value, 
# and computes the gradient of the input Tensors with respect to that same scalar value

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new w1 and new w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph

# nn.Module (hi-level abstraction)
## input -> output computation, hold .learnable param
## functions such as loss

---
### High-level abstraction
---

In [58]:
x = torch.randn(N, D_in, dtype = dtype, requires_grad=True)

with torch.no_grad():
    print(x.requires_grad) # no_grad()를 해도 requires_grad는 변하지 않음
    print((x**2).requires_grad) # 원 tensor에 operation을 적용하는 경우에 대해서만 grad 계산여부 설정 가능

True
False


In [59]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import copy

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype = dtype)
y = torch.randn(N, D_out, dtype = dtype)


model = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
)
print(model)


loss_fn = nn.MSELoss()
lr = 0.0001

for i in range(500):
    # forward
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    
    if i% 100 == 99:
        print(i+1, loss.item())
        
    # backward
    model.zero_grad() # zero-grad before backward
    loss.backward() # 이걸 거쳐야 grad 값 계산이 됨
    
    # grad of first layer
    ## list(model.parameters()).grad
    
    # Manually update weigths using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history
    # You can also use torch.optim.SGD to achieve this.
    
    with torch.no_grad(): # don't want to compute the gradients of new weights. just update
        for param in model.parameters():
            param -= lr*param.grad
            
    ## torch.no_grad를 하는 이유 -> autograd시에 weight이외 다른 grad를 고려하지 않기 위해
            
# https://datascience.stackexchange.com/questions/32651/what-is-the-use-of-torch-no-grad-in-pytorch

Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)
99 0.9792891144752502
199 0.9679204225540161
299 0.9568312764167786
399 0.9459875822067261
499 0.9353870749473572


---
### optimizer function을 이용한 network
---

In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype = dtype)
y = torch.randn(N, D_out, dtype = dtype)


model = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
)
print(model)


loss_fn = nn.MSELoss()
lr = 0.0001

opt_fn = optim.SGD(model.parameters(), lr = lr)

for i in range(1500):
    # forward
    y_pred = model(x)
    loss = loss_fn(y_pred, y)        

    # backward
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers(i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    
    opt_fn.zero_grad() # zero-grad before backward
    loss.backward() # 이걸 거쳐야 grad 값 계산이 됨
    
    opt_fn.step()
    
    if i %100 == 99:
        print(i+1, loss.item())

Sequential(
  (0): Linear(in_features=1000, out_features=100, bias=True)
  (1): ReLU()
  (2): Linear(in_features=100, out_features=10, bias=True)
)
99 0.9720346927642822
199 0.9610562324523926
299 0.9502997994422913
399 0.9397655725479126
499 0.929477870464325
599 0.9194071292877197
699 0.9095223546028137
799 0.8998265266418457
899 0.8903145790100098
999 0.8809998035430908
1099 0.8718146681785583
1199 0.8627730011940002
1299 0.8539113402366638
1399 0.8452159762382507
1499 0.8366585969924927


---
### zero grad 보기
---

In [61]:
list(model.parameters())[0].grad

tensor([[ 0.0025,  0.0038,  0.0052,  ..., -0.0042, -0.0003,  0.0085],
        [-0.0042,  0.0019, -0.0017,  ...,  0.0005, -0.0038,  0.0040],
        [-0.0080,  0.0076, -0.0032,  ..., -0.0002, -0.0032,  0.0051],
        ...,
        [ 0.0053, -0.0004,  0.0047,  ..., -0.0007,  0.0027, -0.0033],
        [-0.0029,  0.0042,  0.0007,  ..., -0.0032, -0.0002,  0.0009],
        [ 0.0001, -0.0050, -0.0031,  ..., -0.0007, -0.0013, -0.0018]])

In [62]:
opt_fn.zero_grad()
list(model.parameters())[0].grad

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

---
### custom model
---

In [67]:
class customNN(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(customNN, self).__init__()
        self.ly1 = nn.Linear(D_in, H)
        self.ly2 = nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.ly1(x).clamp(min=0)
        y_pred = self.ly2(h_relu)
        
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, dtype = dtype)
y = torch.randn(N, D_out, dtype = dtype)


model = customNN(D_in, H, D_out)
print(model)

loss_fn = nn.MSELoss()
lr = 0.0001

opt_fn = optim.SGD(model.parameters(), lr = lr)

for i in range(1500):
    # forward
    y_pred = model(x)
    loss = loss_fn(y_pred, y)        

    # backward
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers(i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    
    opt_fn.zero_grad() # zero-grad before backward
    loss.backward() # 이걸 거쳐야 grad 값 계산이 됨
    
    opt_fn.step()
    
    if i %100 == 99:
        print(i+1, loss.item())    

customNN(
  (ly1): Linear(in_features=1000, out_features=100, bias=True)
  (ly2): Linear(in_features=100, out_features=10, bias=True)
)
99 0.962489664554596
199 0.9513875246047974
299 0.9405459761619568
399 0.9299857020378113
499 0.9196702241897583
599 0.9096024036407471
699 0.8997359871864319
799 0.8900806307792664
899 0.8805966973304749
999 0.8712757229804993
1099 0.86211097240448
1199 0.8531219363212585
1299 0.844300389289856
1399 0.8356345295906067
1499 0.8271398544311523


---
## dynamic network
---

In [75]:
import random

class dynamicNN(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(dynamicNN, self).__init__()
        
        self.input_ly = nn.Linear(D_in, H)
        self.middel_ly = nn.Linear(H, H)
        self.output_ly = nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0,1,2, or 3
        and resue the middel_linear Module that many times to compute hidden layer
        representations.
        
        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.
        
        Here we also see that it is perfectly safe to resue the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, when each Module could be used only once.
        """
        
        h_relu = self.input_ly(x).clamp(min=0)
        for i in range(random.randint(0,3)):
            print(i)
            h_relu = self.middel_ly(h_relu).clamp(min=0)  # model(x) 할 때 마다 middle_ly 개수가 달라짐 (처음 객체선언 한번이 아니라)
        y_pred = self.output_ly(h_relu)
        
        return y_pred
    
model = dynamicNN(D_in, H, D_out)
print(model)

loss_fn = nn.MSELoss()
lr = 1e-3

opt_fn = optim.SGD(model.parameters(), lr = lr)

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)


for i in range(1000):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    
    opt_fn.zero_grad()
    loss.backward()
    opt_fn.step()
    
    if i % 100 == 99:
        print(i + 1, loss.item())

dynamicNN(
  (input_ly): Linear(in_features=1000, out_features=100, bias=True)
  (middel_ly): Linear(in_features=100, out_features=100, bias=True)
  (output_ly): Linear(in_features=100, out_features=10, bias=True)
)
0
1
0
1
0
1
0
1
0
1
0
0
0
0
0
0
1
2
0
1
2
0
1
2
0
1
2
0
0
0
0
0
1
2
0
1
0
0
0
1
2
0
1
0
1
2
0
1
2
0
0
0
1
0
0
0
1
2
0
1
2
0
1
2
0
1
0
0
1
0
0
0
1
2
0
1
2
0
0
1
2
0
0
1
0
1
0
0
1
0
0
0
1
2
0
0
1
2
0
1
0
0
1
2
0
0
0
1
0
0
0
1
0
0
0
1
0
1
0
0
1
2
0
1
2
0
1
0
1
0
0
1
0
0
1
2
100 1.0636993646621704
0
1
2
0
0
0
0
0
1
2
0
1
0
0
1
0
1
2
0
1
2
0
1
0
1
2
0
1
0
1
2
0
1
2
0
1
2
0
1
0
1
0
1
0
0
0
0
0
1
0
1
2
0
0
1
2
0
0
0
1
0
1
0
1
0
1
0
1
2
0
0
1
2
0
0
0
0
1
2
0
1
2
0
1
0
1
0
0
1
2
0
0
1
2
0
0
0
1
0
1
0
0
1
0
0
0
0
1
0
1
0
1
0
0
1
0
1
0
1
0
0
1
2
0
1
0
1
0
0
0
1
2
0
0
1
200 1.0625920295715332
0
1
0
1
2
0
1
0
1
0
1
2
0
0
1
0
1
0
1
0
1
0
1
0
1
2
0
1
2
0
0
0
1
2
0
1
0
0
0
1
2
0
1
2
0
0
1
0
0
1
0
1
2
0
0
1
2
0
1
2
0
1
0
1
2
0
0
0
0
1
2
0
0
1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
0
1
0
1
2
0
1
0
0
1
