## 패키지

In [1]:
import numpy as np
import torch

## Computational Graph

### NumPy 

In [2]:
np.random.seed(0)

N, D = 3, 4
x = np.random.randn(N, D)
y = np.random.randn(N, D)
z = np.random.randn(N, D)

a = x * y
b = a + z
c = np.sum(b)

grad_c = 1.0
grad_b = grad_c * np.ones((N, D))
grad_a = grad_b.copy()
grad_z = grad_b.copy()
grad_x = grad_a * y
grad_y = grad_a * x

In [3]:
grad_x

array([[ 0.76103773,  0.12167502,  0.44386323,  0.33367433],
       [ 1.49407907, -0.20515826,  0.3130677 , -0.85409574],
       [-2.55298982,  0.6536186 ,  0.8644362 , -0.74216502]])

### PyTorch

In [5]:
is_cuda = torch.cuda.is_available()

device = 'cuda' if is_cuda else 'cpu'

N, D = 3, 4
x = torch.randn(N, D, requires_grad=True, device=device)
y = torch.randn(N, D, device=device)
z = torch.randn(N, D, device=device)

a = x * y
b = a + z
c = torch.sum(b)

c.backward()
print(x.grad)

tensor([[-0.2375, -0.4777, -0.3602,  0.2041],
        [ 0.5741,  0.7695,  0.0246,  1.3220],
        [-0.4703,  0.5159, -0.3740,  0.8713]])


## PyTorch

### Autograd

#### 직접 계산

In [6]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500) :
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

#### Autograd 적용

In [9]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

for t in range(500) :
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    loss.backward()
    
    with torch.no_grad() :
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()
        
w1, w2

(tensor([[-0.5696,  0.4357,  0.8706,  ..., -0.1720,  2.0008,  1.8849],
         [-1.7577, -0.5078,  0.1283,  ..., -0.0257,  0.5639, -0.4977],
         [ 1.0603, -0.6733,  0.4327,  ...,  1.7146, -0.0470,  0.0271],
         ...,
         [-0.2729, -1.0909, -0.1624,  ...,  0.0769,  0.9066, -0.4561],
         [-0.8741, -0.3212, -0.3699,  ...,  1.9232, -0.2111, -0.6045],
         [-0.3825, -0.7857,  0.4174,  ..., -0.1628, -0.7185, -0.7904]],
        requires_grad=True),
 tensor([[ 8.9947e-01,  1.2585e+00, -1.0133e+00,  2.2679e-01,  1.9468e-01,
           6.6002e-01,  2.3521e+00,  1.1097e+00, -8.9030e-01, -1.5406e+00],
         [ 1.7537e-01, -4.3232e-02,  6.4440e-01,  1.2474e+00,  8.6388e-01,
          -2.5684e-01, -1.0790e+00,  5.6755e-01, -4.7203e-01, -4.7542e-01],
         [ 3.6652e-02, -5.8028e-01, -5.9648e-01,  9.4549e-01,  7.7446e-01,
          -2.9398e-01,  3.8954e-01,  1.2850e+00,  1.1444e+00, -6.2753e-01],
         [ 4.8031e-01, -2.7198e-01,  1.0482e+00,  1.2393e+00, -6.0090e-02,
  

### New Autograd Function

In [10]:
class MyReLU(torch.autograd.Function) :
    @staticmethod
    def forward(ctx, x) :
        ctx.save_for_backward(x)
        return x.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_y) :
        x, = ctx.saved_tensors
        grad_input = grad_y.clone()
        grad_input[x < 0] = 0
        return grad_input
    
def my_relu(x) :
    return MyReLU.apply(x)

In [17]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

for t in range(500) :
    y_pred = my_relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    loss.backward()
    
    with torch.no_grad() :
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

### nn (Neural Network)

In [18]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

learning_rate = 1e-2
for t in range(500) :
    y_pred = model(x)
    
    loss = torch.nn.functional.mse_loss(y_pred, y)
    loss.backward()
    
    with torch.no_grad() :
        for param in model.parameters() :
            param -= learning_rate * param.grad
    
    model.zero_grad()

### optim

In [19]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500) :
    y_pred = model(x)
    
    loss = torch.nn.functional.mse_loss(y_pred, y)
    loss.backward()
    
    optimizer.step()
    optimizer.zero_grad()

### 새로운 모델 정의

In [20]:
class TwoLayerNet(torch.nn.Module) :
    def __init__(self, D_in, H, D_out) :
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x) :
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        
        return y_pred
    
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

model = TwoLayerNet(D_in, H, D_out)

learning_rate = 1e-4
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for t in range(500) :
    y_pred = model(x)
    loss = torch.nn.functional.mse_loss(y_pred, y)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

### DataLoader

In [23]:
from torch.utils.data import TensorDataset, DataLoader

EPOCH = 20
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

loader = DataLoader(TensorDataset(x, y), batch_size=8)
model = TwoLayerNet(D_in, H, D_out)

learning_rate = 1e-2
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(EPOCH) :
    for x_batch, y_batch in loader :
        y_pred = model(x_batch)
        loss = torch.nn.functional.mse_loss(y_pred, y_batch)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

### Pre-Trained Model

In [25]:
import torchvision

mobileV3 = torchvision.models.mobilenet_v3_small(pretrained=True)
mobileV3

Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to C:\Users\sosim/.cache\torch\hub\checkpoints\mobilenet_v3_small-047dcff4.pth


  0%|          | 0.00/9.83M [00:00<?, ?B/s]

MobileNetV3(
  (features): Sequential(
    (0): ConvBNActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (fc1): Conv2d(16, 8, kernel_size=(1, 1), stride=(1, 1))
          (relu): ReLU(inplace=True)
          (fc2): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): ConvBNActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_r

## TensorFlow

In [26]:
import numpy as np
import tensorflow as tf

### Dynamic, Static Graph

#### v2 이전 (실행 안됨)
  - Static Graph 사용

In [27]:
N, D, H = 64, 1000, 100
x = tf.placeholder(tf.float32, shape=(N, D))
y = tf.placeholder(tf.float32, shape=(N, D))
w1 = tf.placeholder(tf.float32, shape=(D, H))
w2 = tf.placeholder(tf.float32, shape=(H, D))

h = tf.maximum(tf.matmul(x, w1), 0)
y_pred = tf.matmul(h, w2)
diff = y_pred - y
loss = tf.reduce_mean(tf.reduce_sum(diff ** 2, axis=1))

grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])
# ↑ 그래프를 쌓고
# ↓ 이후에 연산
with tf.Session() as sess :
    values = {x : np.random.randn(N, D),
              w1: np.random.randn(D, H),
              w2: np.random.randn(H, D),
              y : np.random.randn(N, D),}
    
    out = sess.run([loss, grad_w1, grad_w2], feed_dict=value)
    loss_val, graw_w1_val, grad_w2_val = out

AttributeError: module 'tensorflow' has no attribute 'placeholder'

#### v2 이후
  - Dynaminc Graph 사용

In [30]:
N, D, H = 64, 1000, 100
x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
w1 = tf.Variable(tf.random.uniform((D, H)))
w2 = tf.Variable(tf.random.uniform((H, D)))

with tf.GradientTape() as tape :
    h = tf.maximum(tf.matmul(x, w1), 0)
    y_pred = tf.matmul(h, w2)
    diff = y_pred - y
    loss = tf.reduce_mean(tf.reduce_sum(diff ** 2, axis=1))
    
gradients = tape.gradient(loss, [w1, w2])

### Optimizer

In [31]:
N, D, H = 64, 1000, 100
x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
w1 = tf.Variable(tf.random.uniform((D, H)))
w2 = tf.Variable(tf.random.uniform((H, D)))

learning_rate = 1e-6
optimizer = tf.optimizers.SGD(learning_rate)

with tf.GradientTape() as tape :
    h = tf.maximum(tf.matmul(x, w1), 0)
    y_pred = tf.matmul(h, w2)
    diff = y_pred - y
    loss = tf.reduce_mean(tf.reduce_sum(diff ** 2, axis=1))
    
gradients = tape.gradient(loss, [w1, w2])
optimizer.apply_gradients(zip(gradients, [w1, w2]))

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

### Loss

In [34]:
EPOCH = 50
N, D, H = 64, 1000, 100
x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
w1 = tf.Variable(tf.random.uniform((D, H)))
w2 = tf.Variable(tf.random.uniform((H, D)))

learning_rate = 1e-6
optimizer = tf.optimizers.SGD(learning_rate)

for t in range(EPOCH) :
    with tf.GradientTape() as tape :
        h = tf.maximum(tf.matmul(x, w1), 0)
        y_pred = tf.matmul(h, w2)
        loss = tf.losses.MeanSquaredError()(y_pred, y)

    gradients = tape.gradient(loss, [w1, w2])
    optimizer.apply_gradients(zip(gradients, [w1, w2]))

### Keras

In [35]:
EPOCH = 50
N, D, H = 64, 1000, 100
x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D, ), activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(D))

learning_rate = 1e-6
optimizer = tf.optimizers.SGD(learning_rate)

for t in range(EPOCH) :
    with tf.GradientTape() as tape :
        y_pred = model(x)
        loss = tf.losses.MeanSquaredError()(y_pred, y)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [36]:
EPOCH = 50
N, D, H = 64, 1000, 100
x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D, ), activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(D))

learning_rate = 1e-6
optimizer = tf.optimizers.SGD(learning_rate)

model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=optimizer)
history = model.fit(x, y, epochs=EPOCH, batch_size=N)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


### `@tf.function`


In [39]:
import timeit

EPOCH = 50
N, D, H = 64, 1000, 100
x = tf.convert_to_tensor(np.random.randn(N, D), np.float32)
y = tf.convert_to_tensor(np.random.randn(N, D), np.float32)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D, ), activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(D))

learning_rate = 1e-6
optimizer = tf.optimizers.SGD(learning_rate)

@tf.function
def model_static(x, y) :
    y_pred = model(x)
    loss = tf.losses.MeanSquaredError()(y_pred, y)
    return y_pred, loss

def model_dynamic(x, y) :
    y_pred = model(x)
    loss = tf.losses.MeanSquaredError()(y_pred, y)
    return y_pred, loss

print('Static Graph :', timeit.timeit(lambda : model_static(x, y), number=10))
print('Dynamic Graph :', timeit.timeit(lambda : model_dynamic(x, y), number=10))

Static Graph : 0.1000317999996696
Dynamic Graph : 0.018434499999784748
