In [24]:
import sys, os
import numpy as np
from collections import OrderedDict

sys.path.append(os.pardir)
from dataset.mnist import load_mnist

def softmax(x):
  """소프트맥스
  """
  if x.ndim == 2:
    x = x.T
    x = x - np.max(x, axis=0)
    y = np.exp(x) / np.sum(np.exp(x), axis=0)
    return y.T 
  x = x - np.max(x) # 오버플로 대책
  return np.exp(x) / np.sum(np.exp(x))

def cross_entropy_error(y, t):
  """교차 엔트로피 오차
  """
  if y.ndim == 1:
      t = t.reshape(1, t.size)
      y = y.reshape(1, y.size)
  if t.size == y.size:
      # 원-핫 엔코딩인 경우 정답 레이블의 인덱스 값을 정답으로 사용한다.
      t = t.argmax(axis=1)
  batch_size = y.shape[0]
  return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

def numerical_gradient(f, x):
    """수치 미분
    """
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    # np.nditer: 다차원 배열을 반복(iterate)할 때 사용한다.
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) # multi_index 플래그를 주면 it.multi_index 사용 가능해짐.
    while not it.finished:
        idx = it.multi_index # x가 3차원 배열이면 (i,j,k) 형태의 인덱스를 얻을 수 있다.
        tmp_val = x[idx] # A[(0,0)] 인 경우 0행 0번째 항목을 얻을 수 있다.
        x[idx] = float(tmp_val) + h
        right = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        left = f(x) # f(x-h)
        grad[idx] = (right - left) / (2*h)
        
        x[idx] = tmp_val # 값 복원
        it.iternext()
    return grad

In [25]:
class MulLayer:
  """곱셈 계층
  """
  def __init__(self):
    self.x = None
    self.y = None
  
  def forward(self, x, y):
    self.x = x
    self.y = y
    return x * y
  
  def backward(self, dout):
    dx = dout * self.y
    dy = dout * self.x
    return dx, dy

apple = 100
apple_num = 2
tax_rate = 1.1
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax_rate)
print(price)
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dprice, dapple_price, dtax, dapple, dapple_num)

220.00000000000003
1 1.1 200 2.2 110.00000000000001


In [26]:
class AddLayer:
  """덧셈 계층
  """
  def __init__(self):
    pass

  def forward(self, x, y):
    return x + y
  
  def backward(self, dout):
    dx = dout * 1
    dy = dout * 1
    return dx, dy

apple_price = 100
apple_num = 2
orange_price = 150
orange_num = 3
tax_rate = 1.1

# 계층들
mul_apple_price = MulLayer()
mul_orange_price = MulLayer()
add_apple_orange_price = AddLayer()
mul_tax = MulLayer()

# 순전파
apple_total_price = mul_apple_price.forward(apple_price, apple_num)
orange_total_price = mul_orange_price.forward(orange_price, orange_num)
total_price = add_apple_orange_price.forward(apple_total_price, orange_total_price)
final_price = mul_tax.forward(total_price, tax_rate)
print("apple_price:", apple_price)
print("apple_num:", apple_num)
print("orange_price:", orange_price)
print("orange_num:", orange_num)
print("tax_rate:", tax_rate)
print("apple_total_price:", apple_total_price)
print("orange_total_price:", orange_total_price)
print("total_price:", total_price)
print("final_price:", final_price)

# 역전파
d_final_price = 1
d_total_price, d_tax = mul_tax.backward(d_final_price)
d_apple_total_price, d_orange_total_price = add_apple_orange_price.backward(d_total_price)
d_apple_price, d_apple_num = mul_apple_price.backward(d_apple_total_price)
d_orange_price, d_orange_num = mul_orange_price.backward(d_orange_total_price)
print("d_final_price:", d_final_price)
print("d_total_price:", d_total_price)
print("d_tax:", d_tax)
print("d_apple_total_price:", d_apple_total_price)
print("d_orange_total_price:", d_orange_total_price)
print("d_apple_price:", d_apple_price)
print("d_apple_num:", d_apple_num)
print("d_orange_price:", d_orange_price)
print("d_orange_num:", d_orange_num)


apple_price: 100
apple_num: 2
orange_price: 150
orange_num: 3
tax_rate: 1.1
apple_total_price: 200
orange_total_price: 450
total_price: 650
final_price: 715.0000000000001
d_final_price: 1
d_total_price: 1.1
d_tax: 650
d_apple_total_price: 1.1
d_orange_total_price: 1.1
d_apple_price: 2.2
d_apple_num: 110.00000000000001
d_orange_price: 3.3000000000000003
d_orange_num: 165.0


In [27]:
class ReluLayer:
  """ReLU 계층
  """

  def __init__(self):
    self.mask = None
  
  def forward(self, x):
    self.mask = (x <= 0)
    out = x.copy()
    out[self.mask] = 0
    return out
  
  def backward(self, dout):
    dout[self.mask] = 0
    dx = dout
    return dx

a = np.array([-0.2, -0.1, 0.0, 0.1, 0.2])
dout = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
rl = ReluLayer()
print("a:", a)
print("rl.forward(a):", rl.forward(a))
print("r1.backward(a):", rl.backward(dout))

a: [-0.2 -0.1  0.   0.1  0.2]
rl.forward(a): [0.  0.  0.  0.1 0.2]
r1.backward(a): [0. 0. 0. 1. 1.]


In [28]:
class SigmoidLayer:
  """Sigmoid 계층
  """
  def __init__(self):
    self.out = None

  def forward(self, x):
    """순전파
    순전파의 출력을 저장해 두고 역전파 때 사용한다.
    """
    self.out = 1 / (1 + np.exp(-x))
    return self.out
  
  def backward(self, dout):
    dx = dout * self.out * (1-self.out)
    return dx

a = np.array([-10.0, -5.0, 0.0, 5.0, 10.0])
dout = np.array([1.0, 1.0, 1.0, 1.0, 1.0])
sl = SigmoidLayer()
print("a:", a)
print("sl.forward(a):", sl.forward(a))
print("s1.backward(a):", sl.backward(dout))

a: [-10.  -5.   0.   5.  10.]
sl.forward(a): [4.53978687e-05 6.69285092e-03 5.00000000e-01 9.93307149e-01
 9.99954602e-01]
s1.backward(a): [4.53958077e-05 6.64805667e-03 2.50000000e-01 6.64805667e-03
 4.53958077e-05]


In [29]:
class AffineLayer:
  """Affine 계층
  """
  def __init__(self, W, b):
    self.X = None
    self.W = W
    self.b = b
    self.dW = None
    self.db = None
    pass

  def forward(self, X):
    """순전파
    """
    self.X = X
    return np.dot(X, self.W) + self.b
  
  def backward(self, dout):
    """역전파
    """
    dX = np.dot(dout, self.W.T)
    self.dW = np.dot(self.X.T, dout)
    self.db = np.sum(dout, axis=0)
    return dX

A = np.array([[1,2,3], [4,5,6]])
A.T
print("A:", A)
print("A.T:" , A.T)
X = np.array([[1,2,3], [4,5,6]])
W = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
b = np.array([0.1, 0.2, 0.3])
D = np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])
al = AffineLayer(W, b)
al_forward_out = al.forward(X)
al_backward_out = al.backward(D)
print("X:", X)
print("W:", W)
print("B:", b)
print("D:", D)
print("al_forward_out:", al_forward_out)
print("al_backward_out:", al_backward_out)

A: [[1 2 3]
 [4 5 6]]
A.T: [[1 4]
 [2 5]
 [3 6]]
X: [[1 2 3]
 [4 5 6]]
W: [[0.1 0.2 0.3]
 [0.4 0.5 0.6]
 [0.7 0.8 0.9]]
B: [0.1 0.2 0.3]
D: [[1. 1. 1.]
 [1. 1. 1.]]
al_forward_out: [[3.1 3.8 4.5]
 [6.7 8.3 9.9]]
al_backward_out: [[0.6 1.5 2.4]
 [0.6 1.5 2.4]]


In [30]:
class SoftmaxWithLoss:
  """Softmax with Loss(cross entropy error) 계층
  """

  def __init(self):
    self.y = None # softmax의 출력
    self.t = None # 정답 레이블(원-핫 엔코딩)
    self.loss = None # 손실

  def forward(self, x, t):
    """순전파
    x: 입력값
    t: 정답 레이블 (원-핫 엔코딩)
    """
    self.t = t
    self.y = softmax(x)
    self.loss = cross_entropy_error(self.y, self.t)
    return self.loss

  def backward(self, dout=1):
    batch_size = self.t.shape[0]
    dx = (self.y - self.t) / batch_size
    return dx

X = np.array([[1,2,3], [4,5,6]])
T = np.array([[0,1,0], [1,0,0]])
sl = SoftmaxWithLoss()
loss = sl.forward(X, T)
dx = sl.backward()
print("X:", X)
print("T:", T)
print("Y:", sl.y)
print("loss:", loss)
print("dx:", dx)

X: [[1 2 3]
 [4 5 6]]
T: [[0 1 0]
 [1 0 0]]
Y: [[0.09003057 0.24472847 0.66524096]
 [0.09003057 0.24472847 0.66524096]]
loss: 1.9076052047697707
dx: [[ 0.04501529 -0.37763576  0.33262048]
 [-0.45498471  0.12236424  0.33262048]]


In [31]:
class TwoLayerNet:
  """MNIST 훈련 및 추론을 위한 2층 신경망.
  """

  def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
    # 가중치
    self.params = {}
    self.params["W1"] = weight_init_std * np.random.randn(input_size, hidden_size)
    self.params["b1"] = np.zeros(hidden_size)
    self.params["W2"] = weight_init_std * np.random.randn(hidden_size, output_size)
    self.params["b2"] = np.zeros(output_size)

    # 계층
    self.layers = OrderedDict()
    self.layers["Affine1"] = AffineLayer(self.params["W1"], self.params["b1"])
    self.layers["Relu1"]   = ReluLayer()
    self.layers["Affine2"] = AffineLayer(self.params["W2"], self.params["b2"])
    self.last_layer = SoftmaxWithLoss()
    
  def predict(self, x):
    """추론.
    입력값을 신경망 순전파 처리.
    """
    for layer in self.layers.values():
      x = layer.forward(x)
    return x
  
  def loss(self, x, t):
    """추론 후 손실함수 처리.
    """
    y = self.predict(x)
    return self.last_layer.forward(y, t)
  
  def accuracy(self, x, t):
    """입력에 대한 추론 결과와 정답지를 비교하여 평균 정답률 계산하기.
    """
    y = self.predict(x)
    y = np.argmax(y, axis=1) # 열 방향 데이터 중 가장 큰 값의 인덱스를 얻기.
    if t.ndim != 1:
      t = np.argmax(t, axis=1) # 원핫엔코딩 데이터를 정답값(숫자)로 변환하기 위해 가장 큰 값의 인덱스를 얻음
    accuracy = np.sum(y == t) / float(x.shape[0]) # 정답을 맞춘 수를 데이터 수(행 수)로 나눠 평균을 냄.
    return accuracy

  def numerical_gradient(self, x, t):
    """수치미분 방식으로 기울기 계산하기.
    """
    loss_W = lambda W: self.loss(x, t)
    grads = {}
    grads["W1"] = numerical_gradient(loss_W, self.params["W1"])
    grads["b1"] = numerical_gradient(loss_W, self.params["b1"])
    grads["W2"] = numerical_gradient(loss_W, self.params["W2"])
    grads["b2"] = numerical_gradient(loss_W, self.params["b2"])
    return grads
  
  def gradient(self, x, t):
    """역전파 방식으로 기울기 계산하기.
    """

    # 순전파 신경망 계산을 한다.
    self.loss(x, t)

    dout = 1
    dout = self.last_layer.backward(dout)
    
    # A. 레이어의 순서를 변경한 리스트를 만들어 역방향으로 backward메서드를 실행하여 역전파로 미분값을 얻는다.
    # layers = list(self.layers.values())
    # layers.reverse()
    # for layer in layers:
    #   dout = layer.backward(dout)

    # B. OrderedList의 역방향 이터레이터를 사용, 레이어 역순으로 backward 메서드를 호출한다.
    for key in reversed(self.layers):
      dout = self.layers[key].backward(dout)
    
    # 미분 결과(기울기)를 저장한다.
    grads = {}
    grads["W1"] = self.layers["Affine1"].dW
    grads["b1"] = self.layers["Affine1"].db
    grads["W2"] = self.layers["Affine2"].dW
    grads["b2"] = self.layers["Affine2"].db
    
    return grads


In [32]:
"""수치미분과 역전파방식 미분의 결과 차이를 통해
역전파를 통해 구한 결과의 정확도를 확인하기.
"""

(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=28*28, hidden_size=50, output_size=10)
x_batch = x_train[:3]
t_batch = t_train[:3]

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop  = network.gradient(x_batch, t_batch)

for key in grad_numerical.keys():
  diff = np.average(np.abs(
    grad_backprop[key] - grad_numerical[key]
  ))
  print(key,":",str(diff))

W1 : 4.545661952811881e-10
b1 : 3.0766799402777692e-09
W2 : 6.468307528790065e-09
b2 : 1.4066052205774505e-07


In [33]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
train_loss_list = []
train_accuracy_list = []
test_accuracy_list = []
iters_num = 100_000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.01
iter_per_epoch = max(train_size / batch_size, 1)
network = TwoLayerNet(input_size=28*28, hidden_size=50, output_size=10)

for i in range(iters_num):
  # 미니배치용 데이터 준비.
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  t_batch = t_train[batch_mask]

  # 기울기 얻기.
  # grad = network.numerical_gradient(x_batch, t_batch)
  grad = network.gradient(x_batch, t_batch) # 오차역전파법을 이용해 기울기를 구하는 함수.

  # 기울기를 이용해 경사하강법으로 손실함수의 결과값을 줄이는 방향으로 각 파라미터(가중치와 편향) 조정하기.
  for key in ("W1", "b1", "W2", "b2"):
    # 기울기 값이 음수면 손실함수의 결과가 0으로 향하기 위해 더해줌.
    network.params[key] -= learning_rate * grad[key]
  
  # 학습경과를 파악하기 위해
  # 수정된 파라미터를 통해 손실함수를 계산하고 그 결과를 보관하기.
  loss = network.loss(x_batch, t_batch)
  train_loss_list.append(loss)

  if i % iter_per_epoch == 0:
    train_accuracy = network.accuracy(x_train, t_train)
    test_accuracy  = network.accuracy(x_test, t_test)
    train_accuracy_list.append(train_accuracy)
    test_accuracy_list.append(test_accuracy)
    print(f"iter {i} / train_accuracy: {train_accuracy}, test_accuracy: {test_accuracy}")
  


iter 0 / train_accuracy: 0.09973333333333333, test_accuracy: 0.104
iter 600 / train_accuracy: 0.5220833333333333, test_accuracy: 0.5241
iter 1200 / train_accuracy: 0.7736166666666666, test_accuracy: 0.7807
iter 1800 / train_accuracy: 0.84505, test_accuracy: 0.8499
iter 2400 / train_accuracy: 0.8714333333333333, test_accuracy: 0.8743
iter 3000 / train_accuracy: 0.8839333333333333, test_accuracy: 0.8876
iter 3600 / train_accuracy: 0.8923333333333333, test_accuracy: 0.8967
iter 4200 / train_accuracy: 0.8969166666666667, test_accuracy: 0.9002
iter 4800 / train_accuracy: 0.9007666666666667, test_accuracy: 0.9044
iter 5400 / train_accuracy: 0.9042833333333333, test_accuracy: 0.9082
iter 6000 / train_accuracy: 0.90705, test_accuracy: 0.9096
iter 6600 / train_accuracy: 0.90935, test_accuracy: 0.9126
iter 7200 / train_accuracy: 0.91145, test_accuracy: 0.9145
iter 7800 / train_accuracy: 0.9139833333333334, test_accuracy: 0.9169
iter 8400 / train_accuracy: 0.9158833333333334, test_accuracy: 0.918