<a href="https://colab.research.google.com/github/ownit4137/TIL/blob/main/DL%20from%20Scratch/1/SGD_2layer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 확률적 경사 하강법

## 개념

### 교차 엔트로피 오차, CEE

$E = - \sum_i {t}_i \log({y}_i) $

- 정답일 때의 출력이 클수록 오차가 작아짐, 작을수록 오차가 커짐

### 배치 학습

- 계산 라이브러리 대부분이 큰 배열을 효율적으로 처리할 수 있게 최적화됨
- I/O를 통해 데이터를 읽는 횟수를 줄여 순수 계산 수행 비율을 높임

### 미분

- 해석적 미분 : 수식을 전개해 미분하는 것
- 수치 미분 : 함수 f의 어떤 x를 중심으로 그 전후의 차분을 계산



In [1]:
from google.colab import drive 
drive.mount('/content/gdrive/')

%cd /content/gdrive/MyDrive/'Colab Notebooks'/

Mounted at /content/gdrive/
/content/gdrive/MyDrive/Colab Notebooks


In [6]:
from dataset.mnist import load_mnist    # 책 코드
from PIL import Image
import numpy as np

In [3]:
def cross_entropy_error(y, t):
  # 1차원 배열일 때의 처리, (1, n)꼴로 변환
  if y.ndim == 1:
    t = t.reshape(1, t.size)
    y = y.reshape(1, y.size)
      
  # 훈련 데이터가 원-핫 벡터라면 정답 레이블의 인덱스로 반환
  if t.size == y.size:
    t = t.argmax(axis=1)
            
  batch_size = y.shape[0]
  return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size


def softmax(x):
  if x.ndim == 2:
    x = x.T
    x = x - np.max(x, axis=0)
    y = np.exp(x) / np.sum(np.exp(x), axis=0)
    return y.T 

  x = x - np.max(x) # 오버플로 대책
  return np.exp(x) / np.sum(np.exp(x))


def numerical_gradient(f, x):
  h = 1e-4 # 0.0001
  grad = np.zeros_like(x)
  
  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
  while not it.finished:
    idx = it.multi_index
    tmp_val = x[idx]
    x[idx] = float(tmp_val) + h
    fxh1 = f(x) # f(x+h)
    
    x[idx] = tmp_val - h 
    fxh2 = f(x) # f(x-h)
    grad[idx] = (fxh1 - fxh2) / (2*h)
    
    x[idx] = tmp_val # 값 복원
    it.iternext()   
      
  return grad

def sigmoid(x):
  return 1 / (1 + np.exp(-x)) 

In [4]:
class SGD_2layer:
  def __init__(self, input_size, hidden_size, output_size):
    self.init_div = 0.01
    self.params = {}
    self.params['w1'] = self.init_div * np.random.randn(input_size, hidden_size)
    self.params['w2'] = self.init_div * np.random.randn(hidden_size, output_size)
    self.params['b1'] = np.zeros(hidden_size)
    self.params['b2'] = np.zeros(output_size)

  def predict(self, x):
    w1, w2 = self.params['w1'], self.params['w2']
    b1, b2 = self.params['b1'], self.params['b2']

    a1 = np.dot(x, w1) + b1
    z1 = sigmoid(a1)
    a2 = np.dot(z1, w2)
    y = softmax(a2)

    return y

  def loss(self, x, t):
    y = self.predict(x)
    return cross_entropy_error(y, t)

  def accuracy(self, x, t):
    y = self.predict(x)
    y = np.argmax(y, axis=1)  # 열 방향 최대
    t = np.argmax(t, axis=1)

    accuracy = np.sum(y == t) / float(x.shape[0])
    return accuracy

  def getgrad(self, x, t):
    loss_w = lambda w : self.loss(x, t)

    grads = {}
    grads['w1'] = numerical_gradient(loss_w, self.params['w1'])
    grads['b1'] = numerical_gradient(loss_w, self.params['b1'])
    grads['w2'] = numerical_gradient(loss_w, self.params['w2'])
    grads['b2'] = numerical_gradient(loss_w, self.params['b2'])
    return grads
    

In [15]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
x_train = x_train[:1000][:]
t_train = t_train[:1000][:]


net = SGD_2layer(input_size=784, hidden_size=10, output_size=10)

iter_total = 100
train_size = x_train.shape[0]
batch_size = 50
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_count = max(train_size / batch_size, 1)

for i in range(iter_total):
  batch_mask = np.random.choice(train_size, batch_size)
  x_batch = x_train[batch_mask]
  t_batch = t_train[batch_mask]

  grad = net.getgrad(x_batch, t_batch)
  for key in('w1', 'b1', 'w2', 'b2'):
    net.params[key] -= learning_rate * grad[key]

  loss = net.loss(x_batch, t_batch)
  train_loss_list.append(loss)

  if i % iter_count == 0:
    train_acc = net.accuracy(x_train, t_train)
    test_acc = net.accuracy(x_test, t_test)
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)
    print("iter-", i, " train acc : ", train_acc, " test_acc : ", test_acc)

iter- 0  train acc :  0.094  test_acc :  0.0958
iter- 20  train acc :  0.116  test_acc :  0.1135
iter- 40  train acc :  0.117  test_acc :  0.1028
iter- 60  train acc :  0.117  test_acc :  0.1028
iter- 80  train acc :  0.117  test_acc :  0.1028


속도 문제로 데이터, hidden layer, 배치, iter을 줄여서 학습 => 정확도 낮음