# chapter4 신경망 학습

## 4.1 데이터에서 학습한다!

### 4.1.1 데이터 주도 학습

deep learning : end to end machine learning  

## 4.2 손실 함수

손실 함수(loss function): 신경망 학습에 사용하는 지표로, 일반적으로 오차제곱합과 교차 엔트로피 오차를 사용한다.  
(낮을 수록 좋음)

### 4.2.1 오차제곱합

SSE(sum of squares for error) = E = 1/2(sum((y_pred - y) ** 2))

In [1]:
import numpy as np

def sum_squares_error(y_pred, y):
    return 0.5 * np.sum((y_pred-y)**2)

### 4.2.2 교차 엔트로피 오차

CEE(cross entropy error) = E = -sum(y * log(y_pred))

In [2]:
def cross_entropy_error(y_pred, y):
    delta = 1e-7 # np.log(0) == -inf -> prevent
    return -np.sum(y * np.log(y_pred + delta))

### 4.2.3 미니배치 학습

책 참고

### 4.2.4 (배치용) 교차 엔트로피 오차 구현하기

In [3]:
# for batch(when y is one_hot encoded)
def cross_entropy_error1(y_pred, y):
    if y_pred.ndim == 1:
        y = y.reshape(1, y.size)
        y_pred = y.reshape(1, y_pred.size)
    
    batch_size = y_pred.shape[0]
    return -np.sum(y * np.log(y_pred + 1e-7)) / batch_size

In [4]:
# for batch(when y is not one_hot encoded)
def cross_entropy_error2(y_pred, y):
    if y_pred.ndim == 1:
        y = y.reshape(1, y.size)
        y_pred = y.reshape(1, y_pred.size)
    
    batch_size = y_pred.shape[0]
    return -np.sum(np.log(y_pred[np.arange(batch_size), y]+ 1e-7)) / batch_size

### 4.2.5 왜 손실 함수를 설정하는가?

신경망 학습에서는 parameter를 optimize 할 때 미분을 사용한다.

## 4.3 수치 미분

### 4.3.1 미분

In [5]:
def numerical_diff(f, x):
    h = 1e-4
    return (f(x+h) - f(x-h) / (2 * h))

### 4.3.2 수치 미분의 예

책 참고

### 4.3.3 편미분

책 참고

## 4.4 기울기

In [6]:
def _numerical_gradient_no_batch(f, x):
    h = 1e-4
    grad = np.zeros_like(x)
    for idx in range(x.size):
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x)
        x[idx] = tmp_val - h 
        fxh2 = f(x) 
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val        
    return grad


def numerical_gradient(f, X):
    if X.ndim == 1:
        return _numerical_gradient_no_batch(f, X)
    else:
        grad = np.zeros_like(X)
        
        for idx, x in enumerate(X):
            grad[idx] = _numerical_gradient_no_batch(f, x)
        
        return grad

In [7]:
def function_2(x):
    return np.sum(x ** 2)

numerical_gradient(function_2, np.array([3.0, 4.0]))

array([6., 8.])

### 4.4.1 경사법(경사 하강법)

x0, x1 = x - eta * numerical_gradient  
eta: learning rate

In [8]:
def gradient_descent(f, init_x, lr=0.01, step_num=100):
    x = init_x

    for i in range(step_num):
        grad = numerical_gradient(f, x)
        x -= lr * grad
    return x

## 4.4.2 신경망에서의 기울기

In [9]:
def softmax(x):
    c = np.max(x)
    exp_x = np.exp(x-c)
    sum_exp_x = np.sum(exp_x-c)
    y = exp_x / sum_exp_x
    return y

In [10]:
class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2, 3)
    
    def predict(self, x):
        return np.dot(x, self.W)
    
    def loss(self, x, y):
        z = self.predict(x)
        y_pred = softmax(z)
        loss = cross_entropy_error1(y_pred, y)
        return loss

In [11]:
net = simpleNet()
print(net.W)
x = np.array([0.6, 0.9])
y_pred = net.predict(x)
print(np.argmax(y_pred))
y = np.array([0, 0, 1])
net.loss(x, y)

[[-2.50093784  0.60734283 -0.40662738]
 [ 0.86203722  1.5465298  -1.42010576]]
1


-9.999999505838704e-08

## 4.5 학습 알고리즘 구현하기

1단계: 미니배치  
-> 훈련 데이터 중 일부를 무작위로 가져온후(확률적 경사 하강법) 그 미니배치의 손실 함수 값을 줄여야함  
2단계: 기울기 산출  
-> 각 가중치 매개변수의 기울기를 구한다.  
3단계: 매개변수 갱신  
-> 가중치 매개변수를 갱신한다.  
4단계: 반복  
-> 적절히 학습될 때 까지 반복

### 4.5.1 2층 신경망 클래스 구현하기

In [12]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, ouput_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, ouput_size)
        self.params['b2'] = np.zeros(ouput_size)

    def predict(self, X):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = np.dot(X, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)

        return y
    
    def loss(self, X, y):
        y_pred = self.predict(X)

        return cross_entropy_error1(y_pred, y)

    def accuracy(self, X, y):
        y_pred = self.predict(X)
        y_pred = np.argmax(y_pred, axis=1)
        y = np.argmax(y, axis=1)

        accuracy = np.sum(y_pred == t) / float(X.shape[0])
        return accuracy

    def numerical_gradient(self, X, y):
        loss_W = lambda W: self.loss(X, y)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads

In [13]:
net = TwoLayerNet(input_size=784, hidden_size=100, ouput_size=10)
X = np.random.rand(100, 784)
y_pred = net.predict(X)

In [14]:
y = np.random.rand(100, 10)

grads = net.numerical_gradient(X, y)

In [15]:
grads

{'W1': array([[-0.0057976 , -0.00986031,  0.00412719, ...,  0.00393074,
         -0.01166395,  0.0075011 ],
        [-0.00849637, -0.01447001,  0.00631011, ...,  0.00583378,
         -0.01737105,  0.01101892],
        [-0.00336681, -0.00565316,  0.00207201, ...,  0.00230151,
         -0.00659406,  0.00435532],
        ...,
        [-0.00610132, -0.01036529,  0.00422813, ...,  0.00419283,
         -0.01233546,  0.00785279],
        [-0.00476939, -0.00794245,  0.00319993, ...,  0.00322592,
         -0.00955559,  0.00603055],
        [-0.00935923, -0.0161094 ,  0.00696939, ...,  0.00647335,
         -0.01910267,  0.01230543]]),
 'b1': array([-0.01123123, -0.01912047,  0.00797039,  0.01853413,  0.01062306,
        -0.00817326, -0.02147835, -0.00794657, -0.00855942, -0.02051595,
         0.01213878,  0.0256261 , -0.01254031,  0.01082492,  0.01056655,
        -0.02082665, -0.0097519 ,  0.01112733, -0.02648476,  0.01835583,
        -0.01702786,  0.02168456,  0.00104009,  0.00646827,  0.012409