# DeepLearning 第7回

## 今回の目標
前回学んだ誤差逆伝播を用いて高速化した学習を実装する

## Affineレイヤ

入力値と重みをかけて、バイアスを足すレイヤ  
NNで扱うために行列計算を行う

行列の形状を合わせることが大切。

![](https://tanaka-tom.github.io/backpropagation2/affine-graph.svg)

### ミニバッチ版Affineレイヤ
上記のAffineレイヤをさらにミニバッチ処理できるように修正する

![](https://tanaka-tom.github.io/backpropagation2/affine-graph2.svg)

### ミニバッチAffineレイヤの実装

In [21]:
import numpy as np

class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b

        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)

        return dx

## 今回実装するNNの全貌

２層NNを実装する。  
以下の図の用にAffineレイヤとReLUレイヤ（活性化関数）を組み合わせ、  
最後の出力でSoftmaxによる正規化を行う

![](https://tanaka-tom.github.io/backpropagation2/output-neuron-backpropagation.svg)

## Softmax with Lossレイヤ

今回は学習を実装するため、  
最後の出力ではSoftmaxで得られた確率から誤差を導き出してやる必要がある。  
その為Softmax with Lossレイヤとして実装し、損失関数にはCross Entropy Errorを用いる。

![](https://tanaka-tom.github.io/backpropagation2/softmax-with-loss-graph.svg)

### Softmax-with-Lossの実装

In [22]:
class SoftmaxWithLoss:
	def __init__(self):
		self.loss = None
		self.y = None
		self.x = None

	def forward(self, x, t):
		self.t = t
		self.y = softmax(x)
		self.loss = cross_entropy_error(self.y, self.t)

		return self.loss

	def backward(self, dout=1):
		batch_size = self.t.shape[0]
		dx = (self.y - self.t) / batch_size

		return dx

## 2層ニューラルネットワークの実装

In [23]:
import sys, os
sys.path.append(os.pardir)
from collections import OrderedDict
from dataset.mnist import load_mnist

In [24]:
class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout

        return dx

In [25]:
def softmax(x):
    if x.ndim == 2:
        x = x.T
        x = x - np.max(x, axis=0)
        y = np.exp(x) / np.sum(np.exp(x), axis=0)
        return y.T 

    x = x - np.max(x) # オーバーフロー対策
    return np.exp(x) / np.sum(np.exp(x))

In [26]:
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
        
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
             
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

In [27]:
def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 値を元に戻す
        it.iternext()   
        
    return grad

In [28]:
class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

## 勾配確認

勾配の求める方法２通り

- 数値微分（実装簡単）
- 解析的に数式を解いて求める（より複雑）

後者だと実装にミスがあるかもしれないので両者の計算結果を比較することで正しさを担保する

In [29]:
def gradient_check():
    # データの読み込み
    (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

    network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

    x_batch = x_train[:3]
    t_batch = t_train[:3]

    grad_numerical = network.numerical_gradient(x_batch, t_batch)
    grad_backprop = network.gradient(x_batch, t_batch)

    for key in grad_numerical.keys():
        diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
        print(key + ":" + str(diff))

In [30]:
gradient_check()

W1:2.58522139062e-13
b1:1.17827803178e-12
b2:1.1990408666e-10
W2:8.68944111977e-13


## 逆誤差伝播を用いた学習の実装

In [31]:
def backprop_learn():
	(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

	network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

	iters_num = 10000
	train_size = x_train.shape[0]
	batch_size = 100
	learning_rate = 0.1
	train_loss_list = []
	train_acc_list = []
	test_acc_list = []
	print(train_size)
	iter_per_epoch = max(train_size / batch_size, 1)

	for i in range(iters_num):
		batch_mask = np.random.choice(train_size, batch_size)
		x_batch = x_train[batch_mask]
		t_batch = t_train[batch_mask]

		grad = network.gradient(x_batch, t_batch)

		for key in ('W1', 'b1', 'W2', 'b2'):
			network.params[key] -= learning_rate * grad[key]

		loss = network.loss(x_batch, t_batch)
		train_loss_list.append(loss)

		if i % iter_per_epoch == 0:
			train_acc = network.accuracy(x_train, t_train)
			test_acc = network.accuracy(x_test, t_test)
			train_acc_list.append(train_acc)
			test_acc_list.append(test_acc)
			print(train_acc, test_acc)

In [32]:
backprop_learn()

60000
0.128333333333 0.1262
0.901366666667 0.9031
0.925283333333 0.9273
0.9358 0.9364
0.946233333333 0.9432
0.952616666667 0.9487
0.958883333333 0.9528
0.962316666667 0.958
0.966566666667 0.9609
0.968516666667 0.9622
0.9703 0.9621
0.973766666667 0.966
0.9733 0.9649
0.976033333333 0.9664
0.977683333333 0.9674
0.97795 0.9664
0.9802 0.9678
