# 简单层的实现（MulLayer、AddLayer）

##### 1. MulLayer的实现

In [1]:
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = self.x * self.y
        
        return out
    
    def backward(self, dout):
        dx = dout * self.y    #翻转x和y
        dy = dout * self.x
        
        return dx, dy

#### 购买两个苹果

In [2]:
apple = 100
apple_num = 2
tax = 1.1

In [3]:
#Layer
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

In [4]:
#forward
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

In [5]:
print(apple_price, price)

200 220.00000000000003


In [6]:
#backward
dprice = 1

dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

In [7]:
print(dapple, dapple_num, dapple_price, dtax)

2.2 110.00000000000001 1.1 200


#### 2. AddLayer的实现

In [8]:
#加法层无需特意进行初始化
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        
        return dx, dy

#### 购买两个苹果和三个橘子

In [9]:
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

In [10]:
#Layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

In [11]:
#forward
apple_price = mul_apple_layer.forward(apple, apple_num)
orange_price = mul_orange_layer.forward(orange, orange_num)
all_price = add_apple_orange_layer.forward(apple_price, orange_price)
price = mul_tax_layer.forward(all_price, tax)

In [12]:
print(apple_price, orange_price, all_price, price)

200 450 650 715.0000000000001


In [13]:
#backward
dprice = 1

dall_price, dtax = mul_tax_layer.backward(dprice)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)
dorange, dorange_num = mul_orange_layer.backward(dorange_price)

In [14]:
print(dall_price, dtax, dapple_price, dorange_price, dapple, dapple_num, dorange, dorange_num)

1.1 650 1.1 1.1 2.2 110.00000000000001 3.3000000000000003 165.0


# 激活函数层的实现

#### 1. ReLU层

In [15]:
class ReLU:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()    #并非直接赋值，而是使用copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx

In [16]:
import numpy as np

In [17]:
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
print(x)

[[ 1.  -0.5]
 [-2.   3. ]]


In [18]:
mask = (x <= 0)
print(mask)

[[False  True]
 [ True False]]


#### 2. Sigmoid层

In [19]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return self.out
    
    def backward(self, dout):
        dx = dout * self.out * (1.0 - self.out)
        
        return dx

# Affine/Softmax层的实现

#### 1.反向传播时，各数据的反向传播的值需要汇总为偏置的元素

In [20]:
dY = np.array([[1, 2, 3], [4, 5, 6]])
dY

array([[1, 2, 3],
       [4, 5, 6]])

In [21]:
db = np.sum(dY, axis = 0)
db

array([5, 7, 9])

#### 2 Affine的实现

In [22]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(self.x, self.W) + self.b
        
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis = 0)
        
        return dx

#### 3. Softmax-with-Loss层

In [23]:
import sys, os
sys.path.append(os.pardir)
from common.functions import softmax, cross_entropy_error

In [24]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]    #将要传播的值除以批大小
        dx = (self.y - self.t) / batch_size     #(y-t)的漂亮结果
        
        return dx

# 误差反向传播法的实现

#### 1. TwoLayerNet

In [37]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

In [48]:
class TwoLayerNet:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 初始化权重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        
        return x
        
    # x:输入数据, t:监督数据
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
        
    # x:输入数据, t:监督数据
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
        
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 设定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

#### 2. 误差反向传播的梯度确认

##### Gradient Check: 确认数值微分求出的梯度结果和误差反向传播法求出的结果是否一致

In [49]:
#gradient_check
from dataset.mnist import load_mnist

(x_train, t_train), (x_test, t_test) = load_mnist(normalize = True, one_hot_label = True)

network = TwoLayerNet(input_size = 784, hidden_size = 50, output_size = 10)

x_batch = x_train[ : 3]
t_batch = t_train[ : 3]

gradient_numerical = network.numerical_gradient(x_batch, t_batch)
gradient_backprop = network.gradient(x_batch, t_batch)

#求各个权重的绝对误差的平均值
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + " : " + str(diff))

W1 : 3.776877569853691e-10
b1 : 2.109840013656862e-09
W2 : 5.491770239851316e-09
b2 : 1.401629800876214e-07


#### 3. 使用误差反向传播法的学习

In [50]:
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 梯度
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.14491666666666667 0.1441
0.9024833333333333 0.9054
0.92505 0.9275
0.9350833333333334 0.9348
0.9448 0.9437
0.9508666666666666 0.9497
0.9560833333333333 0.9529
0.9580666666666666 0.9544
0.9616666666666667 0.9573
0.9654166666666667 0.9601
0.968 0.9618
0.9704666666666667 0.9639
0.9703166666666667 0.9644
0.9729333333333333 0.9663
0.9740333333333333 0.9668
0.9755666666666667 0.9676
0.9762333333333333 0.9676
