# ゼロから作るDeep Learning

## 4章 ニューラルネットワークの学習

### 4.5 学習アルゴリズムの実装

#### 4.5.1 2層ニューラルネットワークのクラス

2層のニューラルネットワーク（隠れ層が1層のニューラルネットワーク）を対象に、MNISTデータセットを使って学習を行う

In [1]:

%cd ../deep-learning-from-scratch/ch04

/mnt/wd500/gotowork/workspace/study-dl-from-scratch/deep-learning-from-scratch/ch04


In [2]:
import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.functions import *
from common.gradient import numerical_gradient

In [3]:
class TwoLayerNet:
    # このクラスで使用する変数
    ## params : ニューラルネットワークのパラメータを保持するディクショナリ変数（インスタンス変数）
    ##          params['W1']は1層目の重み、params['b1']は1層目のバイアス。
    ##          params['W2']は2層目の重み、params['b2']は2層目のバイアス。
    #
    ## grads : 勾配を保持するディクショナリ変数（numerical_gradient()メソッドの返り値）
    ##         grads['W1']は1層目の重みの勾配、grads['b1']は1層目のバイアスの勾配。
    ##         grads['W2']は2層目の重みの勾配、grads['b2']は2層目のバイアスの勾配。
    
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y
    
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y,t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        t = np.argmax(t, axis=1)
        
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    # x:入力データ, t:教師データ
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    

#### 一つ例を見てみる

In [4]:
net = TwoLayerNet(input_size=784, hidden_size=100, output_size=10)
print(net.params['W1'].shape) # 今の層（入力層）のニューロン数×次の層（隠れ層）ニューロン数の行列になる
print(net.params['b1'].shape) # 次の層（隠れ層）のニューロン数の行列になる
print(net.params['W2'].shape) # 今の層（隠れ層）のニューロン数×次の層（出力層）のニューロン数の行列になる
print(net.params['b2'].shape) # 次の層（出力層）のニューロン数の行列になる

(784, 100)
(100,)
(100, 10)
(10,)


#### 推論処理の例

In [5]:
x = np.random.rand(100, 784) # ダミーの入力データ（１００枚分）
y = net.predict(x)

In [6]:
x

array([[0.30061326, 0.62200328, 0.06475344, ..., 0.90300161, 0.5716309 ,
        0.22417583],
       [0.09182456, 0.05423592, 0.56426271, ..., 0.85992835, 0.41810751,
        0.67244417],
       [0.55489101, 0.06015968, 0.28058285, ..., 0.16870425, 0.99879233,
        0.22004696],
       ...,
       [0.59097383, 0.50058699, 0.64294482, ..., 0.97936782, 0.25013807,
        0.23901405],
       [0.48856935, 0.97073131, 0.45933961, ..., 0.70106356, 0.6889186 ,
        0.37665326],
       [0.08056828, 0.24073048, 0.71498597, ..., 0.3738099 , 0.94836747,
        0.95779669]])

In [7]:
y

array([[0.10570701, 0.09614525, 0.10467622, 0.10313973, 0.10076265,
        0.09742207, 0.09344511, 0.09944404, 0.0983814 , 0.10087652],
       [0.10617315, 0.09628343, 0.10417493, 0.10325622, 0.10109316,
        0.09711629, 0.09352671, 0.09963159, 0.09820417, 0.10054034],
       [0.10548503, 0.09624465, 0.10464122, 0.10342485, 0.10137393,
        0.09720527, 0.09342952, 0.0994257 , 0.09805932, 0.1007105 ],
       [0.10578654, 0.09642624, 0.10427677, 0.10291702, 0.10118839,
        0.09722556, 0.09333388, 0.09956796, 0.09846446, 0.10081319],
       [0.10596829, 0.09611379, 0.104182  , 0.10311817, 0.10121774,
        0.09725571, 0.09367103, 0.09946348, 0.0981176 , 0.10089219],
       [0.10585183, 0.0964614 , 0.10435046, 0.10324876, 0.10130059,
        0.09741796, 0.09306886, 0.09936615, 0.09801256, 0.10092143],
       [0.10553975, 0.09634161, 0.10409341, 0.10279944, 0.10075795,
        0.09774254, 0.0936028 , 0.09991489, 0.09832449, 0.10088311],
       [0.1057161 , 0.09631046, 0.1045587

In [8]:
x = np.random.rand(100, 784) # ダミーの入力データ(100枚分)
t = np.random.rand(100, 10)  # ダミーの正解ラベル(100枚分)

grads = net.numerical_gradient(x, t) # 勾配を計算  5分くらいかかった

In [9]:
grads['W1'].shape 

(784, 100)

In [10]:
grads['b1'].shape

(100,)

In [11]:
grads['W2'].shape 

(100, 10)

In [12]:
grads['b2'].shape

(10,)

#### 4.5.2 ミニバッチ学習の実装

In [13]:
import numpy as np
from dataset.mnist import load_mnist

In [14]:
(x_train, t_train), (x_test, t_test) = \
    load_mnist(normalize=True, one_hot_label=True)

train_loss_list = []

In [17]:
# ハイパーパラメータ
#iters_num = 10000
iters_num = 10
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

In [18]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

In [19]:
from datetime import datetime
for i in range(iters_num):
    print(datetime.now(),i,"start")
    # ミニバッチの取得
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 勾配の計算
    print(datetime.now(),i, "calculating gradient..")
    grad = network.numerical_gradient(x_batch, t_batch)
    # grad = nework.gradient(..) # 高速版
    
    # パラメータの更新
    print(datetime.now(),i, "updating params..")
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 学習経過の記録
    print(datetime.now(),i,"recording loss..")
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
# 10時間以上経っても終わらず。
# 10ループなら11.5分程度。


2021-11-16 22:47:51.941923 0 start
2021-11-16 22:47:51.944228 0 calculating gradient..
2021-11-16 22:49:00.604404 0 updating params..
2021-11-16 22:49:00.605454 0 recording loss..
2021-11-16 22:49:00.607454 1 start
2021-11-16 22:49:00.608317 1 calculating gradient..
2021-11-16 22:50:13.435413 1 updating params..
2021-11-16 22:50:13.436402 1 recording loss..
2021-11-16 22:50:13.437770 2 start
2021-11-16 22:50:13.438645 2 calculating gradient..
2021-11-16 22:51:22.218434 2 updating params..
2021-11-16 22:51:22.220093 2 recording loss..
2021-11-16 22:51:22.221467 3 start
2021-11-16 22:51:22.222178 3 calculating gradient..
2021-11-16 22:52:30.279966 3 updating params..
2021-11-16 22:52:30.280915 3 recording loss..
2021-11-16 22:52:30.282519 4 start
2021-11-16 22:52:30.283709 4 calculating gradient..
2021-11-16 22:53:38.526185 4 updating params..
2021-11-16 22:53:38.527551 4 recording loss..
2021-11-16 22:53:38.529349 5 start
2021-11-16 22:53:38.530301 5 calculating gradient..
2021-11-16 22

- 0:01:15 / loop
- 75sec / loop
- 10,000 loopだと
  - 750,000 sec
  - 10,250 min
  - 170.8 hour


### 4.5.3 テストデータで評価

In [60]:
import numpy as np
from dataset.mnist import load_mnist

In [61]:
(x_train, t_train), (x_test, t_test) = \
    load_mnist(normalize=True, one_hot_label=True)

train_size = x_train.shape[0]
train_loss_list = []
train_acc_list = []
test_acc__list = []
# 1エポックあたりの繰り返し数
iter_per_epoch = max(train_size / batch_size, 1)


In [62]:
# ハイパーパラメータ
iters_num = 10000
batch_size = 100
learning_rate = 0.1


In [63]:
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

In [64]:
from datetime import datetime
for i in range(iters_num):
    print(datetime.now(),"start",i)
    # ミニバッチの取得
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 勾配の計算
    print(datetime.now(),"calculating gradient..",i)
    grad = network.numerical_gradient(x_batch, t_batch)
    # grad = nework.gradient(..) # 高速版
    
    # パラメータの更新
    print(datetime.now(),"updating params..",i)
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 学習経過の記録
    print(datetime.now(),"recording loss..",i)
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    # 1エポックごとに認識精度を計算
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc__list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + "," +str(test_acc))


2018-05-30 19:05:41.326270 start 0
2018-05-30 19:05:41.327250 calculating gradient.. 0
2018-05-30 19:06:59.391969 updating params.. 0
2018-05-30 19:06:59.392604 recording loss.. 0
train acc, test acc | 0.100983333333,0.1003
2018-05-30 19:07:00.169775 start 1
2018-05-30 19:07:00.170555 calculating gradient.. 1
2018-05-30 19:08:18.287752 updating params.. 1
2018-05-30 19:08:18.288498 recording loss.. 1
2018-05-30 19:08:18.290601 start 2
2018-05-30 19:08:18.291282 calculating gradient.. 2
2018-05-30 19:09:35.520129 updating params.. 2
2018-05-30 19:09:35.521195 recording loss.. 2
2018-05-30 19:09:35.522597 start 3
2018-05-30 19:09:35.523180 calculating gradient.. 3
2018-05-30 19:10:53.301385 updating params.. 3
2018-05-30 19:10:53.302177 recording loss.. 3
2018-05-30 19:10:53.303760 start 4
2018-05-30 19:10:53.304647 calculating gradient.. 4
2018-05-30 19:12:10.802603 updating params.. 4
2018-05-30 19:12:10.803771 recording loss.. 4
2018-05-30 19:12:10.805571 start 5
2018-05-30 19:12:10.8

KeyboardInterrupt: 