In [1]:
# 実装の見直し

import numpy as np

class Relu:
    def __init__(self):
        self.mask = None
    
    def forward(self,x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx
    
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self,x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self,dout):
        dx = dout * (1.0 - self.out) * self.out
        
        return dx
    
class Affine:
    def __init__(self,W,b):
        self.W = W
        self.b = b
        self.x = None
        self.dW= None
        self.db= None
        
    def forward(self,x):
        self.x = x
        out = np.dot(x,self.W) + self.b
        
        return out
    
    def backward(self,dout):
        dx = np.dot(dout,self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout,axis=0)
        
        return dx
    
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def cross_entropy_error(self,y,t):
        if y.ndim == 1:
            t = t.reshape(1,t.size)
            y = y.reshape(1,y.size)
        batch_size = y.shape[0]
        
        return -np.sum(t * np.log(y)) / batch_size
        
    def softmax(self,x):
        # if文の意味?
        if x.ndim == 2:
            x = x.T
            x = x - np.max(x, axis=0)
            y = np.exp(x) / np.sum(np.exp(x), axis=0)
            return y.T 

        x = x - np.max(x) # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x))
    
    def forward(self,x,t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self,dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx

In [2]:
import numpy as np
from collections import OrderedDict

class TwoLayerNet:

    def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
        #重みの初期化
        self.params = {}
        #784 * 50の重み行列
        self.params['W1'] = weight_init_std * np.random.randn(input_size,hidden_size)
        #50 * 10の重み行列
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size,output_size)
        #バイアス，隠れ層の数だけ
        self.params['b1'] = np.zeros(hidden_size)
        #バイアス，出力層の数だけ
        self.params['b2'] = np.zeros(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'],self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'],self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self,x):
        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    def loss(self, x, t):
        y = self.predict(x)

        return self.lastLayer.forward(y,t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y,axis=1)
        if t.ndim != 1:
            t = np.argmax(t,axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def numerical_gradient(self,f, x):
        h = 1e-4 # 0.0001
        grad = np.zeros_like(x)

        it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            tmp_val = x[idx]
            x[idx] = float(tmp_val) + h
            fxh1 = f(x) # f(x+h)

            x[idx] = tmp_val - h 
            fxh2 = f(x) # f(x-h)
            grad[idx] = (fxh1 - fxh2) / (2*h)

            x[idx] = tmp_val # 値を元に戻す
            it.iternext()   
        
        return grad
    
    # 勾配計算
    def nu_gradient(self,x,t):
        loss_W = lambda W: self.loss(x,t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W,self.params['W1'])
        grads['W2'] = numerical_gradient(loss_W,self.params['W2'])
        grads['b1'] = numerical_gradient(loss_W,self.params['b1'])
        grads['b2'] = numerical_gradient(loss_W,self.params['b2'])

        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

In [5]:
#coding: utf-8
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

# MNISTのダウンロード
mnist = fetch_mldata('MNIST original', data_home=".")

# 訓練データ
x_train = mnist['data'][:60000]
t_train = mnist['target'][:60000]

# trainと同じ形にする -1は自動的に調整
# 無駄な処理
t_train = t_train.reshape(1, -1).transpose()

# encode label
# labelごとのone-hot表現として取り扱う
encoder = OneHotEncoder(n_values=max(t_train)+1)
t_train = encoder.fit_transform(t_train).toarray()

# テストデータ
x_test = mnist['data'][60000:]
t_test = mnist['target'][60000:]

t_test = t_test.reshape(1, -1).transpose()

# encode label
# transformする必要
t_test = encoder.transform(t_test).toarray()

# 学習データがint型なのでfloatに変換，その後[0,1]に正規化する，max = 255
x_train  = x_train.astype(np.float64)
x_train /= x_train.max()
x_test   = x_test.astype(np.float64)
x_test  /= x_test.max()

train_loss_list = []
train_acc_list  = []
test_acc_list   = []

#hyper parameter
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

# 1エポックあたりの繰り返し数
#iter_per_epoch = max(train_size / batch_size, 1)
iter_per_epoch = 100

network = TwoLayerNet(input_size=784,hidden_size=50,output_size=10)

for i in range(iters_num):
    
    # train_sizeの中からbatch_size分ミニバッチのサンプルをとる
    batch_mask = np.random.choice(train_size,batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 勾配を計算する
    grad = network.gradient(x_batch,t_batch)
    
    # ミニバッチによってparameterの更新 x - eta * grad
    for key in ('W1','W2','b1','b2'):
        network.params[key] -= learning_rate * grad[key]
    
    # 全体のlossを格納
    loss = network.loss(x_batch,t_batch)
    train_loss_list.append(loss)
    
    # 1エポックごとに認識精度を計算
    if i % iter_per_epoch == 0:
    
        # acc
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)

        # accの格納
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
    

train acc, test acc | 0.0952833333333, 0.1006
train acc, test acc | 0.69415, 0.6995
train acc, test acc | 0.8448, 0.852
train acc, test acc | 0.871416666667, 0.8792
train acc, test acc | 0.888816666667, 0.8952
train acc, test acc | 0.897533333333, 0.9021
train acc, test acc | 0.902033333333, 0.9071
train acc, test acc | 0.90755, 0.9125
train acc, test acc | 0.909683333333, 0.9131
train acc, test acc | 0.915833333333, 0.9178
train acc, test acc | 0.91595, 0.9174
train acc, test acc | 0.920166666667, 0.922
train acc, test acc | 0.924016666667, 0.9239
train acc, test acc | 0.926116666667, 0.9281
train acc, test acc | 0.9283, 0.9287
train acc, test acc | 0.931216666667, 0.9312
train acc, test acc | 0.933516666667, 0.9337
train acc, test acc | 0.934566666667, 0.9305
train acc, test acc | 0.937966666667, 0.9377
train acc, test acc | 0.939016666667, 0.9351
train acc, test acc | 0.940666666667, 0.9376
train acc, test acc | 0.941016666667, 0.9384
train acc, test acc | 0.944383333333, 0.9428
tra