In [1]:
# 実装の見直し

import numpy as np

class Relu:
    def __init__(self):
        self.mask = None
    
    def forward(self,x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        
        return out
    
    def backward(self,dout):
        dout[self.mask] = 0
        dx = dout
        
        return dx
    
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self,x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        
        return out
    
    def backward(self,dout):
        dx = dout * (1.0 - self.out) * self.out
        
        return dx
    
class Affine:
    def __init__(self,W,b):
        self.W = W
        self.b = b
        self.x = None
        self.dW= None
        self.db= None
        
    def forward(self,x):
        self.x = x
        out = np.dot(x,self.W) + self.b
        
        return out
    
    def backward(self,dout):
        dx = np.dot(dout,self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout,axis=0)
        
        return dx
    
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def cross_entropy_error(self,y,t):
        if y.ndim == 1:
            t = t.reshape(1,t.size)
            y = y.reshape(1,y.size)
        batch_size = y.shape[0]
        
        return -np.sum(t * np.log(y)) / batch_size
        
    def softmax(self,x):
        # if文の意味?
        if x.ndim == 2:
            x = x.T
            x = x - np.max(x, axis=0)
            y = np.exp(x) / np.sum(np.exp(x), axis=0)
            return y.T 

        x = x - np.max(x) # オーバーフロー対策
        return np.exp(x) / np.sum(np.exp(x))
    
    def forward(self,x,t):
        self.t = t
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self,dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        
        return dx

In [2]:
import numpy as np
from collections import OrderedDict

class TwoLayerNet:

    def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
        
        #重みの初期化
        self.params = {}
        #784 * 50の重み行列 He's initial value        
        self.params['W1'] = np.sqrt(2/input_size) * np.random.randn(input_size,hidden_size)
        #50 * 10の重み行列 He's initial value  
        self.params['W2'] = np.sqrt(2/hidden_size) * np.random.randn(hidden_size,output_size)
        #バイアス，隠れ層の数だけ
        self.params['b1'] = np.zeros(hidden_size)
        #バイアス，出力層の数だけ
        self.params['b2'] = np.zeros(output_size)
        
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'],self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'],self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self,x):
        for layer in self.layers.values():
            x = layer.forward(x)

        return x

    def loss(self, x, t):
        y = self.predict(x)

        return self.lastLayer.forward(y,t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y,axis=1)
        if t.ndim != 1:
            t = np.argmax(t,axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        # 設定
        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

In [4]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]
            
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self,params,grads):
        if self.v is None:
            self.v = {}
            for key,val in params.items():
                self.v[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.v[key] = self.momentum*self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]
            
class AdaGrad:
    def __init__(self,lr=0.01):
        self.lr = lr
        self.h  = None
        
    def update(self,params, grads):
        if self.h is None:
            self.h = {}
            for key,val in params.items():
                self.h[key] = np.zeros_like(val)
                
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
            
class Adam:

    """Adam (http://arxiv.org/abs/1412.6980v8)"""

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)         
        
        for key in params.keys():
            #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
            
            #unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            #unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            #params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)


In [5]:
#coding: utf-8
import numpy as np
from sklearn.datasets import fetch_mldata
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

# MNISTのダウンロード
mnist = fetch_mldata('MNIST original', data_home=".")

# 訓練データ
x_train = mnist['data'][:60000]
t_train = mnist['target'][:60000]

# trainと同じ形にする -1は自動的に調整
# 無駄な処理
t_train = t_train.reshape(1, -1).transpose()

# encode label
# labelごとのone-hot表現として取り扱う
encoder = OneHotEncoder(n_values=max(t_train)+1)
t_train = encoder.fit_transform(t_train).toarray()

# テストデータ
x_test = mnist['data'][60000:]
t_test = mnist['target'][60000:]

t_test = t_test.reshape(1, -1).transpose()

# encode label
# transformする必要
t_test = encoder.transform(t_test).toarray()

# 学習データがint型なのでfloatに変換，その後[0,1]に正規化する，max = 255
x_train  = x_train.astype(np.float64)
x_train /= x_train.max()
x_test   = x_test.astype(np.float64)
x_test  /= x_test.max()

train_loss_list = []
train_acc_list  = []
test_acc_list   = []

#hyper parameter
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

# 1エポックあたりの繰り返し数
#iter_per_epoch = max(train_size / batch_size, 1)
iter_per_epoch = 100

network = TwoLayerNet(input_size=784,hidden_size=50,output_size=10)
#opt = SGD()
#opt = Momentum()
#opt = AdaGrad()
opt = Adam()

for i in range(iters_num):
    
    # train_sizeの中からbatch_size分ミニバッチのサンプルをとる
    batch_mask = np.random.choice(train_size,batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 勾配を計算する
    grads = network.gradient(x_batch,t_batch)
    params= network.params
    opt.update(params, grads)
    
    # 全体のlossを格納
    loss = network.loss(x_batch,t_batch)
    train_loss_list.append(loss)
    
    # 1エポックごとに認識精度を計算
    if i % iter_per_epoch == 0:
    
        # acc
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)

        # accの格納
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
    

train acc, test acc | 0.110283333333, 0.1083
train acc, test acc | 0.87665, 0.8831
train acc, test acc | 0.904533333333, 0.9061
train acc, test acc | 0.912716666667, 0.9149
train acc, test acc | 0.921566666667, 0.9204
train acc, test acc | 0.926133333333, 0.9268
train acc, test acc | 0.933083333333, 0.9341
train acc, test acc | 0.937, 0.936
train acc, test acc | 0.94025, 0.9394
train acc, test acc | 0.941566666667, 0.9408
train acc, test acc | 0.94125, 0.9394
train acc, test acc | 0.9484, 0.9448
train acc, test acc | 0.9506, 0.9496
train acc, test acc | 0.953916666667, 0.9535
train acc, test acc | 0.9552, 0.9531
train acc, test acc | 0.956616666667, 0.9537
train acc, test acc | 0.957016666667, 0.9528
train acc, test acc | 0.9592, 0.9574
train acc, test acc | 0.9599, 0.9555
train acc, test acc | 0.9608, 0.9558
train acc, test acc | 0.961833333333, 0.9586
train acc, test acc | 0.962583333333, 0.9578
train acc, test acc | 0.96275, 0.9565
train acc, test acc | 0.96565, 0.9609
train acc, te