## 准备数据

In [62]:
import numpy as np
import torch
import torchvision
from torch.utils.data import DataLoader

def mnist_dataset():
    train_data = torchvision.datasets.MNIST('./data/', train=True, download=True)
    test_data = torchvision.datasets.MNIST('./data/', train=False, download=True)
    train_data = [train_data.data.detach().numpy()/255.0, train_data.targets.detach().numpy()]
    test_data = [test_data.data.detach().numpy()/255.0, test_data.targets.detach().numpy()]
    return train_data, test_data

## Demo numpy based auto differentiation

In [63]:
import numpy as np

class Matmul:
    def __init__(self):
        self.mem = {}
        
    def forward(self, x, W):
        h = np.matmul(x, W)
        self.mem={'x': x, 'W':W}
        return h
    
    def backward(self, grad_y):
        '''
        x: shape(N, d)
        w: shape(d, d')
        grad_y: shape(N, d')
        '''
        x = self.mem['x']
        W = self.mem['W']
        # FILL IN HERE
        grad_x = grad_y.dot(W.T)
        grad_W = x.T.dot(grad_y)
        return grad_x, grad_W


class Relu:
    def __init__(self):
        self.mem = {}
        
    def forward(self, x):
        self.mem['x']=x
        return np.where(x > 0, x, np.zeros_like(x))
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        # FILL IN HERE
        x = self.mem['x']
        grad_x = np.where(x > 0, grad_y, np.zeros_like(x))
        return grad_x
    


class Softmax:
    '''
    softmax over last dimention
    '''
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}
        
    def forward(self, x):
        '''
        x: shape(N, c)
        '''
        x_exp = np.exp(x)
        partition = np.sum(x_exp, axis=1, keepdims=True)
        out = x_exp/(partition+self.epsilon)
        self.mem['out'] = out
        self.mem['x_exp'] = x_exp
        return out
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        s = self.mem['out']
        sisj = np.matmul(np.expand_dims(s,axis=2), np.expand_dims(s, axis=1)) # (N, c, c)
        g_y_exp = np.expand_dims(grad_y, axis=1)
        tmp = np.matmul(g_y_exp, sisj) #(N, 1, c)
        tmp = np.squeeze(tmp, axis=1)
        tmp = -tmp+grad_y*s 
        return tmp
    
class Log:
    '''
    softmax over last dimention
    '''
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}
        
    def forward(self, x):
        '''
        x: shape(N, c)
        '''
        out = np.log(x+self.epsilon)
        
        self.mem['x'] = x
        return out
    
    def backward(self, grad_y):
        '''
        grad_y: same shape as x
        '''
        x = self.mem['x']
        
        return 1./(x+1e-12) * grad_y
    


## Gradient check

In [64]:
# Check Matmul()
import torch.nn as nn

x = np.random.normal(size=[5, 6])
W = np.random.normal(size=[6, 4])
aa = Matmul()
out = aa.forward(x, W) # shape(5, 4)
grad = aa.backward(np.ones_like(out))

x, W = torch.tensor(x, requires_grad=True), torch.tensor(W, requires_grad=True)
y = x.mm(W)
loss = y.sum()
loss.backward()
print((x.grad - torch.tensor(grad[0]))**2<1e-3, (W.grad - torch.tensor(grad[1]))**2<1e-3, sep='\n')

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])
tensor([[True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True],
        [True, True, True, True]])


In [65]:
# Check ReLU()
x = np.random.normal(size=[5, 6])
aa = Relu()
out = aa.forward(x) # shape(5, 4)
grad = aa.backward(np.ones_like(out))
print (grad)

relu = nn.ReLU()

x = torch.tensor(x, requires_grad=True)
loss = relu(x).sum()
loss.backward()
print((x.grad - torch.tensor(grad))**2<1e-3)

[[1. 0. 1. 0. 1. 0.]
 [0. 0. 1. 0. 0. 1.]
 [1. 1. 1. 1. 1. 1.]
 [0. 0. 1. 1. 0. 1.]
 [1. 1. 0. 0. 1. 0.]]
tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])


In [66]:
# Check Softmax
x = np.random.normal(size=[5, 6], scale=5.0, loc=1)
label = np.zeros_like(x)
label[0, 1]=1.
label[1, 0]=1
label[1, 1]=1
label[2, 3]=1
label[3, 5]=1
label[4, 0]=1
print(label)
aa = Softmax()
out = aa.forward(x) # shape(5, 6)
grad = aa.backward(label)
print (grad)

softmax = nn.Softmax(dim=-1)

x = torch.tensor(x, requires_grad=True)
y = softmax(x)
loss = (torch.tensor(label) * y).sum()
loss.backward()
print((x.grad - torch.tensor(grad))**2<1e-3)

[[0. 1. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]]
[[-2.87401404e-04  2.32522338e-02 -4.15491845e-05 -2.22738597e-02
  -2.28699139e-04 -4.20724324e-04]
 [ 3.17338384e-03  5.98030893e-08 -1.99841628e-04 -1.91661442e-03
  -1.04547486e-03 -1.15127382e-05]
 [-2.81105445e-08 -8.35164746e-07 -1.07322766e-08  1.02315159e-05
  -3.10737589e-06 -6.25013242e-06]
 [-4.56302065e-02 -1.08284833e-06 -1.15368347e-02 -2.49641161e-04
  -1.47851800e-05  5.74325504e-02]
 [ 1.54333069e-01 -7.43609129e-04 -1.42308577e-01 -5.41134827e-03
  -2.55296606e-04 -5.61423846e-03]]
tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])


In [67]:
# Check Log
x = np.random.normal(size=[5, 6])
aa = Log()
out = aa.forward(x) # shape(5, 4)
grad = aa.backward(label)
print (grad)

x = torch.tensor(x, requires_grad=True)
y = torch.log_(x + 1e-10)
loss = (torch.tensor(label) * y).sum()
loss.backward()
print((x.grad - torch.tensor(grad))**2<1e-3)

[[ 0.         -1.90761041  0.         -0.         -0.         -0.        ]
 [-1.04898531 -2.87090454 -0.          0.          0.          0.        ]
 [-0.          0.         -0.          0.51745855  0.          0.        ]
 [ 0.          0.         -0.          0.          0.          1.53038509]
 [-0.97192884  0.          0.         -0.         -0.          0.        ]]
tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True]])


  out = np.log(x+self.epsilon)


# Final Gradient Check

In [68]:
label = np.zeros_like(x.detach().numpy())
label[0, 1]=1.
label[1, 0]=1
label[2, 3]=1
label[3, 5]=1
label[4, 0]=1

x = np.random.normal(size=[5, 6])
W1 = np.random.normal(size=[6, 5])
W2 = np.random.normal(size=[5, 6])

mul_h1 = Matmul()
mul_h2 = Matmul()
relu = Relu()
softmax = Softmax()
log = Log()

h1 = mul_h1.forward(x, W1) # shape(5, 4)
h1_relu = relu.forward(h1)
h2 = mul_h2.forward(h1_relu, W2)
h2_soft = softmax.forward(h2)
h2_log = log.forward(h2_soft)


h2_log_grad = log.backward(label)
h2_soft_grad = softmax.backward(h2_log_grad)
h2_grad, W2_grad = mul_h2.backward(h2_soft_grad)
h1_relu_grad = relu.backward(h2_grad)
h1_grad, W1_grad = mul_h1.backward(h1_relu_grad)

print(W1_grad)
print('--'*20)

x = torch.tensor(x)
W1 = torch.tensor(W1, requires_grad=True)
W2 = torch.tensor(W2, requires_grad=True)
label =  torch.tensor(label)

relu1 = nn.ReLU()
softmax = nn.Softmax(dim=-1)

h1 = x.mm(W1)
h1_relu = relu1(h1)
h2 = h1_relu.mm(W2)
prob = softmax(h2)
log_prob = torch.log(prob)
loss = (log_prob * label).sum()
loss.backward()
print(W1.grad)
# Pytorch无法直接获取中间结果的梯度，因此选用最开始的W1梯度作比较，相同则证明反向传递链条无误。

[[ 0.00475074 -0.29924499 -0.13586521 -1.11082878  0.94324042]
 [ 0.87672573  0.53554158 -0.26349355 -0.68065352 -1.35321778]
 [ 0.67019987 -1.45276741  0.2873168   1.85739209 -1.87460072]
 [-1.14652132  1.04477673  0.19311893  1.02762317 -0.34878546]
 [ 0.14224971 -0.75947043  0.12755101 -0.01343931  0.42039601]
 [ 1.06268463  0.54506015  0.10691023  1.02909316 -1.64630779]]
----------------------------------------
tensor([[ 0.0048, -0.2992, -0.1359, -1.1108,  0.9432],
        [ 0.8767,  0.5355, -0.2635, -0.6807, -1.3532],
        [ 0.6702, -1.4528,  0.2873,  1.8574, -1.8746],
        [-1.1465,  1.0448,  0.1931,  1.0276, -0.3488],
        [ 0.1422, -0.7595,  0.1276, -0.0134,  0.4204],
        [ 1.0627,  0.5451,  0.1069,  1.0291, -1.6463]], dtype=torch.float64)


## 建立模型

In [69]:
class myModel:
    def __init__(self):
        
        self.W1 = np.random.normal(size=[28*28+1, 100])
        self.W2 = np.random.normal(size=[100, 10])
        
        self.mul_h1 = Matmul()
        self.mul_h2 = Matmul()
        self.relu = Relu()
        self.softmax = Softmax()
        self.log = Log()
        
        
    def forward(self, x):
        x = x.reshape(-1, 28*28)
        bias = np.ones(shape=[x.shape[0], 1])
        x = np.concatenate([x, bias], axis=1)
        
        self.h1 = self.mul_h1.forward(x, self.W1) # shape(5, 4)
        self.h1_relu = self.relu.forward(self.h1)
        self.h2 = self.mul_h2.forward(self.h1_relu, self.W2)
        self.h2_soft = self.softmax.forward(self.h2)
        self.h2_log = self.log.forward(self.h2_soft)
            
    def backward(self, label):
        self.h2_log_grad = self.log.backward(-label)
        self.h2_soft_grad = self.softmax.backward(self.h2_log_grad)
        self.h2_grad, self.W2_grad = self.mul_h2.backward(self.h2_soft_grad)
        self.h1_relu_grad = self.relu.backward(self.h2_grad)
        self.h1_grad, self.W1_grad = self.mul_h1.backward(self.h1_relu_grad)
        
model = myModel()


## 计算 loss

In [70]:
def compute_loss(log_prob, labels):
     return np.mean(np.sum(-log_prob*labels, axis=1))
    

def compute_accuracy(log_prob, labels):
    predictions = np.argmax(log_prob, axis=1)
    truth = np.argmax(labels, axis=1)
    return np.mean(predictions==truth)

def train_one_step(model, x, y):
    lr = 1e-5
    model.forward(x)
    model.backward(y)
    model.W1 -= lr* model.W1_grad
    model.W2 -= lr* model.W2_grad
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

def test(model, x, y):
    model.forward(x)
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

## 实际训练

In [72]:
train_data, test_data = mnist_dataset()
train_label = np.zeros(shape=[train_data[0].shape[0], 10])
test_label = np.zeros(shape=[test_data[0].shape[0], 10])
train_label[np.arange(train_data[0].shape[0]), np.array(train_data[1])] = 1.
test_label[np.arange(test_data[0].shape[0]), np.array(test_data[1])] = 1.

for epoch in range(50):
    loss, accuracy = train_one_step(model, train_data[0], train_label)
    print('epoch', epoch, ': loss', loss, '; accuracy', accuracy)
loss, accuracy = test(model, test_data[0], test_label)

print('test loss', loss, '; accuracy', accuracy)

epoch 0 : loss 16.42098832465115 ; accuracy 0.3424833333333333
epoch 1 : loss 14.742062529605798 ; accuracy 0.4037
epoch 2 : loss 13.954191072458672 ; accuracy 0.4263166666666667
epoch 3 : loss 13.701146372425942 ; accuracy 0.4439166666666667
epoch 4 : loss 12.741220838790587 ; accuracy 0.467
epoch 5 : loss 12.668149346197305 ; accuracy 0.47868333333333335
epoch 6 : loss 11.770831172213583 ; accuracy 0.5042666666666666
epoch 7 : loss 11.873352036788196 ; accuracy 0.5107166666666667
epoch 8 : loss 11.030306485745625 ; accuracy 0.5381
epoch 9 : loss 11.008182763373659 ; accuracy 0.54085
epoch 10 : loss 10.54640524708077 ; accuracy 0.5608
epoch 11 : loss 10.353594095679497 ; accuracy 0.5675333333333333
epoch 12 : loss 9.975575656166471 ; accuracy 0.5850833333333333
epoch 13 : loss 9.825150933861048 ; accuracy 0.589
epoch 14 : loss 9.707710833554012 ; accuracy 0.5965666666666667
epoch 15 : loss 9.949793606936263 ; accuracy 0.5849333333333333
epoch 16 : loss 10.023168622384894 ; accuracy 0.