In [3]:
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import d2lzh_pytorch as d2l

def dropout(X, drop_prob):
    X = X.float()
    assert 0 <= drop_prob <= 1
    keep_prob = 1 - drop_prob
    
    if keep_prob == 0:  # 全都丢弃呗
        return torch.zeros_like(X)
    elif drop_prob == 0: # 不丢弃
        return X
    mask = (torch.randn(X.shape) > drop_prob).float() # 这个是小于保存概率的
    
    return mask * X / keep_prob

# 定义模型参数

In [4]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

W1 = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_hiddens1)), dtype=torch.float, requires_grad=True)
b1 = torch.zeros(num_hiddens1, requires_grad=True)
W2 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens1, num_hiddens2)), dtype=torch.float, requires_grad=True)
b2 = torch.zeros(num_hiddens2, requires_grad=True)
W3 = torch.tensor(np.random.normal(0, 0.01, size=(num_hiddens2, num_outputs)), dtype=torch.float, requires_grad=True)
b3 = torch.zeros(num_outputs, requires_grad=True)

params = [W1, b1, W2, b2, W3, b3]

# 定义模型

In [5]:
drop_prob1, drop_prob2 = 0.2, 0.5

def net(X, is_training = True):
    X = X.view(-1, num_inputs)
    H1 = (torch.matmul(X, W1) + b1).relu()
    if is_training: # 只在训练模型的时候用丢弃法
        H1 = dropout(H1, drop_prob1)
    H2 = (torch.matmul(H1, W2) + b2).relu()
    if is_training:
        H2 = dropout(H2, drop_prob2)
    return torch.matmul(H2, W3) + b3

# 训练模型

In [6]:
num_epochs, lr, batch_size = 5, 100.0, 256
loss = torch.nn.CrossEntropyLoss()
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params, lr)

epoch 1,loss 0.0067, train acc 0.333, test acc 0.581
epoch 2,loss 0.0034, train acc 0.674, test acc 0.704
epoch 3,loss 0.0026, train acc 0.759, test acc 0.773
epoch 4,loss 0.0023, train acc 0.793, test acc 0.796
epoch 5,loss 0.0021, train acc 0.808, test acc 0.780


# 简洁实现

In [7]:
net = nn.Sequential(
    d2l.FlattenLayer(),
    nn.Linear(num_inputs, num_hiddens1),
    nn.ReLU(),
    nn.Dropout(drop_prob1),
    nn.Linear(num_hiddens1, num_hiddens2),
    nn.ReLU(),
    nn.Dropout(drop_prob2),
    nn.Linear(num_hiddens2, 10)
)

for param in net.parameters():
    nn.init.normal_(param, mean=0, std=0.01)

In [8]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1,loss 0.0047, train acc 0.535, test acc 0.731
epoch 2,loss 0.0023, train acc 0.781, test acc 0.763
epoch 3,loss 0.0019, train acc 0.818, test acc 0.797
epoch 4,loss 0.0017, train acc 0.838, test acc 0.831
epoch 5,loss 0.0016, train acc 0.846, test acc 0.839
