丢弃法就是dropout，这里主要是inverted dropout

In [7]:
#从零开始实现
%matplotlib inline
import torch
import torch.nn as nn
import numpy as np
import sys
import d2lzh_pytorch as d2l
import torchvision
import torchvision.transforms as transforms

In [8]:
def dropout(X,drop_prob):#X是输入的张量，一般是神经网络层（like全连接层的输出）用drop——prob来做丢弃概率
    X=X.float()#将张量转换为float类型,确保之后的计算类型一致
    assert 0<=drop_prob<=1#断言检查，确保概率0-1，否则报错
    keep_prob=1-drop_prob#保留概率
    if keep_prob==0:#如果全部都不保留
        return torch.zeros_like(X)#直接返回与X相同形状的全零张量
    mask = (torch.rand(X.shape)<keep_prob).float()#掩码操作，生成一个torch（服从0，1分布）
    #根据keep_prob概率进行保留（要保留的地方是true，不保留的地方是false，在转换为float的格式（数字）
    return mask*X/keep_prob#不保留的地方地方mask=0，保留的地方对元素进行缩放（保持输入的总期望不变，确保数据分布不会变化）

In [9]:
#测试一下这个dropout
x =torch.arange(16).view(2,8)
dropout(x,0.2)

tensor([[ 0.0000,  1.2500,  2.5000,  3.7500,  5.0000,  0.0000,  7.5000,  8.7500],
        [10.0000,  0.0000, 12.5000, 13.7500,  0.0000, 16.2500, 17.5000, 18.7500]])

In [10]:
dropout(x,0.5)

tensor([[ 0.,  2.,  4.,  6.,  8., 10.,  0., 14.],
        [ 0.,  0.,  0., 22., 24., 26.,  0., 30.]])

In [11]:
dropout(x,1.0)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:
#定义模型参数
num_inputs,num_outputs,num_hiddens1,num_hiddens2 = 784,10,256,256
w1 = torch.tensor(np.random.normal(0,0.01,size = (num_inputs,num_hiddens1)),dtype = torch.float,requires_grad=True)
b1 = torch.zeros(num_hiddens1,requires_grad= True)
w2 = torch.tensor(np.random.normal(0,0.01,size = (num_hiddens1,num_hiddens2)),dtype = torch.float,requires_grad=True)
b2 = torch.zeros(num_hiddens2,requires_grad=True)
w3 = torch.tensor(np.random.normal(0,0.01,size = (num_hiddens2,num_outputs)),dtype = torch.float,requires_grad= True)
b3 = torch.zeros(num_outputs,requires_grad= True)
params = [w1,b1,w2,b2,w3,b3]

                  

In [13]:
#定义模型
drop_prob1,drop_prob2 = 0.2,0.5
def net(X,is_training = True):
    X = X.view(-1,num_inputs)
    H1 = (torch.matmul(X,w1)+b1).relu()
    if is_training:
        H1 = dropout(H1,drop_prob1)
    H2 = (torch.matmul(H1,w2)+b2).relu()
    if is_training:
        H2 = dropout(H2,drop_prob2)
    Y = torch.matmul(H2,w3)+b3
    return Y
    

In [14]:
#修改一下计算精度的函数,这个函数在D2L中修改
def evaluate_accuracy(data_iter,net):
    acc_sum,n=0.0,0
    for X,y in data_iter:
        if isinstance(net,torch.nn.Module):#判断net是否是nn.module的派生类
            net.eval()#评估模式，关闭dropout
            accs_sum+=(net(X).argmax(dim=1)==y).float().sum().item()
            net.train()#改回训练模式
        else:#如果是自定义的模型
            if('is_training' in net.__code__.co_varnames):#如果有is_training这个参数
                #将这个参数设为F
                acc_sum+=(net(X,is_training=False).argmax(dim=1)==y).float().sum().item()
            else:
                acc_sum+=(net(X).argmax(dim=1)==y).float().sum().item()
        n+=y.shape[0]
    return acc_sum/n
                

In [15]:
#训练模型
num_epochs,lr,batch_size = 5,100.0,256
loss = torch.nn.CrossEntropyLoss()
train_data = torchvision.datasets.FashionMNIST(root='~/DataSets/FashionMNIST',train = True,download=True,transform = transforms.ToTensor())
test_data = torchvision.datasets.FashionMNIST(root='~/DataSets/FashionMNIST',train=False,download = True,transform = transforms.ToTensor())
num_workers =4
train_iter = torch.utils.data.DataLoader(train_data,batch_size=batch_size,shuffle=True,num_workers=num_workers)
test_iter = torch.utils.data.DataLoader(test_data,batch_size=batch_size,shuffle=True,num_workers=num_workers)

In [10]:
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,params,lr)

Consider using tensor.detach() first. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\generated\python_variable_methods.cpp:836.)
  print('epoch %d,loss%.4f,train acc %.3f,test acc %.3f'


epoch 1,loss0.0047,train acc 0.539,test acc 0.715
epoch 2,loss0.0023,train acc 0.780,test acc 0.796
epoch 3,loss0.0019,train acc 0.823,test acc 0.826
epoch 4,loss0.0017,train acc 0.839,test acc 0.846
epoch 5,loss0.0016,train acc 0.849,test acc 0.787


In [16]:
#简洁实现
net = nn.Sequential(
    d2l.FlattenLayer(),
    nn.Linear(num_inputs,num_hiddens1),
    nn.ReLU(),
    nn.Dropout(drop_prob1),
    nn.Linear(num_hiddens1,num_hiddens2),
    nn.ReLU(),
    nn.Dropout(drop_prob2),
    nn.Linear(num_hiddens2,num_outputs)
)


In [17]:
for param in net.parameters():
    nn.init.normal_(param,mean=0,std=0.01)

In [18]:
optimizer = torch.optim.SGD(net.parameters(),lr=0.5)
d2l.train_ch3(net,train_iter,test_iter,loss,num_epochs,batch_size,None,None,optimizer)

Consider using tensor.detach() first. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\torch\csrc\autograd\generated\python_variable_methods.cpp:836.)
  print('epoch %d,loss%.4f,train acc %.3f,test acc %.3f'


epoch 1,loss0.0046,train acc 0.542,test acc 0.748
epoch 2,loss0.0023,train acc 0.783,test acc 0.746
epoch 3,loss0.0019,train acc 0.819,test acc 0.782
epoch 4,loss0.0018,train acc 0.834,test acc 0.823
epoch 5,loss0.0017,train acc 0.845,test acc 0.843


dropout只有在训练模式时才能使用，所以在上面的accracy计算中（这个函数只用来计算测试集的acc）  
进行net.eval()的操作