<a href="https://colab.research.google.com/github/mengcius/pytorch-learn/blob/master/11_%E8%BF%87%E6%8B%9F%E5%90%88%E4%B8%8E%E5%AD%A6%E4%B9%A0%E7%8E%87%E8%A1%B0%E5%87%8F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 11_过拟合与学习率衰减

### 划分train-val-test集来训练
 可将原来的train集划分为train集和val集，test集不变。
 
 train集用来训练，val集用来挑选模型参数，test仅仅用来评价。

In [0]:
import  torch
import  torch.nn as nn
import  torch.nn.functional as F
import  torch.optim as optim
from    torchvision import datasets, transforms

batch_size=200
learning_rate=0.01
epochs=10

# 加载数据，划分train、val、test集
train_db = datasets.MNIST('mnist_data', train=True, download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))]))     
# train_loader = torch.utils.data.DataLoader(train_db, batch_size=batch_size, shuffle=True)

test_db = datasets.MNIST('mnist_data', train=False, download=True, #train=False，不训练
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307,), (0.3081,))]))
test_loader = torch.utils.data.DataLoader(test_db, batch_size=batch_size, shuffle=True)

print('train:', len(train_db), 'test:', len(test_db))
train_db, val_db = torch.utils.data.random_split(train_db, [50000, 10000]) #将原train集随机划分为train和val集
print('train_db:', len(train_db), 'val_db:', len(val_db))

train_loader = torch.utils.data.DataLoader(train_db, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_db, batch_size=batch_size, shuffle=True)


# 模型构建
class MLP(nn.Module):

    def __init__(self):
        super(MLP, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(784, 200),
            nn.LeakyReLU(inplace=True),
            nn.Linear(200, 200),
            nn.LeakyReLU(inplace=True),
            nn.Linear(200, 10),
            nn.LeakyReLU(inplace=True),
        )

    def forward(self, x):
        x = self.model(x)

        return x


# train集训练
device = torch.device('cuda:0')
net = MLP().to(device)
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
# optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=0.01) #weight_decay就是开L2正则化，设置lambda参数
criteon = nn.CrossEntropyLoss().to(device)

for epoch in range(epochs):

    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.view(-1, 28*28)
        data, target = data.to(device), target.cuda()

        logits = net(data)
        loss = criteon(logits, target)

        optimizer.zero_grad()
        loss.backward()
        # print(w1.grad.norm(), w2.grad.norm())
        optimizer.step()

        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))


    # val集验证，以挑选出最优的模型
    test_loss = 0
    correct = 0
    for data, target in val_loader:
        data = data.view(-1, 28 * 28)
        data, target = data.to(device), target.cuda()
        logits = net(data)
        test_loss += criteon(logits, target).item()

        pred = logits.data.max(1)[1]
        correct += pred.eq(target.data).sum()

    test_loss /= len(val_loader.dataset)
    print('\nVAL set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(val_loader.dataset),
        100. * correct / len(val_loader.dataset)))


# test集测试评价
#...#这里可加载从val中找到的最佳的模型，这里默认用最后一个
test_loss = 0
correct = 0
for data, target in test_loader:
    data = data.view(-1, 28 * 28)
    data, target = data.to(device), target.cuda()
    logits = net(data)
    test_loss += criteon(logits, target).item()

    pred = logits.data.max(1)[1]
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(test_loader.dataset),
    100. * correct / len(test_loader.dataset)))

train: 60000 test: 10000
train_db: 50000 val_db: 10000

VAL set: Average loss: 0.0079, Accuracy: 6807/10000 (68%)


VAL set: Average loss: 0.0049, Accuracy: 7105/10000 (71%)


VAL set: Average loss: 0.0033, Accuracy: 7978/10000 (79%)


VAL set: Average loss: 0.0019, Accuracy: 8927/10000 (89%)


VAL set: Average loss: 0.0017, Accuracy: 9044/10000 (90%)


VAL set: Average loss: 0.0015, Accuracy: 9124/10000 (91%)


VAL set: Average loss: 0.0014, Accuracy: 9169/10000 (91%)


VAL set: Average loss: 0.0014, Accuracy: 9216/10000 (92%)


VAL set: Average loss: 0.0013, Accuracy: 9243/10000 (92%)


VAL set: Average loss: 0.0012, Accuracy: 9291/10000 (92%)


Test set: Average loss: 0.0011, Accuracy: 9347/10000 (93%)



### 正则化
如果没有过拟合的话，设置正则化会使性能急剧下降，因为表达能力还不够。如果过拟合后使用了它，则train不变test性能会上升一些。

不加正则化：Test set: Average loss: 0.0011, Accuracy: 9347/10000 (93%)

加正则化：Test set: Average loss: 0.0013, Accuracy: 9268/10000 (92%)



In [0]:
optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=0.01) #weight_decay就是开L2正则化，设置lambda参数

### Dropout

In [0]:
net_dropped = torch.nn.Sequential(
    nn.Linear(784, 200),
    nn.Dropout(0.5), #Dropout层：在两层之间丢弃断掉50%的连接，不断掉之前是直连。这不是在层里面断
    nn.LeakyReLU(inplace=True),
    nn.Linear(200, 200),
    nn.Dropout(1), #断掉100%。但在TF相反是保留100%
    nn.LeakyReLU(inplace=True),
    nn.Linear(200, 10),
    # nn.LeakyReLU(inplace=True),
)

In [0]:
for epoch in range(epochs):
    # train
    net_dropped.train()
    for batch_idx,(data,target) in enumerate(train_loader):
        ...
        net_dropped.eval() #在test/val时所有连接都要连上，不能Dropout，切换到eval里去
        test_loss=0
        correct=0
        for data,target in test_loader
            ...

### 动量与学习率衰减


动量

学习率衰减监听法

In [0]:
optimizer=torch.optim.SGD(model.parameters(),args.lr,
             momentum=arg.momentum, #加入动量，传入动量的belta参数
             weight_decay=args.weight_decay) #L2正则化
scheduler=ReduceLROnPlateau(optim,'min') #监听法：min是要监听连续几次loss如果没有下降就减少学习率

for epoch in xrange(args.start_epoch,arg.epochs):
    train(train_loader,model,criterion,optimizer,epoch)
    result_avg,loss_val=validate(val_loader,model,criterion,epoch)
    scheduler.step(loss_val) #每调用一次就监听一次loss

学习率衰减步进法

In [0]:
scheduler=StepLR(optimizer,step_size=30,gamma=0.1) #步进法：每30个epoch学习率降低0.1倍
for epoch in range(100):
    scheduler.step() #调度学习率衰减方法
    train(...)
    validate(...)