### 优化器相关例子
https://pytorch.org/docs/stable/optim.html#stochastic-weight-averaging

1、优化器
指定一种优化算法（可以细化指定不同的层使用不同的初始化学习率，后续结合lr_scheduler可以实现各层的学习率动态调整）

    optim.SGD([
                    {'params': model.base.parameters()},
                    {'params': model.classifier.parameters(), 'lr': 1e-3}
                ], lr=1e-2, momentum=0.9)

然后loss的backward计算梯度，optimizer的step更新参数值。

2、torch.optim.lr_scheduler提供了几种方式，根据epoch来调整learning rate。（注意，scheduler的step应该在optimizer的step之后，否则会丢失一次动态调整学习率）。
pytorch1.6提供了不限于以下几种学习率调整方式：

    1、StepLR 每隔多少epoch步数调整学习率
    2、MultiStepLR 根据milestones指定的epoch步数调整学习率
    3、ExponentialLR 每一epoch步数都调整学习率
    4、CyclicLR 在每批batch数据之后改变学习率，所以该类的step()函数应在使用批处理进行训练后调用。

In [None]:
def get_scheduler(optimizer, opt):
    """返回学习率调试器
    Parameters:
        optimizer          -- 网络使用的优化器
        opt (option class) -- opt.lr_policy是学习率策略的名字，如: linear | step | plateau | cosine
    """
    if opt.lr_policy == 'linear':
        def lambda_rule(epoch):    #epoch_count表示epoch开始的值，默认为1
            lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1)
            return lr_l
        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
    elif opt.lr_policy == 'step':
        scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1)
    elif opt.lr_policy == 'plateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
    elif opt.lr_policy == 'cosine':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.niter, eta_min=0)
    else:
        return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
    return scheduler

In [2]:
import torchvision
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision.models import AlexNet
import matplotlib.pyplot as plt
"""
根据epoch来调整学习率的方式，可以结合不同的参数用不同的学习率方式
"""
model = AlexNet(num_classes=2)
optimizer = optim.SGD(params=model.parameters(), lr=0.05)

# lr_scheduler.StepLR()
# Assuming optimizer uses lr = 0.05 for all groups
# lr = 0.05     if epoch < 30
# lr = 0.005    if 30 <= epoch < 60
# lr = 0.0005   if 60 <= epoch < 90

scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
plt.figure()
x = list(range(100))
y = []
for epoch in range(100):
    # 更新了学习率，从而更新了optimizer中的学习率状态，继续后续的train过程
    optimizer.step()
    scheduler.step()
    lr = scheduler.get_lr()
    print(epoch, scheduler.get_lr()[0])
    y.append(scheduler.get_lr()[0])

plt.plot(x, y)
"""
# 网络中不同的参数用不同的学习率来学习的方式
"""
model = torchvision.models.resnet18()
paras = dict(model.named_parameters())

for k, v in paras.items():
    print(k.ljust(30), str(v.shape).ljust(30), 'bias:', v.requires_grad)

paras_new = []
for k, v in paras.items():
    if 'bias' in k:
        paras_new += [{'params': [v], 'lr': 0.02, 'weight_decay': 0}]
    else:
        paras_new += [{'params': [v], 'lr': 0.01, 'weight_decay': 0.00004}]
optimizer = torch.optim.SGD(paras_new, momentum=0.9)

for p in optimizer.param_groups:
    outputs = ''
    for k, v in p.items():
        if k is 'params':
            outputs += (k + ': ' + str(v[0].shape).ljust(30) + ' ')
        else:
            outputs += (k + ': ' + str(v).ljust(10) + ' ')
    print(outputs)

0 0.05
1 0.05
2 0.05
3 0.05
4 0.05
5 0.05
6 0.05
7 0.05
8 0.05
9 0.05
10 0.05
11 0.05
12 0.05
13 0.05
14 0.05
15 0.05
16 0.05
17 0.05
18 0.05
19 0.05
20 0.05
21 0.05
22 0.05
23 0.05
24 0.05
25 0.05
26 0.05
27 0.05
28 0.05
29 0.005000000000000001
30 0.005000000000000001
31 0.005000000000000001
32 0.005000000000000001
33 0.005000000000000001
34 0.005000000000000001
35 0.005000000000000001
36 0.005000000000000001
37 0.005000000000000001
38 0.005000000000000001
39 0.005000000000000001
40 0.005000000000000001
41 0.005000000000000001
42 0.005000000000000001
43 0.005000000000000001
44 0.005000000000000001
45 0.005000000000000001
46 0.005000000000000001
47 0.005000000000000001
48 0.005000000000000001
49 0.005000000000000001
50 0.005000000000000001
51 0.005000000000000001
52 0.005000000000000001
53 0.005000000000000001
54 0.005000000000000001
55 0.005000000000000001
56 0.005000000000000001
57 0.005000000000000001
58 0.005000000000000001
59 0.0005000000000000001
60 0.0005000000000000001
61 0.000