权重衰退 通过限制参数值的选择范围来控制模型容量
最小化损失函数的时候 w的l2范数
(w每个项的平方和)小于sita  b不做限制    sita越小正则项越强
一般使用均方范数

权重衰退会使得某些w项为0(消失)  跟缩小学习率不一样(放慢速度)
https://www.cnblogs.com/zzk0/p/15056312.html

In [None]:
import torch
from torch import nn
from d2l import torch as d2l

# 使过拟合更加明显，设置训练数据集很小,维数很大
n_train, n_test, num_inputs, batch_size = 20, 100, 200, 5
true_w, true_b = torch.ones((num_inputs, 1)) * 0.01, 0.05
train_data = d2l.synthetic_data(true_w, true_b, n_train)
train_iter = d2l.load_array(train_data, batch_size)
test_data = d2l.synthetic_data(true_w, true_b, n_test)
test_iter = d2l.load_array(test_data, batch_size, is_train=False)


def init_params():
    """初始化模型参数"""
    w = torch.normal(0, 1, size=(num_inputs, 1), requires_grad=True)
    b = torch.zeros(1, requires_grad=True)
    return [w, b]


def l2_penalty(w):
    """L2范数惩罚"""
    return torch.sum(w.pow(2)) / 2


def train(lambd):
    '''训练，lambd是L2正则化的超参数'''
    w, b = init_params()
    net, loss = (
        lambda X: d2l.linreg(X, w, b),
        d2l.squared_loss,
    )  # lambda argument_list:expersion
    num_epochs, lr = 100, 0.003
    animator = d2l.Animator(
        xlabel="epochs",
        ylabel="loss",
        yscale="log",
        xlim=[5, num_epochs],
        legend=["train", "test"],
    )
    for epoch in range(num_epochs):
        for X, y in train_iter:
            # 增加了L2范数惩罚项，
            # 广播机制使l2_penalty(w)成为一个长度为batch_size的向量
            l = loss(net(X), y) + lambd * l2_penalty(w)
            l.sum().backward()
            d2l.sgd([w, b], lr, batch_size)
        if (epoch + 1) % 5 == 0:
            animator.add(
                epoch + 1,
                (
                    d2l.evaluate_loss(net, train_iter, loss),
                    d2l.evaluate_loss(net, test_iter, loss),
                ),
            )
    print("w的L2范数是：", torch.norm(w).item())



In [None]:
# 忽略正则化直接训练

train(lambd=0)


In [None]:
# 使用权重衰减   严重过拟合
train(lambd=3)

简洁实现

In [None]:
def train_concise(wd):
    net = nn.Sequential(nn.Linear(num_inputs, 1))
    for param in net.parameters():
        param.data.normal_()
    loss = nn.MSELoss(reduction="none")
    num_epochs, lr = 100, 0.003
    # 偏置参数没有衰减 L2罚既可以写在目标函数(loss?)，也可以做在训练算法内。在更新w之前，把当前的w乘一个这样的小值。
    #这样子做，自动求导少算了? 没理解
    trainer = torch.optim.SGD(
        [{"params": net[0].weight, "weight_decay": wd}, {"params": net[0].bias}], lr=lr
    )
    animator = d2l.Animator(
        xlabel="epochs",
        ylabel="loss",
        yscale="log",
        xlim=[5, num_epochs],
        legend=["train", "test"],
    )
    for epoch in range(num_epochs):
        for X, y in train_iter:
            trainer.zero_grad()
            l = loss(net(X), y)
            l.sum().backward()
            trainer.step()
        if (epoch + 1) % 5 == 0:
            animator.add(
                epoch + 1,
                (
                    d2l.evaluate_loss(net, train_iter, loss),
                    d2l.evaluate_loss(net, test_iter, loss),
                ),
            )
    print("w的L2范数：", net[0].weight.norm().item())


train_concise(3)
