In [6]:
import torch
from torch import nn
from d2l import torch as d2l

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

num_inputs, num_outputs, num_hiddens = 784, 10, 256

# 初始化模型参数
W1 = nn.Parameter(
    torch.randn(num_inputs, num_hiddens, requires_grad=True)
)  # shape:784*256
b1 = nn.Parameter(torch.zeros(num_hiddens, requires_grad=True))  # shape:256

W2 = nn.Parameter(
    torch.randn(num_hiddens, num_outputs, requires_grad=True) * 0.01
)  # shape:256*10

b2 = nn.Parameter(torch.zeros(num_outputs, requires_grad=True))  # shape:10
params = [W1, b1, W2, b2]


def relu(X):
    """激活函数"""
    a = torch.zeros_like(X)  # 结构相同全为0
    return torch.max(X, a)


def net(X):
    """定义模型"""
    X = X.reshape((-1, num_inputs))
    H = relu(X @ W1 + b1)  # @代表矩阵乘法， 线性模型外面套一层relu激活函数
    return H @ W2 + b2  # relu后也乘上权重加偏移


def CrossEntropyLoss(X, y):
    """softmax和交叉熵损失函数"""
    X_exp = torch.exp(X)  # X_exp.shape:(batch_size,10)
    partition = X_exp.sum(1, keepdim=True)
    y_hat = X_exp / partition  # 广播
    # y_hat.shape:(batch_size,10) y_hat里面是每个批量中每个类别的概率
    # y.shape:(batch_size) y里面是每个批量正确的标号
    # 所以用y里的标号取出y_hat中对应标号的正确类别概率
    # (若y_hat中取出的概率越大，loss越小
    # 这个概率小于1，y=ln(x)在x<1时，y<0，单调增，所以取y的负数使损失值为正数，x=0就是概率为1，损失为0
    # 用这个概率做交叉熵损失
    # y_hat[range(len(y_hat)), y]  逗号前取出第0个轴对应标号的的内容，也就是每一行，
    # 逗号后取出第1轴对应标号的内容，也是就是正确类,结合在一起取出正确类别的预测概率
    return -torch.log(y_hat[range(len(y_hat)), y])  # torch.log(X) 就是 数学中的ln(X)


def sgd(params, lr, batch_size):
    """小批量随机梯度下降"""
    # 这里不需要计算梯度，只是用到了梯度，所以no_grad。训练的时候会对损失函数backward
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size  # 这里为什么要除以batch_size?
            param.grad.zero_()


def train_epoch(net, train_iter, loss, updater, lr):
    """训练模型一个迭代周期"""
    # 训练损失总和,训练准确度总和，样本数
    metric = d2l.Accumulator(3)
    for X, y in train_iter:
        # 计算y_hat和损失梯度，并更新参数
        y_hat = net(X)
        l = loss(y_hat, y)
        l.sum().backward()
        updater(params, lr, y.numel())
        metric.add(float(l.sum()), d2l.accuracy(y_hat, y), y.numel())
    # 返回训练损失和训练精度
    return metric[0] / metric[2], metric[1] / metric[2]


def train(net, train_iter, test_iter, loss, num_epochs, updater, lr):
    """训练模型"""
    # 画图
    animator = d2l.Animator(
        xlabel="epoch",
        xlim=[1, num_epochs],
        ylim=[0.3, 0.9],
        legend=["train loss", "train acc", "test acc"],
    )
    for epoch in range(num_epochs):
        train_metrics = train_epoch(net, train_iter, loss, updater, lr)
        test_acc = d2l.evaluate_accuracy(net, test_iter)
        animator.add(epoch + 1, train_metrics + (test_acc,))


# 训练
num_epochs, lr = 1, 0.1  # 训练次数,学习率

train(net, train_iter, test_iter, CrossEntropyLoss, num_epochs, sgd, lr)


In [None]:


def relu(X):
    """激活函数"""
    a = torch.zeros_like(X)  # 结构相同全为0
    return torch.max(X, a)


def net(X):
    """定义模型"""
    X = X.reshape((-1, num_inputs))
    H = relu(X @ W1 + b1)  # @代表矩阵乘法， 线性模型外面套一层relu激活函数
    return H @ W2 + b2  # relu后也乘上权重加偏移


def CrossEntropyLoss(X, y):
    """softmax和交叉熵损失函数"""
    X_exp = torch.exp(X)  # X_exp.shape:(batch_size,10)
    partition = X_exp.sum(1, keepdim=True)
    y_hat = X_exp / partition  # 广播
    # y_hat.shape:(batch_size,10) y_hat里面是每个批量中每个类别的概率
    # y.shape:(batch_size) y里面是每个批量正确的标号
    # 所以用y里的标号取出y_hat中对应标号的正确类别概率
    # (若y_hat中取出的概率越大，loss越小
    # 这个概率小于1，y=ln(x)在x<1时，y<0，单调增，所以取y的负数使损失值为正数，x=0就是概率为1，损失为0
    # 用这个概率做交叉熵损失
    # y_hat[range(len(y_hat)), y]  逗号前取出第0个轴对应标号的的内容，也就是每一行，
    # 逗号后取出第1轴对应标号的内容，也是就是正确类,结合在一起取出正确类别的预测概率
    return -torch.log(y_hat[range(len(y_hat)), y])  # torch.log(X) 就是 数学中的ln(X)


def sgd(params, lr, batch_size):
    """小批量随机梯度下降"""
    # 这里不需要计算梯度，只是用到了梯度，所以no_grad。训练的时候会对损失函数backward
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size  # 这里为什么要除以batch_size?
            param.grad.zero_()


def train_epoch(net, train_iter, loss, updater, lr):
    """训练模型一个迭代周期"""
    # 训练损失总和,训练准确度总和，样本数
    metric = d2l.Accumulator(3)
    for X, y in train_iter:
        # 计算y_hat和损失梯度，并更新参数
        y_hat = net(X)
        l = loss(y_hat, y)
        l.sum().backward()
        updater(params, lr, y.numel())
        metric.add(float(l.sum()), d2l.accuracy(y_hat, y), y.numel())
    # 返回训练损失和训练精度
    return metric[0] / metric[2], metric[1] / metric[2]


def train(net, train_iter, test_iter, loss, num_epochs, updater, lr):
    """训练模型"""
    # 画图
    animator = d2l.Animator(
        xlabel="epoch",
        xlim=[1, num_epochs],
        ylim=[0.3, 0.9],
        legend=["train loss", "train acc", "test acc"],
    )
    for epoch in range(num_epochs):
        train_metrics = train_epoch(net, train_iter, loss, updater, lr)
        test_acc = d2l.evaluate_accuracy(net, test_iter)
        animator.add(epoch + 1, train_metrics + (test_acc,))


# 训练
num_epochs, lr = 1, 0.1  # 训练次数,学习率

train(net, train_iter, test_iter, CrossEntropyLoss, num_epochs, sgd, lr)
