In [16]:
# https://github.com/rasbt/dora-from-scratch
# https://magazine.sebastianraschka.com/p/lora-and-dora-from-scratch

from utils import EarlyStopping, train_loop_with_resume, train_epoch, validate_epoch
from models import LinearWithLoRA, LoRALayer
import time, os
import numpy as np
import torch
from torch.utils.data import DataLoader, random_split
import torch.nn.functional as F
from torch import nn, optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import datasets, transforms

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.backends.cudnn.deterministic = True

In [2]:
###########################
#### Settings
###########################
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
TRAIN_RATIO = 0.8

###########################
#### mnist dataset
###########################
labelled_set = datasets.MNIST(root='../../data', train=True, transform=transforms.ToTensor(), download=False)
test_set = datasets.MNIST(root='../../data', train=False, transform=transforms.ToTensor(), download=False)

# 划分训练集和验证集
train_size = int(TRAIN_RATIO * len(labelled_set))
valid_size = len(labelled_set) - train_size
train_set, valid_set = random_split(labelled_set, [train_size, valid_size])
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

# 检查数据集张量维度
check_batch = next(iter(train_loader))
print(f"批次数据维度: {check_batch[0].shape}, 标签维度: {check_batch[1].shape}")

批次数据维度: torch.Size([128, 1, 28, 28]), 标签维度: torch.Size([128])


In [3]:
#############################
#### Hyperparameters
#############################
SEED = 123
torch.manual_seed(SEED)
LEARNING_RATE = 5e-3
NUM_EPOCHS = 50

#############################
#### Architecture
#############################
n_features = 28 * 28  # MNIST images are 28x28 pixels
n_classes = 10
n_hidden1 = 256
n_hidden2 = 64


#############################
#### Perceptron Model
#############################
class MultiLayerPerceptron(nn.Module):
    def __init__(self, in_features, hidden1, hidden2, out_features, rank=4, alpha=1.0):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, out_features)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # 展平输入
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x


model_pretrained = MultiLayerPerceptron(n_features, n_hidden1, n_hidden2, n_classes).to(DEVICE)
optimizer_pretrained = optim.Adam(model_pretrained.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
scheduler = CosineAnnealingLR(optimizer_pretrained, T_max=NUM_EPOCHS, eta_min=1e-6)
early_stopping = EarlyStopping(patience=5, delta=1e-4, mode="max")
best_ckpt_file_path = "best_pretrained_20250707.pth"
# 训练
accuracy, discord = train_loop_with_resume(
    model_pretrained,
    train_loader,
    valid_loader,
    criterion,
    optimizer_pretrained,
    scheduler,
    early_stopping,
    best_ckpt_file_path,
    NUM_EPOCHS,
    DEVICE,
)
print(f"训练过程损失: {discord['train_loss']}")
print(f"训练过程准确率: {discord['train_acc']}")
print(f"验证过程损失: {discord['valid_loss']}")
print(f"验证过程准确率: {discord['valid_acc']}")

Checkpoint loaded from best_pretrained_20250707.pth, resume from epoch 6
加载训练点模型成功，当前准确率为0.9754，从第6个epoch开始训练...

Epoch 7/50 - --------------------------------------------------


                                                                                                    

当前验证准确率: 0.9867
当前学习率: 4.76e-03
更新最佳验证准确率: 0.9867
Checkpoint saved to best_pretrained_20250707.pth

Epoch 8/50 - --------------------------------------------------


                                                                                                    

当前验证准确率: 0.9843
当前学习率: 4.69e-03

Epoch 9/50 - --------------------------------------------------


                                                                                                    

当前验证准确率: 0.9804
当前学习率: 4.61e-03

Epoch 10/50 - --------------------------------------------------


                                                                                                    

当前验证准确率: 0.9799
当前学习率: 4.52e-03

Epoch 11/50 - --------------------------------------------------


                                                                                                    

当前验证准确率: 0.9799
当前学习率: 4.43e-03

Epoch 12/50 - --------------------------------------------------


                                                                                                    

当前验证准确率: 0.9801
早停触发!
训练过程损失: [0.04945656222539643, 0.038173826846294105, 0.028851708532776684, 0.02816136488694853, 0.02662047257569308, 0.027006679687959452]
训练过程准确率: [0.9852083333333334, 0.9877291666666667, 0.9906666666666667, 0.9913125, 0.9912083333333334, 0.9914791666666667]
验证过程损失: [0.044369788820544875, 0.05006299970547358, 0.07011902468154828, 0.07342269016553958, 0.07831427867741635, 0.08914228246423106]
验证过程准确率: [0.9866666666666667, 0.9843333333333333, 0.9804166666666667, 0.9799166666666667, 0.9799166666666667, 0.9800833333333333]




In [10]:
import copy

print("改造前的模型结构:")
print(model_pretrained)
lora_rank = 4
lora_alpha = 1.5
model_lora = copy.deepcopy(model_pretrained)


def add_lora_to_linear(net: nn.Module, rank=4, alpha=1.0) -> None:
    """递归替换模型中的线性层为带LoRA的线性层"""
    for name, module in net.named_children():
        if isinstance(module, nn.Linear):
            print(f"替换层: {name} -> 带LoRA的线性层")
            lora_layer = LoRALayer(module.in_features, module.out_features, rank, alpha)
            setattr(net, name, LinearWithLoRA(module, lora_layer))
        elif len(list(module.children())) > 0:
            # 如果模块还有子模块，递归替换
            add_lora_to_linear(module, rank, alpha)


add_lora_to_linear(model_lora, rank=lora_rank, alpha=lora_alpha)
print("改造后的模型结构:")
print(model_lora)

# --- 步骤 3：验证LoRA模型的可训练参数 ---
print("\n验证LoRA模型的可训练参数:")
for name, param in model_lora.named_parameters():
    if param.requires_grad:
        print(f"可训练 (Trainable): {name}: 形状: {param.shape}")
    else:
        print(f"已冻结 (Frozen): {name}: 形状: {param.shape}")

改造前的模型结构:
MultiLayerPerceptron(
  (fc1): Linear(in_features=784, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=10, bias=True)
)
替换层: fc1 -> 带LoRA的线性层
替换层: fc2 -> 带LoRA的线性层
替换层: fc3 -> 带LoRA的线性层
改造后的模型结构:
MultiLayerPerceptron(
  (fc1): LinearWithLoRA(
    (linear): Linear(in_features=784, out_features=256, bias=True)
    (lora): LoRALayer()
  )
  (fc2): LinearWithLoRA(
    (linear): Linear(in_features=256, out_features=64, bias=True)
    (lora): LoRALayer()
  )
  (fc3): LinearWithLoRA(
    (linear): Linear(in_features=64, out_features=10, bias=True)
    (lora): LoRALayer()
  )
)

验证LoRA模型的可训练参数:
已冻结 (Frozen): fc1.linear.weight: 形状: torch.Size([256, 784])
已冻结 (Frozen): fc1.linear.bias: 形状: torch.Size([256])
可训练 (Trainable): fc1.lora.A: 形状: torch.Size([784, 4])
可训练 (Trainable): fc1.lora.B: 形状: torch.Size([4, 256])
已冻结 (Frozen): fc2.linear.weight: 形状: torch.Size([64, 256])
已冻结 (Frozen): fc2.linear.bias

In [21]:
# --- 步骤 3：获取并保存LoRA模型参数 ---
def get_lora_state_dict_explicit(net):
    """显式地遍历模块"""
    lora_state_dict = {}
    for name, module in net.named_modules():
        # 我们要找的是 LoRALayer, 而不是 LinearWithLoRA
        if isinstance(module, LoRALayer):
            # 'name' 已经是正确的层级名称，例如 'fc1.lora'
            lora_state_dict[name + ".A"] = module.A
            lora_state_dict[name + ".B"] = module.B
    return lora_state_dict


def get_lora_state_dict(net):
    return {k: v for k, v in net.named_parameters() if 'lora' in k and v.requires_grad}


def save_lora_checkpoint(lora_model, optimizer, scheduler, epoch, best_acc, ckpt_path):
    """保存LoRA模型的checkpoint，注意，不保存完整的 model.state_dict()"""
    lora_state_dict = get_lora_state_dict(lora_model)
    checkpoint = {
        'epoch': epoch,
        'best_acc': best_acc,
        'lora_state_dict': lora_state_dict,
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }
    torch.save(checkpoint, ckpt_path)
    print(f"LoRA模型参数已保存到 {ckpt_path}")


def load_lora_checkpoint(ckpt_path, lora_model, optimizer, scheduler=None, device="cpu"):
    """
    加载LoRA检查点以恢复训练或进行推理

    Args:
        ckpt_path (str): 检查点文件路径。
        lora_model (nn.Module): LoRA模型实例 (预训练模型 + LoRA结构)。
        optimizer (torch.optim.Optimizer, optional): 优化器实例。
        scheduler (torch.optim.lr_scheduler._LRScheduler, optional): 调度器实例。
        device (str, optional): 设备类型，默认为 "cpu"。

    Returns:
        tuple: (start_epoch, best_acc) 重开的轮次和最佳准确率。
    """
    checkpoint = torch.load(ckpt_path, map_location=device)
    lora_model.load_state_dict(checkpoint['lora_state_dict'], strict=False)  # strict=False 是必须的，因为只加载模型参数的一个子集
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    start_epoch = checkpoint['epoch']
    best_acc = checkpoint['best_acc']
    print(f"LoRA模型参数已从 {ckpt_path} 加载")
    return start_epoch, best_acc


dict1 = get_lora_state_dict_explicit(model_lora)

dict2 = get_lora_state_dict_by_grad(model_lora)

print(dict2 == dict1)  # 应该为 True

True


In [22]:
# --- 步骤 4：训练LoRA模型 ---
def merge_all_lora_weights(net):
    """递归合并所有LoRA层的权重"""
    for module in net.modules():
        if isinstance(module, LinearWithLoRA):
            module.merge_weights()


def unmerge_all_lora_weights(net):
    """递归取消合并所有LoRA层的权重"""
    for module in net.modules():
        if isinstance(module, LinearWithLoRA):
            module.unmerge_weights()


model_lora.to(DEVICE)
lora_params_path = "best_lora_parameters.pth"
optimizer_lora = optim.Adam(filter(lambda p: p.requires_grad, model_lora.parameters()), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
scheduler = CosineAnnealingLR(optimizer_pretrained, T_max=NUM_EPOCHS, eta_min=1e-6)
early_stopping = EarlyStopping(patience=7, delta=1e-4, mode="max")
if os.path.exists(lora_params_path):
    start_epoch, best_acc = load_lora_checkpoint(lora_params_path, model_lora, optimizer_lora, scheduler, DEVICE)
    print(f"加载训练点LoRA参数成功，当前准确率为{best_acc:.4f}，从第{start_epoch}个epoch开始训练...")
else:
    start_epoch = 0
    best_acc = 0.0
    print("未找到LoRA参数文件，开始从头训练LoRA层...")

unmerge_all_lora_weights(model_lora)  # 确保在训练前权重是 *未合并* 的状态
model_lora.train()  # 设置模型为训练模式
for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    print("-" * 50)
    train_loss, train_acc = train_epoch(model_lora, train_loader, optimizer_lora, criterion, DEVICE)
    valid_loss, valid_acc = validate_epoch(model_lora, valid_loader, criterion, DEVICE)
    print(f"当前验证准确率: {valid_acc:.4f}")

    early_stopping(valid_acc)
    if early_stopping.early_stop:
        print("验证准确率未提升，提前停止训练")
        break

    scheduler.step()
    print(f"当前学习率: {scheduler.get_last_lr()[0]:.3e}")

    if valid_acc > best_acc:
        best_acc = valid_acc
        save_lora_checkpoint(model_lora, optimizer_lora, scheduler, epoch, best_acc, lora_params_path)
        print(f"保存新的最佳LoRA参数，当前最佳验证准确率: {best_acc:.4f}")

LoRA模型参数已从 best_lora_parameters.pth 加载
加载训练点LoRA参数成功，当前准确率为0.9882，从第2个epoch开始训练...

Epoch 1/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9870
当前学习率: 4.187e-03

Epoch 2/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9872
当前学习率: 4.149e-03

Epoch 3/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9873
当前学习率: 4.104e-03

Epoch 4/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9864
当前学习率: 4.051e-03

Epoch 5/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9888
当前学习率: 3.990e-03
LoRA模型参数已保存到 best_lora_parameters.pth
保存新的最佳LoRA参数，当前最佳验证准确率: 0.9888

Epoch 6/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9884
当前学习率: 3.922e-03

Epoch 7/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9867
当前学习率: 3.847e-03

Epoch 8/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9892
当前学习率: 3.766e-03
LoRA模型参数已保存到 best_lora_parameters.pth
保存新的最佳LoRA参数，当前最佳验证准确率: 0.9892

Epoch 9/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9872
当前学习率: 3.677e-03

Epoch 10/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9879
当前学习率: 3.583e-03

Epoch 11/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9878
当前学习率: 3.483e-03

Epoch 12/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9860
当前学习率: 3.377e-03

Epoch 13/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9878
当前学习率: 3.267e-03

Epoch 14/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9883
当前学习率: 3.152e-03

Epoch 15/50
--------------------------------------------------


                                                                                                    

当前验证准确率: 0.9884
验证准确率未提升，提前停止训练




对于输入向量矩阵 $X \in \mathbb{R}^{T \times d}$（$T$ 为序列长度，$d$ 为维度）：
1. 计算 Query、Key、Value：
2. 计算注意力分数：
结果仍然是 $(T, d)$，但每个 token 的输出向量都变成了融合其他 token 信息后的新向量。

In [5]:
import torch
import torch.nn as nn
import math

# =============================================================================
# 1. 定义我们自己的、结构清晰的 CustomMHA
# =============================================================================
class CustomMHA(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        q = self.q_proj(x).reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        attn_scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        attn_output = attn_weights @ v
        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, self.embed_dim)

        return self.out_proj(attn_output)

# =============================================================================
# 2. 验证脚本主流程
# =============================================================================
if __name__ == "__main__":
    # --- 参数设置 ---
    embed_dim = 128
    num_heads = 8
    batch_size = 4
    seq_len = 10

    # --- 实例化两个模块 ---
    # 使用 batch_first=True 以匹配现代的常用做法
    torch_mha = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
    custom_mha = CustomMHA(embed_dim, num_heads)

    print("--- 步骤1: 实例化两个MHA模块 ---")
    print("PyTorch MHA 实例已创建。")
    print("Custom MHA 实例已创建。")

    # --- 关键：权重迁移 ---
    # 将 PyTorch MHA 的权重复制到我们的 CustomMHA 中
    with torch.no_grad():
        # PyTorch 将 Q, K, V 的权重合并在一个 `in_proj_weight` 中，我们需要拆分它
        # 形状: (3 * embed_dim, embed_dim) -> 分成三块 (embed_dim, embed_dim)
        custom_mha.q_proj.weight.data = torch_mha.in_proj_weight.data[:embed_dim, :]
        custom_mha.k_proj.weight.data = torch_mha.in_proj_weight.data[embed_dim:2*embed_dim, :]
        custom_mha.v_proj.weight.data = torch_mha.in_proj_weight.data[2*embed_dim:, :]

        # 同样地处理偏置项 (bias)
        custom_mha.q_proj.bias.data = torch_mha.in_proj_bias.data[:embed_dim]
        custom_mha.k_proj.bias.data = torch_mha.in_proj_bias.data[embed_dim:2*embed_dim]
        custom_mha.v_proj.bias.data = torch_mha.in_proj_bias.data[2*embed_dim:]

        # 输出投影层的权重是分开的，可以直接复制
        custom_mha.out_proj.weight.data = torch_mha.out_proj.weight.data
        custom_mha.out_proj.bias.data = torch_mha.out_proj.bias.data

    print("\n--- 步骤2: 权重迁移 ---")
    print("已将 torch_mha 的权重成功复制到 custom_mha。")

    # --- 准备输入数据 ---
    input_tensor = torch.randn(batch_size, seq_len, embed_dim)

    # --- 执行并比较输出 ---
    # 设置为评估模式以确保行为确定性 (例如，关闭dropout)
    torch_mha.eval()
    custom_mha.eval()

    # 获取输出
    # nn.MultiheadAttention 的 forward 需要 query, key, value
    # 在自注意力中，它们是同一个张量
    # 它返回一个元组 (attn_output, attn_output_weights)
    output_torch, _ = torch_mha(input_tensor, input_tensor, input_tensor)
    output_custom = custom_mha(input_tensor)

    print("\n--- 步骤3: 执行前向传播 ---")
    print(f"输入张量形状: {input_tensor.shape}")
    print(f"torch_mha 输出形状: {output_torch.shape}")
    print(f"custom_mha 输出形状: {output_custom.shape}")

    # --- 最终验证 ---
    # 使用 torch.allclose 来比较两个张量是否在数值上非常接近
    are_outputs_equal = torch.allclose(output_torch, output_custom)

    print("\n--- 步骤4: 最终验证 ---")
    print(f"两个模块的输出是否一致? -> {are_outputs_equal}")

    if are_outputs_equal:
        print("\n[结论] 验证成功！✅ CustomMHA 完美复刻了 nn.MultiheadAttention 的功能。")
    else:
        print("\n[结论] 验证失败！❌ 输出不一致，请检查权重迁移逻辑。")
        # 打印差值的绝对和，以供调试
        print(f"差值总和: {torch.sum(torch.abs(output_torch - output_custom))}")


--- 步骤1: 实例化两个MHA模块 ---
PyTorch MHA 实例已创建。
Custom MHA 实例已创建。

--- 步骤2: 权重迁移 ---
已将 torch_mha 的权重成功复制到 custom_mha。

--- 步骤3: 执行前向传播 ---
输入张量形状: torch.Size([4, 10, 128])
torch_mha 输出形状: torch.Size([4, 10, 128])
custom_mha 输出形状: torch.Size([4, 10, 128])

--- 步骤4: 最终验证 ---
两个模块的输出是否一致? -> True

[结论] 验证成功！✅ CustomMHA 完美复刻了 nn.MultiheadAttention 的功能。


In [8]:
batch_size, seq_len, d_model = 300, 50, 128
X = torch.randn(batch_size, seq_len, d_model)
h = 8  # 注意力头数
d_k = d_model // h  # 每个头的维度
d_v = d_model // h
q_proj = nn.Linear(d_model, d_k)
k_proj = nn.Linear(d_model, d_k)
v_proj = nn.Linear(d_model, d_v)
Q = q_proj(X)
print(Q.shape)  # (batch_size, seq_len, d_k)

torch.Size([300, 50, 16])


In [None]:
class SelfAttention(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        d_k = d_model // 1  # 1个注意力头
        d_v = d_model // 1
        self.q_proj = nn.Linear(d_model, d_k)
        self.k_proj = nn.Linear(d_model, d_k)
        self.v_proj = nn.Linear(d_model, d_v)
        self.out_proj = nn.Linear(d_v, d_model)

    def forward(self, x):
        Q = self.q_proj(x) # (batch_size, seq_len, d_k)
        K = self.k_proj(x) # (batch_size, seq_len, d_k)
        V = self.v_proj(x) # (batch_size, seq_len, d_v)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / (K.size(-1) ** 0.5)  # (batch_size, seq_len, seq_len)

In [9]:
class TestModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(10, 20, 3, padding=1),
            nn.ReLU(),
            nn.Linear(20, 10)
        )
        self.fc = nn.Linear(10, 1)

model = TestModel()
model._modules

{'layer1': Sequential(
   (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   (1): ReLU()
   (2): Linear(in_features=20, out_features=10, bias=True)
 ),
 'fc': Linear(in_features=10, out_features=1, bias=True)}

In [11]:
for name, module in model.named_modules():
    print(f"层名称: {name}, 模块: {module}")

层名称: , 模块: TestModel(
  (layer1): Sequential(
    (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Linear(in_features=20, out_features=10, bias=True)
  )
  (fc): Linear(in_features=10, out_features=1, bias=True)
)
层名称: layer1, 模块: Sequential(
  (0): Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU()
  (2): Linear(in_features=20, out_features=10, bias=True)
)
层名称: layer1.0, 模块: Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
层名称: layer1.1, 模块: ReLU()
层名称: layer1.2, 模块: Linear(in_features=20, out_features=10, bias=True)
层名称: fc, 模块: Linear(in_features=10, out_features=1, bias=True)
