### 实现《自然语言处理——预训练模型方法》第四章代码上（4.1~4.5）

1. MLP实现

In [1]:
# 导入所需库
import torch
from torch import nn
from torch.nn import functional as F

In [4]:
# 创建一个MLP类
class MLP(nn.Module):
    # 基类为nn.Module
    def __init__(self, input_dim, hidden_dim, num_class):
        # 构造函数
        # input_dim:输入数据维度
        # hidden_dim:隐藏层维度
        # num_class:多分类个数
        super(MLP, self).__init__()

        self.linear1 = nn.Linear(input_dim, hidden_dim)
        # 隐含层，线性变换
        self.activate = F.relu
        # 使用relu函数作为激活函数：小于0的值输出为0
        self.linear2 = nn.Linear(hidden_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs):
        # 前向计算函数
        # inputs:输入
        print(f"输入为：{inputs}")
        hidden = self.linear1(inputs)
        print(f"经过隐含层变换为：{hidden}")
        activation = self.activate(hidden)
        print(f"经过激活后为：{activation}")
        outputs = self.linear2(activation)
        print(f"输出层输出为：{outputs}")
        probs = F.softmax(outputs, dim = 1)
        print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return probs


In [5]:
# 调用该模型
mlp = MLP(input_dim = 4, hidden_dim = 5, num_class = 2)
inputs = torch.rand(3, 4)
probs = mlp(inputs)
print(probs)

输入为：tensor([[0.2840, 0.3345, 0.5389, 0.4109],
        [0.1151, 0.6950, 0.7657, 0.8501],
        [0.1254, 0.6561, 0.3595, 0.6281]])
经过隐含层变换为：tensor([[ 0.0918, -0.3648,  0.0221, -0.0366,  0.2924],
        [ 0.0231, -0.3179, -0.2034,  0.1455,  0.2739],
        [-0.0328, -0.2742,  0.0010,  0.0370,  0.2395]],
       grad_fn=<AddmmBackward0>)
经过激活后为：tensor([[0.0918, 0.0000, 0.0221, 0.0000, 0.2924],
        [0.0231, 0.0000, 0.0000, 0.1455, 0.2739],
        [0.0000, 0.0000, 0.0010, 0.0370, 0.2395]], grad_fn=<ReluBackward0>)
输出层输出为：tensor([[0.4405, 0.1669],
        [0.5124, 0.1831],
        [0.4834, 0.1595]], grad_fn=<AddmmBackward0>)
输出概率值为：tensor([[0.5680, 0.4320],
        [0.5816, 0.4184],
        [0.5803, 0.4197]], grad_fn=<SoftmaxBackward0>)
tensor([[0.5680, 0.4320],
        [0.5816, 0.4184],
        [0.5803, 0.4197]], grad_fn=<SoftmaxBackward0>)


2. 卷积神经网络实现

In [6]:
# 导入所需库
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import Conv1d
from torch.nn import MaxPool1d

In [27]:
# 创建一个CNN类
# 注：该类为笔者自行实现
class CNN(nn.Module):
    # 基类为nn.Module
    def __init__(self, input_dim, output_dim, num_class, kernel_size):
        # 构造函数
        # input_dim:输入数据维度
        # output_dim:卷积输出维度
        # num_class:多分类个数
        # kernel_size：卷积核宽度
        super(CNN, self).__init__()

        self.conv = Conv1d(input_dim, output_dim, kernel_size)
        # 卷积层
        self.pool = F.max_pool1d
        # 池化层，使用最大池化
        self.linear = nn.Linear(output_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs):
        # 前向计算函数
        # inputs:输入
        print(f"输入size为：{inputs.size()}")
        conv = self.conv(inputs)
        print(f"经过卷积层变换size为：{conv.size()}")
        pool = self.pool(conv, kernel_size = conv.shape[2])
        print(f"经过池化后size为：{pool.size()}")
        pool_squeeze = pool.squeeze(dim=2)
        outputs = self.linear(pool_squeeze)
        print(f"输出层输出size为：{outputs.size()}")
        return outputs


In [26]:
# 调用该模型
cnn = CNN(5, 2, 2, 4)
inputs = torch.rand(2,5,6)
probs = cnn(inputs)
print(probs)

输入size为：torch.Size([2, 5, 6])
经过隐含层变换size为：torch.Size([2, 2, 3])
经过池化后size为：torch.Size([2, 2, 1])
输出层输出size为：torch.Size([2, 2])
tensor([[0.1097, 0.4246],
        [0.0275, 0.3587]], grad_fn=<AddmmBackward0>)


3. 循环神经网络实现

In [16]:
# 导入所需库
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import RNN
from torch.nn import LSTM

In [2]:
# RNN可直接调用函数构造
rnn = RNN(input_size = 4, hidden_size = 5, batch_first = True)
# 定义一个输入为4维，输出为5维的RNN网络

In [3]:
inputs = torch.rand(2,3,4)
# 批次为2，序列长度为3，输入维度为4
outputs, hn = rnn(inputs)

In [4]:
outputs
# outputs为所有状态的输出

tensor([[[ 0.2286,  0.8542,  0.4634, -0.4545,  0.8325],
         [ 0.5414,  0.8282,  0.7302, -0.6027,  0.8845],
         [ 0.5205,  0.9020,  0.7551, -0.6717,  0.8920]],

        [[ 0.3798,  0.7895,  0.4614, -0.3672,  0.7947],
         [ 0.2242,  0.8136,  0.7505, -0.6971,  0.8387],
         [ 0.7363,  0.9302,  0.7960, -0.6228,  0.9013]]],
       grad_fn=<TransposeBackward1>)

In [5]:
hn
# hn为最后一个时刻隐藏层的输出

tensor([[[ 0.5205,  0.9020,  0.7551, -0.6717,  0.8920],
         [ 0.7363,  0.9302,  0.7960, -0.6228,  0.9013]]],
       grad_fn=<StackBackward0>)

In [11]:
# 修改参数构建rnn

new_rnn = RNN(input_size = 4, hidden_size = 5, batch_first = True, bidirectional = False, num_layers = 3)
# bidirectional——是否双向循环
# num_layer——隐含层层数
outputs, hn = new_rnn(inputs)

In [12]:
outputs

tensor([[[ 0.4804, -0.3505, -0.1715,  0.4986,  0.1992],
         [ 0.1356, -0.6146, -0.1079,  0.5493, -0.1920],
         [ 0.0440, -0.5984,  0.0594,  0.6036, -0.3099]],

        [[ 0.4907, -0.4173, -0.1886,  0.4708,  0.1714],
         [ 0.0991, -0.6351, -0.1140,  0.5796, -0.2207],
         [ 0.0855, -0.5836,  0.0795,  0.5775, -0.3130]]],
       grad_fn=<TransposeBackward1>)

In [15]:
hn

tensor([[[ 0.0713, -0.6564, -0.5520, -0.2442,  0.0394],
         [ 0.0119, -0.4096, -0.6544, -0.2719,  0.0766]],

        [[-0.4639, -0.4273,  0.4186, -0.8699, -0.4914],
         [-0.5272, -0.3858,  0.3944, -0.8775, -0.4276]],

        [[ 0.0440, -0.5984,  0.0594,  0.6036, -0.3099],
         [ 0.0855, -0.5836,  0.0795,  0.5775, -0.3130]]],
       grad_fn=<StackBackward0>)

In [17]:
# LSTM也可直接调用函数构造
lstm = LSTM(input_size = 4, hidden_size = 5, batch_first = True)

In [19]:
outputs, (hn, cn) = lstm(inputs)

In [20]:
outputs
# 值含义同rnn

tensor([[[ 0.0135,  0.1853,  0.1818, -0.0742, -0.0463],
         [ 0.0095,  0.2240,  0.3504,  0.0161, -0.1934],
         [-0.0084,  0.2773,  0.3812,  0.0006, -0.2226]],

        [[ 0.0207,  0.1564,  0.2159, -0.0242, -0.0964],
         [ 0.0205,  0.2207,  0.2431, -0.1132, -0.0822],
         [ 0.0554,  0.3030,  0.3500, -0.1545, -0.0900]]],
       grad_fn=<TransposeBackward0>)

In [21]:
hn
# 值含义同rnn

tensor([[[-0.0084,  0.2773,  0.3812,  0.0006, -0.2226],
         [ 0.0554,  0.3030,  0.3500, -0.1545, -0.0900]]],
       grad_fn=<StackBackward0>)

In [22]:
cn
# cn为最后一个时刻记忆细胞的值

tensor([[[-0.0177,  0.5129,  0.8251,  0.0011, -0.4486],
         [ 0.0943,  0.5583,  0.8232, -0.2650, -0.1795]]],
       grad_fn=<StackBackward0>)

4. 注意力网络实现

In [24]:
# 导入所需库
import torch
from torch import nn

In [25]:
# 搭建一个Encoder层
encoder_layer = nn.TransformerEncoderLayer(d_model=4, nhead=2)
# 输入、输出维度为4，头数为2
encoder = nn.TransformerEncoder(encoder_layer, num_layers=5)
outputs = encoder(inputs)

In [26]:
outputs

tensor([[[-0.1555,  1.6035, -1.1460, -0.3020],
         [-0.7393,  1.5755, -0.9756,  0.1395],
         [ 0.1729,  0.6933,  0.8147, -1.6809]],

        [[-0.8905,  1.2357, -1.0727,  0.7276],
         [ 1.6451, -0.4516, -0.1625, -1.0311],
         [-0.8592,  0.5700, -1.0586,  1.3477]]],
       grad_fn=<NativeLayerNormBackward0>)

In [27]:
# 搭建一个Decoder层
decoder_layer = nn.TransformerDecoderLayer(d_model=4, nhead=2)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=5)

In [28]:
inputs_another = torch.rand(2,3,4)
final_outputs = decoder(inputs_another, outputs)
# inputs_another是当前输入，outputs是解码的输出作为历史记忆

In [29]:
final_outputs

tensor([[[ 0.1466,  1.5527, -0.6007, -1.0985],
         [ 0.4964, -1.6062,  0.0267,  1.0831],
         [ 0.0523,  1.4473, -0.1259, -1.3736]],

        [[ 0.0510,  1.5564, -0.4276, -1.1798],
         [ 0.0870, -1.5946,  0.3580,  1.1496],
         [-1.1281,  0.9141,  1.0737, -0.8596]]],
       grad_fn=<NativeLayerNormBackward0>)

5. 梯度下降法训练

In [30]:
# 导入所需库
import torch
from torch import nn, optim
from torch.nn import functional as F

In [31]:
# 创建一个使用logSoftmax操作的MLP类
class MLP(nn.Module):
    # 基类为nn.Module
    def __init__(self, input_dim, hidden_dim, num_class):
        # 构造函数
        # input_dim:输入数据维度
        # hidden_dim:隐藏层维度
        # num_class:多分类个数
        super(MLP, self).__init__()

        self.linear1 = nn.Linear(input_dim, hidden_dim)
        # 隐含层，线性变换
        self.activate = F.relu
        # 使用relu函数作为激活函数：小于0的值输出为0
        self.linear2 = nn.Linear(hidden_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs):
        # 前向计算函数
        # inputs:输入
        # print(f"输入为：{inputs}")
        hidden = self.linear1(inputs)
        # print(f"经过隐含层变换为：{hidden}")
        activation = self.activate(hidden)
        # print(f"经过激活后为：{activation}")
        outputs = self.linear2(activation)
        # print(f"输出层输出为：{outputs}")
        probs = F.log_softmax(outputs, dim = 1)
        # print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return probs

In [49]:
# 构建异或问题的输入输出
train_data = torch.tensor([[0.0,0.0], [0.0,1.0], [1.0,0.0], [1.0,1.0]])
train_label = torch.tensor([0,1,1,0])

In [50]:
# 创建MLP模型
mlp = MLP(input_dim=2, hidden_dim=5, num_class=2)

In [51]:
# 观察未训练的输出
y_pre = mlp(train_data)
y_pre.argmax(dim = 1)
# 取预测最大值作为类别结果

tensor([1, 1, 1, 1])

In [52]:
criterion = nn.NLLLoss()
# 使用负对数似然损失
optimizer = optim.SGD(mlp.parameters(), lr=0.05)
# 使用梯度下降法
for epoch in range(500):
    y = mlp(train_data)
    loss = criterion(y, train_label)
    optimizer.zero_grad()
    # 将优化器的梯度置零
    loss.backward()
    # 梯度的反向传播
    optimizer.step()
    # 更新参数

In [53]:
# 打印训练后的参数
for name, param in mlp.named_parameters():
    print(name, param.data)

linear1.weight tensor([[ 0.7744,  0.6532],
        [-0.0648, -0.3256],
        [-0.1493, -0.2856],
        [ 0.4949, -0.1721],
        [-1.2461, -1.2449]])
linear1.bias tensor([-0.2786,  0.7907,  0.4438, -0.4979,  1.2473])
linear2.weight tensor([[ 0.5600, -0.2013, -0.5235,  0.3350,  1.5162],
        [-0.6237,  0.6042,  0.1783,  0.0218, -1.3404]])
linear2.bias tensor([-0.4435,  0.2404])


In [54]:
y_pre = mlp(train_data)
y_pre.argmax(dim = 1)
# 取预测最大值作为类别结果

tensor([0, 1, 1, 0])