### 实现《自然语言处理——预训练模型方法》第四章代码下（4.6~4.7）

1. 情感分类词表准备

In [1]:
from collections import defaultdict

In [2]:
# 定义一个词表类型
# 该类用于实现token到索引的映射
class Vocab:

    def __init__(self, tokens = None) -> None:
        # 构造函数
        # tokens：全部的token列表

        self.idx_to_token = list()
        # 将token存成列表，索引直接查找对应的token即可
        self.token_to_idx = dict()
        # 将索引到token的映射关系存成字典，键为索引，值为对应的token

        if tokens is not None:
            # 构造时输入了token的列表
            if "<unk>" not in tokens:
                # 不存在标记
                tokens = tokens + "<unk>"
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
                # 当前该token对应的索引是当下列表的最后一个
            self.unk = self.token_to_idx["<unk>"]

    @classmethod
    def build(cls, text, min_freq=1, reserved_tokens=None):
        # 构建词表
        # cls：该类本身
        # text：输入的文本
        # min_freq：列入token的最小频率
        # reserved_tokens：额外的标记token
        token_freqs = defaultdict(int)
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        # 统计各个token的频率
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        # 加入额外的token
        uniq_tokens += [token for token, freq in token_freqs.items() \
            if freq >= min_freq and token != "<unk>"]
        # 全部的token列表
        return cls(uniq_tokens)

    def __len__(self):
        # 返回词表的大小
        return len(self.idx_to_token)

    def __getitem__(self, token):
        # 查找输入token对应的索引，不存在则返回<unk>返回的索引
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        # 查找一系列输入标签对应的索引值
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, ids):
        # 查找一系列索引值对应的标记
        return [self.idx_to_token[index] for index in ids]


2. 多层感知机

In [3]:
import torch
from torch import nn
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [71]:
# 创建一个MLP类
class MLP(nn.Module):
    # 基类为nn.Module
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        # 构造函数
        # vocab_size:词表大小
        # embedding_dim：词向量维度
        # hidden_dim:隐藏层维度
        # num_class:多分类个数
        super(MLP, self).__init__()

        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
        # 词向量层
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        # 隐含层，线性变换
        self.activate = F.relu
        # 使用relu函数作为激活函数：小于0的值输出为0
        self.linear2 = nn.Linear(hidden_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs, offsets):
        # 前向计算函数
        # inputs:输入
        # print(f"输入为：{inputs.size()}")
        embeds = self.embedding(inputs, offsets)
        # 对词向量层取袋模型
        hidden = self.linear1(embeds)
        # print(f"经过隐含层变换为：{hidden}")
        activation = self.activate(hidden)
        # print(f"经过激活后为：{activation}")
        outputs = self.linear2(activation)
        # print(f"输出层输出为：{outputs}")
        probs = F.log_softmax(outputs, dim = 1)
        # print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return probs


3. 数据处理

In [4]:
from nltk.corpus import sentence_polarity
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [5]:
def load_sentence_polarity():

    vocab = Vocab.build(sentence_polarity.sents())
    # 使用nltk的情感倾向数据作为示例

    train_data = [(vocab.convert_tokens_to_ids(sentence), 0) for sentence in sentence_polarity.sents(categories="pos")[:4000]]\
        +[(vocab.convert_tokens_to_ids(sentence), 1) for sentence in sentence_polarity.sents(categories='neg')[:4000]]
    # 分别取褒贬各4000句作为训练数据，将token映射为对应的索引值

    test_data = [(vocab.convert_tokens_to_ids(sentence), 0) for sentence in sentence_polarity.sents(categories="pos")[4000:]]\
        +[(vocab.convert_tokens_to_ids(sentence), 1) for sentence in sentence_polarity.sents(categories="neg")[4000:]]
    # 其余数据作为测试数据

    return train_data, test_data, vocab


In [6]:
# 声明一个DataSet类
class BowDataset(Dataset):

    def __init__(self, data) -> None:
        # data：使用load_sentence_polarity获得的数据
        self.data = data

    def __len__(self):
        # 返回样例的数目
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [7]:
# collate_fn函数用于对一个批次的样本进行整理
def collate_fn(examples):
    # 从独立样本集合中构建各批次的输入输出
    inputs = [torch.tensor(ex[0]) for ex in examples]
    # 将输入inputs定义为一个张量的列表，每一个张量为句子对应的索引值序列
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    # 目标targets为该批次所有样例输出结果构成的张量
    offsets = [0] + [i.shape[0] for i in inputs]
    # 一个批次中每个样例的序列长度
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    # 根据序列长度计算每个序列起始位置的偏移量
    inputs = torch.cat(inputs)
    # 将inputs列表中的张量拼接成一个大的张量
    return inputs, offsets, targets

4. 训练多层感知机模型

In [8]:
from tqdm import tqdm
from collections import defaultdict
from torch import optim

In [79]:
# 超参数设置
embedding_dim = 128
hidden_dim = 256
num_class = 2
batch_size = 16
num_epoch = 10

train_data, test_data, vocab = load_sentence_polarity()
# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device)
# 加载模型

MLP(
  (embedding): EmbeddingBag(21402, 128, mode=mean)
  (linear1): Linear(in_features=128, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=2, bias=True)
)

In [78]:
nll_loss = nn.NLLLoss()
# 负对数似然损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化器


In [80]:
model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, offsets, targets = [x.to(device) for x in batch]
        log_probs = model(inputs, offsets)
        loss = nll_loss(log_probs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

    # 测试
    acc = 0
    for batch in tqdm(test_data_loader, desc=f"Testing"):
        inputs, offsets, targets = [x.to(device) for x in batch]
        with torch.no_grad():
            output = model(inputs, offsets)
            acc += (output.argmax(dim=1) == targets).sum().item()
    print(f"ACC:{acc / len(test_data_loader):.2f}")

Training Epoch 0: 100%|██████████| 500/500 [00:01<00:00, 442.29it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1547.68it/s]


ACC:0.52


Training Epoch 1: 100%|██████████| 500/500 [00:00<00:00, 502.54it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1506.08it/s]


ACC:0.52


Training Epoch 2: 100%|██████████| 500/500 [00:01<00:00, 466.28it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1561.90it/s]


ACC:0.52


Training Epoch 3: 100%|██████████| 500/500 [00:01<00:00, 453.93it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1655.60it/s]


ACC:0.52


Training Epoch 4: 100%|██████████| 500/500 [00:01<00:00, 495.87it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1755.88it/s]


ACC:0.52


Training Epoch 5: 100%|██████████| 500/500 [00:00<00:00, 522.02it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1746.85it/s]


ACC:0.52


Training Epoch 6: 100%|██████████| 500/500 [00:00<00:00, 500.12it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1689.89it/s]


ACC:0.52


Training Epoch 7: 100%|██████████| 500/500 [00:01<00:00, 496.73it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1777.25it/s]


ACC:0.52


Training Epoch 8: 100%|██████████| 500/500 [00:00<00:00, 551.42it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1837.65it/s]


ACC:0.52


Training Epoch 9: 100%|██████████| 500/500 [00:00<00:00, 525.89it/s]


Loss:345.66


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1829.48it/s]

ACC:0.52





5. 训练卷积神经网络模型

In [9]:
from torch.nn.utils.rnn import pad_sequence

In [84]:
# 创建一个CNN类
class CNN(nn.Module):
    # 基类为nn.Module
    def __init__(self, vocab_size, embedding_dim, filter_size, num_filter, num_class):
        # 构造函数
        # vocab_size:词表大小
        # embedding_dim：词向量维度
        # filter_size：卷积核大小
        # num_filter: 卷积核个数
        # num_class:多分类个数
        super(CNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.conv1d = nn.Conv1d(embedding_dim, num_filter, filter_size, padding=1)
        # 卷积层，使用1作为padding
        self.activate = F.relu
        # 使用relu函数作为激活函数：小于0的值输出为0
        self.linear = nn.Linear(num_filter, num_class)
        # 输出层，线性变换

    def forward(self, inputs):
        # 前向计算函数
        # inputs:输入
        # print(f"输入为：{inputs.size()}")
        embeds = self.embedding(inputs).permute(0, 2, 1)
        # 注意这儿是词向量层，不是词袋词向量层
        # 卷积层的输入两个维度与词向量层输出相反，需要使用permute转换一下
        # print(f"词向量层输出为：{embeds.size()}")
        convolution = self.conv1d(embeds)
        # print(f"经过卷积层变换为：{convolution.size()}")
        activation = self.activate(convolution)
        # print(f"经过激活后为：{activation.size()}")
        pooling = F.max_pool1d(activation, kernel_size=activation.shape[-1])
        # print(f"池化后为：{pooling.size()}")
        # print(f"池化后结果为：{pooling}")
        outputs = self.linear(pooling.squeeze(dim=2))
        # 池化后的输出是二维的，需要使用squeeze降维到一维
        # print(f"输出层输出为：{outputs.size()}")
        log_probs = F.log_softmax(outputs, dim = 1)
        # print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return log_probs


In [85]:
# 修改collate_fn函数
def collate_fn(examples):
    # 从独立样本集合中构建各批次的输入输出
    inputs = [torch.tensor(ex[0]) for ex in examples]
    # 将输入inputs定义为一个张量的列表，每一个张量为句子对应的索引值序列
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    # 目标targets为该批次所有样例输出结果构成的张量
    inputs = pad_sequence(inputs, batch_first=True)
    # 将用pad_sequence对批次类的样本进行补齐
    return inputs, targets

In [86]:
# 训练
# 超参数设置
embedding_dim = 128
filter_size = 3
num_filter = 150
num_class = 2
batch_size = 16
num_epoch = 10

train_data, test_data, vocab = load_sentence_polarity()
# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN(len(vocab), embedding_dim, filter_size, num_filter, num_class)
model.to(device)
# 加载模型

CNN(
  (embedding): Embedding(21402, 128)
  (conv1d): Conv1d(128, 150, kernel_size=(3,), stride=(1,), padding=(1,))
  (linear): Linear(in_features=150, out_features=2, bias=True)
)

In [87]:
nll_loss = nn.NLLLoss()
# 负对数似然损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        # print(inputs.size())
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

    # 测试
    acc = 0
    for batch in tqdm(test_data_loader, desc=f"Testing"):
        inputs, targets = [x.to(device) for x in batch]
        with torch.no_grad():
            output = model(inputs)
            acc += (output.argmax(dim=1) == targets).sum().item()
    print(f"ACC:{acc / len(test_data_loader):.2f}")

Training Epoch 0: 100%|██████████| 500/500 [00:04<00:00, 110.67it/s]


Loss:325.78


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1486.70it/s]


ACC:0.66


Training Epoch 1: 100%|██████████| 500/500 [00:02<00:00, 190.38it/s]


Loss:226.80


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1484.76it/s]


ACC:0.69


Training Epoch 2: 100%|██████████| 500/500 [00:02<00:00, 216.43it/s]


Loss:103.71


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1518.83it/s]


ACC:0.69


Training Epoch 3: 100%|██████████| 500/500 [00:02<00:00, 218.49it/s]


Loss:28.00


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1491.96it/s]


ACC:0.71


Training Epoch 4: 100%|██████████| 500/500 [00:02<00:00, 209.01it/s]


Loss:6.69


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1438.43it/s]


ACC:0.71


Training Epoch 5: 100%|██████████| 500/500 [00:02<00:00, 214.44it/s]


Loss:2.45


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1454.87it/s]


ACC:0.72


Training Epoch 6: 100%|██████████| 500/500 [00:02<00:00, 207.29it/s]


Loss:1.41


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1386.58it/s]


ACC:0.72


Training Epoch 7: 100%|██████████| 500/500 [00:02<00:00, 206.65it/s]


Loss:0.90


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1471.20it/s]


ACC:0.72


Training Epoch 8: 100%|██████████| 500/500 [00:02<00:00, 215.41it/s]


Loss:0.61


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1529.55it/s]


ACC:0.72


Training Epoch 9: 100%|██████████| 500/500 [00:02<00:00, 217.27it/s]


Loss:0.43


Testing: 100%|██████████| 2662/2662 [00:01<00:00, 1402.13it/s]

ACC:0.72





6. 训练循环神经网络模型

In [10]:
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_sequence

In [89]:
# 创建一个LSTM类
class LSTM(nn.Module):
    # 基类为nn.Module
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        # 构造函数
        # vocab_size:词表大小
        # embedding_dim：词向量维度
        # hidden_dim：隐藏层维度
        # num_class:多分类个数
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        # lstm层
        self.output = nn.Linear(hidden_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs, lengths):
        # 前向计算函数
        # inputs:输入
        # lengths:打包的序列长度
        # print(f"输入为：{inputs.size()}")
        embeds = self.embedding(inputs)
        # 注意这儿是词向量层，不是词袋词向量层
        # print(f"词向量层输出为：{embeds.size()}")
        x_pack = pack_padded_sequence(embeds, lengths.to('cpu'), batch_first=True, enforce_sorted=False)
        # LSTM需要定长序列，使用该函数将变长序列打包
        # print(f"经过打包为：{x_pack.size()}")
        hidden, (hn, cn) = self.lstm(x_pack)
        # print(f"经过lstm计算后为：{hn.size()}")
        outputs = self.output(hn[-1])
        # print(f"输出层输出为：{outputs.size()}")
        log_probs = F.log_softmax(outputs, dim = -1)
        # print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return log_probs


In [90]:
# 修改collate_fn函数
def collate_fn(examples):
    # 从独立样本集合中构建各批次的输入输出
    lengths = torch.tensor([len(ex[0]) for ex in examples])
    # 获取每个序列的长度
    inputs = [torch.tensor(ex[0]) for ex in examples]
    # 将输入inputs定义为一个张量的列表，每一个张量为句子对应的索引值序列
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    # 目标targets为该批次所有样例输出结果构成的张量
    inputs = pad_sequence(inputs, batch_first=True)
    # 将用pad_sequence对批次类的样本进行补齐
    return inputs, lengths, targets

In [91]:
# 训练
# 超参数设置
embedding_dim = 128
hidden_dim = 24
batch_size = 16
num_epoch = 10

train_data, test_data, vocab = load_sentence_polarity()
# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
model.to(device)
# 加载模型

LSTM(
  (embedding): Embedding(21402, 128)
  (lstm): LSTM(128, 24, batch_first=True)
  (output): Linear(in_features=24, out_features=2, bias=True)
)

In [92]:
nll_loss = nn.NLLLoss()
# 负对数似然损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, lengths, targets = [x.to(device) for x in batch]
        # print(inputs.size())
        log_probs = model(inputs, lengths)
        loss = nll_loss(log_probs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

    # 测试
    acc = 0
    for batch in tqdm(test_data_loader, desc=f"Testing"):
        inputs, lengths, targets = [x.to(device) for x in batch]
        with torch.no_grad():
            output = model(inputs, lengths)
            acc += (output.argmax(dim=1) == targets).sum().item()
    print(f"ACC:{acc / len(test_data_loader):.2f}")

Training Epoch 0: 100%|██████████| 500/500 [00:05<00:00, 85.99it/s] 


Loss:343.55


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 646.27it/s]


ACC:0.58


Training Epoch 1: 100%|██████████| 500/500 [00:04<00:00, 106.40it/s]


Loss:288.63


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 656.96it/s]


ACC:0.68


Training Epoch 2: 100%|██████████| 500/500 [00:04<00:00, 108.57it/s]


Loss:178.49


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 653.37it/s]


ACC:0.71


Training Epoch 3: 100%|██████████| 500/500 [00:04<00:00, 112.18it/s]


Loss:93.57


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 655.99it/s]


ACC:0.71


Training Epoch 4: 100%|██████████| 500/500 [00:04<00:00, 107.58it/s]


Loss:47.02


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 664.92it/s]


ACC:0.71


Training Epoch 5: 100%|██████████| 500/500 [00:04<00:00, 108.07it/s]


Loss:23.27


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 661.75it/s]


ACC:0.70


Training Epoch 6: 100%|██████████| 500/500 [00:04<00:00, 107.98it/s]


Loss:12.83


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 623.25it/s]


ACC:0.71


Training Epoch 7: 100%|██████████| 500/500 [00:04<00:00, 109.21it/s]


Loss:10.88


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 662.88it/s]


ACC:0.71


Training Epoch 8: 100%|██████████| 500/500 [00:04<00:00, 110.45it/s]


Loss:7.61


Testing: 100%|██████████| 2662/2662 [00:03<00:00, 688.42it/s]


ACC:0.71


Training Epoch 9: 100%|██████████| 500/500 [00:04<00:00, 112.34it/s]


Loss:2.80


Testing: 100%|██████████| 2662/2662 [00:04<00:00, 662.64it/s]

ACC:0.71





7. 训练Transformer网络

In [11]:
import math

In [80]:
# 首先实现一个位置编码层
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=512) -> None:
        # d_model：模型计算公式中的参数
        # dropout：辍学率
        # max_len: 事先准备好的序列长度
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        # 生成全零的矩阵,5000*512的矩阵，5000个位置，每个位置用一个512维度向量来表示位置编码
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # 生成位置序列，unsqueeze用于升一个维度，(5000,) -> (5000,1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # 公式中的w_k
        pe[:, 0::2] = torch.sin(position * div_term)
        # 偶数位置编码
        pe[:, 1::2] = torch.cos(position * div_term)
        # 奇数位置编码
        pe = pe.unsqueeze(0).transpose(0, 1)    
        # 升维，为batch_size留出位置
        self.register_buffer('pe', pe)
        # 在内存中定一个常量，即将位置编码存进去

    def forward(self, x):
        # 前向计算
        # x:输入词向量序列
        # print(x.size())
        # print(self.pe.size())
        try:
            x = x + self.pe[:x.size(0), :]
        except:
            # print(x.size())
            # print(self.pe.size())
            pass
            # 针对有一个输入会出现问题，将其跳过，不进行位置编码
        # 输入词向量与位置编码相加
        return x

In [13]:
# 定义一个根据序列长度生成Mask矩阵的函数
def length_to_mask(lengths):
    # lengths:给定序列长度
    max_len = torch.max(lengths)
    # print("maxlen", max_len.is_cuda)
    # print(torch.arange(max_len).is_cuda)
    mask = torch.arange(max_len).to("cuda").expand(lengths.shape[0], max_len) < lengths.unsqueeze(1)
    return mask

In [96]:
# 创建一个Transformer类
# 此处书中代码有误，不需要hidden_dim，注意力层输入维度应该直接是词向量维度
class Transformer(nn.Module):
    # 基类为nn.Module
    def __init__(self, vocab_size, embedding_dim, num_class,
    dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=128, activation: str = "relu"):
        # 构造函数
        # vocab_size:词表大小
        # embedding_dim：词向量维度
        # hidden_dim：隐藏层维度
        # num_class:多分类个数
        # dim_feedforward：前馈网络模型的维度
        # num_head:头数
        # num_layers：注意力层数
        # dropout:辍学比例
        # max_len:序列最大长度
        # activation:激活函数
        super(Transformer, self).__init__()

        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
        # 位置编码层
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_head, dim_feedforward, dropout, activation)
        # 一个encoder
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        # 注意力编码层
        self.output = nn.Linear(embedding_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs, lengths):
        # 前向计算函数
        # inputs:输入
        # lengths:打包的序列长度
        # print(f"输入为：{inputs.size()}")
        inputs = torch.transpose(inputs, 0, 1)
        # 首先需要将输入第一维与第二维互换，适应transformer的输入
        embeds = self.embedding(inputs)
        # 注意这儿是词向量层，不是词袋词向量层
        # print(f"词向量层输出为：{embeds.size()}")
        embeds = self.position_embedding(embeds)
        # 加入位置编码
        # print(f"位置编码层输出为：{embeds.size()}")
        attention_mask = length_to_mask(lengths) == False
        # 生成mask掩码
        # print(f"生成mask为：{attention_mask.size()}")
        hidden_states = self.transformer(embeds, src_key_padding_mask = attention_mask)
        # 用来遮蔽<PAD>以避免pad token的embedding输入
        # print(f"经过transformer计算后为：{hidden_states.size()}")
        hidden_states = hidden_states[0, :, :]
        # 取第一个标记的输出结果作为分类层的输入
        outputs = self.output(hidden_states)
        # print(f"输出层输出为：{outputs.size()}")
        log_probs = F.log_softmax(outputs, dim = -1)
        # print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return log_probs


In [97]:
# 训练
# 超参数设置
embedding_dim = 128
batch_size = 16
num_epoch = 10
num_class = 2

train_data, test_data, vocab = load_sentence_polarity()
# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(len(vocab), embedding_dim, num_class)
model.to(device)
# 加载模型

Transformer(
  (embedding): Embedding(21402, 128)
  (position_embedding): PositionalEncoding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear

In [98]:
nll_loss = nn.NLLLoss()
# 负对数似然损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, lengths, targets = [x.to(device) for x in batch]
        # print(inputs.size())
        # print("inputs", inputs.is_cuda)
        # print("lengths", lengths.is_cuda)
        log_probs = model(inputs, lengths)
        loss = nll_loss(log_probs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

    # 测试
    acc = 0
    for batch in tqdm(test_data_loader, desc=f"Testing"):
        inputs, lengths, targets = [x.to(device) for x in batch]
        with torch.no_grad():
            output = model(inputs, lengths)
            acc += (output.argmax(dim=1) == targets).sum().item()
    print(f"ACC:{acc / len(test_data_loader):.2f}")

Training Epoch 0: 100%|██████████| 500/500 [00:07<00:00, 69.86it/s]


Loss:348.95


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 361.52it/s]


ACC:0.53


Training Epoch 1: 100%|██████████| 500/500 [00:06<00:00, 77.91it/s]


Loss:328.55


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 352.41it/s]


ACC:0.58


Training Epoch 2: 100%|██████████| 500/500 [00:06<00:00, 74.75it/s]


Loss:315.31


Testing: 100%|██████████| 2662/2662 [00:08<00:00, 322.31it/s]


ACC:0.59


Training Epoch 3: 100%|██████████| 500/500 [00:06<00:00, 74.14it/s]


Loss:298.82


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 334.62it/s]


ACC:0.61


Training Epoch 4: 100%|██████████| 500/500 [00:06<00:00, 72.45it/s]


Loss:284.84


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 339.52it/s]


ACC:0.62


Training Epoch 5: 100%|██████████| 500/500 [00:06<00:00, 72.42it/s]


Loss:271.36


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 337.96it/s]


ACC:0.63


Training Epoch 6: 100%|██████████| 500/500 [00:06<00:00, 72.30it/s]


Loss:253.22


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 337.04it/s]


ACC:0.63


Training Epoch 7: 100%|██████████| 500/500 [00:06<00:00, 73.58it/s]


Loss:240.02


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 333.35it/s]


ACC:0.63


Training Epoch 8: 100%|██████████| 500/500 [00:06<00:00, 73.68it/s]


Loss:223.48


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 340.57it/s]


ACC:0.62


Training Epoch 9: 100%|██████████| 500/500 [00:06<00:00, 74.79it/s]


Loss:269.09


Testing: 100%|██████████| 2662/2662 [00:07<00:00, 339.74it/s]

ACC:0.59





8. 基于循环神经网络的词性标注实现

In [14]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [15]:
# 使用宾州树库词性标注数据库
from nltk.corpus import treebank
from torch.nn.utils.rnn import pad_packed_sequence

In [16]:
sents, postags = zip(*(zip(*sent) for sent in treebank.tagged_sents()))

In [17]:
def load_treebank():
    # 定义数据加载函数
    sents, postags = zip(*(zip(*sent) for sent in treebank.tagged_sents()))
    # sents存储全部经过标记化的句子
    # postags存储每个标记对应的词性标注结果
    vocab= Vocab.build(sents, reserved_tokens=["<pad>"])
    # 使用pad标记来补齐序列长度
    tag_vocab = Vocab.build(postags)
    # 将词性标注标签也映射为索引值
    train_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags))\
         for sentence, tags in zip(sents[:3000], postags[:3000])]
    # 取前3000句作为训练数据，将token映射为对应的索引值
    test_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags))\
         for sentence, tags in zip(sents[3000:], postags[3000:])]
    # 其余的作为测试数据

    return train_data, test_data, vocab, tag_vocab
    

In [56]:
# 修改collate_fn函数
def collate_fn(examples):
    # 从独立样本集合中构建各批次的输入输出
    lengths = torch.tensor([len(ex[0]) for ex in examples])
    # 获取每个序列的长度
    inputs = [torch.tensor(ex[0]) for ex in examples]
    # 将输入inputs定义为一个张量的列表，每一个张量为句子对应的索引值序列
    targets = [torch.tensor(ex[1]) for ex in examples]
    # 目标targets为该批次所有样例输出结果构成的张量，同文本分类任务不同
    inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<pad>"])
    targets = pad_sequence(targets, batch_first=True, padding_value=vocab["<pad>"])
    # 将用pad_sequence对批次类的样本进行补，标签也需要补齐
    return inputs, lengths, targets, inputs != vocab["<pad>"]

In [59]:
# 创建一个LSTM类
class LSTM(nn.Module):
    # 基类为nn.Module
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        # 构造函数
        # vocab_size:词表大小
        # embedding_dim：词向量维度
        # hidden_dim：隐藏层维度
        # num_class:多分类个数
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        # lstm层
        self.output = nn.Linear(hidden_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs, lengths):
        # 前向计算函数
        # inputs:输入
        # lengths:打包的序列长度
        # print(f"输入为：{inputs.size()}")
        embeds = self.embedding(inputs)
        # 注意这儿是词向量层，不是词袋词向量层
        # print(f"词向量层输出为：{embeds.size()}")
        x_pack = pack_padded_sequence(embeds, lengths.to('cpu'), batch_first=True, enforce_sorted=False)
        # LSTM需要定长序列，使用该函数将变长序列打包
        # print(f"经过打包为：{x_pack.size()}")
        hidden, (hn, cn) = self.lstm(x_pack)
        # print(f"经过lstm计算后为：{hn.size()}")
        hidden, _ = pad_packed_sequence(hidden, batch_first = True)
        # 词性标注需要再进行解包，还原成经过补齐的序列
        # print(f"解包之后输出为：{hidden.size()}")
        outputs = self.output(hidden)
        # 在词性标注中需要使用全部的隐藏层状态
        # print(f"输出层输出为：{outputs.size()}")
        log_probs = F.log_softmax(outputs, dim = -1)
        # print(f"输出概率值为：{log_probs}")
        # 归一化为概率值
        return log_probs


In [63]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# 训练
# 超参数设置
embedding_dim = 128
batch_size = 16
num_epoch = 10
hidden_dim = 64

train_data, test_data, vocab, tag_vocab = load_treebank()
# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)

num_class = len(tag_vocab)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
device = 'cpu'
model.to(device)
# 加载模型

LSTM(
  (embedding): Embedding(12410, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (output): Linear(in_features=64, out_features=47, bias=True)
)

In [64]:
nll_loss = nn.NLLLoss()
# 负对数似然损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, lengths, targets, mask = [x.to(device) for x in batch]
        # print(inputs.sizbe())
        # print("inputs", inputs.is_cuda)
        # print("lengths", lengths.is_cuda)
        log_probs = model(inputs, lengths)
        loss = nll_loss(log_probs[mask], targets[mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

    # 测试
    acc = 0
    for batch in tqdm(test_data_loader, desc=f"Testing"):
        inputs, lengths, targets, mask = [x.to(device) for x in batch]
        with torch.no_grad():
            output = model(inputs, lengths)
            acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
    print(f"ACC:{acc / len(test_data_loader):.2f}")

Training Epoch 0: 100%|██████████| 188/188 [00:08<00:00, 21.61it/s]


Loss:429.64


Testing: 100%|██████████| 914/914 [00:01<00:00, 511.98it/s]


ACC:16.44


Training Epoch 1: 100%|██████████| 188/188 [00:08<00:00, 21.17it/s]


Loss:212.27


Testing: 100%|██████████| 914/914 [00:01<00:00, 476.32it/s]


ACC:18.99


Training Epoch 2: 100%|██████████| 188/188 [00:08<00:00, 21.55it/s]


Loss:145.63


Testing: 100%|██████████| 914/914 [00:01<00:00, 494.04it/s]


ACC:20.32


Training Epoch 3: 100%|██████████| 188/188 [00:08<00:00, 21.09it/s]


Loss:107.94


Testing: 100%|██████████| 914/914 [00:02<00:00, 447.89it/s]


ACC:21.06


Training Epoch 4: 100%|██████████| 188/188 [00:09<00:00, 20.65it/s]


Loss:83.16


Testing: 100%|██████████| 914/914 [00:01<00:00, 495.91it/s]


ACC:21.56


Training Epoch 5: 100%|██████████| 188/188 [00:08<00:00, 21.43it/s]


Loss:65.19


Testing: 100%|██████████| 914/914 [00:01<00:00, 517.18it/s]


ACC:21.83


Training Epoch 6: 100%|██████████| 188/188 [00:08<00:00, 21.90it/s]


Loss:51.78


Testing: 100%|██████████| 914/914 [00:01<00:00, 506.04it/s]


ACC:22.04


Training Epoch 7: 100%|██████████| 188/188 [00:08<00:00, 21.94it/s]


Loss:41.47


Testing: 100%|██████████| 914/914 [00:01<00:00, 509.99it/s]


ACC:22.18


Training Epoch 8: 100%|██████████| 188/188 [00:08<00:00, 21.99it/s]


Loss:33.47


Testing: 100%|██████████| 914/914 [00:01<00:00, 507.16it/s]


ACC:22.28


Training Epoch 9: 100%|██████████| 188/188 [00:08<00:00, 22.08it/s]


Loss:27.21


Testing: 100%|██████████| 914/914 [00:01<00:00, 506.88it/s]

ACC:22.34





9. 基于Transformer网络的词性标注实现

In [66]:
# 创建一个Transformer类
# 此处书中代码有误，不需要hidden_dim，注意力层输入维度应该直接是词向量维度
class Transformer(nn.Module):
    # 基类为nn.Module
    def __init__(self, vocab_size, embedding_dim, num_class,
    dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=128, activation: str = "relu"):
        # 构造函数
        # vocab_size:词表大小
        # embedding_dim：词向量维度
        # hidden_dim：隐藏层维度
        # num_class:多分类个数
        # dim_feedforward：前馈网络模型的维度
        # num_head:头数
        # num_layers：注意力层数
        # dropout:辍学比例
        # max_len:序列最大长度
        # activation:激活函数
        super(Transformer, self).__init__()

        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
        # 位置编码层
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_head, dim_feedforward, dropout, activation)
        # 一个encoder
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        # 注意力编码层
        self.output = nn.Linear(embedding_dim, num_class)
        # 输出层，线性变换

    def forward(self, inputs, lengths):
        # 前向计算函数
        # inputs:输入
        # lengths:打包的序列长度
        # print(f"输入为：{inputs.size()}")
        inputs = torch.transpose(inputs, 0, 1)
        # 首先需要将输入第一维与第二维互换，适应transformer的输入
        embeds = self.embedding(inputs)
        # 注意这儿是词向量层，不是词袋词向量层
        # print(f"词向量层输出为：{embeds.size()}")
        embeds = self.position_embedding(embeds)
        # 加入位置编码
        # print(f"位置编码层输出为：{embeds.size()}")
        attention_mask = length_to_mask(lengths) == False
        # 生成mask掩码
        # print(f"生成mask为：{attention_mask.size()}")
        hidden_states = self.transformer(embeds, src_key_padding_mask = attention_mask).transpose(0, 1)
        # 用来遮蔽<PAD>以避免pad token的embedding输入
        # print(f"经过transformer计算后为：{hidden_states.size()}")
        # hidden_states = hidden_states[0, :, :]
        # 取序列中每个输入的隐藏层作为分类层的输入
        outputs = self.output(hidden_states)
        # print(f"输出层输出为：{outputs.size()}")
        log_probs = F.log_softmax(outputs, dim = -1)
        # print(f"输出概率值为：{probs}")
        # 归一化为概率值
        return log_probs


In [81]:
# 训练
# 超参数设置
embedding_dim = 128
batch_size = 16
num_epoch = 10

train_data, test_data, vocab, tag_vocab = load_treebank()
# 加载数据
# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
num_class = len(tag_vocab)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(len(vocab), embedding_dim, num_class)
model.to(device)
# 加载模型

Transformer(
  (embedding): Embedding(12410, 128)
  (position_embedding): PositionalEncoding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear

In [82]:
nll_loss = nn.NLLLoss()
# 负对数似然损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化器

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
        inputs, lengths, targets, mask = [x.to(device) for x in batch]
        # print(inputs.sizbe())
        # print("inputs", inputs.is_cuda)
        # print("lengths", lengths.is_cuda)
        log_probs = model(inputs, lengths)
        loss = nll_loss(log_probs[mask], targets[mask])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

    # 测试
    acc = 0
    for batch in tqdm(test_data_loader, desc=f"Testing"):
        inputs, lengths, targets, mask = [x.to(device) for x in batch]
        with torch.no_grad():
            output = model(inputs, lengths)
            acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
    print(f"ACC:{acc / len(test_data_loader):.2f}")

Training Epoch 0: 100%|██████████| 188/188 [00:05<00:00, 35.27it/s]


Loss:269.89


Testing: 100%|██████████| 914/914 [00:04<00:00, 185.11it/s]


ACC:18.38


Training Epoch 1: 100%|██████████| 188/188 [00:05<00:00, 33.62it/s]


Loss:143.50


Testing: 100%|██████████| 914/914 [00:05<00:00, 182.48it/s]


ACC:20.28


Training Epoch 2: 100%|██████████| 188/188 [00:05<00:00, 34.68it/s]


Loss:96.47


Testing: 100%|██████████| 914/914 [00:04<00:00, 185.56it/s]


ACC:21.13


Training Epoch 3: 100%|██████████| 188/188 [00:05<00:00, 34.42it/s]


Loss:68.14


Testing: 100%|██████████| 914/914 [00:04<00:00, 188.42it/s]


ACC:21.40


Training Epoch 4: 100%|██████████| 188/188 [00:05<00:00, 36.05it/s]


Loss:50.45


Testing: 100%|██████████| 914/914 [00:04<00:00, 191.58it/s]


ACC:21.82


Training Epoch 5: 100%|██████████| 188/188 [00:05<00:00, 33.38it/s]


Loss:37.56


Testing: 100%|██████████| 914/914 [00:04<00:00, 187.92it/s]


ACC:21.76


Training Epoch 6: 100%|██████████| 188/188 [00:05<00:00, 35.09it/s]


Loss:28.68


Testing: 100%|██████████| 914/914 [00:05<00:00, 171.35it/s]


ACC:21.94


Training Epoch 7: 100%|██████████| 188/188 [00:05<00:00, 34.34it/s]


Loss:23.09


Testing: 100%|██████████| 914/914 [00:05<00:00, 180.46it/s]


ACC:21.95


Training Epoch 8: 100%|██████████| 188/188 [00:05<00:00, 32.18it/s]


Loss:19.21


Testing: 100%|██████████| 914/914 [00:05<00:00, 176.66it/s]


ACC:22.01


Training Epoch 9: 100%|██████████| 188/188 [00:05<00:00, 32.86it/s]


Loss:16.77


Testing: 100%|██████████| 914/914 [00:05<00:00, 179.07it/s]

ACC:21.95



