<a href="https://colab.research.google.com/github/JimmyXiaodong/pytorch-study/blob/master/model_lstm_pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
character_to_ix = {}
words_characters = {}

for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            words_characters[len(word_to_ix)] = list(word)
            word_to_ix[word] = len(word_to_ix)

            for character in word:
                if character not in character_to_ix:
                    character_to_ix[character] = len(character_to_ix)

print(character_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

{'T': 0, 'h': 1, 'e': 2, 'd': 3, 'o': 4, 'g': 5, 'a': 6, 't': 7, 'p': 8, 'l': 9, 'E': 10, 'v': 11, 'r': 12, 'y': 13, 'b': 14, 'k': 15}


In [3]:
# 实际中通常使用更大的维度如32维, 64维.
# 这里我们使用小的维度, 为了方便查看训练过程中权重的变化.
WORD_EMBEDDING_DIM = 6
CHARACTER_EMBEDDING_DIM = 3
HIDDEN_DIM = 6
CHARACTER_HIDDEN_DIM = 3


class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, character_embedding_dim, hidden_dim, character_hidden_dim, vocab_size,
                 character_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.character_hidden_dim = character_hidden_dim
        # 词嵌入
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # 字符嵌入
        self.character_embeddings = nn.Embedding(character_size, character_embedding_dim)

        # lstm_character以每个字符的character_embeddings作为输入, 输出即为该单词对应字符级别的特征，输出维度为 character_hidden_dim 的隐藏状态值
        self.lstm_character = nn.LSTM(character_embedding_dim, character_hidden_dim)

        # tag_lstm以word_embeddings和该词字符级别特征的拼接向量作为输入, 输出维度为 hidden_dim 的隐藏状态值
        self.tag_lstm = nn.LSTM(embedding_dim + character_hidden_dim, hidden_dim)

        # 线性层将隐藏状态空间映射到标注空间
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden_tag = self.init_hidden(hidden_dim)
        self.hidden_character = self.init_hidden(character_hidden_dim)

    def init_hidden(self, hidden_dim):
        # 一开始并没有隐藏状态所以我们要先初始化一个
        # 关于维度为什么这么设计请参考Pytoch相关文档
        # 各个维度的含义是 (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, hidden_dim),
                torch.zeros(1, 1, hidden_dim))

    def forward(self, sentence, words_characters):

        embeds = list()
        for sentence_word in sentence:
            # 词嵌入
            word_embed = self.word_embeddings(sentence_word)

            # 获取单词字符级别的特征
            word_character = words_characters[sentence_word.item()]
            word_character_in = prepare_sequence(word_character, character_to_ix)
            character_embeds = self.character_embeddings(word_character_in)
            character_lstm_out, self.hidden_character = self.lstm_character(
                character_embeds.view(len(word_character_in), 1, -1), self.hidden_character)

            # 拼接词向量与字符级别的特征
            embed = torch.cat((word_embed, self.hidden_character[0].view(-1)))
            embeds.append(embed)

        # 拼接句子中每个词的词向量，拼接后的结果作为tag_lstm的输入
        embeds = torch.cat(embeds).view(len(sentence), 1, -1)
        lstm_out, self.hidden_tag = self.tag_lstm(embeds, self.hidden_tag)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


model = LSTMTagger(WORD_EMBEDDING_DIM, CHARACTER_EMBEDDING_DIM, HIDDEN_DIM, CHARACTER_HIDDEN_DIM, len(word_to_ix),
                   len(character_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [4]:
# 查看训练前的分数
# 注意: 输出的 i,j 元素的值表示单词 i 的 j 标签的得分
# 这里我们不需要训练不需要求导，所以使用torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs, words_characters)
    print("训练前的分数：\n")
    print(tag_scores)

for epoch in range(300):  # 实际情况下你不会训练300个周期, 此例中我们只是随便设了一个值
    for sentence, tags in training_data:
        # 第一步: 请记住Pytorch会累加梯度.
        # 我们需要在训练每个实例前清空梯度
        model.zero_grad()

        # 此外还需要清空 LSTM 的隐状态,
        # 将其从上个实例的历史中分离出来.
        model.hidden_tag = model.init_hidden(HIDDEN_DIM)
        model.hidden_character = model.init_hidden(CHARACTER_HIDDEN_DIM)

        # 准备网络输入, 将其变为词索引的 Tensor 类型数据
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # 第三步: 前向传播.
        tag_scores = model(sentence_in, words_characters)

        # 第四步: 计算损失和梯度值, 通过调用 optimizer.step() 来更新梯度
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# 查看训练后的得分
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs, words_characters)

    # 句子是 "the dog ate the apple", i,j 表示对于单词 i, 标签 j 的得分.
    # 我们采用得分最高的标签作为预测的标签. 从下面的输出我们可以看到, 预测得
    # 到的结果是0 1 2 0 1. 因为 索引是从0开始的, 因此第一个值0表示第一行的
    # 最大值, 第二个值1表示第二行的最大值, 以此类推. 所以最后的结果是 DET
    # NOUN VERB DET NOUN, 整个序列都是正确的!
    print("训练后的分数：\n")
    print(tag_scores)

训练前的分数：

tensor([[-1.0777, -0.8799, -1.4074],
        [-1.1135, -0.8977, -1.3315],
        [-1.1135, -0.9231, -1.2935],
        [-1.4582, -0.8372, -1.0953],
        [-1.2561, -0.8412, -1.2586]])
训练后的分数：

tensor([[-0.0962, -2.4535, -5.1684],
        [-5.9003, -0.0162, -4.3165],
        [-4.7111, -3.3033, -0.0468],
        [-0.0109, -5.0973, -5.3566],
        [-4.1152, -0.0198, -5.7050]])
