In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence, pad_sequence)
from datetime import datetime, timedelta

In [14]:
class config(object):
    ftrain = 'data/ctb5/train.conll'
    fdev = 'data/ctb5/dev.conll'
    ftest = 'data/ctb5/test.conll'
    fembed = 'data/sgns.renmin.char'
    n_context = 1
    n_embed = 300
    n_hidden = 150
    drop = 0.5
    batch_size = 50
    epochs = 100
    interval = 10
    eta = 0.001
    file = 'network.pt'
    use_char = False

数据预处理步骤：
1.获取词典：（训练集句子中词去重）∪（Embedding预训练词）+ 'PAD' + 'UNK' + 'SOS' + 'EOS'
2.获取目标词典：有多少词性就有多少目标
2.重构词嵌入矩阵：词嵌入矩阵 = 原词嵌入矩阵 + 正态分布初始化在训练集中但是不在Embedding预训练词集矩阵
3.构建训练数据，按词典索引构成训练集

In [3]:
def init_embedding(tensor):
    std = (1. / tensor.size(1)) ** 0.5
    nn.init.normal_(tensor, mean=0, std=std)


class Corpus(object):
    PAD = '<PAD>'
    UNK = '<UNK>'
    SOS = '<SOS>'
    EOS = '<EOS>'

    def __init__(self, fdata, fembed=None):
        # 获取数据的句子
        self.sents = self.preprocess(fdata)
        # 获取数据的所有不同的词汇、词性和字符
        self.words, self.tags, self.chars = self.parse(self.sents)
        # 增加句首词汇、句尾词汇、填充词汇和未知词汇
        self.words = [self.PAD, self.UNK, self.SOS, self.EOS] + self.words
        # 增加填充字符和未知字符
        self.chars = [self.PAD, self.UNK] + self.chars

        # 词汇字典
        self.wdict = {w: i for i, w in enumerate(self.words)}
        # 词性字典
        self.tdict = {t: i for i, t in enumerate(self.tags)}
        # 字符字典
        self.cdict = {c: i for i, c in enumerate(self.chars)}

        # 填充词汇索引
        self.pad_wi = self.wdict[self.PAD]
        # 未知词汇索引
        self.unk_wi = self.wdict[self.UNK]
        # 句首词汇索引
        self.sos_wi = self.wdict[self.SOS]
        # 句尾词汇索引
        self.sos_wi = self.wdict[self.EOS]
        # 填充字符索引
        self.pad_ci = self.cdict[self.PAD]
        # 未知字符索引
        self.unk_ci = self.cdict[self.UNK]

        # 句子数量
        self.n_sents = len(self.sents)
        # 词汇数量
        self.n_words = len(self.words)
        # 词性数量
        self.n_tags = len(self.tags)
        # 字符数量
        self.n_chars = len(self.chars)

        # 预训练词嵌入
        self.embed = self.get_embed(fembed) if fembed is not None else None

    def extend(self, words):
        unk_words = [w for w in words if w not in self.wdict]
        unk_chars = [c for c in ''.join(unk_words) if c not in self.cdict]
        # 扩展词汇和字符
        self.words = sorted(set(self.words + unk_words) - {self.PAD})
        self.chars = sorted(set(self.chars + unk_chars) - {self.PAD})
        self.words = [self.PAD] + self.words
        self.chars = [self.PAD] + self.chars
        # 更新字典
        self.wdict = {w: i for i, w in enumerate(self.words)}
        self.cdict = {c: i for i, c in enumerate(self.chars)}
        # 更新索引
        self.pad_wi = self.wdict[self.PAD]
        self.unk_wi = self.wdict[self.UNK]
        self.sos_wi = self.wdict[self.SOS]
        self.sos_wi = self.wdict[self.EOS]
        self.pad_ci = self.cdict[self.PAD]
        self.unk_ci = self.cdict[self.UNK]
        # 更新词汇和字符数
        self.n_words = len(self.words)
        self.n_chars = len(self.chars)

    def load(self, fdata, use_char=False, n_context=1, max_len=10):
        sentences = self.preprocess(fdata)
        x, y, char_x, lens = [], [], [], []

        for wordseq, tagseq in sentences:
            #找不到就返回未登录词的序号
            wiseq = [self.wdict.get(w, self.unk_wi) for w in wordseq]
            tiseq = [self.tdict[t] for t in tagseq]
            # 获取每个词汇的上下文
            if n_context > 1:
                x.append(self.get_context(wiseq, n_context))
            else:
                x.append(torch.tensor(wiseq, dtype=torch.long))
            y.append(torch.tensor(tiseq, dtype=torch.long))
            # 不足最大长度的部分用0填充
            char_x.append(torch.tensor([
                [self.cdict.get(c, self.unk_ci)
                 for c in w[:max_len]] + [0] * (max_len - len(w))
                for w in wordseq
            ]))
            lens.append(len(tiseq))

        x = pad_sequence(x, True)
        y = pad_sequence(y, True)
        char_x = pad_sequence(char_x, True)
        lens = torch.tensor(lens)

        if use_char:
            dataset = TensorDataset(x, y, char_x, lens)
        else:
            dataset = TensorDataset(x, y, lens)

        return dataset

    def get_context(self, wiseq, n_context):
        half = n_context // 2
        length = len(wiseq)
        wiseq = [self.sos_wi] * half + wiseq + [self.sos_wi] * half
        context = [wiseq[i:i + n_context] for i in range(length)]
        context = torch.tensor(context, dtype=torch.long)

        return context

    def get_embed(self, fembed):
        with open(fembed, 'r') as f:
            lines = [line for line in f]
        splits = [line.split() for line in lines]
        splits = splits[1:]
        # 获取预训练数据中的词汇和嵌入矩阵
        words, embed = zip(*[
            (split[0], list(map(float, split[1:]))) for split in splits
        ])
        # 扩充词汇
        self.extend(words)
        # 初始化词嵌入
        embed = torch.tensor(embed, dtype=torch.float)
        embed_indices = [self.wdict[w] for w in words]
        extended_embed = torch.Tensor(self.n_words, embed.size(1))
        init_embedding(extended_embed)
        extended_embed[embed_indices] = embed

        return extended_embed

    def __repr__(self):
        info = f"{self.__class__.__name__}(\n"
        info += f"{'':2}num of sentences: {self.n_sents}\n"
        info += f"{'':2}num of words: {self.n_words}\n"
        info += f"{'':2}num of tags: {self.n_tags}\n"
        info += f"{'':2}num of chars: {self.n_chars}\n"
        info += f")\n"

        return info

    @staticmethod
    def preprocess(fdata):
        start = 0
        sentences = []
        with open(fdata, 'r') as f:
            lines = [line for line in f]
        for i, line in enumerate(lines):
            if len(lines[i]) <= 1:
                splits = [l.split()[1:4:2] for l in lines[start:i]]
                wordseq, tagseq = zip(*splits)
                start = i + 1
                while start < len(lines) and len(lines[start]) <= 1:
                    start += 1
                sentences.append((wordseq, tagseq))

        return sentences

    @staticmethod
    def parse(sentences):
        wordseqs, tagseqs = zip(*sentences)
        words = sorted(set(w for wordseq in wordseqs for w in wordseq))
        tags = sorted(set(t for tagseq in tagseqs for t in tagseq))
        chars = sorted(set(''.join(words)))

        return words, tags, chars


In [11]:
# -*- coding: utf-8 -*-
class LSTM_CRF(nn.Module):

    def __init__(self, n_vocab, n_embed, n_hidden, n_out, drop=0.5):
        super(LSTM_CRF, self).__init__()

        self.embed = nn.Embedding(n_vocab, n_embed)
        # 词嵌入LSTM层
        self.lstm = nn.LSTM(input_size=n_embed,
                            hidden_size=n_hidden,
                            batch_first=True,
                            bidirectional=True)

        # 输出层
        self.out = nn.Linear(n_hidden * 2, n_out)
        # CRF层
        self.crf = CRF(n_out)

        self.drop = nn.Dropout(drop)

    def load_pretrained(self, embed):
        self.embed = nn.Embedding.from_pretrained(embed, False)

    def forward(self, x, lens):
        B, T = x.shape
        # 获取词嵌入向量
        x = self.embed(x)
        x = self.drop(x)

        x = pack_padded_sequence(x, lens, True)
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, True)
        x = self.drop(x)

        return self.out(x)

    def fit(self, train_loader, dev_loader, test_loader,
            epochs, interval, eta, file):
        # 记录迭代时间
        total_time = timedelta()
        # 记录最大准确率及对应的迭代次数
        max_e, max_acc = 0, 0.0
        # 设置优化器为Adam
        self.optimizer = optim.Adam(params=self.parameters(), lr=eta)

        for epoch in range(1, epochs + 1):
            start = datetime.now()
            # 更新参数
            self.update(train_loader)

            print(f"Epoch: {epoch} / {epochs}:")
            loss, train_acc = self.evaluate(train_loader)
            print(f"{'train:':<6} Loss: {loss:.4f} Accuracy: {train_acc:.2%}")
            loss, dev_acc = self.evaluate(dev_loader)
            print(f"{'dev:':<6} Loss: {loss:.4f} Accuracy: {dev_acc:.2%}")
            loss, test_acc = self.evaluate(test_loader)
            print(f"{'test:':<6} Loss: {loss:.4f} Accuracy: {test_acc:.2%}")
            t = datetime.now() - start
            print(f"{t}s elapsed\n")
            total_time += t

            # 保存效果最好的模型
            if dev_acc > max_acc:
                torch.save(self, file)
                max_e, max_acc = epoch, dev_acc
            elif epoch - max_e >= interval:
                break
        print(f"max accuracy of dev is {max_acc:.2%} at epoch {max_e}")
        print(f"mean time of each epoch is {total_time / epoch}s\n")

    def update(self, loader):
        # 设置为训练模式
        self.train()

        # 从加载器中加载数据进行训练
        for x, y, lens in loader:
            # B(batch_size)
            # T(Sentences_size)
            # N(Tag_size)
            # x [B,T]; y [B,T]
            # 清除梯度
            self.optimizer.zero_grad()
            # 获取掩码
            mask = x.gt(0) # [B,T]
            target = y[mask]

            out = self(x, lens) # [B,T,N]
            #转换任意两个维度
            out = out.transpose(0, 1)  # [B,T,N]->[T, B, N]
            y, mask = y.t(), mask.t()  # [T, B]
            # out->发射矩阵
            loss = self.crf(out, y, mask)
            # 计算梯度
            loss.backward()
            # 更新参数
            self.optimizer.step()

    @torch.no_grad()
    def evaluate(self, loader):
        # 设置为评价模式
        self.eval()

        loss, tp, total = 0, 0, 0
        # 从加载器中加载数据进行评价
        for x, y, lens in loader:
            mask = x.gt(0)
            target = y[mask]

            out = self.forward(x, lens)
            out = out.transpose(0, 1)  # [T, B, N]
            y, mask = y.t(), mask.t()  # [T, B]
            predict = self.crf.viterbi(out, mask)
            loss += self.crf(out, y, mask)
            tp += torch.sum(predict == target).item()
            total += lens.sum().item()
        loss /= len(loader)

        return loss, tp / total

    def collate_fn(self, data):
        x, y, lens = zip(
            *sorted(data, key=lambda x: x[-1], reverse=True)
        )
        max_len = lens[0]
        x = torch.stack(x)[:, :max_len]
        y = torch.stack(y)[:, :max_len]
        lens = torch.tensor(lens)

        return x, y, lens
                

class CRF(nn.Module):

    def __init__(self, n_tags):
        super(CRF, self).__init__()

        # 不同的词性个数
        self.n_tags = n_tags
        # 句间迁移(FROM->TO)
        self.trans = nn.Parameter(torch.Tensor(n_tags, n_tags))
        # 句首迁移
        self.strans = nn.Parameter(torch.Tensor(n_tags))
        # 句尾迁移
        self.etrans = nn.Parameter(torch.Tensor(n_tags))

        # 初始化参数
        self.reset_parameters()

    def reset_parameters(self):
        std = (1 / self.n_tags) ** 0.5
        nn.init.normal_(self.trans, mean=0, std=std)
        nn.init.normal_(self.strans, mean=0, std=std)
        nn.init.normal_(self.etrans, mean=0, std=std)

    def forward(self, emit, target, mask):
        T, B, N = emit.shape

        logZ = self.get_logZ(emit, mask)
        score = self.get_score(emit, target, mask)

        return (logZ - score) / B

    def get_logZ(self, emit, mask):
        T, B, N = emit.shape
        #emit[0]->batch中每个word_0的所有发射概率
        #strans单独定义，start->所有状态的转移概率
        #mask [T, B]
        alpha = self.strans + emit[0]  # [B, N]

        for i in range(1, T):
            trans_i = self.trans.unsqueeze(0)  #升维 [1, N, N] unsqueeze(0)->在0位置加一维
            emit_i = emit[i].unsqueeze(1)  # [B, 1, N]
            #mask[i]是batch的所有word_0的掩码[1,B]
            mask_i = mask[i].unsqueeze(1).expand_as(alpha)  # [B, N]
            scores = trans_i + emit_i + alpha.unsqueeze(2)  # [B, N, N]
            scores = torch.logsumexp(scores, dim=1)  # [B, N]
            alpha[mask_i] = scores[mask_i]

        return torch.logsumexp(alpha + self.etrans, dim=1).sum()

    def get_score(self, emit, target, mask):
        T, B, N = emit.shape
        scores = torch.zeros(T, B)

        # 加上句间迁移分数
        scores[1:] += self.trans[target[:-1], target[1:]]
        # 加上发射分数
        scores += emit.gather(dim=2, index=target.unsqueeze(2)).squeeze(2)
        # 通过掩码过滤分数
        # 根据掩码取出分数
        score = scores.masked_select(mask).sum()

        # 获取序列最后的词性的索引
        ends = mask.sum(dim=0).view(1, -1) - 1
        # 加上句首迁移分数
        score += self.strans[target[0]].sum()
        # 加上句尾迁移分数
        score += self.etrans[target.gather(dim=0, index=ends)].sum()

        return score

    def viterbi(self, emit, mask):
        T, B, N = emit.shape
        lens = mask.sum(dim=0)
        delta = torch.zeros(T, B, N)
        paths = torch.zeros(T, B, N, dtype=torch.long)

        delta[0] = self.strans + emit[0]  # [B, N]

        for i in range(1, T):
            trans_i = self.trans.unsqueeze(0)  # [1, N, N]
            emit_i = emit[i].unsqueeze(1)  # [B, 1, N]
            scores = trans_i + emit_i + delta[i - 1].unsqueeze(2)  # [B, N, N]
            delta[i], paths[i] = torch.max(scores, dim=1)

        predicts = []
        for i, length in enumerate(lens):
            prev = torch.argmax(delta[length - 1, i] + self.etrans)

            predict = [prev]
            for j in reversed(range(1, length)):
                prev = paths[j, i, prev]
                predict.append(prev)
            # 反转预测序列并保存
            predicts.append(torch.tensor(predict).flip(0))

        return torch.cat(predicts)


In [None]:
print(f"Set the max num of threads to 4\n"
          f"Set the seed for generating random numbers to 1\n")
torch.set_num_threads(4)
torch.manual_seed(1)

config = config()
corpus = Corpus(config.ftrain, config.fembed)
print(corpus)

print("Load the dataset")
trainset = corpus.load(config.ftrain, config.use_char, config.n_context)
devset = corpus.load(config.fdev, config.use_char, config.n_context)
testset = corpus.load(config.ftest, config.use_char, config.n_context)
print(f"size of trainset: {len(trainset)}\n"
          f"size of devset: {len(devset)}\n"
          f"size of testset: {len(testset)}\n")

In [13]:
print("Create Neural Network")

print(f"n_vocab: {corpus.n_words}\n"
      f"n_embed: {config.n_embed}\n"
      f"n_hidden: {config.n_hidden}\n"
      f"n_out: {corpus.n_tags}\n")
network = LSTM_CRF(n_vocab=corpus.n_words,
                   n_embed=config.n_embed,
                   n_hidden=config.n_hidden,
                   n_out=corpus.n_tags,
                   drop=config.drop)
    
network.load_pretrained(corpus.embed)

train_loader = DataLoader(dataset=trainset,
                          batch_size=config.batch_size,
                          shuffle=True,
                          collate_fn=network.collate_fn)
dev_loader = DataLoader(dataset=devset,
                        batch_size=config.batch_size,
                        collate_fn=network.collate_fn)
test_loader = DataLoader(dataset=testset,
                         batch_size=config.batch_size,
                         collate_fn=network.collate_fn)
      
print(f"{network}\n")

Create Neural Network
  n_vocab: 54304
  n_embed: 100
  n_hidden: 150
  n_out: 32

LSTM_CRF(
  (embed): Embedding(54304, 100)
  (lstm): LSTM(100, 150, batch_first=True, bidirectional=True)
  (out): Linear(in_features=300, out_features=32, bias=True)
  (crf): CRF()
  (drop): Dropout(p=0.5)
)



In [16]:
print("Use Adam optimizer to train the network")
print(f"{'':2}epochs: {config.epochs}\n"
      f"{'':2}batch_size: {config.batch_size}\n"
      f"{'':2}interval: {config.interval}\n"
      f"{'':2}eta: {config.eta}\n")
network.fit(train_loader=train_loader,
            dev_loader=dev_loader,
            test_loader=test_loader,
            epochs=config.epochs,
            interval=config.interval,
            eta=config.eta,
            file=config.file)

Use Adam optimizer to train the network
  epochs: 100
  batch_size: 50
  interval: 10
  eta: 0.001

Epoch: 1 / 100:
train: Loss: 7.6402 Accuracy: 90.73%
dev:   Loss: 7.4370 Accuracy: 90.19%
test:  Loss: 8.2437 Accuracy: 89.46%
0:02:38.095340s elapsed



  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Epoch: 2 / 100:
train: Loss: 5.0652 Accuracy: 93.69%
dev:   Loss: 5.4976 Accuracy: 92.46%
test:  Loss: 6.1008 Accuracy: 91.82%
0:02:33.396411s elapsed

Epoch: 3 / 100:
train: Loss: 4.0994 Accuracy: 94.81%
dev:   Loss: 4.9968 Accuracy: 93.16%
test:  Loss: 5.5575 Accuracy: 92.47%
0:02:33.330230s elapsed

Epoch: 4 / 100:
train: Loss: 3.5094 Accuracy: 95.53%
dev:   Loss: 4.6367 Accuracy: 93.75%
test:  Loss: 5.1702 Accuracy: 92.83%
0:02:33.349064s elapsed

Epoch: 5 / 100:
train: Loss: 3.0532 Accuracy: 96.15%
dev:   Loss: 4.4679 Accuracy: 94.06%
test:  Loss: 4.7913 Accuracy: 93.31%
0:02:34.500253s elapsed

Epoch: 6 / 100:
train: Loss: 2.7347 Accuracy: 96.51%
dev:   Loss: 4.3056 Accuracy: 94.33%
test:  Loss: 4.5222 Accuracy: 93.57%
0:02:34.068111s elapsed

Epoch: 7 / 100:
train: Loss: 2.4764 Accuracy: 96.85%
dev:   Loss: 4.2183 Accuracy: 94.51%
test:  Loss: 4.5454 Accuracy: 93.63%
0:02:37.228383s elapsed

Epoch: 8 / 100:
train: Loss: 2.2742 Accuracy: 97.07%
dev:   Loss: 4.1889 Accuracy: 94.53

In [19]:
network = torch.load(config.file)
loss, accuracy = network.evaluate(test_loader)
print(f"{'test:':<6} Loss: {loss:.4f} Accuracy: {accuracy:.2%}")

test:  Loss: 5.2197 Accuracy: 94.40%


NameError: name 'start' is not defined