ELMo词向量模型

1. 数据读取

In [1]:
from tqdm import tqdm
from torch.utils.data import Dataset
import torch
from torch.nn.utils.rnn import pad_sequence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 创建文本预定义标记
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
BOW_TOKEN = "<bow>"
EOW_TOKEN = "<eow>"

In [3]:
# 创建词表函数
def load_corpus(path, max_tok_len=None, max_seq_len=None):
    '''
    path:本地文本数据路径
    max_tok_len:词长度上限
    max_seq_len:序列长度上限
    '''
    text = []
    charset = {BOS_TOKEN, EOS_TOKEN, PAD_TOKEN, BOW_TOKEN, EOW_TOKEN}
    # 字符集，首先加入预定义标记
    with open(path, "r") as f:
        # 读取文本文件
        for line in tqdm(f):
            # 文件中每一行是一段字符序列
            tokens = line.rstrip().split(" ")
            # rstrip函数用于删除字符串末尾的空白
            if max_seq_len is not None and len(tokens) + 2 > max_seq_len:
                # 之后要加入BOS_TOKEN和EOS_TOKEN两个标记，所以要留出两个位置
                tokens = line[:max_seq_len-2]
                # 截断过长的序列
            sent = [BOS_TOKEN]
            # 当前序列
            for token in tokens:
                if max_tok_len is not None and len(token) + 2 > max_tok_len:
                    # 同理，要加入BOW_TOKEN和EOW_TOKEN
                    # 注意，因为ELMo模型使用了字符级输入，所以除构建词级语料外，还要构建字符级语料
                    token = token[:max_tok_len-2]
                sent.append(token)
                for ch in token:
                    charset.add(ch)
                    # 将字符加入字符集
            sent.append(EOS_TOKEN)
            text.append(sent)
    # 此处处理之后，text中的一个元素为一个序列即一个sent，sent中的一个元素为一个标注即一个token
    # print(text[:10])
    
    vocab_w = Vocab.build(text, min_freq=2, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    # 词级词表，需要先统计token，因此使用build方法
    vocab_c = Vocab(tokens=list(charset))
    # 字符级词表，charset是已经统计好的字符，因此无需统计，直接构建

    corpus_w = [vocab_w.convert_tokens_to_ids(sent) for sent in text]
    # 构建词级语料
    corpus_c = []
    bow = vocab_c[BOW_TOKEN]
    eow = vocab_c[EOW_TOKEN]
    for i, sent in enumerate(text):
        sent_c = []
        for token in sent:
            if token == BOS_TOKEN or token == EOS_TOKEN:
                token_c = [bow, vocab_c[token], eow]
            # 如果token不是一个词
            else:
                token_c = [bow] + vocab_c.convert_tokens_to_ids(token) + [eow]
            sent_c.append(token_c)
        # 这一块代码整体是将文本转化为索引，对于正常token，直接调用转化函数即可，对于标记类token，则直接查找对应索引
        # 因为convert函数内部会将传入参数拆开依次映射，因此vocab_c传入token，但其实映射的是token内部的字符
        corpus_c.append(sent_c)

    return corpus_w, corpus_c, vocab_w, vocab_c

In [4]:
# 词表构建方法同之前一样
# 构建Vocab类
from collections import defaultdict

class Vocab:

    def __init__(self, tokens = None) -> None:
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens += ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx["<unk>"] 

    @classmethod
    def build(cls, text, min_freq = 1, reserved_tokens = None):
        # cls 为类本身，相当于Vocab()
        token_freqs = defaultdict(int) # 统计token的频率
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items()  
                       if freq >= min_freq and token != "<unk>"]
        return cls(uniq_tokens)
        
    def __len__(self):
        # 返回词表的大小
        return len(self.idx_to_token)

    def __getitem__(self, token):
        # 查找输入token对应的索引值，如果不存在返回<unk>对应的索引0
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_to_ids(self, tokens):
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]
        


In [49]:
corpus_w, corpus_c, vocab_w, vocab_c = load_corpus("/home/zouyuheng/data/English/travel_comment.txt")

145321it [00:05, 27047.84it/s]


In [5]:
# 创建用于双向语言模型的数据集
class BiLMDataset(Dataset):

    def __init__(self, corpus_w, corpus_c, vocab_w, vocab_c) -> None:
        super(BiLMDataset, self).__init__()
        self.pad_w = vocab_w[PAD_TOKEN]
        self.pad_c = vocab_c[PAD_TOKEN]

        self.data = []
        for sent_w, sent_c in zip(corpus_w, corpus_c):
            self.data.append((sent_w, sent_c))
        # print(self.data[0][1])
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]

In [55]:
data = BiLMDataset(corpus_w, corpus_c, vocab_c, vocab_w)

In [6]:
def collate_fn(examples, pad_w, pad_c):
    
    seq_lens = torch.LongTensor([len(ex[0]) for ex in examples])
    # ex为每段文本，ex[0]为以词划分的序列
    # 样本中序列的长度，使用LongTensor函数进行数据类型的转换
    inputs_w = [torch.tensor(ex[0]) for ex in examples]
    # 词级别输入
    inputs_w = pad_sequence(inputs_w, batch_first=True, padding_value=pad_w)
    # 对每个序列补齐到相同长度

    batch_size, max_seq_len = inputs_w.shape
    # 词级别的输入矩阵为批次大小*序列长度，因为之前做了补齐，所以所有长度皆为最长序列长度
    max_tok_len = max([max([len(tok) for tok in ex[1]]) for ex in examples])
    # ex[1]为以字符划分的序列，tok为以字符表示的每个词
    # 找出最大词大小

    inputs_c = torch.LongTensor(batch_size, max_seq_len, max_tok_len).fill_(pad_c)
    # 字符级别的输入矩阵为批次大小*序列长度*最大词大小,使用pad初始化
    # 字符比词更深一层
    for i, (sent_w, sent_c) in enumerate(examples):
        for j, tok in enumerate(sent_c):
            inputs_c[i][j][:len(tok)] = torch.LongTensor(tok)
            # 此处使用索引起到了补齐的作用

    targets_fw = torch.LongTensor(inputs_w.shape).fill_(pad_w)
    # 前向语言模型的目标输出序列
    targets_bw = torch.LongTensor(inputs_w.shape).fill_(pad_w)
    # 后向语言模型的目标输出序列
    for i, (sent_w, sent_c) in enumerate(examples):
        targets_fw[i][:len(sent_w)-1] = torch.LongTensor(sent_w[1:])
        # 前向语言模型的目标输出序列为输入序列左移一位
        targets_bw[i][1:len(sent_w)] = torch.LongTensor(sent_w[:len(sent_w)-1])
    # 对于前向语言模型，输入为<bos>w1w2w3...<eos>，输出为w1w2w3...<eos><pad>
    # 计算时输入<bos>输出w1，输入w1和历史状态（<bos>）输出w2，以此类推
    # 对于后向语言模型，输入为<bos>w1w2w3...<eos>，输出为<pad><bos>w1w2...wn

    return inputs_w, inputs_c, seq_lens, targets_fw, targets_bw

2. 建立模型

In [7]:
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import os

In [8]:
class Highway(nn.Module):
    # 基于字符的输入表示层，即Highway网络
    def __init__(self, input_dim, num_layers, activation = F.relu) -> None:
        super(Highway, self).__init__()
        self.input_dim = input_dim
        self.layers = nn.ModuleList(
            [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)]
        )
        # 使用ModuleList构建多个线性层，每层的输入为input_dim，输出为两倍input_dim，其中一半输入下一层，一半用于计算门控
        self.activation = activation
        for layer in self.layers:
            layer.bias[input_dim:].data.fill_(1)
        # 后半部分是计算门控向量的参数,即公式中的W^g和b^g

    def forward(self, inputs):
        curr_inputs = inputs
        # 整体输入
        for layer in self.layers:
            projected_inputs = layer(curr_inputs)
            # 经过线性层计算
            hidden = self.activation(projected_inputs[:, 0:self.input_dim])
            # 前半部分通过激活作为当前隐藏层输出
            gate = torch.sigmoid(projected_inputs[:, self.input_dim:])
            # 后半部分计算门控向量
            curr_inputs = gate * curr_inputs + (1 - gate) * hidden
        return curr_inputs

In [21]:
class ConvTokenEmbedder(nn.Module):
    # 基于字符卷积的词表示层
    def __init__(self, vocab_c, char_embedding_dim, char_conv_filters, num_highways, output_dim, pad = "<pad>") -> None:
        '''
        Args:
           vocab_c: 字符级词表
           char_embedding_dim: 字符向量维度
           char_conv_filters: 卷积核大小，双层列表
           num_highways: highway网络层数 
           output_dim: 输出维度
        '''
        super(ConvTokenEmbedder, self).__init__()
        self.vocab_c = vocab_c
        self.char_embeddings = nn.Embedding(len(vocab_c), char_embedding_dim, padding_idx=vocab_c[pad])
        # 词向量层，注意，此处的len(vocab_c)并不是输入维度，而是词表大小
        # self.char_embeddings.data.uniform(-0.25, 0.25)
        # uniform随机取值，对参数进行初始化

        self.convolutions = nn.ModuleList()
        # 卷积层
        for kernel_size, out_channels in char_conv_filters:
            conv = nn.Conv1d(in_channels=char_embedding_dim, out_channels=out_channels, kernel_size=kernel_size, bias=True)
            self.convolutions.append(conv)
        # 创建多个一维卷积层

        self.num_filters = sum(f[1] for f in char_conv_filters)
        # 输入highway网络时是将不同卷积核的输出拼接在了一起，所以highway网络的输入维度是所有卷积层的输出维度之和
        self.num_highways = num_highways
        self.highways = Highway(self.num_filters, self.num_highways, activation = F.relu)
        self.projection = nn.Linear(self.num_filters, output_dim, bias = True)
        '''线性层'''

    def forward(self, inputs):
        batch_size, seq_len, token_len = inputs.shape
        # 批次大小，序列长度，标记长度
        # print("inputs.shape = ", inputs.shape)
        inputs = inputs.view(batch_size * seq_len, -1)
        # 将输入展开为二维，以token为单位，所以需要将批次和序列长度整合
        # print("after view, inputs.shape = ", inputs.shape)
        # print("len(vocab_c) = ", len(self.vocab_c))
        char_embeds = self.char_embeddings(inputs)
        # print("char_embeds.shape = ", char_embeds.shape)
        char_embeds = char_embeds.transpose(1, 2)
        # 做转置原因：卷积的输入定义不同，为批次*输入通道数*长度，表示层输出为批次*长度*输入通道数
        # 注意，由于将token拆成了字符，此处的批次其实为批次*token数

        conv_hiddens = []
        for i in range(len(self.convolutions)):
            # 逐个卷积操作
            conv_hidden = self.convolutions[i](char_embeds)
            conv_hidden, _ = torch.max(conv_hidden, dim = -1)
            # 最大池化
            conv_hidden = F.relu(conv_hidden)
            conv_hiddens.append(conv_hidden)

        token_embeds = torch.cat(conv_hiddens, dim=-1)
        # 将不同卷积层的输出拼接
        token_embeds = self.highways(token_embeds)
        token_embeds = self.projection(token_embeds)
        token_embeds = token_embeds.view(batch_size, seq_len, -1)
        # 将输出的形状还原
        return token_embeds


In [28]:
class ELMoLSTMEncoder(nn.Module):
    # 双向LSTM编码器
    def __init__(self, input_dim, hidden_dim, num_layers) -> None:
        
        super(ELMoLSTMEncoder, self).__init__()
        self.projection_dim = input_dim
        # 用于投影层，保证各层具有相同的维度
        self.num_layers = num_layers

        self.forward_layers = nn.ModuleList()
        # 前向LSTM
        self.forward_projections = nn.ModuleList()
        # 投影层：hidden_dim -> projection_dim
        self.backward_layers= nn.ModuleList()
        # 后向LSTM
        self.backward_projections = nn.ModuleList()
        # 后向投影层同前向

        lstm_input_dim = input_dim
        for _ in range(num_layers):
            forward_layer = nn.LSTM(lstm_input_dim, hidden_dim, num_layers=1, batch_first=True)
            forward_projection = nn.Linear(hidden_dim, self.projection_dim, bias=True)
            # 单层前向LSTM以及投影层
            backward_layer = nn.LSTM(lstm_input_dim, hidden_dim, num_layers=1, batch_first=True)
            backward_projection = nn.Linear(hidden_dim, self.projection_dim, bias=True)
            # 单层后向LSTM以及投影层
            lstm_input_dim = self.projection_dim
            self.forward_layers.append(forward_layer)
            self.forward_projections.append(forward_projection)
            self.backward_layers.append(backward_layer)
            self.backward_projections.append(backward_projection)

    def forward(self, inputs, lengths):
        batch_size, seq_len, input_dim = inputs.shape
        
        rev_idx = torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1)
        # print("最初的rev_idx")
        # print(rev_idx)
        for i in range(lengths.shape[0]):
            rev_idx[i, :lengths[i]] = torch.arange(lengths[i]-1, -1, -1)
            # print("经过第{}次处理之后的rev_idx")
            # print(rev_idx)
        rev_idx = rev_idx.unsqueeze(2).expand_as(inputs)
        # print("处理之后的rev_idx")
        # print(rev_idx)
        rev_idx = rev_idx.to(inputs.device)
        rev_inputs = inputs.gather(1, rev_idx)
        '''此处不是特别清晰'''

        forward_inputs, backward_inputs = inputs, rev_inputs
        # 前向和后向的输入
        stacked_forward_states, stacked_backward_states = [], []
        # 前向和后向的隐含层状态

        for layer_index in range(self.num_layers):
            
            packed_forward_inputs = pack_padded_sequence(forward_inputs, lengths, batch_first=True, enforce_sorted=False)
            packed_backward_inputs = pack_padded_sequence(backward_inputs, lengths, batch_first=True, enforce_sorted=False)
            # 对前后向输入进行打包对齐
            
            forward_layer = self.forward_layers[layer_index]
            packed_forward, _ = forward_layer(packed_forward_inputs)
            forward = pad_packed_sequence(packed_forward, batch_first=True)[0]
            # 对输出解包
            forward = self.forward_projections[layer_index](forward)
            # 规整宽度
            stacked_forward_states.append(forward)
            # 计算前向LSTM

            backward_layer = self.backward_layers[layer_index]
            packed_backward, _ = backward_layer(packed_backward_inputs)
            backward = pad_packed_sequence(packed_backward, batch_first=True)[0]
            backward = self.backward_projections[layer_index](backward)
            stacked_backward_states.append(backward.gather(1, rev_idx))
            # 将输出还原顺序
            # 计算后向LSTM

        return stacked_forward_states, stacked_backward_states 





In [11]:
# 超参数
config = {
    'max_tok_len' : 50,
    'train_file': "/home/zouyuheng/data/English/travel_comment.txt",
    'model_path' : './elmo_bilm',
    'char_embedding_dim':50,
    'char_conv_filters':[[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
    'num_highways':2,
    'projection_dim':512,
    'hidden_dim':4096,
    "num_layers":2,
    'batch_size':4,
    "dropout":0.1, 
    'learning_rate':0.0004,
    "clip_grad":5,
    'num_epoch':10
}

In [12]:
# 双向语言模型
class BiLM(nn.Module):
    def __init__(self, configs, vocab_w, vocab_c) -> None:
        super(BiLM, self).__init__()
        self.dropout_prob = configs["dropout"]
        self.num_classes = len(vocab_w)
        # 输出层的维度为词表大小，即对词表中的每一个词有一个预测概率

        self.token_embedder = ConvTokenEmbedder(vocab_c, configs['char_embedding_dim'], configs['char_conv_filters'], configs['num_highways'], configs['projection_dim'])
        # 词表示编码器
        self.encoder = ELMoLSTMEncoder(configs['projection_dim'], configs['hidden_dim'], configs['num_layers'])
        # ELMo编码器
        self.classifier = nn.Linear(configs['projection_dim'], self.num_classes)
        # 分类器

    def forward(self, inputs, lengths):
        token_embeds = self.token_embedder(inputs)
        token_embeds = F.dropout(token_embeds, self.dropout_prob)
        # 采样
        forward, backward = self.encoder(token_embeds, lengths.to('cpu'))
        return self.classifier(forward[-1]), self.classifier(backward[-1])
        # 使用最后的隐藏状态作为分类器的输入
        # 此处注意，ELMo模型的原思想应该是对各个隐藏状态做线性组合，此处做了简化

    def save_pretrained(self, path):
        os.makedirs(path, exist_ok=True)
        torch.save(self.token_embedder.state_dict(), os.path.join("token_embedder.pth"))
        # 保存词表示编码器的参数
        torch.save(self.encoder.state_dict(), os.path.join("encoder.pth"))

3. 训练


In [13]:
from torch.utils.data import DataLoader
from torch import optim
from tqdm import tqdm
import numpy as np
import json

In [14]:
corpus_w, corpus_c, vocab_w, vocab_c = load_corpus(config['train_file'])
train_data = BiLMDataset(corpus_w, corpus_c, vocab_w, vocab_c)
train_loader = DataLoader(train_data, config['batch_size'], collate_fn = lambda x : collate_fn(x, vocab_w[PAD_TOKEN], vocab_c[PAD_TOKEN]))

145321it [00:07, 19774.60it/s]


In [15]:
criterion = nn.CrossEntropyLoss(ignore_index = vocab_w[PAD_TOKEN], reduction='sum')
# 使用交叉熵损失函数


In [30]:
model = BiLM(config, vocab_w, vocab_c)
# 构建模型
device = torch.device("cuda")
model.to(device)
optimizer = optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), lr = config['learning_rate'])

In [31]:
model.train()
for epoch in range(config['num_epoch']):
    total_loss = 0
    total_tags = 0
    # 有效预测位置的数量
    for batch in tqdm(train_loader, desc = f"Training Epoch {epoch}"):
        batch = [x.to(device) for x in batch]
        inputs_w, inputs_c, seq_lens, targets_fw, targets_bw = batch
        optimizer.zero_grad()
        outputs_fw, outputs_bw = model(inputs_c, seq_lens)
        # 模型计算输出
        loss_fw = criterion(outputs_fw.view(-1, outputs_fw.shape[-1]), targets_fw.view(-1))
        # 计算前向模型损失
        loss_bw = criterion(outputs_bw.view(-1, outputs_bw.shape[-1]), targets_bw.view(-1))
        # 计算后向模型损失
        loss = (loss_fw + loss_bw) / 2.0
        loss.backward()
        # 反向传播
        nn.utils.clip_grad_norm_(model.parameters(), config['clip_grad'])
        # 梯度裁剪，解决梯度爆炸问题
        optimizer.step()

    total_loss += loss_fw.item()
    total_tags += seq_lens.sum().item()
    train_ppl = np.exp(total_loss / total_tags)
    # 以前向模型的困惑度作为性能指标
    print(f"Train PPL: {train_ppl:.2f}")

model.save_pretrained(config['model_path'])
# 保存模型参数
json.dump(config, open(os.path.join(config['model_path'], 'config.json'), "w"))
# 保存超参数

Training Epoch 0:   0%|          | 23/36331 [00:11<5:07:33,  1.97it/s] 


KeyboardInterrupt: 