In [1]:
import torch
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


数据准备

In [2]:
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"

In [3]:
# 构建Vocab类
from collections import defaultdict

class Vocab:

    def __init__(self, tokens = None) -> None:
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens += ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx["<unk>"] 

    @classmethod
    def build(cls, text, min_freq = 1, reserved_tokens = None):
        # cls 为类本身，相当于Vocab()
        token_freqs = defaultdict(int) # 统计token的频率
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items()  
                       if freq >= min_freq and token != "<unk>"]
        return cls(uniq_tokens)
        
    def __len__(self):
        # 返回词表的大小
        return len(self.idx_to_token)

    def __getitem__(self, token):
        # 查找输入token对应的索引值，如果不存在返回<unk>对应的索引0
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_do_ids(self, tokens):
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]
        


In [4]:
import nltk
# nltk.download("reuters")

In [5]:
nltk.download("reuters")

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\24825\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [5]:
from nltk.corpus import reuters
text = reuters.sents()
text = [[word.lower() for word in sentence] for sentence in text]
text

[['asian',
  'exporters',
  'fear',
  'damage',
  'from',
  'u',
  '.',
  's',
  '.-',
  'japan',
  'rift',
  'mounting',
  'trade',
  'friction',
  'between',
  'the',
  'u',
  '.',
  's',
  '.',
  'and',
  'japan',
  'has',
  'raised',
  'fears',
  'among',
  'many',
  'of',
  'asia',
  "'",
  's',
  'exporting',
  'nations',
  'that',
  'the',
  'row',
  'could',
  'inflict',
  'far',
  '-',
  'reaching',
  'economic',
  'damage',
  ',',
  'businessmen',
  'and',
  'officials',
  'said',
  '.'],
 ['they',
  'told',
  'reuter',
  'correspondents',
  'in',
  'asian',
  'capitals',
  'a',
  'u',
  '.',
  's',
  '.',
  'move',
  'against',
  'japan',
  'might',
  'boost',
  'protectionist',
  'sentiment',
  'in',
  'the',
  'u',
  '.',
  's',
  '.',
  'and',
  'lead',
  'to',
  'curbs',
  'on',
  'american',
  'imports',
  'of',
  'their',
  'products',
  '.'],
 ['but',
  'some',
  'exporters',
  'said',
  'that',
  'while',
  'the',
  'conflict',
  'would',
  'hurt',
  'them',
  'in',


In [6]:
def load_reuters():
    # 从nltk中导入reuters数据
    from nltk.corpus import reuters
    # 获取reutuers数据
    text = reuters.sents()
    # 将字母都转化为小写
    text = [[word.lower() for word in sentence] for sentence in text]
    # 构建词表
    vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    # 将文本数据转换为id表示
    corpus = [vocab.convert_tokens_do_ids(sentence) for sentence in text]
    return vocab, corpus

In [7]:
vocab, corpus = load_reuters()

In [8]:
vocab[BOS_TOKEN]

2

前馈神经网络

In [9]:
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from torch import nn
import torch.functional as F
from torch import optim

In [10]:
# 从torch.utils.Dataset继承
class NGramDataset(Dataset):
    
    def __init__(self, corpus, vocab, context_size = 2):

        self.data = []
        self.bos = vocab[BOS_TOKEN]# 句首标记
        self.eos = vocab[EOS_TOKEN]# 句尾标记

        for sentence in tqdm(corpus, desc = "Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos] # 插入句首句尾标记符
            if len(sentence) < context_size:
                continue
            for i in range(context_size, len(sentence)):
                # 模型输入：长度为context_size的上下文
                context = sentence[i-context_size:i]
                # 模型输出：当前词
                target = sentence[i]
                # 每个训练样本由(context, target)组成
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]




In [11]:
def collate_fn(examples):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    return (inputs, targets)

In [17]:
class FeedForwordNNLM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(FeedForwordNNLM, self).__init__()
        # 词向量层，每一个输入都是一个词表维度的one-hot向量；输入是每个单词的索引
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 线性变换：词向量层——隐含层
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        # 线性变换：隐含层——输出层, 输出维度将通过softmax归一化取为词表概率
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        # 使用Relu激活函数：小于0的输出置为0
        self.activate = nn.ReLU()
        self.output = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        # 将输入词序列映射为词向量，通过view函数对映射后的词向量序列组成的三维张量
        # 进行重构，以完成词向量的拼接
        # print("输入向量的维度为", inputs.shape)
        # print("词向量层输出向量的维度为", self.embeddings(inputs).shape)
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        # print("使用view拼接之后的向量维度为", embeds.shape)
        hidden = self.activate(self.linear1(embeds))
        output = self.linear2(hidden)
        # 根据输出层计算概率分布并取对数，计算对数似然
        log_probs = self.output(output)
        return log_probs
    

In [23]:
embedding_dim = 128 # 词向量维度
hidden_dim = 256 # 隐藏层维度
batch_size = 1024 # 批次大小
context_size = 3 # 上下文长度
num_epoch = 10 # 迭代次数

In [14]:
# 设置超参
from torch.utils.data import DataLoader

# 读取文本数据，构建训练数据集
vocab, corpus = load_reuters()
dataset = NGramDataset(corpus, vocab, context_size)
data_loader = DataLoader(dataset, batch_size, collate_fn=collate_fn)

Dataset Construction: 100%|██████████| 54711/54711 [00:02<00:00, 20992.25it/s]


In [16]:
torch.cuda.current_device()

0

In [19]:
nll_loss = nn.NLLLoss()
model = FeedForwordNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device=device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [65]:
for i in data_loader:
    for j in i:
        print(j)
    print()

tensor([[  2,   4,   5],
        [  4,   5,   6],
        [  5,   6,   7],
        ...,
        [432, 433, 429],
        [433, 429, 434],
        [429, 434, 118]])
tensor([  6,   7,   8,  ..., 434, 118, 435])

tensor([[434, 118, 435],
        [118, 435,  57],
        [435,  57, 374],
        ...,
        [ 31,  19, 196],
        [ 19, 196, 284],
        [196, 284, 750]])
tensor([ 57, 374,  49,  ..., 284, 750,  57])

tensor([[ 284,  750,   57],
        [ 750,   57,   53],
        [  57,   53,  751],
        ...,
        [1025,   47,  309],
        [  47,  309,  397],
        [ 309,  397,  901]])
tensor([ 53, 751, 298,  ..., 397, 901, 902])

tensor([[ 397,  901,  902],
        [ 901,  902,  128],
        [ 902,  128,  283],
        ...,
        [  11,   10,   68],
        [  10,   68, 1228],
        [  68, 1228,  373]])
tensor([ 128,  283,   42,  ..., 1228,  373,   57])

tensor([[1228,  373,   57],
        [ 373,   57, 1229],
        [  57, 1229,  244],
        ...,
        [  16,  185, 

In [24]:
model.train()
total_losses = []
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Trainging Epoch {epoch}"):
        # for x in batch:
        #     for y in x:
        #         print(y)
        # inputs = torch.tensor(batch[0]).to(device)
        # targets = batch[1]
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")
    total_losses.append(total_loss)

save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

Trainging Epoch 0: 100%|██████████| 1628/1628 [01:20<00:00, 20.30it/s]


Loss:7131.51


Trainging Epoch 1: 100%|██████████| 1628/1628 [01:17<00:00, 20.87it/s]


Loss:6534.39


Trainging Epoch 2: 100%|██████████| 1628/1628 [01:18<00:00, 20.74it/s]


Loss:6106.60


Trainging Epoch 3: 100%|██████████| 1628/1628 [01:16<00:00, 21.35it/s]


Loss:5807.73


Trainging Epoch 4: 100%|██████████| 1628/1628 [01:18<00:00, 20.87it/s]


Loss:5605.81


Trainging Epoch 5: 100%|██████████| 1628/1628 [01:18<00:00, 20.77it/s]


Loss:5455.97


Trainging Epoch 6: 100%|██████████| 1628/1628 [01:19<00:00, 20.54it/s]


Loss:5332.46


Trainging Epoch 7: 100%|██████████| 1628/1628 [01:20<00:00, 20.27it/s]


Loss:5226.96


Trainging Epoch 8: 100%|██████████| 1628/1628 [01:21<00:00, 19.89it/s]


Loss:5133.85


Trainging Epoch 9: 100%|██████████| 1628/1628 [01:19<00:00, 20.44it/s]


Loss:5050.79


In [21]:
def save_pretrained(vocab, embeds, save_path):
    with open(save_path, "w") as writer:
        writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
        for idx, token in enumerate(vocab.idx_to_token):
            vec = " ".join([f"{x}" for x in embeds[idx]])
            writer.write(f"{token} {vec}\n")

循环神经网络

In [32]:
class RnnlmDataset(Dataset):
    
    def __init__(self, vocab, corpus):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        self.pad = vocab[PAD_TOKEN]
        for sentence in tqdm(corpus, desc = "Dataset Construction"):
            # 输入序列：BOS_TOKEN，w1，w2......
            input = [self.bos] + sentence
            # 输出序列：w1，w2，EOS_TOKEN
            target = sentence + [self.eos]
            self.data.append((input, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [28]:

from torch.nn.utils.rnn import pad_sequence

def collate_fn(examples, pad):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=pad)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad)
    return (inputs, targets)

In [39]:
class RNNLM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLM, self).__init__()
        # 词向量层
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        # 输出层
        self.output = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, inputs):
        embeds = self.embedding(inputs)
        hidden, _ = self.rnn(embeds)
        output = self.output(hidden)
        log_probs = self.softmax(output)
        return log_probs

In [33]:
corpus, vocab = load_reuters()
dataset = RnnlmDataset(corpus, vocab)
data_loader = DataLoader(dataset, batch_size, collate_fn=lambda x : collate_fn(x, vocab[PAD_TOKEN]))

Dataset Construction: 100%|██████████| 54711/54711 [00:00<00:00, 62176.77it/s]


In [40]:
# 设置ignore_index参数，以忽略PAD_TOKEN处的损失
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
model = RNNLM(len(vocab), embedding_dim, hidden_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RNNLM(
  (embedding): Embedding(54711, 128)
  (rnn): LSTM(128, 256, batch_first=True)
  (output): Linear(in_features=256, out_features=54711, bias=True)
  (softmax): LogSoftmax(dim=2)
)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]