数据准备

In [44]:
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"

In [45]:
# 构建Vocab类
from collections import defaultdict

class Vocab:

    def __init__(self, tokens = None) -> None:
        self.idx_to_token = list()
        self.token_to_idx = dict()

        if tokens is not None:
            if "<unk>" not in tokens:
                tokens += ["<unk>"]
            for token in tokens:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1
            self.unk = self.token_to_idx["<unk>"] 

    @classmethod
    def build(cls, text, min_freq = 1, reserved_tokens = None):
        # cls 为类本身，相当于Vocab()
        token_freqs = defaultdict(int) # 统计token的频率
        for sentence in text:
            for token in sentence:
                token_freqs[token] += 1
        uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
        uniq_tokens += [token for token, freq in token_freqs.items()  
                       if freq >= min_freq and token != "<unk>"]
        return cls(uniq_tokens)
        
    def __len__(self):
        # 返回词表的大小
        return len(self.idx_to_token)

    def __getitem__(self, token):
        # 查找输入token对应的索引值，如果不存在返回<unk>对应的索引0
        return self.token_to_idx.get(token, self.unk)

    def convert_tokens_do_ids(self, tokens):
        return [self[token] for token in tokens]

    def convert_ids_to_tokens(self, indices):
        return [self.idx_to_token[index] for index in indices]
        


In [46]:
import nltk
# nltk.download("reuters")

In [5]:
from nltk.corpus import reuters
text = reuters.sents()
text = [[word.lower() for word in sentence] for sentence in text]
text

[['asian',
  'exporters',
  'fear',
  'damage',
  'from',
  'u',
  '.',
  's',
  '.-',
  'japan',
  'rift',
  'mounting',
  'trade',
  'friction',
  'between',
  'the',
  'u',
  '.',
  's',
  '.',
  'and',
  'japan',
  'has',
  'raised',
  'fears',
  'among',
  'many',
  'of',
  'asia',
  "'",
  's',
  'exporting',
  'nations',
  'that',
  'the',
  'row',
  'could',
  'inflict',
  'far',
  '-',
  'reaching',
  'economic',
  'damage',
  ',',
  'businessmen',
  'and',
  'officials',
  'said',
  '.'],
 ['they',
  'told',
  'reuter',
  'correspondents',
  'in',
  'asian',
  'capitals',
  'a',
  'u',
  '.',
  's',
  '.',
  'move',
  'against',
  'japan',
  'might',
  'boost',
  'protectionist',
  'sentiment',
  'in',
  'the',
  'u',
  '.',
  's',
  '.',
  'and',
  'lead',
  'to',
  'curbs',
  'on',
  'american',
  'imports',
  'of',
  'their',
  'products',
  '.'],
 ['but',
  'some',
  'exporters',
  'said',
  'that',
  'while',
  'the',
  'conflict',
  'would',
  'hurt',
  'them',
  'in',


In [47]:
def load_reuters():
    # 从nltk中导入reuters数据
    from nltk.corpus import reuters
    # 获取reutuers数据
    text = reuters.sents()
    # 将字母都转化为小写
    text = [[word.lower() for word in sentence] for sentence in text]
    # 构建词表
    vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
    # 将文本数据转换为id表示
    corpus = [vocab.convert_tokens_do_ids(sentence) for sentence in text]
    return vocab, corpus

In [7]:
vocab, corpus = load_reuters()

基于前馈神经网络生成词向量

In [48]:
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from torch import nn
import torch.functional as F
from torch import optim
from torch.utils.data import DataLoader

In [10]:
# 创建一个对应数据集
# 从torch.utils.Dataset继承
class NGramDataset(Dataset):
    
    def __init__(self, corpus, vocab, context_size = 2):

        self.data = []
        self.bos = vocab[BOS_TOKEN]# 句首标记
        self.eos = vocab[EOS_TOKEN]# 句尾标记

        for sentence in tqdm(corpus, desc = "Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos] # 插入句首句尾标记符
            if len(sentence) < context_size:
                continue
            for i in range(context_size, len(sentence)):
                # 模型输入：长度为context_size的上下文
                context = sentence[i-context_size:i]
                # 模型输出：当前词
                target = sentence[i]
                # 每个训练样本由(context, target)组成
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]




In [11]:
def collate_fn(examples):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    return (inputs, targets)

In [12]:
class FeedForwordNNLM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(FeedForwordNNLM, self).__init__()
        # 词向量层，每一个输入都是一个词表维度的one-hot向量；输入是每个单词的索引
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 线性变换：词向量层——隐含层
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        # 线性变换：隐含层——输出层, 输出维度将通过softmax归一化取为词表概率
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
        # 使用Relu激活函数：小于0的输出置为0
        self.activate = nn.ReLU()
        self.output = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        # 将输入词序列映射为词向量，通过view函数对映射后的词向量序列组成的三维张量
        # 进行重构，以完成词向量的拼接
        # print("输入向量的维度为", inputs.shape)
        # print("词向量层输出向量的维度为", self.embeddings(inputs).shape)
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        # print("使用view拼接之后的向量维度为", embeds.shape)
        hidden = self.activate(self.linear1(embeds))
        output = self.linear2(hidden)
        # 根据输出层计算概率分布并取对数，计算对数似然
        log_probs = self.output(output)
        return log_probs
    

In [8]:
embedding_dim = 128 # 词向量维度
hidden_dim = 256 # 隐藏层维度
batch_size = 64 # 批次大小
context_size = 3 # 上下文长度
num_epoch = 10 # 迭代次数

In [15]:
# 设置超参
from torch.utils.data import DataLoader

# 读取文本数据，构建训练数据集
vocab, corpus = load_reuters()
dataset = NGramDataset(corpus, vocab, context_size)
data_loader = DataLoader(dataset, batch_size, collate_fn=collate_fn)

Dataset Construction: 100%|██████████| 54711/54711 [00:03<00:00, 17347.15it/s]


In [16]:
nll_loss = nn.NLLLoss()
model = FeedForwordNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
device = torch.device("cuda")# if torch.cuda.is_available() else "cpu")
model.to(device=device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [17]:
model.train()
total_losses = []
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Trainging Epoch {epoch}"):
        # for x in batch:
        #     for y in x:
        #         print(y)
        # inputs = torch.tensor(batch[0]).to(device)
        # targets = batch[1]
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")
    total_losses.append(total_loss)

# save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

Trainging Epoch 0: 100%|██████████| 163/163 [00:09<00:00, 17.14it/s]


Loss:1018.01


Trainging Epoch 1: 100%|██████████| 163/163 [00:08<00:00, 18.62it/s]


Loss:821.15


Trainging Epoch 2: 100%|██████████| 163/163 [00:08<00:00, 18.65it/s]


Loss:760.71


Trainging Epoch 3: 100%|██████████| 163/163 [00:08<00:00, 18.55it/s]


Loss:719.21


Trainging Epoch 4: 100%|██████████| 163/163 [00:08<00:00, 18.52it/s]


Loss:686.53


Trainging Epoch 5: 100%|██████████| 163/163 [00:08<00:00, 18.54it/s]


Loss:659.33


Trainging Epoch 6: 100%|██████████| 163/163 [00:08<00:00, 18.72it/s]


Loss:636.52


Trainging Epoch 7: 100%|██████████| 163/163 [00:08<00:00, 18.65it/s]


Loss:617.73


Trainging Epoch 8: 100%|██████████| 163/163 [00:08<00:00, 18.64it/s]


Loss:602.54


Trainging Epoch 9: 100%|██████████| 163/163 [00:08<00:00, 19.28it/s]

Loss:590.00





In [18]:
def save_pretrained(vocab, embeds, save_path):
    with open(save_path, "w") as writer:
        writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
        for idx, token in enumerate(vocab.idx_to_token):
            vec = " ".join([f"{x}" for x in embeds[idx]])
            writer.write(f"{token} {vec}\n")

基于循环神经网络生成词向量

In [9]:
class RnnlmDataset(Dataset):
    
    def __init__(self, vocab, corpus):
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        self.pad = vocab[PAD_TOKEN]
        for sentence in tqdm(corpus, desc = "Dataset Construction"):
            # 输入序列：BOS_TOKEN，w1，w2......
            input = [self.bos] + sentence
            # 输出序列：w1，w2，EOS_TOKEN
            target = sentence + [self.eos]
            self.data.append((input, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [10]:

from torch.nn.utils.rnn import pad_sequence

def collate_fn(examples, pad):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    # for i in range(len(examples)):
    #     if len(examples[i][0]) != 50:
    #         print(i)
    inputs = [torch.tensor(ex[0]) for ex in examples]
    targets = [torch.tensor(ex[1]) for ex in examples]
    # 注意此处先生成了列表，而不是如同前文一样生成tensor，因为tensor需要补齐
    inputs = pad_sequence(inputs, batch_first=True, padding_value=pad)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad)
    return (inputs, targets)

In [11]:
class RNNLM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNLM, self).__init__()
        # 词向量层
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        # 输出层
        self.output = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, inputs):
        embeds = self.embedding(inputs)
        hidden, _ = self.rnn(embeds)
        output = self.output(hidden)
        log_probs = self.softmax(output)
        return log_probs

In [12]:
from torch.utils.data import DataLoader
vocab, corpus = load_reuters()
dataset = RnnlmDataset(vocab, corpus)
data_loader = DataLoader(dataset, batch_size, collate_fn=lambda x : collate_fn(x, vocab[PAD_TOKEN]))

Dataset Construction: 100%|██████████| 54711/54711 [00:00<00:00, 199319.17it/s]


In [13]:
# 设置ignore_index参数，以忽略PAD_TOKEN处的损失
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
model = RNNLM(len(vocab), embedding_dim, hidden_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RNNLM(
  (embedding): Embedding(31081, 128)
  (rnn): LSTM(128, 256, batch_first=True)
  (output): Linear(in_features=256, out_features=31081, bias=True)
  (softmax): LogSoftmax(dim=2)
)

In [14]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

Training Epoch 0: 100%|██████████| 855/855 [00:32<00:00, 26.00it/s]


Loss:4960.68


Training Epoch 1: 100%|██████████| 855/855 [00:32<00:00, 26.13it/s]


Loss:4112.05


Training Epoch 2: 100%|██████████| 855/855 [00:32<00:00, 25.99it/s]


Loss:3795.99


Training Epoch 3: 100%|██████████| 855/855 [00:31<00:00, 27.45it/s]


Loss:3585.39


Training Epoch 4: 100%|██████████| 855/855 [00:32<00:00, 26.19it/s]


Loss:3427.35


Training Epoch 5: 100%|██████████| 855/855 [00:33<00:00, 25.87it/s]


Loss:3297.07


Training Epoch 6: 100%|██████████| 855/855 [00:33<00:00, 25.86it/s]


Loss:3185.90


Training Epoch 7: 100%|██████████| 855/855 [00:33<00:00, 25.89it/s]


Loss:3090.05


Training Epoch 8: 100%|██████████| 855/855 [00:32<00:00, 26.04it/s]


Loss:3005.44


Training Epoch 9: 100%|██████████| 855/855 [00:32<00:00, 26.03it/s]

Loss:2929.91





Word2Vec词向量

CBOW

In [18]:
class CbowDataset(Dataset):
    # 用于CBOW模型的数据集
    def __init__(self, vocab, corpus, context_size = 2) -> None:
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]
            if len(sentence) < context_size * 2 + 1:
                # 此处的context_size是单向上下文长度，因此，如果小于上文长度，无法构建该任务
                continue
            for i in range(context_size, len(sentence) - context_size):
                context = sentence[i-context_size:i] + sentence[i+1:i+context_size]
                # 模型输入：左右各取context_size的上下文
                target = sentence[i]
                # 模型输出：中间的单词
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [24]:
# CBOW模型
class CbowModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim) -> None:
        super(CbowModel, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.output = nn.Linear(embedding_dim, vocab_size, bias=False)
        # 输出层
        self.log = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        hidden = embeds.mean(dim=1)
        # 对词向量取平均
        output = self.output(hidden)
        log_probs = self.log(output)
        return log_probs


In [11]:
def collate_fn(examples):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    return (inputs, targets)

In [30]:
from torch.utils.data import DataLoader
vocab, corpus = load_reuters()
dataset = CbowDataset(vocab, corpus)
data_loader = DataLoader(dataset, batch_size, collate_fn=collate_fn)

Dataset Construction: 100%|██████████| 54711/54711 [00:03<00:00, 16829.04it/s]


In [31]:
# 设置ignore_index参数，以忽略PAD_TOKEN处的损失
nll_loss = nn.NLLLoss()
model = CbowModel(len(vocab), embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [29]:
embedding_dim = 128 # 词向量维度
hidden_dim = 256 # 隐藏层维度
batch_size = 10240 # 批次大小
context_size = 3 # 上下文长度
num_epoch = 10 # 迭代次数

In [32]:
model.train()
total_losses = []
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Trainging Epoch {epoch}"):
        # for x in batch:
        #     for y in x:
        #         print(y)
        # inputs = torch.tensor(batch[0]).to(device)
        # targets = batch[1]
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")
    total_losses.append(total_loss)

# save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

Trainging Epoch 0: 100%|██████████| 158/158 [00:05<00:00, 28.69it/s]


Loss:1399.43


Trainging Epoch 1: 100%|██████████| 158/158 [00:05<00:00, 29.30it/s]


Loss:1002.09


Trainging Epoch 2: 100%|██████████| 158/158 [00:05<00:00, 27.62it/s]


Loss:891.18


Trainging Epoch 3: 100%|██████████| 158/158 [00:05<00:00, 27.61it/s]


Loss:835.33


Trainging Epoch 4: 100%|██████████| 158/158 [00:05<00:00, 27.35it/s]


Loss:797.48


Trainging Epoch 5: 100%|██████████| 158/158 [00:05<00:00, 27.49it/s]


Loss:768.96


Trainging Epoch 6: 100%|██████████| 158/158 [00:05<00:00, 27.53it/s]


Loss:746.11


Trainging Epoch 7: 100%|██████████| 158/158 [00:05<00:00, 27.37it/s]


Loss:727.10


Trainging Epoch 8: 100%|██████████| 158/158 [00:05<00:00, 27.50it/s]


Loss:710.85


Trainging Epoch 9: 100%|██████████| 158/158 [00:05<00:00, 27.58it/s]

Loss:696.69





Skip-Gram

In [14]:
class SkipGramDataset(Dataset):
    # 定义用于Skip-gram模型的数据集
    def __init__(self, vocab, corpus, context_size=2) -> None:
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        for sentence in tqdm(corpus, desc="Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence) - 1):
                # 从第二个单词开始，到倒数第二个词为之
                w = sentence[i]
                # 模型输入：当前词
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index]
                # 模型输出：上下文窗口内的共现词，如果窗口边缘超出了字符串左右，则截取到字符串尽头
                self.data.extend([(w, c) for c in context])
                # 此处使用extend，因为该列表里面是多个词对，每一个是一个输出

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

In [11]:
class SkipGramModule(nn.Module):
    # 定义一个Skip-gram的模型
    def __init__(self, vocab_size, embedding_dim) -> None:
        super(SkipGramModule, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.output = nn.Linear(embedding_dim, vocab_size, bias=False)
        # 线性层
        self.log = nn.LogSoftmax(dim=1)
        # 分类层

    def forward(self, inputs):
        embeds = self.embedding(inputs)
        output = self.output(embeds)
        log_probs = self.log(output)
        return log_probs

In [13]:
def collate_fn(examples):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
    targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    return (inputs, targets)

In [21]:
embedding_dim = 128 # 词向量维度
hidden_dim = 256 # 隐藏层维度
batch_size = 40960 # 批次大小
context_size = 3 # 上下文长度
num_epoch = 10 # 迭代次数

In [23]:
from torch.utils.data import DataLoader
vocab, corpus = load_reuters()
dataset = SkipGramDataset(vocab, corpus)
data_loader = DataLoader(dataset, batch_size, collate_fn=collate_fn)

Dataset Construction: 100%|██████████| 54711/54711 [00:02<00:00, 27211.21it/s]


In [24]:
# 设置ignore_index参数，以忽略PAD_TOKEN处的损失
nll_loss = nn.NLLLoss()
model = SkipGramModule(len(vocab), embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [25]:
model.train()
total_losses = []
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc = f"Trainging Epoch {epoch}"):
        # for x in batch:
        #     for y in x:
        #         print(y)
        # inputs = torch.tensor(batch[0]).to(device)
        # targets = batch[1]
        inputs, targets = [x.to(device) for x in batch]
        optimizer.zero_grad()
        log_probs = model(inputs)
        loss = nll_loss(log_probs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")
    total_losses.append(total_loss)

# save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

Trainging Epoch 0: 100%|██████████| 125/125 [00:15<00:00,  8.01it/s]


Loss:1168.62


Trainging Epoch 1: 100%|██████████| 125/125 [00:15<00:00,  8.04it/s]


Loss:931.68


Trainging Epoch 2: 100%|██████████| 125/125 [00:15<00:00,  8.02it/s]


Loss:824.61


Trainging Epoch 3: 100%|██████████| 125/125 [00:15<00:00,  7.97it/s]


Loss:787.20


Trainging Epoch 4: 100%|██████████| 125/125 [00:15<00:00,  7.99it/s]


Loss:768.44


Trainging Epoch 5: 100%|██████████| 125/125 [00:15<00:00,  7.94it/s]


Loss:756.21


Trainging Epoch 6: 100%|██████████| 125/125 [00:15<00:00,  7.96it/s]


Loss:747.16


Trainging Epoch 7: 100%|██████████| 125/125 [00:15<00:00,  7.95it/s]


Loss:740.00


Trainging Epoch 8: 100%|██████████| 125/125 [00:15<00:00,  7.93it/s]


Loss:734.09


Trainging Epoch 9: 100%|██████████| 125/125 [00:15<00:00,  7.90it/s]

Loss:729.07





使用负采样技术的Skip-Gram


负采样理论介绍：https://zhuanlan.zhihu.com/p/39684349

In [8]:
class SGNSDataset(Dataset):
# 用于负采样的Skip-Gram的数据集
# 我们在数据集构建时生成负样本
    def __init__(self, vocab, corpus, context_size=2, n_negatives=5, ns_dist = None) -> None:
        # n_negative指生成负样本个数
        # ns_dist指生成负样本分布，None为均匀分布
        self.data = []
        self.bos = vocab[BOS_TOKEN]
        self.eos = vocab[EOS_TOKEN]
        self.pad = vocab[PAD_TOKEN]
        for sentence in tqdm(corpus, desc = "Dataset Construction"):
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence) - 1):
                w = sentence[i]
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index]
                # 模型输入为当前词和上下文的词对
                context += [self.pad] * (2 * context_size - len(context))
                # 需要对上下文进行补齐
                self.data.append((w, context))
                # 输出为0/1，标志是否为负样本

        self.n_negatives = n_negatives
        self.ns_dist = ns_dist if ns_dist == None else torch.ones(len(vocab))
        # None为均匀分布

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i]


In [9]:
def collate_fn(examples, ns_dist, n_negatives):
    # 从独立样本集合中构建批次的输入输出，并转换为PyTorch张量
    words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
    contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
    batch_size, context_size = contexts.shape
    neg_contexts = []
    # 对批次内样本分别进行负采样
    for i in range(batch_size):
        ns_dist = ns_dist.index_fill(0, contexts[i], .0)
        # index_fill根据给定索引填充张量的值，此处用于保持context不变
        neg_contexts.append(torch.multinomial(ns_dist, n_negatives * context_size, replacement=True))
        # 对input的每一行做n_samples次取值，输出的张量是每一次取值时input张量对应行的下标
    neg_contexts = torch.stack(neg_contexts, dim=0)
    # stack用于拼接
    return words, contexts, neg_contexts

In [10]:
class SGNSModel(nn.Module):
    # 用于负采样技术的Skip-Gram模型
    # 该模型分别维护并训练词和上下文的向量层
    def __init__(self, vocab_size, embedding_dim) -> None:
        super(SGNSModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # 词向量层
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim)
        # 上下文词向量层
        self.output = nn.Linear(embedding_dim, vocab_size, bias=False)
        # 线性层
        self.log = nn.LogSoftmax(dim=1)
        # 分类层

    def forward_w(self, words):
        words_embeds = self.embedding(words)
        return words_embeds

    def forward_c(self, contexts):
        contexts_embeds = self.embedding(contexts)
        return contexts_embeds

In [11]:
# 计算训练预料中Unigram概率分布
def get_unigram_distribution(corpus, vocab_size):
    token_counts = torch.tensor([0] * vocab_size)
    total_count = 0
    for sentence in corpus:
        total_count += len(sentence)
        for token in sentence:
            token_counts[token] += 1
    unigram_dist = torch.div(token_counts.float(), total_count)
    # 做除法，即将频次转化为频率
    return unigram_dist

In [39]:
embedding_dim = 128 # 词向量维度
batch_size = 10240 # 批次大小
context_size = 3 # 上下文长度
n_negatives = 5 # 负样本数量
num_epoch = 10 # 迭代次数

In [40]:
vocab, corpus = load_reuters()
unigram_dist = get_unigram_distribution(corpus, len(vocab))
# 计算unigram概率分布
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum()
# 计算负采样分布
dataset = SGNSDataset(vocab, corpus, context_size, n_negatives, negative_sampling_dist)
data_loader = DataLoader(dataset, batch_size, collate_fn=lambda x : collate_fn(x, negative_sampling_dist, n_negatives))

Dataset Construction: 100%|██████████| 54711/54711 [00:06<00:00, 8933.66it/s] 


In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [42]:
model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, neg_contexts = [x.to(device) for x in batch]
        optimizer.zero_grad()
        batch_size = words.shape[0]
        words_embeds = model.forward_w(words).unsqueeze(dim=2)
        contexts_embeds = model.forward_c(contexts)
        neg_contexts_embeds = model.forward_c(neg_contexts)
        # 分别提取其向量表示
        context_loss = torch.nn.functional.logsigmoid(torch.bmm(contexts_embeds, words_embeds).squeeze(dim=2))
        context_loss = context_loss.mean(dim=1)
        # bmm函数：矩阵乘法
        # 正样本的对数似然
        neg_context_loss = torch.nn.functional.logsigmoid(torch.bmm(neg_contexts_embeds, words_embeds).squeeze(dim=2).neg())
        neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
        neg_context_loss = neg_context_loss.mean(dim=1)
        # neg函数：返回负数
        # 负样本的对数似然
        loss = -(context_loss + neg_context_loss).mean()
        # 总损失
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss:{total_loss:.2f}")

combined_embeds = model.embedding.weight + model.context_embedding.weight
# 拼接两个词向量得到最终词向量

Training Epoch 0: 100%|██████████| 169/169 [02:27<00:00,  1.15it/s]


Loss:4128.38


Training Epoch 1: 100%|██████████| 169/169 [02:13<00:00,  1.27it/s]


Loss:3404.62


Training Epoch 2: 100%|██████████| 169/169 [02:13<00:00,  1.27it/s]


Loss:2866.49


Training Epoch 3: 100%|██████████| 169/169 [02:12<00:00,  1.28it/s]


Loss:2452.41


Training Epoch 4: 100%|██████████| 169/169 [02:11<00:00,  1.28it/s]


Loss:2134.62


Training Epoch 5: 100%|██████████| 169/169 [02:12<00:00,  1.28it/s]


Loss:1886.13


Training Epoch 6: 100%|██████████| 169/169 [02:12<00:00,  1.28it/s]


Loss:1688.63


Training Epoch 7: 100%|██████████| 169/169 [02:13<00:00,  1.26it/s]


Loss:1526.40


Training Epoch 8: 100%|██████████| 169/169 [02:15<00:00,  1.25it/s]


Loss:1390.79


Training Epoch 9: 100%|██████████| 169/169 [02:12<00:00,  1.28it/s]

Loss:1275.35



