In [1]:
import collections
import math
import torch
from torch import nn
import torch.utils.data as Data
import random
import sys
import time
import numpy as np

## 1.处理数据集

In [2]:
with open('../SGNS/data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    print(lines[:3])
    raw_dataset = [sentence.split() for sentence in lines]

'# sentences: %d' % len(raw_dataset)

[' aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter \n', ' pierre <unk> N years old will join the board as a nonexecutive director nov. N \n', ' mr. <unk> is chairman of <unk> n.v. the dutch publishing group \n']


'# sentences: 42068'

In [3]:
# 只保留数据集中出现次数至少5次的词
counter = collections.Counter([token for sentence in raw_dataset for token in sentence])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

# 将词映射到整数索引
idx_to_token = [token for token, _ in counter.items()]
print('# 词典大小： %d' % len(idx_to_token))
token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)}
dataset = [[token_to_idx[token] for token in sentence if token in token_to_idx]
           for sentence in raw_dataset]
print(dataset[:3])
num_tokens = sum([len(sentence) for sentence in dataset])
'# tokens: %d' % num_tokens

# 词典大小： 9858
[[], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2], [14, 1, 15, 16, 17, 1, 18, 7, 19, 20, 21]]


'# tokens: 887100'

In [4]:
# Subsampling of Frequent Words    取t = 1e-4
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[token for token in sentence if not discard(token)] for sentence in dataset]
'# 二次采样后 tokens: %d' % sum([len(sentence) for sentence in subsampled_dataset])

'# 二次采样后 tokens: 375406'

In [5]:
# 提取中心词和背景词
# 此处窗口大小参考论文 Efficient Estimation of Word Representations in Vector Space 中 3.2 Continuous Skip-gram Model
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for sentence in dataset:
        if len(sentence) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += sentence
        for center_i in range(len(sentence)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(sentence), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            contexts.append([sentence[idx] for idx in indices])
    return centers, contexts

all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 10)

## 2.Negative Sampling

In [6]:
# 采样 k = 5 个噪音词，噪音词采样概率 P(w) 设为 w 词频与总词频之⽐的 3/4 次⽅
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词。
                # 一次性按概率生成 1e5 个噪音词
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

## 3.读取数据

In [7]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
        
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)

In [8]:
# 小批量读取函数
# 将每个样本的背景词和噪音词连结在⼀起，并添加填充项0  直至连结后的长度相同
def batchify(data):
    """
    
    用作DataLoader的参数collate_fn: 输入是个长为batchsize的list, 
    list中的每个元素都是Dataset类调用__getitem__得到的结果
    
    """
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))

batch_size = 512

dataset = MyDataset(all_centers, 
                    all_contexts, 
                    all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify, 
                            num_workers=0)

## 4.Skip-gram Model
- 参考链接：[Word2Vec Tutorial - The Skip-Gram Model](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/)

In [9]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1))
    return pred

## 5.训练模型

In [10]:
# 分别构造中心词和背景词的嵌入层
embed_size = 100  # 论文中是300，训练比较慢，改成100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)

In [11]:
def train(net, lr, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]
            
            pred = skip_gram(center, context_negative, net[0], net[1])
            
            # 使用掩码变量mask来避免填充项对损失函数计算的影响
            loss = ((nn.functional.binary_cross_entropy_with_logits(pred.view(label.shape).float(), label.float(), reduction="none", weight=mask.float()).mean(dim=1))*
                   mask.shape[1] / mask.float().sum(dim=1)).mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            l_sum += loss.cpu().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))

In [12]:
train(net, 0.01, 10)

train on cpu
epoch 1, loss 1.83, time 294.99s
epoch 2, loss 0.59, time 297.45s
epoch 3, loss 0.45, time 313.11s
epoch 4, loss 0.40, time 318.58s
epoch 5, loss 0.38, time 329.73s
epoch 6, loss 0.37, time 342.81s
epoch 7, loss 0.36, time 351.97s
epoch 8, loss 0.35, time 310.55s
epoch 9, loss 0.35, time 308.59s
epoch 10, loss 0.34, time 319.78s


## 6.应用训练好的词嵌入模型

In [13]:
# 计算两个词之间的相似度
def get_similar_tokens(query_token, k, embed):
    print('和%s最接近的词：'%query_token)
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
      
get_similar_tokens('chip', 3, net[0])
get_similar_tokens('company', 3, net[0])

和chip最接近的词：
cosine sim=0.493: intel
cosine sim=0.489: supercomputer
cosine sim=0.444: technology
和company最接近的词：
cosine sim=0.552: purchase
cosine sim=0.528: stake
cosine sim=0.509: entertainment
