In [1]:
import collections
import math
import random
import time
import Functions as d2l
import numpy as np
import os
import torch
from torch import nn
import torch.utils.data as Data


In [2]:
assert 'ptb.train.txt' in os.listdir('./datasets/ptb')

with open('./datasets/ptb/ptb.train.txt') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]

len(raw_dataset)

42068

In [3]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [4]:
##只保留至少出现5次的词。
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x:x[1] >= 5, counter.items()))

In [5]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk : idx for idx,tk in enumerate(idx_to_token) }
### 这是下面一段代码实现的功能
# dataset = []
# for st in raw_dataset:
#     inside_list = []
#     for tk in st:
#         if tk in token_to_idx:
#             inside_list.append(token_to_idx[tk])
#     dataset.append(inside_list)

dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx] for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])

In [6]:
###二次采样

def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(1e-4 / counter[idx_to_token[idx]] * num_tokens)

## if not discard 就保留在dataset中
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset]) # '# tokens: 375875'


'# tokens: 375722'

In [7]:
### 比较下采样前后不同频率的词的差别
def compare_counts(token):
    return '# {}: before={}, after={}'.format(token, 
    sum([st.count(token_to_idx[token]) for st in dataset]),
    sum([st.count(token_to_idx[token]) for st in subsampled_dataset])
    )

print(compare_counts('the'))
print(compare_counts('join'))

# the: before=50770, after=2210
# join: before=45, after=45


In [8]:
### 提取中心词和背景词
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [],[]
    for st in dataset:
        if len(st) < 2:   # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)   ### 将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts
    

In [9]:
##创建一个人工数据集进行测试
tiny_dataset = [list(range(7)), list(range(7,10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center: {}, has contexts: {}'.format(center,context))

dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
center: 0, has contexts: [1, 2]
center: 1, has contexts: [0, 2, 3]
center: 2, has contexts: [1, 3]
center: 3, has contexts: [1, 2, 4, 5]
center: 4, has contexts: [3, 5]
center: 5, has contexts: [3, 4, 6]
center: 6, has contexts: [4, 5]
center: 7, has contexts: [8, 9]
center: 8, has contexts: [7, 9]
center: 9, has contexts: [8]


In [21]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

### 负采样

In [11]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每次的权重(sampling_weights)随机生成k个次的索引作为噪音词
                # 为了高效计算，可以将k设得稍大一些。
                i, neg_candidates = 0, random.choices(population, sampling_weights, k = int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]  
all_negatives = get_negatives(all_contexts, sampling_weights, 5)
            

### 读取数据

In [63]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives
        
    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])
    
    def __len__(self):
        return len(self.centers)
    
    
dataset = MyDataset(all_centers, all_contexts, all_negatives)

In [67]:
a = []+[5,4]+[6]
a.extend([4,45,4])
[] + [[1,1,1]]

[[1, 1, 1]]

In [68]:
## 小批量读取函数
def batchify(data):
    """用作DataLoader的参数collate_fn: 输入是个长为batchsize的list, 
    list中的每个元素都是Dataset类调用__getitem__得到的结果
    """
    max_len = max(len(context) + len(negative) for _, context, negative in data)  #防止每个样本之间的背景词与噪声词个数不同
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers.append(center)
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        
        # 当contexts_negatives变量中的某个元素为填充项时，相同位置的掩码变量masks中的元素取0，否则取1。
        masks += [[1] * cur_len + [0] * (max_len - cur_len)] 
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))


In [70]:
batch_size = 512
num_workers = 4
dataset = MyDataset(all_centers, all_contexts, all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify, num_workers = num_workers)
for batch in data_iter:
    for name, data in zip(['conters', 'contexts_negatives', 'masks', 'labels'], batch):
        print(name, 'shape:', data.shape)
    break

conters shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


### skip-gram

In [75]:
embed = nn.Embedding(num_embeddings=20, embedding_dim=4)
embed.weight

Parameter containing:
tensor([[-0.7230,  0.3931, -0.6425,  0.2207],
        [ 0.1613, -0.3123,  1.3286,  0.2694],
        [ 0.9413, -0.8451, -0.4251,  2.0284],
        [-0.1609,  0.0389, -0.0814, -1.4092],
        [ 0.5249, -0.6877,  1.2749, -0.0268],
        [-0.7252,  1.6458,  2.9459,  0.2996],
        [-0.2320,  0.0997, -1.0951, -1.5036],
        [-0.3462, -0.0782,  1.2879,  0.3700],
        [ 0.3870, -1.5230, -2.1753, -0.5353],
        [ 0.1551,  2.1947, -0.1158, -0.2787],
        [-1.1749, -0.7988, -2.6129,  0.0148],
        [-1.7559, -0.2019,  0.6404, -0.5579],
        [ 0.1842,  0.2501,  1.3361,  1.3462],
        [-0.7281,  1.1711,  0.0310,  1.0496],
        [ 0.3765,  1.2168, -0.5250, -0.0271],
        [-0.4480,  1.0740,  0.2213, -1.5208],
        [ 1.7914, -0.2365, -0.0797, -1.3140],
        [ 1.6254, -1.0398, -0.7819, -0.2055],
        [-0.9918,  0.5988,  1.1078,  0.4426],
        [ 2.9781,  0.7062, -2.2642, -1.4162]], requires_grad=True)

In [77]:
x = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.long)
embed()

tensor([[[ 0.1613, -0.3123,  1.3286,  0.2694],
         [ 0.9413, -0.8451, -0.4251,  2.0284],
         [-0.1609,  0.0389, -0.0814, -1.4092]],

        [[ 0.5249, -0.6877,  1.2749, -0.0268],
         [-0.7252,  1.6458,  2.9459,  0.2996],
         [-0.2320,  0.0997, -1.0951, -1.5036]]], grad_fn=<EmbeddingBackward>)

In [78]:
X = torch.ones((2,1,4))
Y = torch.ones((2,4,6))
torch.bmm(X,Y).shape

torch.Size([2, 1, 6])

In [80]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)       # batch_size * 1 * embedding_dim
    u = embed_u(contexts_and_negatives)       # batch_size * max_len * embedding_dim
    pred = torch.bmm(v, u.permute(0,2,1))     # 
    return pred

### 训练模型

In [81]:
### 二元交叉熵损失函数
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        # inputs.shape : (batch_size, len)
        # targets.shape: as the shape as input
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        result = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none', weight=mask)
        return result.mean(dim=1)
loss = SigmoidBinaryCrossEntropyLoss()


In [82]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 标签变量label中的1和0分别代表背景词和噪声词
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]])
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]])  # 掩码变量
loss(pred, label, mask) * mask.shape[1] / mask.float().sum(dim=1)


tensor([0.8740, 1.2100])

In [84]:
embed_size = 100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim = embed_size),
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim = embed_size)
)


In [85]:
def train(net, lr, num_epochs):
    device = torch.device('cuda:3')
    print(device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start , ls_sum, n = time.time(), 0, 0
        for batch in data_iter:
            center, context_negative,mask, label = [d.to(device) for d in batch]
            pred = skip_gram(center, context_negative, net[0], net[1])
            
            # 使用掩码变量mask来避免填充项对损失函数计算的影响
            ls = (loss(pred.view(label.shape), label, mask) * mask.shape[1] / mask.float().sum(dim=1)).mean()
            optimizer.zero_grad()
            ls.backward()
            optimizer.step()
            ls_sum += ls.cpu().item()
            n += 1
        print('epoch {}, loss {:.3f}, time {:.3f}s'.format(epoch + 1, ls_sum/n, time.time()-start))

In [86]:
train(net, 0.01, 10)

cuda:3
epoch 1, loss 1.991, time 7.748s
epoch 2, loss 0.654, time 7.189s
epoch 3, loss 0.478, time 8.164s
epoch 4, loss 0.423, time 8.465s
epoch 5, loss 0.397, time 8.162s
epoch 6, loss 0.381, time 8.215s
epoch 7, loss 0.368, time 8.098s
epoch 8, loss 0.359, time 8.617s
epoch 9, loss 0.350, time 8.442s
epoch 10, loss 0.342, time 8.560s


In [88]:
def get_similar_tokens(query_token, k, embed):
    w = embed.weight.data
    x = w[token_to_idx[query_token]]
    cos = torch.matmul(w,x) / (torch.sum(w * w, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k = k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:
        print('cosine sim={:.3f}:{}'.format(cos[i], (idx_to_token[i])))
        
get_similar_tokens('chip', 3, net[0])

cosine sim=0.482:chips
cosine sim=0.428:ibm
cosine sim=0.428:memory
