In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [53]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud

from collections import Counter
import random
import math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)

C = 3 #context window
K = 15 #number of negative samples
epochs = 2
MAX_VOCAB_SIZE = 10000 #选择语料库中出现次数最多的9999个词，还有一个词是<UNK>
EMBEDDING_SIZE = 100
batch_size = 32
lr = 0.2

# 读取文本数据并处理

In [54]:
with open("/content/drive/MyDrive/Pytorch实现Word2Vec/text8/text8.train.txt") as f:
    text = f.read()
text = text.lower().split() #分割成单词列表
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))#得到出现频次最高的9999个单词
vocab_dict["UNK"] = len(text) - np.sum(list(vocab_dict.values())) #把不常用的单词都编码为<UNK>
idx2word = [word for word in vocab_dict.keys()] #列表，词频最高的最前面
word2idx = {word:i for i, word in enumerate(idx2word)} #字典 {“word”:"num"}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32) #词出现的次数
word_freqs = word_counts / np.sum(word_counts) #计算每个词的词频，应该是一个一维向量
word_freqs = word_freqs ** (3./4.) #根据论文公式而来

# 实现DataLoader
1.把所有的word编码成数字

2.保存vocabulary,单词count，normalized word frequency

3.每个iteration sample一个中心词

4.根据当前的中心词返回context单词

5.根据中心词sample一些negative单词

6.返回单词的counts

In [55]:
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self, text, word2idx, idx2word, word_freqs, word_counts):
        super(WordEmbeddingDataset, self).__init__()
        #把单词数字化表示，如果不在词典中，表示为unk即9999， 如果在词典中根据频次数字表示“the”表示为0
        self.text_encoded = [word2idx.get(word, word2idx["UNK"]) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded) #nn.Embedding需要LongTensor格式
        self.word2idx = word2idx
        self.idx2word = idx2word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)
    
    def __len__(self):
        return len(self.text_encoded) #返回单词的总数
    
    def __getitem__(self, idx):
        """
        return:
            中心词
            这个单词附近的positive word
            随机采样的K个单词作为negative word
        """
        center_words = self.text_encoded[idx] #取得中心词
        #取中心词左右个C个单词的索引,C=3, e.g. idx=5 pos_indices=[2,3,4,6,7,8]
        pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1))
        #避免索引越界，对索引进行取余处理
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        #根据索引得到单词索引(数字化的单词)- (Tensor(list)) - Tensor
        pos_words = self.text_encoded[pos_indices]
        
        
        #torch.multinomial(input, num_samples,replacement=False, out=None) → LongTensor
        """
        对input的每一行做n_samples次取值，输出的张量是每一次取值是input张量对应行的下标。
        input张量可以看成一个权重张量，每一个元素代表其在该行中的权重。如果有元素为0，那么在其他不为0的元素，被取干净之前，这个元素是不会被取到的。
        n_samples是每一行的取值次数，该值不能大于每一样的元素数，否则会报错。
        replacement指的是取样时是否是有放回的取样，True是有放回，False无放回。
        
        对self.word_freqs做K*pos_words.shape[0]次取值，输出是self.word_freqs对应的下标，
        取样方式采样有放回的采样，并且self.word_freqs的数值越大采样概率越大，
        每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word),pos_word.shape[0]是正确单词的数量
        """
        neg_words = torch.multinomial(self.word_freqs, K*pos_words.shape[0], True)
        return center_words, pos_words, neg_words
    
dataset = WordEmbeddingDataset(text, word2idx, idx2word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size, shuffle=True)

In [56]:
"""打印dataloader"""
next(iter(dataset))#分别是center word， pos_words, neg_words

(tensor(4813),
 tensor([  50, 9999,  393, 3139,   11,    5]),
 tensor([   5,   75,   87,   35, 1426,   13,  800, 3077, 1624, 1006, 1094,  278,
         3221,  456,    8,   15, 2219,   49, 1107, 3127, 8119, 9999, 7043, 6564,
         5709,   26,   32, 9999, 4951,  672,  575,  638, 3867, 8034, 9676,   35,
         9235,  235,  765,  194,   42,  100,  336,  713,  544,  174, 4754,  257,
          383,  200, 2487,  118,  575, 5574, 3594,  175, 4531, 1028, 7201,   12,
         1634, 9999,   19,   94,  214, 1130,  750,  983, 1229,  134,   31, 1364,
           92, 1282,  703, 2276, 1284,   75, 2483, 5681,    0,  163, 8309, 1330,
         4943, 4181, 3415, 9999, 3374,   86]))

# 定义PyTorch模型
定义两个embedding层来训练，对于一个任意词，既可以作为中心词，又可以作为背景词，所以每个词需要两个词向量表示，
in_embed训练出来的权重就是每个IC作为中心词的权重，out_embed训练出来的权重就是作为背景词的权重，根据论文推荐使用
中心词向量，所以最后返回in_embed.weight

In [57]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
    
    def forward(self, input_labels, pos_labels, neg_labels):
        """
        input_labels:center words, [batch_size]
        pos_labels:positive words, [batch_size, (window_size * 2)]
        neg_labels:negative words, [batch_size, (window_size * 2 * K)]
        """
        input_embedding = self.in_embed(input_labels) #[batch_size, embed_size]
        pos_embedding = self.out_embed(pos_labels) #[batch_size, (window_size*2), embed_size]
        neg_embedding = self.out_embed(neg_labels) #[batch_size, (window_size*2*K), embed_size]
        
        input_embedding = input_embedding.unsqueeze(2) #[batch_size, embed_size, 1]
        
        pos_dot = torch.bmm(pos_embedding, input_embedding) #[batch_size, (window_size*2), 1]
        pos_dot = pos_dot.squeeze(2) #[batch_size, (window_size*2)]
        
        neg_dot = torch.bmm(neg_embedding, input_embedding) #[batch_size, (window_size*2*K), 1]
        neg_dot = neg_dot.squeeze(2) #[batch_size, (window_size*2*K)]
        
        log_pos = F.logsigmoid(pos_dot).sum(1) # .sum()结果只为一个数，.sum(1)结果是一维的张量
        log_neg = F.logsigmoid(neg_dot).sum(1)
        
        loss = log_pos + log_neg
        return -loss
    
    def input_embedding(self):
        return self.in_embed.weight.data.cpu().numpy()
    

# 训练模型

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = EmbeddingModel(MAX_VOCAB_SIZE, EMBEDDING_SIZE).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr = lr)
model

EmbeddingModel(
  (in_embed): Embedding(10000, 100)
  (out_embed): Embedding(10000, 100)
)

In [59]:
for e in range(1):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long().to(device)
        pos_labels = pos_labels.long().to(device)
        neg_labels = neg_labels.long().to(device)

        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()

        optimizer.step()

        if i % 100 == 0:
            print('epoch', e, 'iteration', i, loss.item())




epoch 0 iteration 0 372.61968994140625
epoch 0 iteration 100 224.0306854248047
epoch 0 iteration 200 174.24728393554688
epoch 0 iteration 300 188.07968139648438
epoch 0 iteration 400 126.84024047851562
epoch 0 iteration 500 152.59014892578125
epoch 0 iteration 600 157.2384490966797
epoch 0 iteration 700 113.7626953125
epoch 0 iteration 800 110.3435287475586
epoch 0 iteration 900 96.40852355957031
epoch 0 iteration 1000 107.94926452636719
epoch 0 iteration 1100 115.32440185546875
epoch 0 iteration 1200 82.22579956054688
epoch 0 iteration 1300 93.9762954711914
epoch 0 iteration 1400 53.61088562011719
epoch 0 iteration 1500 88.42274475097656
epoch 0 iteration 1600 74.04023742675781
epoch 0 iteration 1700 35.101097106933594
epoch 0 iteration 1800 60.540199279785156
epoch 0 iteration 1900 83.75331115722656
epoch 0 iteration 2000 50.66320037841797
epoch 0 iteration 2100 46.033905029296875
epoch 0 iteration 2200 55.27130126953125
epoch 0 iteration 2300 95.88961791992188
epoch 0 iteration 2400

In [63]:
embedding_weights = model.input_embedding()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

In [61]:
def find_nearest(word):
  index = word2idx[word]
  embedding = embedding_weights[index]
  cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
  return [idx_to_word[i] for i in cos_dis.argsort()[:10]]


In [62]:
for word in ["two", "america", "computer"]:
  print(word, find_nearest(word))

two ['two', 'zero', 'five', 'three', 'four', 'six', 'one', 'seven', 'eight', 'nine']
america ['america', 'europe', 'east', 'western', 'south', 'north', 'city', 'central', 'west', 'africa']
computer ['computer', 'games', 'based', 'software', 'free', 'game', 'personal', 'systems', 'research', 'information']
