In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
from collections import Counter

from sklearn.datasets import fetch_20newsgroups

%matplotlib inline

# 数据集1 (来源 宋艳青 博士处理后数据)
## 原始数据读取

In [None]:
data1 = open(r'/mnt/d/Dataset/20newgroup/20ng-train-all-terms.txt', encoding='utf-8').readlines()
data2 = open(r'/mnt/d/Dataset/20newgroup/20ng-test-all-terms.txt', encoding='utf-8').readlines()

data = []
[data.append(d.split('\t')[1]) for d in data1]
[data.append(d.split('\t')[1]) for d in data2]

vectorizer = CountVectorizer()
vector = vectorizer.fit_transform(data)
print("训练数据共有{0}篇, 词汇计数为{1}个".format(vector.shape[0], vector.shape[1]))

vocabulary = vectorizer.vocabulary_
VOCAB_SIZE = len(vocabulary)

reverse_vocab = {v:k for k,v in vocabulary.items()}

## 生成训练数据

In [None]:
def generate_samples(data):
    corpus = ''.join(data).split()
    LEN = len(corpus)
    
    samples = []
    for i,center_word in enumerate(corpus):
        if i-2>=0:
            samples.append((corpus[i-1],center_word))
            samples.append((corpus[i-2],center_word))
        if i+2<LEN:
            samples.append((corpus[i+1],center_word))
            samples.append((corpus[i+2],center_word))
    return samples

def trans_samples(data, vocab):
    [(vocab.get(context_word), vocab.get(center_word)) for (context_word,center_word) in data]
    
    
inputs = generate_samples(data)

features = [(vocabulary.get(context_word), vocabulary.get(center_word)) for (context_word,center_word) in inputs]

print(features[0])
random.shuffle(features)
print(features[0])


samples_train = features[:19900000]
samples_test = features[19900000:]

x_train,y_train,x_test,y_test = [],[],[],[]

for (x,y) in samples_train:
    if x != None:
        if y != None:
            x_train.append(x)
            y_train.append(y)

for (x,y) in samples_test:
    if x != None:
        if y != None:
            x_test.append(x)
            y_test.append(y)

# word2vec 模型1  (自己实现)
模型后使用数据2训练见最后
[参考](https://blog.csdn.net/qq1483661204/article/details/78975847)

In [None]:
VOCAB_SIZE = len(vocabulary)
EMBED_SIZE = 100
BATCH_SIZE = 64
NUM_SAMPLE = 5
BATCH_SIZE = 64

In [None]:
tf.reset_default_graph()
center_words = tf.placeholder(tf.int32, shape=[None])
target_words = tf.placeholder(tf.int32, shape=[None, 1])

encoder_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))
embeddings = tf.nn.embedding_lookup(encoder_matrix, center_words)

decoder_matrix = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / math.sqrt(EMBED_SIZE)))
decoder_bias = tf.Variable(tf.zeros(VOCAB_SIZE))


loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=decoder_matrix,
                                    biases=decoder_bias,
                                    labels=target_words,
                                    inputs=embeddings,
                                    num_sampled=5,
                                    num_classes=VOCAB_SIZE))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)

In [None]:
# 为了展示输出的效果，我们在训练的时候打印一些信息
# 以下是求weight_emb的每个行的模长，但我们知道，其实他的每一行就对应一个词，我们把这些词对应的向量的模长求出来，
# 然后将每个词对应的词向量变为单位向量,这样我们使用embedding_lookup取出词也是单位向量，那么计算余弦距离就可以
# 直接矩阵相乘，得到我们所要计算词的余弦距离，然后我们在排序就可以取前几个最相似的词
norm = tf.sqrt(tf.reduce_sum(tf.square(encoder_matrix),axis=1,keep_dims=True))
norm_embedd = encoder_matrix / norm
# 随机选择8个词作为我们计算最相近的词
# val_data = random.choices(list(vocabulary.keys()),k=5)
val_data = ['geometric', 'monitor', 'mouse', 'linux', 'microsoft', 'engine', 'factory', 'storage', 'billion', 'article']
val_int_data = tf.constant([vocabulary[i] for i in val_data],dtype=tf.int32)
val_int_data_embed = tf.nn.embedding_lookup(encoder_matrix, val_int_data)
similarity = tf.matmul(val_int_data_embed,tf.transpose(norm_embedd))

In [None]:
with tf.Session() as session:
    saver = tf.train.Saver()
    session.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./model/word2vec',session.graph)
    
    batch_size_all = len(x_train)//BATCH_SIZE
    for i in range(batch_size_all+1):
        x_batch = x_train[BATCH_SIZE*i:BATCH_SIZE*(i+1)]
        y_batch = y_train[BATCH_SIZE*i:BATCH_SIZE*(i+1)]
        y_batch = np.reshape(y_batch,[-1,1])

        feed_dict = {center_words: x_batch, target_words: y_batch}
        _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)
        
        if i % 200 == 0:
            print(cur_loss)
        if i % 2000 == 0:
            sim = similarity.eval()
            for i,j in enumerate(val_data):
                # 之所以sim取负号是因为为了从余弦距离最大到最小排列，因为argsort返回的是最小到达排列的索引
                nearest_n = (-sim[i,:]).argsort()[1:5+1]
                logg = 'Nearest to %s is :' % j
                for ind,ner_int_word in enumerate(nearest_n):
                    nearest_word = reverse_vocab[ner_int_word]
                    logg = '%s  %s'%(logg,nearest_word)
                print(logg) 
    save_path = saver.save(sess, "./checkpoints/wrod2vec.ckpt")
    embed_mat = sess.run(norm_embedd)
    writer.close()

# 模型2 word2vec Tensorflow_model_github

In [22]:
emb_dim = 300
vocab_size = len(vocabulary)
batch_size = 128
num_samples = 5
epoches = 1000
top_n = 10
val_data = ['geometric', 'monitor', 'mouse', 'linux', 'microsoft', 
                           'engine', 'factory', 'storage', 'billion', 'article']


def forward(vocab, counts):
    """Build the graph for the forward pass."""
    
    examples = tf.placeholder(tf.int32, shape=[batch_size], name='input')
    labels = tf.placeholder(tf.int32, shape=[batch_size, 1], name='label')
    
    # Embedding Weight [vocab_size, emb_dim]
    init_width = 0.5 / emb_dim
    emb = tf.Variable(
        tf.random_uniform(
            [vocab_size, emb_dim], -init_width),
        name='emb')
    
    # Softmax Weight [vocab_size, emb_dim].Transposed.
    sm_w_t = tf.Variable(
        tf.zeros([vocab_size, emb_dim]),
        name='sm_w_t')
    
    # Softmax bias [vocab_size]
    sm_b = tf.Variable(tf.zeros([vocab_size]), name='sm_b')
    
    # NCE loss
    labels_matrix = tf.reshape(
        tf.cast(labels, dtype=tf.int64),
        [batch_size, 1])
    
    # Negative sampling
    sampled_ids, _, _ = (tf.nn.fixed_unigram_candidate_sampler(
        true_classes=labels_matrix,
        num_true=1,
        num_sampled=num_samples,
        unique=True,
        range_max=vocab_size,
        distortion=0.75,
        unigrams=counts))
    
    # Embeddings for examples: [batch_size, emb_dim]
    example_emb = tf.nn.embedding_lookup(emb, examples)
    
    # Weights for labels: [batch_size, emb_dim]
    # 不需要计算所有的logistic 仅计算正确单词的logistic进行更新
    true_w = tf.nn.embedding_lookup(sm_w_t, labels)
    true_b = tf.nn.embedding_lookup(sm_b, labels)
    
    # Weights for sampled ids: [num_sampled, emb_dim]
    sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids)
    # Biases for sampled ids: [num_sampled, 1]
    sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)
    
    # True logits: [batch_size, 1]
    true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w), 1) + true_b
    
    # Sampled logits: [batch_size, numsampled]
    sampled_b_vec = tf.reshape(sampled_b, [num_samples])
    sampled_logits = tf.matmul(example_emb, 
                               sampled_w, 
                               transpose_b=True) + sampled_b_vec
    
    # Similarity option
    norm = tf.sqrt(tf.reduce_sum(tf.square(emb), axis=1, keepdims=True))
    norm_emb = emb / norm
    
    
    val_int = tf.constant([vocab[word] for word in val_data], dtype=tf.int32)
    val_emb = tf.nn.embedding_lookup(norm_emb, val_int)
    similarity = tf.matmul(val_emb, tf.transpose(norm_emb))
    
    return examples, labels, true_logits, sampled_logits, similarity

def nec_loss(true_logits, sampled_logits):
    """Build the graph for the NCE loss."""

    # cross-entropy (logits, labels)
    true_xent = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.ones_like(true_logits), logits=true_logits)
    
    sampled_xent = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.zeros_like(sampled_logits), logits=sampled_logits)
    
    # NCE-loss is sum of the true and noise (sampled words)
    # contributions, average over the batch.
    nce_loss_tensor = (tf.reduce_sum(true_xent) + 
                       tf.reduce_sum(sampled_xent)) / batch_size
    
    return nce_loss_tensor

def optimize(loss):
    """Build the graph to optimize the loss function."""
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
    return optimizer

def nearby():
    pass

def train(center_words, target_words, vocab, reverse_vocab, counts):
    """Build the graph for the full model."""
    
    tf.reset_default_graph()
    
    examples, labels, true_logits, sampled_logits, similarity = forward(vocab, counts)
    loss = nec_loss(true_logits, sampled_logits)
    optimizer = optimize(loss)
    
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        batch_all = len(center_words) // batch_size
        print('Batch_all:', batch_all, 'Batch_size:', batch_size, 'Samples:', len(center_words))
        for epoch in range(epoches):
            for num in range(batch_all):
                x_batch = center_words[num*batch_size: batch_size*(num+1)]
                y_batch = target_words[num*batch_size: batch_size*(num+1)]

                y_batch = np.array(y_batch).reshape(-1, 1)

                _, l = sess.run([optimizer, loss], feed_dict={
                    examples : x_batch,
                    labels : y_batch
                })
                if num % 100 == 0:
                    print('Epoch:',epoch,' Iter', num, 'loss:', l)
                if num % 1000 == 0:
                    sim = similarity.eval()
                    for i,j in enumerate(val_data):
                        nearest_n = (-sim[i, :]).argsort()[1:top_n+1]
                        logg = 'Nearest to %s is :' % j
                        for ind,ner_int_word in enumerate(nearest_n):
                            nearest_word = reverse_vocab[ner_int_word]
                            logg = '%s  %s'%(logg,nearest_word)
                        print(logg)
        
        save_path = saver.save(session, "./checkpoints/word2vec_model_20news.ckpt")


# 数据集2 
数据处理自`sklearn 20newsgroups`

In [1]:
"""skip-gram模型数据"""


def generate_samples(corpus, vocab, vocab_freq):
    """使用子采样生成数据"""
    LEN = len(corpus)
    rate = 0.001
    samples = []
    
    for i,center_word in enumerate(corpus):
        # 非词汇表词过滤
        if i-2<0 or i+2>LEN-1 \
            or center_word is None \
            or center_word == vocab['.']:
            continue
        else:
            condedate_words = [center_word, corpus[i-1], corpus[i-2], corpus[i+1], corpus[i+2]]
            condedate_words = [word for word in condedate_words if word is not None]
            freqs = np.array([vocab_freq[word] for word in condedate_words])
            p_keeps = (np.sqrt(freqs/rate) + 1) * rate / freqs
            p_keeps[p_keeps>1] = 1
            
            if  random.random() > p_keeps[0]: 
                # center_word 子采样
                # print('center_word %d 舍弃' % center_word)
                continue
            else:
                # target_word 子采样
                sampled_words = [(center_word, condedate_words[i+1]) for i,p in enumerate(p_keeps[1:]) if random.random()<p]
                samples.extend(sampled_words)
                
    return samples

def load_data():
    """加载数据"""
    
    newsgroup = fetch_20newsgroups(data_home='/mnt/d/Dataset/20newgroups/',
                          subset='all',
                          remove=('headers', 'footers', 'quotes'))
    
    corpus = ''.join(newsgroup.data).lower().split()
    counter_corpus = Counter(corpus)

    words_count = sorted(counter_corpus.items(), key=lambda kv:kv[1], reverse=True)
    stop_words = '? ! @ # $ % ^ & * ( ) [ ] { } > < = - + ~ ` --- (i (or / ; ;\' $1 |> \
                    --------- -------------------------------------------------------------------------- \
                    ========================= \
                    0 1 2 3 4 5 6 7 8 9 13 15 30 24 20 "a" tk> 95 45'
    
    index = 0
    vocab_words, vocab, reverse_vocab, vocab_count, vocab_freq,= [],{},{},{},{}
    for (k,v) in words_count:
        if k in stop_words.split() or v < 15: continue
        # 单词列表
        vocab_words.append(k)
        # 单词:id
        vocab[k] = index
        # id:单词
        reverse_vocab[index] = k
        # 单词:频次
        vocab_count[k] = v
        # 单词:频率
        vocab_freq[index] = v/len(corpus)
        index += 1

    print('字典长度:', len(vocab.keys()))
    print(vocab_words[:10])
    
    corpus_int = [vocab.get(word) for word in corpus]
    # 非字典词语占比
    print('非字典词语占比%.2f' % (corpus_int.count(None)/len(corpus_int)*100))
    
    samples = generate_samples(corpus_int, vocab, vocab_freq) # version2 实现子采样 传入原始语料(word list)而不是int
    print('样本数量:',len(samples))
    
    return samples, list(vocab_count.values()), vocab

In [19]:
samples,counts,vocabulary = load_data()
reverse_vocab = {v:k for k,v in vocabulary.items()}
center_words = [x for (x,y) in samples]
target_words = [y for (x,y) in samples]


字典长度: 15282
['the', 'to', 'of', 'a', 'and', 'in', 'is', 'i', 'that', 'for']
非字典词语占比16.26
样本数量: 5885881


In [3]:
vocabulary['max>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'ax>\'']

NameError: name 'vocabulary' is not defined

In [20]:

"""
['the', 'to', 'of', 'a', 'and', 'in', 'is', 'i', 'that', 'for']
Counter({0: 76539,
         1: 61104,
         2: 52556,
         3: 53414,
         4: 50200,
         5: 45036,
         6: 43316,
         7: 46876,
         8: 45292,
         9: 38525,
         10: 36548,
"""

# 手动对高频词限制
limit_samples = []
for x,y in samples:
    if x == 0 and random.random() < 0.65: continue
    if x == 1 and random.random() < 0.3: continue
    if x == 2 and random.random() < 0.3: continue
    if x == 3 and random.random() < 0.3: continue
    if x == 3 and random.random() < 0.1: continue
    limit_samples.append((x,y)) 
    
center_words = [x for (x,y) in limit_samples]
target_words = [y for (x,y) in limit_samples]
Counter(center_words)


Counter({0: 76539,
         1: 61104,
         2: 52556,
         3: 53414,
         4: 50200,
         5: 45036,
         6: 43316,
         7: 46876,
         8: 45292,
         9: 38525,
         10: 36548,
         11: 38383,
         12: 33423,
         13: 34288,
         14: 32579,
         15: 33519,
         16: 32385,
         17: 33411,
         18: 32387,
         19: 31051,
         20: 28554,
         21: 28587,
         22: 30011,
         23: 28757,
         24: 30463,
         25: 26727,
         26: 26809,
         27: 26936,
         28: 26933,
         29: 27191,
         30: 26220,
         31: 25810,
         32: 25531,
         33: 25215,
         34: 24119,
         35: 25418,
         36: 24319,
         37: 25515,
         38: 26356,
         39: 23541,
         40: 23309,
         41: 24404,
         42: 24018,
         43: 22070,
         44: 21242,
         45: 19556,
         46: 19258,
         47: 20183,
         48: 18508,
         49: 18075,
         5

In [2]:
train(center_words, target_words, vocabulary, reverse_vocab, list(counts))

NameError: name 'train' is not defined

# 3 第二个数据 第一个模型
模型保存于`./checkpoints/wrod2vec.ckpt` (笔误)

In [None]:
x_train = center_words 
y_train = target_words



VOCAB_SIZE = 14525
EMBED_SIZE = 100
BATCH_SIZE = 128
NUM_SAMPLE = 5
BATCH_SIZE = 64

tf.reset_default_graph()
center_words = tf.placeholder(tf.int32, shape=[None])
target_words = tf.placeholder(tf.int32, shape=[None, 1])

encoder_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))
embeddings = tf.nn.embedding_lookup(encoder_matrix, center_words)

decoder_matrix = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / math.sqrt(EMBED_SIZE)))
decoder_bias = tf.Variable(tf.zeros(VOCAB_SIZE))


loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=decoder_matrix,
                                    biases=decoder_bias,
                                    labels=target_words,
                                    inputs=embeddings,
                                    num_sampled=5,
                                    num_classes=VOCAB_SIZE))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(loss)



# 为了展示输出的效果，我们在训练的时候打印一些信息
# 以下是求weight_emb的每个行的模长，但我们知道，其实他的每一行就对应一个词，我们把这些词对应的向量的模长求出来，
# 然后将每个词对应的词向量变为单位向量,这样我们使用embedding_lookup取出词也是单位向量，那么计算余弦距离就可以
# 直接矩阵相乘，得到我们所要计算词的余弦距离，然后我们在排序就可以取前几个最相似的词
norm = tf.sqrt(tf.reduce_sum(tf.square(encoder_matrix),axis=1,keep_dims=True))
norm_embedd = encoder_matrix / norm
# 随机选择8个词作为我们计算最相近的词
val_data = random.choices(list(vocabulary.keys()),k=5)
val_int_data = tf.constant([vocabulary[i] for i in val_data],dtype=tf.int32)
val_int_data_embed = tf.nn.embedding_lookup(encoder_matrix, val_int_data)
similarity = tf.matmul(val_int_data_embed,tf.transpose(norm_embedd))


with tf.Session() as session:
    saver = tf.train.Saver()
    session.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./model/word2vec',session.graph)
    
    batch_size_all = len(x_train)//BATCH_SIZE
    
    for e in range(10):
        for i in range(batch_size_all+1):
            x_batch = x_train[BATCH_SIZE*i:BATCH_SIZE*(i+1)]
            y_batch = y_train[BATCH_SIZE*i:BATCH_SIZE*(i+1)]
            y_batch = np.reshape(y_batch,[-1,1])

            feed_dict = {center_words: x_batch, target_words: y_batch}
            _, cur_loss = session.run([optimizer, loss], feed_dict=feed_dict)

            if i % 2000 == 0:
                print('Epoch:',e,' index:', i, ' loss:', cur_loss)
            if i % 10000 == 0:
                sim = similarity.eval()
                for i,j in enumerate(val_data):
                    # 之所以sim取负号是因为为了从余弦距离最大到最小排列，因为argsort返回的是最小到达排列的索引
                    nearest_n = (-sim[i,:]).argsort()[1:5+1]
                    logg = 'Nearest to %s is :' % j
                    for ind,ner_int_word in enumerate(nearest_n):
                        nearest_word = reverse_vocab[ner_int_word]
                        logg = '%s  %s'%(logg,nearest_word)
                    print(logg) 
    save_path = saver.save(session, "./checkpoints/wrod2vec.ckpt")
    embed_mat = session.run(norm_embedd)
    writer.close()