In [None]:
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cosine, cdist
from collections import Counter

In [None]:
with open('raw_sentences.txt') as f:
    sentences = f.read().replace('\n', ' ')

In [None]:
# 用 <unk> 取代罕見字
c = Counter(sentences.split())
words = [w if c[w] > 2 else '<unk>' for w in sentences.split()]

In [None]:
inv_vocab = ['<unk>'] + [w for w, cnt in c.most_common() if cnt > 2]
vocab = dict([(w, i) for i, w in enumerate(inv_vocab)])
word_ids = [vocab[w] for w in words]

In [None]:
print u'全部的字數:', len(words)
print u'字典的字數:', len(vocab)
print u'最常見的單字:', c.most_common()[:5]

## CBOW (continuous bag-of-words)

![CBOW](http://sebastianruder.com/content/images/2016/02/cbow.png)

In [None]:
cbow = []
win = 2
for i in range(win, len(word_ids) - win):
    x = tuple([word_ids[j] for j in range(i - win, i + win + 1) if j != i])
    cbow.append((x, word_ids[i]))
print cbow[0], len(cbow)

## Skip-gram

![](http://sebastianruder.com/content/images/2016/02/skip-gram.png)

In [None]:
skipgram = []
win = 3
for i in range(win, len(word_ids) - win):
    skipgram.extend([(word_ids[i], word_ids[j])
                     for j in range(i - win, i + win + 1) if j != i])
print skipgram[0], len(skipgram)

## Build Model

In [None]:
def most_sim(word):
    vw = np.expand_dims(word_vector[vocab[word]], 0)
    sim = np.argsort((1 - cdist(word_vector, vw, 'cosine')).flatten())[::-1]
    return [inv_vocab[i] for i in sim[:5]]

In [None]:
def skipgram_batch(size):
    index = range(len(skipgram))
    np.random.shuffle(index)
    for e in range(size, len(skipgram), size):
        s = e - size
        x_data = [skipgram[i][0] for i in index[s:e]]
        y_data = [[skipgram[i][1]] for i in index[s:e]]
        yield x_data, y_data

In [None]:
batch_size = 128
vector_size = 10
X = tf.placeholder(tf.int32, shape=[batch_size])
Y = tf.placeholder(tf.int32, shape=[batch_size, 1])

with tf.device('/cpu:0'):
    wordvec = tf.Variable(tf.random_normal([len(vocab), vector_size]))
    vec_X = tf.nn.embedding_lookup(wordvec, X)

    nce_weights = tf.Variable(tf.random_normal([len(vocab), vector_size]))
    nce_biases = tf.Variable(tf.zeros([len(vocab)]))

nce_loss = tf.nn.nce_loss(nce_weights, nce_biases, vec_X, Y, 10, len(vocab))
loss = tf.reduce_mean(nce_loss)

global_step = tf.Variable(0, trainable=False)
rate = tf.train.exponential_decay(0.1, global_step, 100000, 0.9, staircase=True)
train_op = tf.train.AdadeltaOptimizer(rate).minimize(loss, global_step=global_step)

In [None]:
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    for epoch in range(50):
        losses = []
        for x, y in skipgram_batch(batch_size):
            _, loss_val = sess.run([train_op, loss], feed_dict={X: x, Y: y})
            losses.append(loss_val)
        word_vector = sess.run(wordvec)
        print epoch, np.mean(losses)
        print '    she --> ', most_sim('she')
        print '    office --> ', most_sim('office')

## Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

tsne = TSNE(n_components=2)
vec_2d = tsne.fit_transform(word_vector)

In [None]:
plt.figure(figsize=(18, 18))
for i, word in enumerate(inv_vocab):
    x, y = vec_2d[i]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')