In [None]:
import numpy as np
import collections, random

In [None]:
batch_size = 40

In [None]:
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)    

In [None]:
batch

In [None]:
labels

In [None]:
skip_window = 2
num_skips = 3

In [None]:
span = 2 * skip_window + 1  # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)

In [None]:
buffer.extend(xrange(span))

In [None]:
buffer

In [None]:
data_index = span

In [None]:
for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [skip_window]
    for j in range(num_skips):
        while target in targets_to_avoid:
            target = random.randint(0, span - 1)
        targets_to_avoid.append(target)
        batch[i * num_skips + j] = buffer[skip_window]
        labels[i * num_skips + j, 0] = buffer[target]
   
    buffer.append(data_index)
    data_index += 1

In [None]:
buffer

In [None]:
batch

In [None]:
labels

In [None]:
np.random.randint(1000, size=20)

### the word2vec model

In [13]:
import tensorflow as tf
import numpy as np
import sys, math, collections

In [20]:

class word2vec(object):
    
    def __init__(self, config, model_type='CBOW'):
        self.model_type = model_type
        self.config = config
        
        #assert config.batch_size % config.num_skip == 0
        #assert config.num_skip <= 2 * config.context_window
        
    def _init_placeholders(self):
        if self.model_type == 'CBOW':
            self.X = tf.placeholder(tf.int32, shape=[self.config.batch_size, self.config.context_window*2], name="input_X")
        elif self.model_type == 'SKIP_GRAM':
            self.X = tf.placeholder(tf.int32, shape=[self.config.batch_size], name="input_X")
        else:
            raise ValueError('unknown model type {} is found...'.format(self.model_type))
        self.y = tf.placeholder(tf.int32, shape=[self.config.batch_size, 1], name="input_y")
        
    def _init_variables(self):
        init_width = 0.5 / self.config.embedding_size
        self.embedding = tf.Variable(
            tf.random_uniform([self.config.vocab_size, self.config.embedding_size], -init_width, init_width),
            name='embedding')
        self.weight = tf.Variable(
            tf.truncated_normal([self.config.vocab_size, self.config.embedding_size], stddev=1. / math.sqrt(self.config.embedding_size)),
            name='weight')
        self.bias = tf.Variable(tf.zeros([self.config.vocab_size]), name='bias')
        
    def cbow_batch_content(self):
        span = 2 * self.config.context_window + 1
        X = np.zeros(shape=(self.config.batch_size, span-1), dtype=np.int32)
        y = np.zeros(shape=(self.config.batch_size, 1), dtype=np.int32)
        buffer = collections.deque(maxlen=span)
        buffer.extend(np.random.randint(self.config.vocab_size, size=span))
        for i in xrange(self.config.batch_size):
            buffer_list = list(buffer)
            y[i, 0] = buffer_list.pop(self.config.context_window)
            X[i] = buffer_list
            buffer.append(np.random.randint(self.config.vocab_size, size=1))
        return X, y
    
    def _build_graph(self):
        X_embedded = tf.nn.embedding_lookup(self.embedding, self.X)
        if self.model_type == 'CBOW':
            X_embedded = tf.reduce_sum(X_embedded, 1)
        print 'shape: ', X_embedded
        self.loss = tf.reduce_mean(tf.nn.nce_loss(self.weight, 
                                             self.bias, 
                                             inputs=X_embedded, 
                                             labels=self.y, 
                                             num_sampled=self.config.neg_sample_size,
                                             num_classes=self.config.vocab_size))
        self.train = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
        
    def train(self):
        self._init_placeholders()
        self._init_variables()
        self._build_graph()
        with tf.Session() as sess:
            sess.run(tf.initialize_all_variables())
            step = 0
            while step < 50000:
                step += 1
                X, y = self.cbow_batch_content()
                _, loss = sess.run([self.train, self.loss], feed_dict={self.X: X, 
                                                                       self.y: y})
                print 'the loss: {} at step {}'.format(loss, step)
                
        
        
        

In [21]:
def main():
    class ModelConfig():
        pass
    model_config = ModelConfig()
    model_config.batch_size = 32
    model_config.context_window = 2
    model_config.vocab_size = 2000
    model_config.embedding_size = 512
    model_config.neg_sample_size = 2
    model_config.learning_rate = 0.0001
    model = word2vec(model_config)
    model.train()
    #X_, y_ = model.cbow_batch_content()
    #print X_
    #print y_
    

In [22]:
main()

shape:  Tensor("Sum_6:0", shape=(32, 512), dtype=float32)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
the loss: 16.3976764679 at step 1
the loss: 16.2923965454 at step 2
the loss: 5.6710486412 at step 3
the loss: 14.4978618622 at step 4
the loss: 13.0691719055 at step 5
the loss: 13.4176864624 at step 6
the loss: 10.8247394562 at step 7
the loss: 10.1176319122 at step 8
the loss: 9.24762153625 at step 9
the loss: 6.77455329895 at step 10
the loss: 11.7266397476 at step 11
the loss: 5.11419296265 at step 12
the loss: 9.13751125336 at step 13
the loss: 11.7613449097 at step 14
the loss: 11.8041944504 at step 15
the loss: 6.88624000549 at step 16
the loss: 7.78710317612 at step 17
the loss: 9.7070941925 at step 18
the loss: 4.62327003479 at step 19
the loss: 12.3694953918 at step 20
the loss: 5.74902439117 at step 21
the loss: 5.90933036804 at step 22
the loss: 14.9038801193 at step 23
the loss: 5.89936447144 at step 24
the loss: 10.1395282745 at step 25
the 

KeyboardInterrupt: 

In [26]:
import cPickle as pickle
import os

In [27]:
data_path = '/Users/matt.meng'
pickle_file = 'processed_titles_data.pkl'

In [28]:
with open(os.path.join(data_path, pickle_file)) as input_file:
    data = pickle.load(input_file)

In [29]:
data.keys()

['url', 'reverse_token_dict', 'pageViw', 'titles', 'token_dict']

In [30]:
titles = data['titles']

In [54]:
titles[:5]

[[163, 2969, 20, 2805, 168],
 [27, 186, 3, 156, 210],
 [322, 1523, 16262, 31, 493],
 [54, 6, 181, 2939, 713],
 [49, 64, 635, 9, 1425]]

In [47]:
def create_cbow_data(titles, context_window):
    span = 2 * context_window + 1
    missing_count = 0
    training_list = []
    target_list = []
    #X = np.zeros(shape=(batch_size, span-1), dtype=np.int32)
    #y = np.zeros(shape=(batch_size, 1), dtype=np.int32)
    for title in titles:
        if len(title) < span:
            missing_count += 1
            continue
        buffer = collections.deque(maxlen=span)
        buffer.extend(title[:span])
        title_len = len(title)
        for i in xrange(title_len-span+1):
            buffer_list = list(buffer)
            target_list.append([buffer_list.pop(context_window)])
            training_list.append(buffer_list)
            if i + span < title_len: 
                buffer.append(title[i+span])
    print '{} short titles are passed by context_window {}'.format(missing_count, context_window)
    return training_list, target_list
            

In [51]:
training_list_, target_list_ = create_cbow_data(titles[:5], 2)

0 short titles are passed by context_window 2


In [52]:
training_list_

[[163, 2969, 2805, 168],
 [27, 186, 156, 210],
 [322, 1523, 31, 493],
 [54, 6, 2939, 713],
 [49, 64, 9, 1425]]

In [53]:
target_list_

[[20], [3], [16262], [181], [635]]

In [59]:
title = [range(10)]

In [60]:
training_list_, target_list_ = create_cbow_data(title, 2)

0 short titles are passed by context_window 2


In [61]:
training_list_

[[0, 1, 3, 4],
 [1, 2, 4, 5],
 [2, 3, 5, 6],
 [3, 4, 6, 7],
 [4, 5, 7, 8],
 [5, 6, 8, 9]]

In [62]:
target_list_

[[2], [3], [4], [5], [6], [7]]

In [63]:
training_list_, target_list_ = create_cbow_data(titles, 2)

0 short titles are passed by context_window 2


In [64]:
len(training_list_)

742529