# !!UNDER CONSTRUCTION!!

Implementation of word2vec (skip-gram).

Tutorial: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

# Load Libraries

In [49]:
import numpy as np
import _pickle as cPickle
import theano
import theano.tensor as T

# Declare varibles

In [50]:
corpus_path = '../datasets/ptwiki-20170820-sentences.pickle'
word_embedding_path = '../datasets/ptwiki-20170820-embedding.pickle'

# word2vec

## Hyperparameters

In [80]:
CONTEXT_SIZE = 10
LEARNING_RATE = 0.01
NEGATIVE_SAMPLE_SIZE = 20
EMBEDDING_SIZE = 10
BATCH_SIZE = 50

## Auxiliar class

Corpus dataset must be preprocessed by preprocessing.ipynb before being load

In [52]:
class Corpus(object):
    
    def __init__(self, corpus_path):
        with open(corpus_path, 'rb') as fp:
            self._indices = cPickle.load(fp) 
            self._indice_freq = cPickle.load(fp)
            self._sentences = cPickle.load(fp)

    def sentences_size(self):
        return len(self._sentences)
            
    def tokens_size(self):
        return len(self._indices)
    
    def contexts(self, C=5):
        for sentence in self._sentences:
            for idx, center_word in enumerate(sentence):
                # Get previous words
                context = sentence[max(0, idx - C):idx]
                
                # Get future words
                if idx + 1 < len(sentence):
                    context += sentence[idx+1:min(len(sentence), idx + C + 1)]

                # Remove duplicate center word
                context = [word for word in context if word is not center_word]

                # Return current context
                yield center_word, context

In [56]:
corpus = Corpus(corpus_path)

## Algorithm

## More Theano Friendly approach (TEMP)

In [81]:
# Initializing network parameters
W_in_values = np.asarray((np.random.rand(corpus.tokens_size(), EMBEDDING_SIZE) - 0.5) / EMBEDDING_SIZE,
                              dtype=theano.config.floatX)

W_out_values = np.asarray(np.zeros((corpus.tokens_size(), EMBEDDING_SIZE)),
                              dtype=theano.config.floatX)

# Declaring theano parameters
W_in = theano.shared(
    value=W_in_values,
    name='W_in',
    borrow=True
)

W_out = theano.shared(
    value=W_out_values,
    name='W_out',
    borrow=True
)

context = T.ivector('context')
target = T.lscalar('target')

# Change context
target_embedding = W_in[target]
#context_embedding = W_out[context]

# Apply Softmax in the output layer
estimated = T.nnet.softmax(T.dot(target_embedding, W_out.T)).dimshuffle(1, 0)

# Compute cost - Ignore for now
cost = T.mean(-T.log(estimated[context]))

# Expected window answer
expected = T.zeros_like(estimated)
expected = T.set_subtensor(expected[context], 1)

# Compute window error
z = context.size * estimated - expected

# Compute gradient descent
grad_in = T.dot(W_out.T, z)
grad_out = T.outer(target_embedding, z)

# Zip updates
updates = [(W_in, T.inc_subtensor(target_embedding, - LEARNING_RATE * grad_in.flatten())),
           (W_out, W_out - LEARNING_RATE * grad_out.T)]

# Create theano training function
train_model = theano.function(
    inputs=[context, target],
    outputs=cost,
    updates=updates
)

def train(window_size=5,
          iterations=3,
          anneal_every=20000):

    print('Start Training')

    for it in range(1, iterations + 1):
        for context_it, (center_word, context) in enumerate(corpus.contexts(window_size)):
            # Compute cost and gradient
            cost = train_model(context, center_word)

            # Print temp results
            if context_it % 5000 == 0:
                print('Iteration:{}, Cost {}'.format(context_it, cost))
    return cost

In [82]:
%time train(iterations=1)

Start Training
Iteration:0, Cost 10.333320017092587
Iteration:5000, Cost 10.329690444191804
Iteration:10000, Cost 8.834912284257166
Iteration:15000, Cost 9.675441497020781
Iteration:20000, Cost 10.2529534691915
Iteration:25000, Cost 7.0330072736699325
Iteration:30000, Cost 6.401999696874472
Iteration:35000, Cost 9.320783825020383
Iteration:40000, Cost 9.803124201354489
Iteration:45000, Cost 9.133121886791253
Iteration:50000, Cost 8.844775216589117
Iteration:55000, Cost 5.719740202477536
Iteration:60000, Cost 7.123045748699175
Iteration:65000, Cost 7.727326183021674
Iteration:70000, Cost 9.830344356836468
Iteration:75000, Cost 7.717209140316397
Iteration:80000, Cost 10.332459648376098
Iteration:85000, Cost 6.728496978470164
Iteration:90000, Cost 6.176272435459113
Iteration:95000, Cost 7.323777016915327
Iteration:100000, Cost 9.272557835081898
Iteration:105000, Cost 9.516425312467353
Iteration:110000, Cost 9.64576114389155
Iteration:115000, Cost 6.240535981340069
Iteration:120000, Cost 1

array(4.832946471379733)

In [5]:
class Word2Vec(object):
    
        def __init__(self, corpus, embedding_size=300):
            self.corpus = corpus
            
            # Initializing network parameters
            self.W_in_values = np.asarray((np.random.rand(corpus.tokens_size(), embedding_size) - 0.5) / embedding_size,
                                          dtype=theano.config.floatX)

            self.W_out_values = np.asarray(np.zeros((corpus.tokens_size(), embedding_size)),
                                          dtype=theano.config.floatX)     

            # Declaring theano parameters
            self.W_in = theano.shared(
                value=self.W_in_values,
                name='W_in',
                borrow=True
            )

            self.W_out = theano.shared(
                value=self.W_out_values,
                name='W_out',
                borrow=True
            )
            
            context = T.dvector('context')
            target = T.lscalar('target')
        
            # Building training function
            self.train_model = theano.function([context, target], 
                                               self.__cost_and_grad(context, target))
            
            """
            updates = [(classifier.W, classifier.W - learning_rate * g_W),
                           (classifier.b, classifier.b - learning_rate * g_b)]            
            train_model = theano.function(
                inputs=[index],
                outputs=cost,
                updates=updates,
                givens={
                    x: train_set_x[index * batch_size: (index + 1) * batch_size],
                    y: train_set_y[index * batch_size: (index + 1) * batch_size]
                }
            )
            """
        
        def __cost_and_grad(self, win, wout):
            # Apply Softmax in the output layer
            p_wout_given_win = T.nnet.softmax(T.dot(win, self.W_out.T)).dimshuffle(1, 0)

            # Compute cost 
            cost = -T.log(p_wout_given_win[wout])

            # Expected answer
            foo = T.zeros_like(p_wout_given_win)
            foo = T.set_subtensor(foo[wout], 1)

            # Compute error
            z = p_wout_given_win - foo

            # Compute gradient descent
            grad_in = T.dot(self.W_out.T, z)
            grad_out = T.outer(z, win)

            return cost, grad_in, grad_out
        
        def __train_one_step(self, center_word, context):
            cost = 0.

            # Gradient descent for each layer
            grad_in = np.zeros_like(self.W_in_values)
            grad_out = np.zeros_like(self.W_out_values)

            # Get center word embedding vector
            r = self.W_in_values[center_word]

            # Compute probability between center and each context word
            for word in context:
                # Train pair
                c_cost, c_grad_in, c_grad_out = self.train_model(r, word)
                
                # Accumulate cost and gradient
                cost += c_cost
                grad_out += c_grad_out 
                grad_in[word,:] += c_grad_in.flatten()

            return cost, grad_in, grad_out

        def __train_one_batch(self, batch):
            cost = 0.0
            batch_size = len(batch)
            
            # Batch gradient accumulator for each layer
            grad_in = np.zeros_like(self.W_in_values)
            grad_out = np.zeros_like(self.W_out_values)
                
            for center_word, context in batch:
                # Train one context
                c_cost, gin, gout = self.__train_one_step(center_word, context)
                
                # Accumulate cost and gradient
                cost += c_cost / batch_size 
                grad_in += gin / batch_size
                grad_out += gout / batch_size

            return cost, grad_in, grad_out
                        
        def train(self,
                  window_size=5,
                  learning_rate=0.3,
                  iterations=3,
                  batch_size=50,
                  anneal_every=20000,
                  print_every=10):
            
            print('Start Training')

            batch = []
            for it in range(1, iterations + 1):
                for context_it, (center_word, context) in enumerate(self.corpus.contexts(window_size)):
                    # Accumulate contexts
                    if context_it % batch_size != 0:
                        batch += [(center_word, context)]
                        continue
                    
                    # Compute cost and gradient
                    cost, grad_in, grad_out = self.__train_one_batch(batch)

                    # Update weights
                    self.W_in_values -= learning_rate * grad_in
                    self.W_out_values -= learning_rate * grad_out
                    
                    # Print temp results
                    if context_it % (batch_size * 100) == 0:
                        print('Batch Iteration:{}, Cost {}'.format(context_it/batch_size, cost))
                
                    # Decreases learning rate
                    if context_it % anneal_every == 0:
                        learning_rate *= 0.5
                    
                    # Restart contexts
                    if context_it % batch_size == 0:
                        batch = []
                        
                # Print temp results                
                print('Iteration:{}, Cost {}'.format(it, cost))

            return cost, grad_in, grad_out
            
        def save(self, file_path):
            with open(file_path, "wb") as fp:
                cPickle.dump(self.W_in_values.shape, fp)
                cPickle.dump(self.W_in_values, fp)       
                cPickle.dump(self.W_out_values, fp)


        def load(self, file_path):
            with open(file_path, "rb") as fp:
                _ = pickle.load(fp)
                self.W_in_values[:] = cPickle.load(fp)
                self.W_out_values[:] = cPickle.load(fp)

In [6]:
corpus = Corpus(corpus_path)
w2v = Word2Vec(corpus, 10)

In [7]:
%time w2v.train(learning_rate=0.3, iterations=1, batch_size=50)

Start Training
Batch Iteration:0.0, Cost 0.0
Batch Iteration:100.0, Cost [ 84.56218947]
Batch Iteration:200.0, Cost [ 96.03659419]
Batch Iteration:300.0, Cost [ 88.33899623]
Batch Iteration:400.0, Cost [ 89.85975526]
Batch Iteration:500.0, Cost [ 87.93987684]
Batch Iteration:600.0, Cost [ 91.10169745]
Batch Iteration:700.0, Cost [ 91.3427836]
Batch Iteration:800.0, Cost [ 82.23530392]
Batch Iteration:900.0, Cost [ 87.51466282]
Batch Iteration:1000.0, Cost [ 90.90872749]
Batch Iteration:1100.0, Cost [ 91.97445738]
Batch Iteration:1200.0, Cost [ 88.54378626]
Batch Iteration:1300.0, Cost [ 86.75933324]
Batch Iteration:1400.0, Cost [ 91.83995348]
Batch Iteration:1500.0, Cost [ 68.10437674]
Batch Iteration:1600.0, Cost [ 86.01489242]
Batch Iteration:1700.0, Cost [ 75.82691304]
Batch Iteration:1800.0, Cost [ 85.07698565]
Batch Iteration:1900.0, Cost [ 83.41413975]
Batch Iteration:2000.0, Cost [ 87.21685181]
Batch Iteration:2100.0, Cost [ 91.34458181]
Batch Iteration:2200.0, Cost [ 90.5983307

(array([ 78.48136031]),
 array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [-0.00354483,  0.0063573 ,  0.0102293 , ...,  0.00311063,
         -0.00483038, -0.00019464],
        [-0.00209988,  0.00392989,  0.00630265, ...,  0.00209935,
         -0.00311302, -0.00026058],
        [-0.00450718,  0.0085184 ,  0.0136881 , ...,  0.00465696,
         -0.00682179, -0.0006087 ]]),
 array([[ -7.22303011e-05,   1.25136028e-04,   1.95571640e-04, ...,
           4.96994331e-05,  -8.77295002e-05,  -4.61085371e-06],
        [ -9.50589097e-02,   1.65034548e-01,   2.55285209e-01, ...,
           6.60394913e-02,  -1.14043225e-01,  -9.82538425e-03],
        [ -1.04207316e-01,   1.80940903e-01,   2.79882144e-01, ...,
           7.240

In [13]:
w2v.save(word_embedding_path)