# !!UNDER CONSTRUCTION!!

Implementation of word2vec (skip-gram).

Tutorial: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

# Load Libraries

In [1]:
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
#from theano.tensor.shared_randomstreams import RandomStreams

import numpy as np
import _pickle as cPickle
import random
import theano
import theano.tensor as T

# Declare varibles

In [2]:
corpus_path = '../datasets/ptwiki-20170820-sentences.pickle'
word_embedding_path = '../datasets/ptwiki-20170820-embedding.pickle'

# Corpus

Corpus dataset must be preprocessed by preprocessing.ipynb before being load

In [3]:
class Corpus(object):
    
    def __init__(self, corpus_path, sampling_rate):
        # Rate for decrease words
        self.sampling_rate = sampling_rate
        
        # Load Corpus
        with open(corpus_path, 'rb') as fp:
            self.__indices = cPickle.load(fp) 
            self.__indice_freq = cPickle.load(fp)
            self.__raw_sentences = cPickle.load(fp)
            
    def tokens_size(self):
        if hasattr(self, "__tokens_size") and self.__tokens_size:
            return self.__tokens_size
        
        self.__tokens_size = len(self.__indices)
        return self.__tokens_size
    
    def words_size(self):
        if hasattr(self, "__words_size") and self.__words_size:
            return self.__words_size
        
        self.__word_size = sum(self.__indice_freq.values())
        return self.__word_size

    def indices(self):
        return self.__indices.copy()
    
    def frequencies(self):
        return self.__indice_freq.copy()
    
    def rejection_probability(self):
        if hasattr(self, '__rejection_probability') and self.__rejection_probability:
            return self.__reject_prob

        n_words = self.words_size()
        n_tokens = self.tokens_size()
        rejection_probability = np.zeros(n_tokens)
        for i in range(n_tokens):
            density = self.__indice_freq[i]/(1.0 * n_words)
            
            # Calculate rejection probability
            rejection_probability[i] = 1 - (np.sqrt(density/self.sampling_rate) + 1) * (self.sampling_rate/density)

        self.__rejection_probability = rejection_probability
        return self.__rejection_probability

    def sentences(self):
        if hasattr(self, "__sentences") and self.__sentences:
            return self.__sentences
        
        rejection_probability = self.rejection_probability()
        sentences = [[word for word in sentence
                      if 0 >= rejection_probability[word]
                      or random.random() >= rejection_probability[word]]
                     for sentence in self.__raw_sentences]

        sentences = [sentence for sentence in sentences
                     if len(sentence) > 1]
        
        self.__sentences = sentences        
        return self.__sentences

    def contexts(self, C=5):
        for sentence in self.sentences():
            for center_idx, center_word in enumerate(sentence):
                # Get current context
                context = self.__get_context(sentence, center_idx, center_word, C)

                # Return current context
                yield center_word, context
                
    def random_contexts(self, size, C=5):
        sentences = self.sentences()
        for _ in range(size):
            # Get random sentence
            sentence_idx = random.randint(0, len(sentences) - 1)
            sentence = sentences[sentence_idx]

            # Get random center word
            center_idx = random.randint(0, len(sentence) - 1)
            center_word = sentence[center_idx]

            # Get current context
            context = self.__get_context(sentence, center_idx, center_word, C)
            
            # Return current context
            yield center_word, context
            
    def __get_context(self, sentence, center_idx, center_word, C=5):
        # Get previous words
        context = sentence[max(0, center_idx - C):center_idx]

        # Get future words
        if center_idx + 1 < len(sentence):
            context += sentence[center_idx+1:min(len(sentence), center_idx + C + 1)]

        # Remove duplicate center word
        context = [word for word in context if word is not center_word]
        
        return context

# Unigram table

In [4]:
class UnigramTable(object):
    
    def __init__(self, counts):
        power = 0.75
        
        # Normalizing constants
        norm = sum([np.power(count, power) for count in counts.values()])
        
        # table_size should be big enough so that the minimum probability for a word * table_size >= 1.
        # Also,  table_size must be hardcoded as a counter-measure for the case of the minimum probability
        # be a extremely low value, what would burst our memory
        table_size = int(1e8)
        table = np.zeros(table_size, dtype=np.int32)

        # Cumulative probability
        cum_probability = 0
        
        i = 0
        for word, count in counts.items():
            cum_probability += np.power(count, power)/norm
            # fill the table until reach the cumulative probability
            while i < table_size and i / table_size < cum_probability:
                table[i] = word
                i += 1

        self.__table = table         
        self.__table_size = table_size

    def sample(self, k):        
        indices = np.random.randint(low=0, high=self.__table_size, size=k)
        return self.__table[indices]

# Word2vec

## Skip-gram

In [13]:
class Word2Vec(object):
    
    def __init__(self,
                 corpus,
                 embedding_size=10,
                 unigram_table=None):
        
        self.corpus = corpus
        self.embedding_size = embedding_size
        self.unigram_table = unigram_table

        # Initializing network parameters
        self.W_in_values = np.asarray((np.random.rand(corpus.tokens_size(), embedding_size) - 0.5) / embedding_size,
                                      dtype=theano.config.floatX)

        self.W_out_values = np.asarray(np.zeros((corpus.tokens_size(), embedding_size)),
                                      dtype=theano.config.floatX)

        # Declaring theano parameters
        # Embedding variables
        self.W_in = theano.shared(
            value=self.W_in_values,
            name='W_in',
            borrow=True
        )

        self.W_out = theano.shared(
            value=self.W_out_values,
            name='W_out',
            borrow=True
        )
        
        # Get training model
        self.train_model = self.__train_model()
        #self.train_model.trust_input = True

    def __train_model(self):
        # Input variables
        context = T.ivector('context')
        target = T.lscalar('target')
        learning_rate = T.scalar('learning_rate')
        negative_samples = T.ivector('negative_samples')
        
        # Change context
        target_embedding = self.W_in[target]
        
        # Negative Sampling
        context_size = context.size
        all_samples = T.concatenate([context, negative_samples], axis=0)
        W_out_sampled = self.W_out[all_samples]
        #estimated = T.nnet.softmax(T.dot(target_embedding, W_out_sampled.T)).dimshuffle(1, 0)
        #cost = -T.log(T.prod(estimated[negative_word_size:]))
        
        # Compute cost
        context_embedding = W_out_sampled[:context_size]
        positive_cost = T.log(T.nnet.sigmoid(T.dot(target_embedding, context_embedding.T)))
        
        negative_embedding = W_out_sampled[context_size:]
        negative_cost = T.log(T.nnet.sigmoid(-T.dot(target_embedding, negative_embedding.T)))
        
        cost = -T.sum(positive_cost) - context.size * T.sum(negative_cost)
        
        # Compute gradient        
        grad_in, grad_out = T.grad(cost, [target_embedding, W_out_sampled])
        
        # Zip updates
        updates = [(self.W_in, T.inc_subtensor(target_embedding, - learning_rate * grad_in)),
                   (self.W_out, T.inc_subtensor(W_out_sampled, - learning_rate * grad_out))]
        
        # Create theano training function
        train_model = theano.function(
            inputs=[context,
                    target,
                    learning_rate,
                    negative_samples],
            outputs=cost,
            updates=updates,
            profile=True
        )
        
        return train_model

    def train(self,
              window_size=5,
              learning_rate=0.3,
              negative_word_size=5,
              iterations=3,
              anneal_every=100000,
              print_every=5000):
        
        print('Start Training')

        batch_cost = 0
        for it in range(1, iterations + 1):
            for context_it, (center_word, context) in enumerate(self.corpus.contexts(window_size)):
                # Get negative samples
                negative_samples = self.unigram_table.sample(len(context) * negative_word_size)
                
                # Train for one context
                batch_cost += self.train_model(context, 
                                               center_word,
                                               learning_rate,
                                               negative_samples)

                # Update learning rate
                if context_it % anneal_every == 0:
                    learning_rate *= 0.5 
                
                # Print temp results
                if context_it % print_every == 0:
                    print('Iteration:{}, Batch Cost {}'.format(context_it, batch_cost/print_every))
                    batch_cost = 0
        self.train_model.profile.summary()
        return batch_cost

# Experiments

## Hyperparameters

In [6]:
# Corpus
SAMPLING_RATE = 1e-3

# Word2Vec
EMBEDDING_SIZE = 10

# Training
CONTEXT_SIZE = 5
LEARNING_RATE = 0.3
ITERATIONS = 1

## Load corpus

In [7]:
corpus = Corpus(corpus_path, SAMPLING_RATE)

## Process unigram table

In [8]:
unigram_table = UnigramTable(corpus.frequencies())

## Run

In [15]:
word2vec = Word2Vec(corpus, EMBEDDING_SIZE, unigram_table)
%time word2vec.train(window_size=CONTEXT_SIZE, \
                     learning_rate=0.01, \
                     iterations=ITERATIONS, \
                     anneal_every=500000, \
                     print_every=20000)

Start Training
Iteration:0, Batch Cost 0.004505456673639643
Iteration:20000, Batch Cost 229.2260423119504
Iteration:40000, Batch Cost 157.80215013877395
Iteration:60000, Batch Cost 129.01639070558028
Iteration:80000, Batch Cost 112.42254127719025
Iteration:100000, Batch Cost 100.23887824936595
Iteration:120000, Batch Cost 92.04496733918913
Iteration:140000, Batch Cost 86.1580467416261
Iteration:160000, Batch Cost 84.69190342980157
Iteration:180000, Batch Cost 87.91489210842165
Iteration:200000, Batch Cost 79.55074129875067
Iteration:220000, Batch Cost 77.20759713950791
Iteration:240000, Batch Cost 71.16693509685437
Iteration:260000, Batch Cost 65.93990245746542
Iteration:280000, Batch Cost 69.9293936171564
Iteration:300000, Batch Cost 69.57799003345406
Iteration:320000, Batch Cost 69.84036817605131
Iteration:340000, Batch Cost 62.848156999011046
Iteration:360000, Batch Cost 59.21070974791085
Iteration:380000, Batch Cost 61.80317290532567
Iteration:400000, Batch Cost 59.43252686706911
I

Function profiling
  Message: <ipython-input-13-b94fd323cc7e>:78
  Time in 1717728 calls to Function.__call__: 3.063980e+02s
  Time in Function.fn.__call__: 1.268399e+02s (41.397%)
  Time in thunks: 9.483382e+01s (30.951%)
  Total compile time: 3.314476e-01s
    Number of Apply nodes: 43
    Theano Optimizer time: 2.723794e-01s
       Theano validate time: 1.350689e-02s
    Theano Linker time (includes C, CUDA code generation/compiling): 3.077126e-02s
       Import time 0.000000e+00s
       Node make_thunk time 2.707410e-02s
           Node Elemwise{Composite{(i0 - (i1 * i2 * i3))}}[(0, 0)](Sum{acc_dtype=float64}.0, TensorConstant{-1.0}, Shape_i{0}.0, Sum{acc_dtype=float64}.0) time 1.025438e-03s
           Node Elemwise{Composite{(i0 * (i1 - scalar_sigmoid((-i2))))}}[(0, 2)](InplaceDimShuffle{x}.0, TensorConstant{(1,) of 1.0}, CGemv{inplace}.0) time 9.849072e-04s
           Node Elemwise{Composite{(Switch(LT(i0, i1), i0, i1) - i2)}}(Shape_i{0}.0, Elemwise{Add}[(0, 1)].0, TensorConstant

912143.56106027972