# !!UNDER CONSTRUCTION!!

Implementation of word2vec (skip-gram).

Tutorial: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

# Load Libraries

In [1]:
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
#from theano.tensor.shared_randomstreams import RandomStreams

import numpy as np
import _pickle as cPickle
import random
import theano
import theano.tensor as T

# Declare varibles

In [2]:
corpus_path = '../datasets/ptwiki-20170820-sentences.pickle'
word_embedding_path = '../datasets/ptwiki-20170820-embedding.pickle'

# Corpus

Corpus dataset must be preprocessed by preprocessing.ipynb before being load

In [3]:
class Corpus(object):
    
    def __init__(self, corpus_path, sampling_rate):
        # Rate for decrease words
        self.sampling_rate = sampling_rate
        
        # Load Corpus
        with open(corpus_path, 'rb') as fp:
            self.__indices = cPickle.load(fp) 
            self.__indice_freq = cPickle.load(fp)
            self.__raw_sentences = cPickle.load(fp)
            
    def tokens_size(self):
        if hasattr(self, "__tokens_size") and self.__tokens_size:
            return self.__tokens_size
        
        self.__tokens_size = len(self.__indices)
        return self.__tokens_size
    
    def words_size(self):
        if hasattr(self, "__words_size") and self.__words_size:
            return self.__words_size
        
        self.__word_size = sum(self.__indice_freq.values())
        return self.__word_size

    def indices(self):
        return self.__indices.copy()
    
    def frequencies(self):
        return self.__indice_freq.copy()
    
    def rejection_probability(self):
        if hasattr(self, '__rejection_probability') and self.__rejection_probability:
            return self.__reject_prob

        n_words = self.words_size()
        n_tokens = self.tokens_size()
        rejection_probability = np.zeros(n_tokens)
        for i in range(n_tokens):
            density = self.__indice_freq[i]/(1.0 * n_words)
            
            # Calculate rejection probability
            rejection_probability[i] = 1 - (np.sqrt(density/self.sampling_rate) + 1) * (self.sampling_rate/density)

        self.__rejection_probability = rejection_probability
        return self.__rejection_probability

    def sentences(self):
        if hasattr(self, "__sentences") and self.__sentences:
            return self.__sentences
        
        rejection_probability = self.rejection_probability()
        sentences = [[word for word in sentence
                      if 0 >= rejection_probability[word]
                      or random.random() >= rejection_probability[word]]
                     for sentence in self.__raw_sentences]

        sentences = [sentence for sentence in sentences
                     if len(sentence) > 1]
        
        self.__sentences = sentences        
        return self.__sentences

    def contexts(self, C=5):
        for sentence in self.sentences():
            for center_idx, center_word in enumerate(sentence):
                # Get current context
                context = self.__get_context(sentence, center_idx, center_word, C)

                # Return current context
                yield center_word, context
                
    def random_contexts(self, size, C=5):
        sentences = self.sentences()
        for _ in range(size):
            # Get random sentence
            sentence_idx = random.randint(0, len(sentences) - 1)
            sentence = sentences[sentence_idx]

            # Get random center word
            center_idx = random.randint(0, len(sentence) - 1)
            center_word = sentence[center_idx]

            # Get current context
            context = self.__get_context(sentence, center_idx, center_word, C)
            
            # Return current context
            yield center_word, context
            
    def __get_context(self, sentence, center_idx, center_word, C=5):
        # Get previous words
        context = sentence[max(0, center_idx - C):center_idx]

        # Get future words
        if center_idx + 1 < len(sentence):
            context += sentence[center_idx+1:min(len(sentence), center_idx + C + 1)]

        # Remove duplicate center word
        context = [word for word in context if word is not center_word]
        
        return context

# Unigram table

In [4]:
class UnigramTable(object):
    
    def __init__(self, counts):
        power = 0.75
        
        # Calculate distribution
        word_distribution = np.array([np.power(count, power) for count in counts.values()])
        
        # Normalize
        word_distribution /= np.sum(word_distribution)
        
        
        # table_size should be big enough so that the minimum probability for a word * table_size >= 1.
        # Also,  table_size must be hardcoded as a counter-measure for the case of the minimum probability
        # be a extremely low value, what would burst our memory
        table_size = int(1e8)
        table = np.zeros(table_size, dtype=np.int32)
        
        # Cumulative probability
        cum_probability = 0
        
        i = 0
        for word, count in counts.items():
            cum_probability += word_distribution[word]
            # fill the table until reach the cumulative probability
            while i < table_size and i / table_size < cum_probability:
                table[i] = word
                i += 1

        self.__table = table         
        self.__table_size = table_size

    def sample(self, k):        
        indices = np.random.randint(low=0, high=self.__table_size, size=k)
        return self.__table[indices]

# Word2vec

## Skip-gram

In [18]:
class Word2Vec(object):
    
    def __init__(self,
                 corpus,
                 embedding_size=10,
                 unigram_table=None):
        
        self.corpus = corpus
        self.embedding_size = embedding_size
        self.unigram_table = unigram_table
        
        # Initializing network parameters
        self.W_in_values = np.asarray((np.random.rand(corpus.tokens_size(), embedding_size) - 0.5) / embedding_size,
                                      dtype=theano.config.floatX)

        self.W_out_values = np.asarray(np.zeros((corpus.tokens_size(), embedding_size)),
                                      dtype=theano.config.floatX)

        # Declaring theano parameters
        # Embedding variables
        self.W_in = theano.shared(
            value=self.W_in_values,
            name='W_in',
            borrow=True
        )

        self.W_out = theano.shared(
            value=self.W_out_values,
            name='W_out',
            borrow=True
        )
        
        # Get training model
        self.train_model = self.__train_model()

    def __train_model(self):
        # Input variables
        target_words = T.ivector('target_words')
        context_words = T.ivector('context_words')
        in_corpus = T.ivector('in_corpus')
        learning_rate = T.scalar('learning_rate')
        
        # Prepare word embeddings
        target_embedding = self.W_in[target_words]
        context_embedding = self.W_out[context_words]
        
        # Compute cost
        positive_cost = in_corpus * T.log(T.nnet.sigmoid(T.sum(target_embedding * context_embedding, axis=1)))
        negative_cost = (1 - in_corpus) * T.log(T.nnet.sigmoid(-T.sum(target_embedding * context_embedding, axis=1)))        
        cost = -T.sum(positive_cost + negative_cost)
        
        # Compute gradient        
        grad_in, grad_out = T.grad(cost, [target_embedding, context_embedding])
        
        # Zip updates
        updates = [(self.W_in, T.inc_subtensor(target_embedding, - learning_rate * grad_in)),
                   (self.W_out, T.inc_subtensor(context_embedding, - learning_rate * grad_out))]
        
        # Create theano training function
        train_model = theano.function(
            inputs=[target_words,
                    context_words,
                    in_corpus,
                    learning_rate],
            outputs=cost,
            updates=updates,
            profile=True
        )
        
        return train_model

    def train(self,
              window_size=5,
              negative_sample_size=5,
              learning_rate=0.3,              
              batch_size=100,
              anneal_every=100000,
              print_every=5000):
        
        print('Start Training')

        # Batch variables
        center_words = []
        contexts = []
        in_corpus = []
        
        batch_cost = 0
        for it, (center_word, context) in enumerate(self.corpus.contexts(window_size)):
            # Define constants
            context_size = len(context)
            total_negative_sample_size = context_size * negative_sample_size
            center_word_size = context_size + total_negative_sample_size
            
            # Generate negative sample
            negative_samples = self.unigram_table.sample(total_negative_sample_size)
            
            # Increment batch
            center_words +=  center_word_size * [center_word]
            contexts += (context + negative_samples.tolist())
            in_corpus += (context_size * [1] + total_negative_sample_size * [0])
            
            # Gathered contexts until batch size
            if (it + 1) % batch_size != 0: 
                continue
            
            # Train for many contexts
            batch_cost += self.train_model(center_words,
                                           contexts,
                                           in_corpus,
                                           learning_rate)

            # Update learning rate
            if (it + 1) % anneal_every == 0:
                learning_rate *= 0.5

            # Print temp results
            if (it + 1) % print_every == 0:
                print('Iteration:{}, Batch Cost {}'.format(it + 1, batch_cost/print_every))
                batch_cost = 0
            
            # Empty batch
            center_words = []
            contexts = []
            in_corpus = []
        self.train_model.profile.summary()
        return batch_cost
    
    def save(self, output_path):
        with open(output_path, 'wb') as fp:
            cPickle.dump(self.W_in_values.shape, fp)
            cPickle.dump(self.W_in_values, fp)
            cPickle.dump(self.W_out_values, fp)

# Experiments

## Hyperparameters

In [19]:
# Corpus
SAMPLING_RATE = 1e-3

# Word2Vec
EMBEDDING_SIZE = 15

# Training
CONTEXT_SIZE = 5
NEGATIVE_SAMPLE_SIZE = 20
BATCH_SIZE = 50
LEARNING_RATE = 0.02
ANNEAL_EVERY =  10000 * BATCH_SIZE
PRINT_EVERY =  1000 * BATCH_SIZE

## Load corpus

In [7]:
corpus = Corpus(corpus_path, SAMPLING_RATE)

## Process unigram table

In [8]:
unigram_table = UnigramTable(corpus.frequencies())

## Run

In [20]:
word2vec = Word2Vec(corpus, EMBEDDING_SIZE, unigram_table)
%time word2vec.train(window_size=CONTEXT_SIZE, \
                     negative_sample_size=NEGATIVE_SAMPLE_SIZE, \
                     batch_size=BATCH_SIZE, \
                     learning_rate=LEARNING_RATE, \
                     anneal_every=ANNEAL_EVERY, \
                     print_every=PRINT_EVERY)

Start Training
Iteration:50000, Batch Cost 77.6953120421538
Iteration:100000, Batch Cost 53.27822380097913
Iteration:150000, Batch Cost 47.090573322226945
Iteration:200000, Batch Cost 46.59455988296572
Iteration:250000, Batch Cost 42.69373597040367
Iteration:300000, Batch Cost 41.518315497228755
Iteration:350000, Batch Cost 40.598496300099775
Iteration:400000, Batch Cost 38.757498005362244
Iteration:450000, Batch Cost 38.25215613804655
Iteration:500000, Batch Cost 35.89313674360301
Iteration:550000, Batch Cost 38.36256330877373
Iteration:600000, Batch Cost 36.11787550695698
Iteration:650000, Batch Cost 36.03674661096861
Iteration:700000, Batch Cost 36.04938126064476
Iteration:750000, Batch Cost 37.18721846668412
Iteration:800000, Batch Cost 37.42902019671328
Iteration:850000, Batch Cost 37.285558788323605
Iteration:900000, Batch Cost 35.64154266347849
Iteration:950000, Batch Cost 35.1225463117843
Iteration:1000000, Batch Cost 34.753572642281014
Iteration:1050000, Batch Cost 37.30459916

Function profiling
  Message: <ipython-input-18-a34994113131>:67
  Time in 34357 calls to Function.__call__: 4.045235e+02s
  Time in Function.fn.__call__: 2.414073e+02s (59.677%)
  Time in thunks: 2.396748e+02s (59.249%)
  Total compile time: 2.480121e-01s
    Number of Apply nodes: 18
    Theano Optimizer time: 2.108345e-01s
       Theano validate time: 3.458500e-03s
    Theano Linker time (includes C, CUDA code generation/compiling): 1.510596e-02s
       Import time 0.000000e+00s
       Node make_thunk time 1.332426e-02s
           Node Elemwise{Composite{(i0 * i1 * ((i2 * i3 * i4 * i5) + (i6 * i7 * i5)))}}[(0, 5)](TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{x,x}.0, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, Elemwise{Composite{(i0 - scalar_sigmoid(i1))}}.0, AdvancedSubtensor1.0, InplaceDimShuffle{0,x}.0, Elemwise{Composite{(i0 - scalar_sigmoid((-i1)))}}.0) time 1.446486e-03s
           Node Elemwise{Composite{(i0 * i1 * ((i2 * i3 * i4 * i5) + (i6 * i7 * i5)))}}[(

630849.13161690452

In [21]:
word2vec.save(word_embedding_path)