# !!UNDER CONSTRUCTION!!

Implementation of word2vec (skip-gram).

Tutorial: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

# Load Libraries

In [1]:
import numpy as np
import _pickle as cPickle
import random
import theano
import theano.tensor as T

# Declare varibles

In [2]:
corpus_path = '../datasets/ptwiki-20170820-sentences.pickle'
word_embedding_path = '../datasets/ptwiki-20170820-embedding.pickle'

# word2vec

## Hyperparameters

In [3]:
# Corpus
SAMPLING_RATE = 1e-3

# Word2Vec
EMBEDDING_SIZE = 10

# Training
CONTEXT_SIZE = 5
LEARNING_RATE = 0.3
ITERATIONS = 1

## Corpus class

Corpus dataset must be preprocessed by preprocessing.ipynb before being load

In [4]:
class Corpus(object):
    
    def __init__(self, corpus_path, sampling_rate):
        # Rate for decrease words
        self.sampling_rate = sampling_rate
        
        # Load Corpus
        with open(corpus_path, 'rb') as fp:
            self.__indices = cPickle.load(fp) 
            self.__indice_freq = cPickle.load(fp)
            self.__raw_sentences = cPickle.load(fp)
            
    def tokens_size(self):
        if hasattr(self, "__tokens_size") and self.__tokens_size:
            return self.__tokens_size
        
        self.__tokens_size = len(self.__indices)
        return self.__tokens_size
    
    def words_size(self):
        if hasattr(self, "__words_size") and self.__words_size:
            return self.__words_size
        
        self.__word_size = sum(self.__indice_freq.values())
        return self.__word_size

    def rejection_probability(self):
        if hasattr(self, '__rejection_probability') and self.__rejection_probability:
            return self.__reject_prob

        n_words = self.words_size()
        n_tokens = self.tokens_size()
        rejection_probability = np.zeros(n_tokens)
        for i in range(n_tokens):
            density = self.__indice_freq[i]/(1.0 * n_words)
            
            # Calculate rejection probability
            rejection_probability[i] = 1 - (np.sqrt(density/self.sampling_rate) + 1) * (self.sampling_rate/density)

        self.__rejection_probability = rejection_probability
        return self.__rejection_probability

    def sentences(self):
        if hasattr(self, "__sentences") and self.__sentences:
            return self.__sentences
        
        rejection_probability = self.rejection_probability()
        sentences = [[word for word in sentence
                      if 0 >= rejection_probability[word]
                      or random.random() >= rejection_probability[word]]
                     for sentence in self.__raw_sentences]

        sentences = [sentence for sentence in sentences
                     if len(sentence) > 1]
        
        self.__sentences = sentences        
        return self.__sentences

    def contexts(self, C=5):
        for sentence in self.sentences():
            for center_idx, center_word in enumerate(sentence):
                # Get current context
                context = self.__get_context(sentence, center_idx, center_word, C)

                # Return current context
                yield center_word, context
                
    def random_contexts(self, size, C=5):
        sentences = self.sentences()
        for _ in range(size):
            # Get random sentence
            sentence_idx = random.randint(0, len(sentences) - 1)
            sentence = sentences[sentence_idx]

            # Get random center word
            center_idx = random.randint(0, len(sentence) - 1)
            center_word = sentence[center_idx]

            # Get current context
            context = self.__get_context(sentence, center_idx, center_word, C)
            
            # Return current context
            yield center_word, context
            
    def __get_context(self, sentence, center_idx, center_word, C=5):
        # Get previous words
        context = sentence[max(0, center_idx - C):center_idx]

        # Get future words
        if center_idx + 1 < len(sentence):
            context += sentence[center_idx+1:min(len(sentence), center_idx + C + 1)]

        # Remove duplicate center word
        context = [word for word in context if word is not center_word]
        
        return context

## Word2Vec Skip-gram class

In [5]:
class Word2Vec(object):
    
    def __init__(self,
                 corpus,
                 embedding_size=10):
        
        self.corpus = corpus
        self.embedding_size = embedding_size

        # Initializing network parameters
        self.W_in_values = np.asarray((np.random.rand(corpus.tokens_size(), embedding_size) - 0.5) / embedding_size,
                                      dtype=theano.config.floatX)

        self.W_out_values = np.asarray(np.zeros((corpus.tokens_size(), embedding_size)),
                                      dtype=theano.config.floatX)

        # Declaring theano parameters
        # Embedding variables
        self.W_in = theano.shared(
            value=self.W_in_values,
            name='W_in',
            borrow=True
        )

        self.W_out = theano.shared(
            value=self.W_out_values,
            name='W_out',
            borrow=True
        )

        # Context variables
        self.context = T.ivector('context')
        self.target = T.lscalar('target')
    
        # Learning variables
        self.learning_rate = T.scalar('learning_rate')
        
        # Get training model
        self.train_model = self.__train_one_context()

    def __train_one_context(self):     
        # Change context
        target_embedding = self.W_in[self.target]

        # Apply Softmax in the output layer
        estimated = T.nnet.softmax(T.dot(target_embedding, self.W_out.T)).dimshuffle(1, 0)

        # Compute cost
        cost = -T.log(T.prod(estimated[self.context]))
        
        # Compute gradient
        grad_in, grad_out = T.grad(cost, [target_embedding, self.W_out])

        # Zip updates
        updates = [(self.W_in, T.inc_subtensor(target_embedding, - self.learning_rate * grad_in)),
                   (self.W_out, self.W_out - self.learning_rate * grad_out)]
        
        # Create theano training function
        train_model = theano.function(
            inputs=[self.context,
                    self.target,
                    self.learning_rate],
            outputs=cost,
            updates=updates
        )
        
        return train_model

    def train(self,
              window_size=5,
              learning_rate=0.3,
              iterations=3,
              anneal_every=100000,
              print_every=5000):
        
        print('Start Training')

        batch_cost = 0
        for it in range(1, iterations + 1):
            for context_it, (center_word, context) in enumerate(self.corpus.contexts(window_size)):
                # Train for one context
                batch_cost += self.train_model(context, center_word, learning_rate)

                # Update learning rate
                if context_it % anneal_every == 0:
                    learning_rate *= 0.5 
                
                # Print temp results
                if context_it % print_every == 0:
                    print('Iteration:{}, Batch Cost {}'.format(context_it, batch_cost/print_every))
                    batch_cost = 0
        return batch_cost

# Experiments

## Load corpus

In [6]:
corpus = Corpus(corpus_path, SAMPLING_RATE)

## Run

In [7]:
word2vec = Word2Vec(corpus, EMBEDDING_SIZE)
%time word2vec.train(window_size=CONTEXT_SIZE, \
                     learning_rate=LEARNING_RATE, \
                     iterations=ITERATIONS, \
                     anneal_every=300000, \
                     print_every=5000)

Start Training
Iteration:0, Batch Cost 0.010333320017092586
Iteration:5000, Batch Cost 72.35096421260462
Iteration:10000, Batch Cost 66.14743424628156
Iteration:15000, Batch Cost 67.40848338842814
Iteration:20000, Batch Cost 65.8312200755106
Iteration:25000, Batch Cost 60.87439005756332
Iteration:30000, Batch Cost 57.90421018154122
Iteration:35000, Batch Cost 63.09532227564259
Iteration:40000, Batch Cost 61.58319123818359
Iteration:45000, Batch Cost 59.63815635451173
Iteration:50000, Batch Cost 60.09757443171851
Iteration:55000, Batch Cost 59.588023432863295
Iteration:60000, Batch Cost 62.342149746551286
Iteration:65000, Batch Cost 59.619117512962035
Iteration:70000, Batch Cost 59.109150877062405
Iteration:75000, Batch Cost 61.152419667359084
Iteration:80000, Batch Cost 58.52519684211259
Iteration:85000, Batch Cost 59.73553496750774
Iteration:90000, Batch Cost 56.96452115615496
Iteration:95000, Batch Cost 60.236553756923904
Iteration:100000, Batch Cost 57.86302179397791
Iteration:10500

36081.497490077258