# !!UNDER CONSTRUCTION!!

Implementation of word2vec (skip-gram).

Tutorial: http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/

# Load Libraries

In [1]:
import numpy as np
import _pickle as cPickle
import random
import theano
import theano.tensor as T

# Declare varibles

In [2]:
corpus_path = '../datasets/ptwiki-20170801-sentences.pickle'
word_embedding_path = '../datasets/ptwiki-20170801-embedding.pickle'

# word2vec

## Hyperparameters

In [3]:
CONTEXT_SIZE = 10
LEARNING_RATE = 0.01
SAMPLING_RATE = 1e-3
NEGATIVE_SAMPLE_SIZE = 20
EMBEDDING_SIZE = 10
BATCH_SIZE = 50

## Auxiliar class

Corpus dataset must be preprocessed by preprocessing.ipynb before being load

In [4]:
class Corpus(object):
    
    def __init__(self, corpus_path):
        with open(corpus_path, 'rb') as fp:
            self.__indices = cPickle.load(fp) 
            self.__indice_freq = cPickle.load(fp)
            self.__raw_sentences = cPickle.load(fp)
            
    def tokens_size(self):
        if hasattr(self, "__tokens_size") and self.__tokens_size:
            return self.__tokens_size
        
        self.__tokens_size = len(self.__indices)
        return self.__tokens_size
    
    def words_size(self):
        if hasattr(self, "__words_size") and self.__words_size:
            return self.__words_size
        
        self.__word_size = sum(self.__indice_freq.values())
        return self.__word_size

    def rejection_probability(self):
        if hasattr(self, '__rejection_probability') and self.__rejection_probability:
            return self.__reject_prob

        n_words = self.words_size()
        n_tokens = self.tokens_size()
        rejection_probability = np.zeros(n_tokens)
        for i in range(n_tokens):
            density = self.__indice_freq[i]/(1.0 * n_words)
            
            # Calculate rejection probability
            rejection_probability[i] = 1 - (np.sqrt(density/SAMPLING_RATE) + 1) * (SAMPLING_RATE/density)

        self.__rejection_probability = rejection_probability
        return self.__rejection_probability

    def sentences(self):
        if hasattr(self, "__sentences") and self.__sentences:
            return self.__sentences
        
        rejection_probability = self.rejection_probability()
        sentences = [[word for word in sentence
                      if 0 >= rejection_probability[word]
                      or random.random() >= rejection_probability[word]]
                     for sentence in self.__raw_sentences]

        sentences = [sentence for sentence in sentences
                     if len(sentence) > 1]
        
        self.__sentences = sentences        
        return self.__sentences

    def contexts(self, C=5):
        for sentence in self.sentences():
            for center_idx, center_word in enumerate(sentence):
                # Get current context
                context = self.__get_context(sentence, center_idx, center_word, C)

                # Return current context
                yield center_word, context
                
    def random_contexts(self, size, C=5):
        sentences = self.sentences()
        for _ in range(size):
            # Get random sentence
            sentence_idx = random.randint(0, len(sentences) - 1)
            sentence = sentences[sentence_idx]

            # Get random center word
            center_idx = random.randint(0, len(sentence) - 1)
            center_word = sentence[center_idx]

            # Get current context
            context = self.__get_context(sentence, center_idx, center_word, C)
            
            # Return current context
            yield center_word, context
            
    def __get_context(self, sentence, center_idx, center_word, C=5):
        # Get previous words
        context = sentence[max(0, center_idx - C):center_idx]

        # Get future words
        if center_idx + 1 < len(sentence):
            context += sentence[center_idx+1:min(len(sentence), center_idx + C + 1)]

        # Remove duplicate center word
        context = [word for word in context if word is not center_word]
        
        return context

In [5]:
corpus = Corpus(corpus_path)

## Algorithm

## More Theano Friendly approach (TEMP)

In [6]:
# Initializing network parameters
W_in_values = np.asarray((np.random.rand(corpus.tokens_size(), EMBEDDING_SIZE) - 0.5) / EMBEDDING_SIZE,
                              dtype=theano.config.floatX)

W_out_values = np.asarray(np.zeros((corpus.tokens_size(), EMBEDDING_SIZE)),
                              dtype=theano.config.floatX)

# Declaring theano parameters
W_in = theano.shared(
    value=W_in_values,
    name='W_in',
    borrow=True
)

W_out = theano.shared(
    value=W_out_values,
    name='W_out',
    borrow=True
)

context = T.ivector('context')
target = T.lscalar('target')

# Change context
target_embedding = W_in[target]
#context_embedding = W_out[context]

# Apply Softmax in the output layer
estimated = T.nnet.softmax(T.dot(target_embedding, W_out.T)).dimshuffle(1, 0)

# Compute cost - Ignore for now
cost = T.mean(-T.log(estimated[context]))

# Expected window answer
expected = T.zeros_like(estimated)
expected = T.set_subtensor(expected[context], 1)

# Compute window error
z = context.size * estimated - expected

# Compute gradient descent
grad_in = T.dot(W_out.T, z)
grad_out = T.outer(target_embedding, z)

# Zip updates
updates = [(W_in, T.inc_subtensor(target_embedding, - LEARNING_RATE * grad_in.flatten())),
           (W_out, W_out - LEARNING_RATE * grad_out.T)]

# Create theano training function
train_model = theano.function(
    inputs=[context, target],
    outputs=cost,
    updates=updates
)

def train(window_size=5,
          iterations=3,
          anneal_every=20000):

    print('Start Training')

    batch_cost = 0
    for it in range(1, iterations + 1):
        for context_it, (center_word, context) in enumerate(corpus.contexts(window_size)):
            # Compute cost and gradient
            batch_cost += train_model(context, center_word)

            # Print temp results
            if context_it % 5000 == 0:
                print('Iteration:{}, Batch Cost {}'.format(context_it, batch_cost/5000))
                batch_cost = 0
    return batch_cost

In [7]:
%time train(iterations=1)

Start Training
Iteration:0, Batch Cost 0.002066631469868222
Iteration:5000, Batch Cost 10.333147289538777
Iteration:10000, Batch Cost 10.333115980573407
Iteration:15000, Batch Cost 10.333128568849316
Iteration:20000, Batch Cost 10.333066871533237
Iteration:25000, Batch Cost 10.33253621218946
Iteration:30000, Batch Cost 10.327257717084972
Iteration:35000, Batch Cost 10.324505932685131
Iteration:40000, Batch Cost 10.300455036111854
Iteration:45000, Batch Cost 10.217913381157622
Iteration:50000, Batch Cost 10.017710954185812
Iteration:55000, Batch Cost 9.715153071232994
Iteration:60000, Batch Cost 9.908575562966258
Iteration:65000, Batch Cost 9.825084713119887
Iteration:70000, Batch Cost 9.695588510434115
Iteration:75000, Batch Cost 9.625679916139834
Iteration:80000, Batch Cost 9.46121361417896
Iteration:85000, Batch Cost 9.412031387162484
Iteration:90000, Batch Cost 9.15905895273066
Iteration:95000, Batch Cost 9.571417364535451
Iteration:100000, Batch Cost 9.42297241775321
Iteration:1050

6018.4366419778607