<a href="https://colab.research.google.com/github/onlyabhilash/NLP-Projects/blob/main/0_Word2vec/tf_2_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [None]:
# revised version from https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/examples/04_word2vec_visualize.py
# [1] change tf.nn.nce_loss to tf.nn.sampled_softmax_loss
# [2] mkdir function
# [3] AdagradOptimizer
# [4] modify `generate_sample` function to infinite loop for each training article

import codecs
import itertools
from collections import Counter
import random
import numpy as np
import tensorflow as tf
import os
import shutil

## Skip-Gram model definition

In [None]:
class SkipGramModel:
    """ Build the graph for word2vec model """

    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        # self.global_step is a counter, so it should not be trained (i.e., trainable=False)
        self.global_step = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name='global_step')

    def _create_placeholders(self):
        """ Step 1: define the placeholders for input and output """
        with tf.name_scope('input_data'):
            self.center_words = tf.placeholder(dtype=tf.int32, shape=[self.batch_size], name='center_words')
            self.target_words = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, 1], name='target_words')

    def _create_embedding(self):
        """ Step 2: define weights. In word2vec, it's actually the weights that we care about """
        with tf.name_scope('embedding'):
            self.embed_matrix = tf.Variable(initial_value=tf.random_uniform(shape=[self.vocab_size, self.embed_size], minval=-1.0, maxval=1.0), name='embed_matrix')

    def _create_loss(self):
        """ Step 3 + 4: define the model + the loss function """
        with tf.name_scope('loss'):
            # Step 3: define the inference
            # directly get the embedding of 'ids'
            # see https://stackoverflow.com/questions/34870614/what-does-tf-nn-embedding-lookup-function-do/41922877
            embed = tf.nn.embedding_lookup(params=self.embed_matrix, ids=self.center_words, name='embed')

            # Step 4: define loss function
            # the results showed that sampled_softmax_loss is litter better than nce_loss
            softmax_weights = tf.Variable(initial_value=tf.truncated_normal([self.vocab_size, self.embed_size], stddev=1.0 / self.embed_size ** 0.5), name='sampled_softmax_weight')
            softmax_biases = tf.Variable(initial_value=tf.zeros([self.vocab_size]), name='sampled_softmax_bias')
            self.loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                                                  labels=self.target_words, num_sampled=NUM_SAMPLED, num_classes=self.vocab_size, name='sampled_softmax_loss'))
            '''
            # construct variables for NCE loss
            nce_weight = tf.Variable(initial_value=tf.truncated_normal(shape=[self.vocab_size, self.embed_size], stddev=1.0 / (self.embed_size ** 0.5)), name='nce_weight')
            nce_bias = tf.Variable(initial_value=tf.zeros([VOCAB_SIZE]), name='nce_bias')

            # define loss function to be NCE loss function
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                                      biases=nce_bias,
                                                      labels=self.target_words,
                                                      inputs=embed,
                                                      num_sampled=self.num_sampled,
                                                      num_classes=self.vocab_size), name='nce_loss')
            '''


    def _create_optimizer(self):
        """ Step 5: define optimizer """
        # do not forget global_step parameter
        self.optimizer = tf.train.AdagradOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)
        #self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss, global_step=self.global_step)        

    def _create_summaries(self):
        with tf.name_scope('summaries'):
            tf.summary.scalar(name='loss', tensor=self.loss)
            tf.summary.histogram(name='histogram_loss', values=self.loss)
            # merge all summaries into one op to make it easier to manage
            self.summary_op = tf.summary.merge_all()

    def build_graph(self):
        """ Build the graph for our model """
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()

## Process input and generate batch data

In [None]:
def build_vocab(all_articles, vocab_size):
    """ Build vocabulary of VOCAB_SIZE most frequent words """
    print('Corpus size: {}'.format(len(all_articles)))
    words = list(itertools.chain.from_iterable(all_articles)) # flatten all articles into one article
    word2id = dict()
    word_count = [('UNK', -1)]
    word_count.extend(Counter(words).most_common(vocab_size - 1))
    print('Total number of unique words: {}'.format(len(Counter(words).most_common())))

    idx = 0 # the word index of 'UNK' is now 0    
    # save most frequent words for TensorBoard visualization
    most_freq_words = codecs.open(filename='processed/vocab_' + str(VOCAB_SIZE) + '.tsv', mode='w', encoding='utf-8')
    for word, freq in word_count:
        word2id[word] = idx
        if idx < VOCAB_SIZE:
            most_freq_words.write(word + '\n')
        idx += 1
    id2word = dict(zip(word2id.values(), word2id.keys())) # convenient convert word2id to id2word
    return word2id, id2word


def convert_words_to_index(all_articles, dictionary):
    """ Replace each word in the corpus with its index in the dictionary """
    word_index_list = []
    for each_article in all_articles:
        word_index_list.append([dictionary[word] if word in dictionary else 0 for word in each_article]) # if the word index is larger than VOCAB_SIZE, then replace it with 0
    return word_index_list

def generate_sample(index_words_list, context_window_size, index_dictionary):
    """ Form training pairs according to the skip-gram model. """
    outer_index = 0
    for id_words in itertools.cycle(index_words_list): # infinite loop iterate each article
        if DEBUG:
            print('================================= outer_index: {}'.format(outer_index))
        for inner_index, center in enumerate(id_words): # inside each article
            context = random.randint(1, context_window_size)
            if DEBUG and outer_index == 0: # print the first article
                print('random context:{}'.format(context))
            # get a random target before the center word
            for target in id_words[max(0, inner_index-context): inner_index]:
                if DEBUG and outer_index == 0: # print the first article
                    print('before the center word === {} vs {} and {} vs {}'.format(center, target, index_dictionary[center], index_dictionary[target]))
                yield center, target
            # get a random target after the center word
            for target in id_words[(inner_index+1):(inner_index+context+1)]:
                if DEBUG and outer_index == 0:
                    print('after the center word === {} vs {} and {} vs {}'.format(center, target, index_dictionary[center], index_dictionary[target]))
                yield center, target
        outer_index += 1

def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1], dtype=np.int32)
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

def process_data(vocab_size, batch_size, skip_window_size):
    all_articles = read_data(file_path=INPUT_FILE)
    word2id, id2word = build_vocab(all_articles, vocab_size)
    index_words_list = convert_words_to_index(all_articles, word2id)
    del all_articles # to save memory
    single_gen = generate_sample(index_words_list, skip_window_size, id2word)
    return get_batch(single_gen, batch_size)

## Helper functions

In [None]:
def make_dir(path):
    """ Create a directory if there isn't one already. """
    if os.path.exists(path):
        shutil.rmtree(path) # delete an entire directory tree
    os.mkdir(path)
    
def read_data(file_path):
    """ read the input corpus """
    all_articles = []
    for line in codecs.open(filename=file_path, mode='r', encoding='utf-8'):
        all_articles.append(line.split())
    return all_articles

## Model training

In [None]:
def train_model(model, batch_gen, num_train_steps):
    """ start to train the model """
    saver = tf.train.Saver() # defaults to saving all variables    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/checkpoint'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path)

        total_loss = 0.0
        writer = tf.summary.FileWriter(logdir='improved_graph/lr' + str(LEARNING_RATE), graph=sess.graph)
        initial_step = model.global_step.eval()
        for idx in range(initial_step, initial_step+num_train_steps):
            centers, targets = next(batch_gen)
            #print('Centers shape:{}, targets shape:{}'.format(centers.shape, targets.shape))
            feed_dict = {model.center_words: centers, model.target_words: targets}
            batch_loss, _, batch_sum = sess.run([model.loss, model.optimizer, model.summary_op], feed_dict=feed_dict)
            writer.add_summary(summary=batch_sum, global_step=idx)
            total_loss += batch_loss
            print('Iteration {}'.format(idx))
            if (idx + 1) % SKIP_STEP == 0:           
                print('Iteration {} of {}, loss: {:5.3f}'.format(idx, initial_step+num_train_steps, total_loss / SKIP_STEP))
                total_loss = 0.0
                saver.save(sess=sess, save_path='checkpoints/skip_gram', global_step=idx)

## Main function

In [None]:
def main():
    make_dir('checkpoints')
    make_dir('processed')
    make_dir('improved_graph')
    
    sg_model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    sg_model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW_SIZE)
    train_model(sg_model, batch_gen, NUM_TRAIN_STEPS)

In [None]:
if __name__ == '__main__':

    # hyper-parameters
    VOCAB_SIZE = 200000 # the considered size of words
    BATCH_SIZE = 128
    EMBED_SIZE = 128  # dimension of the word embedding vectors
    SKIP_WINDOW_SIZE = 1  # the context window
    NUM_SAMPLED = 64  # Number of negative examples to sample.
    LEARNING_RATE = 0.5
    NUM_TRAIN_STEPS = 1500000
    SKIP_STEP = 2000

    INPUT_FILE = 'E:/2017_Deep_learning/word2vec/word_vector_108000.cs'

    DEBUG = True
    main()