## Word Embeddings Using Various Training Strategies

Training a skip-gram model on a small amount of data using the following approaches:
- standard, as explained in the paper: <i>Efficient Estimation of Word Representations in Vector Space</i>
- negative sampling, as explained in the paper: <i>Distributed Representations of Words and Phrases
and their Compositionality</i> and highlighted [here](https://aegis4048.github.io/optimize_computational_efficiency_of_skip-gram_with_negative_sampling)
- importance sampling, as explained in the paper: <i>On Using Very Large Target Vocabulary for Neural Machine Translation</i>

In [1]:
from tensorflow.keras import backend as K
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input,Activation
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import nltk
import latex
from collections import Counter,defaultdict
from nltk.corpus import brown
import random

import warnings
warnings.filterwarnings('ignore')
tf.compat.v1.enable_eager_execution()

### Cleaning news/text data

Using the Brown corpus collection of news documents. Only keeping words which have shown up more than two times.

In [2]:
# processing data on word-level, determining which words to keep
corpus_words = [word.lower() for word in brown.words(categories='news') if word.isalpha()]
word_count = Counter(corpus_words)
words_to_keep = set([])
for word in corpus_words:
    if word_count[word] > 2:
        words_to_keep.add(word)

print("size of corpus:",len(words_to_keep))

size of corpus: 3896


In [3]:
# updating the corpus sentences to only keep the valid words
corpus_sents = [[word.lower() for word in sent] for sent in brown.sents(categories='news')]
corpus_sents = [[word for word in sent if word in words_to_keep] for sent in corpus_sents]
corpus_sents = [sent for sent in corpus_sents if len(sent)>1] # keep sents. with atleast two words
print("Number of sentences in corpus:",len(corpus_sents))

Number of sentences in corpus: 4498


In [4]:
corpus_words = [] # getting the new counts for all words
for sent in corpus_sents:
    for word in sent:
        corpus_words.append(word)

word_indices = {} # storing unique index per word (for one-hot encoding)
word_indices_inv = {} # inverse key-value pairs
index = 0
for word in words_to_keep:
    word_indices_inv[index] = word
    word_indices[word] = index
    index += 1
        
word_freq = defaultdict(int) # stores the num. of occurences (frequency) for each word
for word in corpus_words:
    word_freq[word] += 1
    
corpus_sents_indices = [[word_indices[word] for word in sent] for sent in corpus_sents] # words->index representation

In [5]:
def create_training_pairs(corpus_sents_indices,window=2):
    """ returns x,y lists which contain context words and associate target word
    args:
        corpus_sents_indices: sentences in corpus with words represented by one-hot indices
        window: the number of words to look at to the left and right of context word
    """
    all_contexts = []
    all_targets = []
    
    for sent in corpus_sents_indices:
        sent_contexts = []
        sent_targets = []
        for i in range(len(sent)):
            left_indices = [l for l in range(max(0,i-2),i)]
            right_indices = [r for r in range(i+1,min(len(sent),i+2+1))]
            left_and_right_indices = left_indices + right_indices
            selected_index = random.choice(left_and_right_indices) # selecting one target(index) from the window
            target = sent[selected_index]
            context = sent[i]
            sent_contexts.append(context)
            sent_targets.append(target)
            
        all_contexts += sent_contexts
        all_targets += sent_targets
        
    return all_contexts,all_targets

In [6]:
all_contexts,all_targets = create_training_pairs(corpus_sents_indices,window=2)
print(all_contexts[0:10])
print(all_targets[0:10])

[508, 3388, 1941, 649, 1517, 558, 3268, 1054, 3141, 1649]
[1941, 508, 1517, 3388, 649, 649, 1517, 3268, 1054, 3141]


In [7]:
def create_one_hot_encodings(all_contexts,corpus_size=3896):
    """
    """
    vec = np.zeros((len(all_contexts),corpus_size))
    for i,context in enumerate(all_contexts):
        vec[i][context]=1
    return vec

In [8]:
all_contexts_oh = create_one_hot_encodings(all_contexts,corpus_size=3896)
all_contexts_oh.shape

(74390, 3896)

In [9]:
corpus_word_counts = Counter(corpus_words)
corpus_word_counts_adj = {} # hold counts used in negative sampling
for word in corpus_word_counts:
    corpus_word_counts_adj[word] = corpus_word_counts[word]**(3/4) # give greater weight to uncommon words
base_sum = sum(list(i[1] for i in corpus_word_counts_adj.items())) # denominator for producing probability

sample_vector = []
for word in corpus_word_counts_adj:
    corpus_word_counts_adj[word] /= base_sum # generating probability of this word's occurance
    word_index = word_indices[word]
    word_count = int(corpus_word_counts_adj[word]*100000) # number of times to add the index to the sample_vector
    sample_vector += [word_index for i in range(word_count)]

print(len(sample_vector))

97792


In [10]:
## generating data for use in the negative sampling
num_negative_samples = 5
neg_sampling_indices = []
for i in range(len(all_contexts)):
    target = all_targets[i]
    neg_indices_i = [target] # target is always the first index
    num_valid_neg_samples = 0
    while num_valid_neg_samples != num_negative_samples:
        sample = random.choice(sample_vector) # selecting a random word from the sample_vector
        if sample not in neg_indices_i: # preventing the same word to be selected twice or being equal to target
            neg_indices_i.append(sample)
            num_valid_neg_samples += 1
    neg_sampling_indices.append(neg_indices_i)

In [11]:
neg_sampling_indices = np.array(neg_sampling_indices)
neg_sampling_indices = neg_sampling_indices.astype("int32")
print(neg_sampling_indices.shape)

(74390, 6)


In [12]:
## generating data for use in importance sampling inspired method
T = 500 # the number of unique target words per partition (size of sub-vocabulary)
all_partition_targets = []
all_partition_contexts = []
i=0
while i < len(all_targets):
    these_target_words = set([]) # keeps track of the unique target words in this partition
    num_target_words = 0
    partition_contexts = []
    partition_targets = []
    while num_target_words<T and i<len(all_targets):
        context,target = all_contexts_oh[i],all_targets[i]
        partition_contexts.append(context)
        partition_targets.append(target)
        i += 1
        if target not in these_target_words:
            these_target_words.add(target)
            num_target_words += 1
            
    all_partition_targets.append(partition_targets)
    all_partition_contexts.append(partition_contexts)

In [13]:
print(len(all_partition_targets)) # number of partitions
print(len(all_partition_targets[0])) # number of examples in partition
print(len(set(all_partition_targets[0])))

40
1838
500


In [14]:
partition_contexts = []
partition_targets = []

for partition_i in range(len(all_partition_targets)):
    partition_contexts.append(np.array(all_partition_contexts[partition_i]))
    these_targets = all_partition_targets[partition_i]
    these_unique_targets = list(set(these_targets))
    new_unique_targets = [] # the indices information to use for this example
    for target in these_targets:
        new_unique_target_i = [target]+[t for t in these_unique_targets if t != target]
        new_unique_targets.append(new_unique_target_i)
        
    partition_targets.append(np.array(new_unique_targets).astype("int32"))

In [15]:
print(partition_contexts[0].shape)
print(partition_targets[0].shape)

(1838, 3896)
(1838, 500)


### Standard Word2Vec Model

In [9]:
def word2vec():
    """ Standard model
    """
    x = Input(shape=(3896)) # x represents one-hot encoding, shape:3896x1
    h = Dense(300,use_bias=False,activation=None)(x)
    o = Dense(3896,use_bias=False,activation=None)(h) # logits, output size of vocabulary
    
    model = Model(inputs=x,outputs=o)
    return model

In [10]:
def standard_loss(labels,logits):
    return tf.reduce_mean(sparse_categorical_crossentropy(y_true=labels,y_pred=logits,from_logits=True))

In [11]:
model = word2vec()
optimizer = Adam(lr=0.001)

In [12]:
for _ in range(5):
    losses = []
    for i in range(0,len(all_contexts_oh)-25,25): # batch size of 25
        x_subset = all_contexts_oh[i:i+25]
        y_subset = all_targets[i:i+25]
        with tf.GradientTape() as tape:
            predictions = model([x_subset])
            loss = standard_loss(y_subset,predictions)
        losses.append(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/max(len(losses),1))

7.064018602932201
6.398518884642785
6.1008258811566005
5.782794636798506
5.466340167101691


### Word2Vec with Negative Sampling

$$ loss: -log[p(w)] - \sum_{w:w_{neg}} log[1-p(w)] $$


In [12]:
def word2vec_neg():
    """ Model using negative sampling
    """
    x = Input(shape=(3896)) # x represents one-hot encoding, shape:(batch_size,3896)
    i = Input(shape=(6),dtype=tf.int32) # 5 indices to extract
    
    h = Dense(300,use_bias=False,activation=None)(x)
    o = Dense(3896,use_bias=False,activation=None)(h) # logits, output size of vocabulary
    out = tf.gather(o,i,batch_dims=1) # extracting the relevant dims 
    
    model = Model(inputs=[x,i],outputs=out)
    return model

In [13]:
def neg_loss(logits):
    """ First dimension of logits is the target word, the remaining [1:] are non-target words
    """
    scaled_output = Activation("sigmoid")(logits)
    pos_output = scaled_output[:,0]
    neg_output = scaled_output[:,1:]
    pos_loss_component = tf.reduce_mean(-tf.log(pos_output))
    neg_loss_component = tf.reduce_mean(tf.reduce_sum(-tf.log(1-neg_output),axis=-1))
    total_loss = pos_loss_component + neg_loss_component
    return total_loss

In [16]:
model = word2vec_neg()
optimizer = Adam(lr=0.001)

In [17]:
for _ in range(5):
    losses = []
    for i in range(0,len(all_contexts_oh)-25,25): # batch size of 25
        x_subset = all_contexts_oh[i:i+25]
        # y_subset = all_targets[i:i+25]
        i_subset = neg_sampling_indices[i:i+25] # indices
        with tf.GradientTape() as tape:
            predictions = model([x_subset,i_subset]) # logits
            loss = neg_loss(predictions)
        losses.append(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/max(len(losses),1))

3.020754607945931
2.4456017694152705
2.1599474278618307
1.8523982659107496
1.5894160255864889


### Word2Vec with Importance Sampling

In [16]:
def word2vec_imp():
    """ Model using importance sampling
        Uses the standard_loss function, with y-labels always being the 0-th index
    """
    x = Input(shape=(3896)) # x represents one-hot encoding, shape:(batch_size,3896)
    i = Input(shape=(500),dtype=tf.int32) # 500 indices to extract, for reduced softmax
    
    h = Dense(300,use_bias=False,activation=None)(x)
    o = Dense(3896,use_bias=False,activation=None)(h) # logits, output size of vocabulary
    out = tf.gather(o,i,batch_dims=1) # extracting the relevant dims 
    
    model = Model(inputs=[x,i],outputs=out)
    return model

In [17]:
def standard_loss(labels,logits):
    return tf.reduce_mean(sparse_categorical_crossentropy(y_true=labels,y_pred=logits,from_logits=True))

In [18]:
model = word2vec_imp()
optimizer = Adam(lr=0.001)

In [20]:
for _ in range(5):
    losses = []
    for partition_i in range(len(partition_targets)): # looping through each partition
        targets_i = partition_targets[partition_i] # the indices of vocabulary for this partition
        contexts_i = partition_contexts[partition_i]
        for i in range(0,len(targets_i)-25,25): # batch size of 25, looping through data per partition
            x_subset = contexts_i[i:i+25]
            y_subset = np.zeros((25))
            i_subset = targets_i[i:i+25] # indices
            with tf.GradientTape() as tape:
                predictions = model([x_subset,i_subset]) # logits
                loss = standard_loss(y_subset,predictions)    
            losses.append(float(loss))
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/max(len(losses),1))

5.746219926947593
5.325782788694509
4.997750015900375
4.644884868841706
4.317888594302396
