# Word Embeddings

## Vocabulary

In [None]:
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer()
text_corpus = ['bob ate apples, and pears', 'fred ate apples!']
tokenizer.fit_on_texts(text_corpus)
new_texts = ['bob ate pears', 'fred ate pears']
print(tokenizer.texts_to_sequences(new_texts))
print(tokenizer.word_index)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
  oov_token='OOV')
text_corpus = ['bob ate apples, and pears', 'fred ate apples!']
tokenizer.fit_on_texts(text_corpus)
print(tokenizer.texts_to_sequences(['bob ate bacon']))
print(tokenizer.word_index)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=2)
text_corpus = ['bob ate apples, and pears', 'fred ate apples!']
tokenizer.fit_on_texts(text_corpus)

# the two most common words are 'ate' and 'apples'
# the tokenizer will filter out all other words
# for the sentence 'bob ate pears', only 'ate' will be kept
# since 'ate' maps to an integer ID of 1, the only value 
# in the token sequence will be 1
print(tokenizer.texts_to_sequences(['bob ate pears']))

In [None]:
# Skip-gram embedding model
class EmbeddingModel(object):
    # Model Initialization
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size)

    # Convert a list of text strings into word sequences
    def tokenize_text_corpus(self, texts):
        # CODE HERE
        self.tokenizer.fit_on_texts(texts)
        return self.tokenizer.texts_to_sequences(texts)
        pass

## Embeddings

In [None]:
def get_target_and_size(sequence, target_index, window_size):
    return (sequence[target_index], window_size//2)

# Skip-gram embedding model
class EmbeddingModel(object):
    # Model Initialization
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size)

    # Convert a list of text strings into word sequences
    def get_target_and_context(self, sequence, target_index, window_size):
        target_word, half_window_size = get_target_and_size(
            sequence, target_index, window_size)

In [None]:
def get_window_indices(sequence, target_index, half_window_size):
    left_incl = max(0, target_index - half_window_size)
    right_excl = min(len(sequence), target_index + half_window_size + 1)
    return (left_incl, right_excl)
    pass

# Skip-gram embedding model
class EmbeddingModel(object):
    # Model Initialization
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size)

    # Convert a list of text strings into word sequences
    def get_target_and_context(self, sequence, target_index, window_size):
        target_word, half_window_size = get_target_and_size(
            sequence, target_index, window_size
        )
        left_incl, right_excl = get_window_indices(
            sequence, target_index, half_window_size)
        return target_word, left_incl, right_excl

## Skip-gram

In [None]:
# Skip-gram embedding model
class EmbeddingModel(object):
    # Model Initialization
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size)

    def tokenize_text_corpus(self, texts):
        self.tokenizer.fit_on_texts(texts)
        sequences = self.tokenizer.texts_to_sequences(texts)
        return sequences

    # Convert a list of text strings into word sequences
    def get_target_and_context(self, sequence, target_index, window_size):
        target_word = sequence[target_index]
        half_window_size = window_size // 2
        left_incl = max(0, target_index - half_window_size)
        right_excl = min(len(sequence), target_index + half_window_size + 1)
        return target_word, left_incl, right_excl
    
    # Create (target, context) pairs for a given window size
    def create_target_context_pairs(self, texts, window_size):
        pairs = []
        sequences = self.tokenize_text_corpus(texts)      
        for sequence in sequences:
            for i in range(len(sequence)):
                target_word, left_incl, right_excl = self.get_target_and_context(
                    sequence, i, window_size)
                for ind in range(left_incl, right_excl):
                    if ind != i:
                        pairs.append((sequence[i], sequence[ind]))
        return pairs

## Embedding Matrix

In [None]:
print(tf.get_variable('v1', shape=(1, 3)))
print(tf.get_variable('v2', shape=(2,), dtype=tf.int64))

In [None]:
init = tf.random_uniform((5, 10),minval=-1,maxval=2)
v = tf.get_variable('v1.1', initializer=init)
print(v)

In [25]:
emb_mat = tf.get_variable('v3', shape=(5, 10))
word_ids = tf.constant([0, 3])
emb_vecs = tf.nn.embedding_lookup(emb_mat, word_ids)
print(emb_vecs)

Tensor("embedding_lookup/Identity:0", shape=(2, 10), dtype=float32)


In [26]:
def get_initializer(embedding_dim, vocab_size):
    init = tf.random_uniform((vocab_size, embedding_dim),minval=-0.5/embedding_dim, maxval=0.5/embedding_dim)
    return init
    pass

# Skip-gram embedding model
class EmbeddingModel(object):
    # Model Initialization
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size)

    # Forward run of the embedding model to retrieve embeddings
    def forward(self, target_ids):
        initializer = get_initializer(
            self.embedding_dim, self.vocab_size)

In [27]:
# Skip-gram embedding model
class EmbeddingModel(object):
    # Model Initialization
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size)

    # Forward run of the embedding model to retrieve embeddings
    def forward(self, target_ids):
        initializer = get_initializer(
            self.embedding_dim, self.vocab_size)
        # CODE HERE
        self.embedding_matrix = tf.get_variable('embedding_matrix', initializer=initializer)
        embeddings = tf.nn.embedding_lookup(self.embedding_matrix, target_ids)
        return embeddings