# Homework 10 - Word Embeddings

In this homework you will train the skip gram model on the bible.

In [1]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import RegexpTokenizer
from collections import Counter

In [129]:
# These functions are provided to generate two dictionaries to translate words into ids and back.

def tokenize_text(text):
    text_lower = text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    text_tokenized = tokenizer.tokenize(text_lower)
    return text_tokenized

def create_dicts_from_tokenized_text(tokenized_text,vocabulary_size):
    words_and_count = Counter(tokenized_text).most_common(vocabulary_size - 1)
    print(words_and_count)
    word2id = {word: word_id for word_id, (word, _) in enumerate(words_and_count, 1)}
    word2id["_UNKNOWN_"] = 0
    id2word = dict(zip(word2id.values(), word2id.keys()))
    return word2id, id2word

In [130]:
# Read in and tokenize the bible text.
bible_text = open('bible.txt').read()
bible_text_tokenized = tokenize_text(bible_text)

# Create dictionaries for the 10000 most common words.
vocabulary_size = 10000
word2id, id2word = create_dicts_from_tokenized_text(bible_text_tokenized, vocabulary_size)

# Translate the tokenized text into ids.
bible_id = [word2id.get(word, 0) for word in bible_text_tokenized]



In [31]:
# Generate the dataset of all pairs of word and context with a context window of 2.
# Tip: Create two lists: 
#          - one long list with all valid word indices (for which you can apply the context windows)
#          - one short list with the shifts for getting the context word (remember to exclude 0)
#      Then generate two lists (input words + context words), by running through the two lists above.

### YOUR CODE HERE ###
# Define the context range.
context_range = 2
context_window = [i for i in range(-context_range, context_range + 1) if i != 0]

# Only use the words that have enough words before and after it for getting the contexts.
valid_indices = range(context_range, len(bible_id) - (context_range + 1))

# Generate one list of the words and on for the contexts. If you have e.g. 4 context words for each word
# remember that you need each word 4 times in the list for the words.
word_ids = [bible_id[word_index] for word_index in valid_indices for shift in context_window]
context_ids = [bible_id[word_index + shift] for word_index in valid_indices for shift in context_window]
######################

In [123]:
# Create a dataset from these lists. Batch size: 128. Shuffle.
### YOUR CODE HERE ###
dataset = tf.data.Dataset.from_tensor_slices((word_ids,context_ids))
dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(128)
######################

26689


In [124]:
class SkipGram(tf.keras.layers.Layer):
    
    def __init__(self, vocab_size, embedding_size):
        super(SkipGram, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        
        
    def build(self, input_shape):
        self.embedding_matrix = self.add_weight(
                                shape=(self.vocab_size, self.embedding_size),
                                initializer='GlorotNormal'
                                )
        self.score_matrix = self.add_weight(
                            shape=(self.vocab_size, self.embedding_size),
                            initializer='GlorotNormal'
                            )
        self.score_bias = self.add_weight(
                            shape=(self.vocab_size),
                            initializer='zeros'
                            )
        
    def call(self,inputs,labels):
        ### YOUR CODE HERE ###
        labels = tf.expand_dims(labels, axis=-1)
        # Get the embeddings. Use tf.nn.embedding_lookup().
        embeddings = tf.nn.embedding_lookup(self.embedding_matrix,inputs)
        # Instead of calculating the scores, we will directly calculate and return the loss.
        # Use tf.nn.nce_loss(). Remember to average the loss over all batches.
        nce_loss = tf.nn.nce_loss(
                    weights = self.score_matrix,
                    biases = self.score_bias,
                    labels = labels,
                    inputs = embeddings,
                    num_sampled = 15,
                    num_classes = self.vocab_size
        )
        loss = tf.reduce_mean(nce_loss)
        return loss
        #######################

In [131]:
# Provided function to readout the nearest neighbors inside the embedding.
# Feel free to add more words to the list.
target_words = ['israel', 'god', 'jesus', '1', 'love', 'day', 'wine']
number_of_nearest_neighbors = 8

def find_and_print_nearest_neighbors(target_words, number_of_nearest_neighbors,embeddings):
    normed_embeddings = embeddings / np.sqrt(np.sum(embeddings**2, axis=1, keepdims=True))
    for word in target_words:
        word_id = word2id[word]
        word_embedding = normed_embeddings[word_id, :]
        cosine_similarities = np.matmul(normed_embeddings, word_embedding )
        n_nearest_neighbors = np.argsort(-cosine_similarities)[:number_of_nearest_neighbors]
        print("Nearest to " + word + ": " + ", ".join([id2word[nearest] for nearest in n_nearest_neighbors]))

In [134]:
tf.keras.backend.clear_session()

### YOUR CODE HERE ###
# Initialize model (vocab size:100000, embedding size:64) and optimizer (Adam, lr:1).
model = SkipGram(vocab_size=vocabulary_size, embedding_size=64)
optimizer = tf.keras.optimizers.Adam(learning_rate=1)

# Train model for at least 10 epochs. This might take a while. 
# But you should see first sensible results already after 2 or 3 epochs.
for epoch in range(10):
    for x,t in dataset:

        # Compute loss, gradients and apply gradients.
        with tf.GradientTape() as tape:
            loss = model(x,t)
            gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
    # Once after the epoch use the above defined function to print the nearest neighbors.
    # Get the current embeddings via the model.trainable_variables parameter.
    embeddings = model.trainable_variables[0].numpy()
    print('Epoch: {}'.format(epoch))
    find_and_print_nearest_neighbors(target_words, 8, embeddings)
    print('------------------------')
#####################

KeyboardInterrupt: 