# Setup

In [1]:
# Import packages
import io
import string

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

tf.__version__ # 2.x

'2.3.0'

# Skip-Gram Model

Word2Vec is not a singular algorithm, rather, it is a family of model architectures and optimizations that can be used to learn word embeddings from large datasets.

These papers proposed two methods for learning representations of words:

- Continuous Bag-of-Words Model which predicts the middle word based on surrounding context words. The context consists of a few words before and after the current (middle) word. This architecture is called a bag-of-words model as the order of words in the context is not important.

- Continuous Skip-gram Model which predict words within a certain range before and after the current word in the same sentence.

A skip-gram model predicts the context (or neighbors) of a word, given the word itself.
The model is trained on skip-grams, which are n-grams that allows tokens to be skipped.
The context of a word can be representend using a pair of (`target_words`, `context_word`) where `context_word` appears in the neighboring context of the `target_word`.

The `window_size` here determines the number of neighbors on either side of the `target_word` for context.

The objective of the skip-gram model is to maximize the probability of predicting context words given the `target_word`.

The computation of probability here would require to take softmax over the entire vocabulary which is often very large as any word from the vocabulary can be predicted for the given `target_word`. The `Noise Contrastive Estimation` loss function is an efficient approximation for a full softmax. Rather than taking the whole vocabulary as sample for probability distribution, negative sampling is used.

The simplified negative sampling objective for a `target_word` is to distinguish the context words from number of n negative samples drawn from a noise distribution P(W) of words. That is, an approximation of softmax over the entire vocabulary is to pose the loss for a `target_word` as a classification problem between context word and number of n negative samples.

A negative sample is defined as (`target_word`, `context_word`) pair such that the context word does not appear in the `window_size` of the `target_word`.

## Setup input pipeline

In [2]:
PATH_TO_FILE = tf.keras.utils.get_file("shakespeare", "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

# Load dataset
with open(PATH_TO_FILE) as f: 
    lines = f.read().splitlines()
    for line in lines[:20]:
        print(line)

# Create dataset
text_ds = tf.data.TextLineDataset(PATH_TO_FILE).filter(lambda x: tf.cast(tf.strings.length(x), bool))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [3]:
for text in text_ds.take(1):
    print("Input text:", text)

Input text: tf.Tensor(b'First Citizen:', shape=(), dtype=string)


## Data preprocessing

In [4]:
VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 32

# Define a vectorization layer
encoder = TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=VOCAB_SIZE,
    output_sequence_length=MAX_SEQUENCE_LENGTH,
    output_mode="int"
)

In [5]:
# Adapt using the training dataset
encoder.adapt(text_ds.batch(132))

# Vectorize the data using the learned encoder
text_vector_ds = text_ds.batch(1024).prefetch(tf.data.experimental.AUTOTUNE).map(encoder).unbatch()

In [6]:
# Get sequences out
sequences = list(text_vector_ds.as_numpy_iterator())

print("Number of sequences:", len(sequences))
print("First sequence:", sequences[0])

Number of sequences: 32777
First sequence: [ 89 270   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [7]:
SEED = 2021
NUM_NS = 4
WINDOW_SIZE = 2
targets, contexts, labels = list(), list(), list()

# Create subsampling table
subsampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=VOCAB_SIZE)

# Create targets, contexts and labels from the sequence
for sequence in sequences:
    # Get positive ngrams
    positive_ngrams, _ = tf.keras.preprocessing.sequence.skipgrams(
        sequence=sequence,
        vocabulary_size=VOCAB_SIZE,
        sampling_table=subsampling_table,
        window_size=WINDOW_SIZE,
        negative_samples=0
    )

    for target_w, context_w in positive_ngrams:
        # Create context tensor
        context_class = tf.expand_dims(tf.constant([context_w], dtype="int64"), 1)

        # Get negative contexts
        negative_context_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
            true_classes=context_class,
            num_true=1,
            num_sampled=NUM_NS,
            unique=True,
            range_max=VOCAB_SIZE,
            seed=SEED
        )
        negative_context_candidates = tf.expand_dims(negative_context_candidates, 1)

        # Add negative and positive contexts together
        context = tf.concat([context_class, negative_context_candidates], 0)

        # Create label
        label = tf.constant([1] + [0]*NUM_NS, dtype="int64")

        # Store
        targets.append(target_w)
        contexts.append(context)
        labels.append(label)

In [8]:
BUFFER = 10000
BUFFER_BS = 64

# Create optimized dataset for model
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER).batch(BUFFER_BS, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

## Modelling

The Word2Vec model can be implemented as a classifier to distinguish between true context words from skip-grams and false context words obtained through negative sampling.

You can perform a dot product between the embeddings of target and context words to obtain predictions for labels and compute loss against true labels in the dataset.

In [9]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, num_ns):
        super(Word2Vec, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
        self.context_embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=num_ns+1)
        self.dots = tf.keras.layers.Dot(axes=(3,2))
        self.flatten = tf.keras.layers.Flatten()
    
    def call(self, pair):
        target, context = pair
        we = self.target_embedding(target)
        ce = self.context_embedding(context)
        dots = self.dots([ce, we])
        return self.flatten(dots)

In [10]:
EMBEDDING_DIM = 100

# Instantiate model
word2vec = Word2Vec(VOCAB_SIZE, EMBEDDING_DIM, NUM_NS)

# Compile model
word2vec.compile(
    optimizer="adam",
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# Log modelling with tensorboard for analysis and visualization
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

# Fit model on train dataset
history = word2vec.fit(dataset, epochs=5, callbacks=[tensorboard_callback])

Epoch 1/5
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
# View embeddings on Tensorboard
# ! tensorboard --logdir logs

## Embedding Lookup

In [12]:
# Get weights from layer
weights = word2vec.get_layer("w2v_embedding").get_weights()[0]
vocab = encoder.get_vocabulary()

# Create and save metadata files
out_v = io.open("/tmp/vectors.tsv", "w", encoding="utf-8")
out_m = io.open("/tmp/metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(vocab):
    if  index == 0: continue # skip 0, it"s padding.
    vec = weights[index] 
    out_v.write("\t".join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()