In [2]:
import os
import gc
import re
import string
import numpy as np
import pandas as pd
import kagglehub
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
# Assign home_dir as a local directory to use when saving data to disk.
home_dir = os.getenv("home_BA885")

if not home_dir:
    home_dir = os.getcwd()
# Colab home
# home_dir = "/content"

# Embedding

Author: Nima Doroud

The most publicized ML/AI models are Natural Language Processing (NLP) models including Neural Machine Translation (ex: Google Translate) and Next-word Prediction (ex: Auto-complete and Large Language Models). Furthermore, most real-world models deal with multiple types (or modes) of data including text inputs. For instance, when building a recommendation model, we include item description or user reviews as inputs.

For a neural network to process natural language, we first need a map that transforms text into tensors with numeric values. To that end we need to standardize and tokenize the text. Once we have a list of all the unique tokens in our dataset, we can represent each token with a unique one-hot vector (vectorization) or dense vector (embedding). For example, the input text "Hello World!" transforms as follows:

```
"Hello World!" -> ["hello", "world"] -> [v[0], v[1]]
```
For a large vocabulary (number of unique tokens), there are a lot of relations (synonyms, antonyms, tense, ...) among the tokens that are not captured by a simple vectorization. Moreover, these relations are -- for the most part -- inherent to the language and can be learned from any corpus of text in that language. This is the role of embedding models.

To see the inner workings of embedding models, we will build and train a Word2Vec embedding model.

## IMDB reviews

We will use the IMDB reviews dataset which can be found <a href="http://ai.stanford.edu/~amaas/data/sentiment/"> here </a> and is also available through TensorFlow Datasets.

In [4]:
# Download dataset
if not os.path.exists(home_dir+'/datasets/aclImdb'):
    ds_link='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    tf.keras.utils.get_file('aclImdb_v1.tar.gz', 
                            ds_link, 
                            cache_dir='./',
                            cache_subdir='datasets',
                            extract=True)

In [5]:
# Create a folder for miscellaneous files (if you intend to load
# the entire dataset)
if not os.path.exists(home_dir+'/datasets/aclImdb/train_miscellaneous'):
    os.mkdir(home_dir+'/datasets/aclImdb/train_miscellaneous')

# Move miscellaneous files
for x in os.scandir(home_dir+'/datasets/aclImdb/train'):
    if x.is_file():
        os.rename(home_dir+'/datasets/aclImdb/train/'+x.name,
                  home_dir+'/datasets/aclImdb/train_miscellaneous/'+x.name)

In [7]:
# Load the 'unsupervised' reviews as a dataset
batch_size = 128
ds_dir = home_dir+'/datasets/aclImdb/train/unsup'

# Strip html elements
def strip_html(input_str):
    ''' Strips the all occurrences of the string "<br />" from
    the dataset. Use with text_ds.map.
    '''
    return tf.strings.regex_replace(input_str, "<br />", " ")

# Parallelization parameter of dataset.map
num_cores = 12

# Create the dataset from the reviews ignoring the categorization
ds = keras.utils.text_dataset_from_directory(
    directory = ds_dir,
    labels = None,
    batch_size = batch_size,
    validation_split = None,
    subset = None,
    seed = 1).map(strip_html, num_parallel_calls = num_cores)

Found 50000 files belonging to 1 classes.


Note: You can also import the dataset via tensorflow_datasets:
```
import tensorflow_datasets as tfds

#tfds.__version__ == '4.9.6'

ds_train, ds_test, ds_unsup = tfds.load(name="imdb_reviews",
          split=['train', 'test', 'unsupervised'],
          batch_size=-1,
          data_dir=home_dir,
          as_supervised=False)
```

## Pre-trained embedding

For most applications, you can simply choose a pre-trained model from a plethora of available text embedding models such as Google's Neural-Net Language Models (NNLM) which are trained on Google News datasets. Using a pre-trained embedding provides an accurate representation of (common) text data to your model and can reduce the time and resources required to train and fine-tune your model.

In [8]:
# Define a function to download a google NNLM model
def download_g_nnlm(nnlm_model: str):
    '''Downloads the NNLM model from Google's NNLM models on kaggle
    (https://www.kaggle.com/models/google/nnlm) where pretrained
    embedding models in multiple languages are available. You can
    choose the embedding dimension to be 50 or 128. Finally, you can
    choose models with or without text normalization to remove 
    punctuations.

    The downloaded model is moved to:
        home_dir/models/google_nnlm/lled(wn)
        ll: 2 letter abbreviation of the language (en, es, de,...)
        ed: embedding dimension (50 or 128)
        wn: with normalization
    '''
    # model specs
    model = nnlm_model.split('/')[-1]
    language = model.split('-')[0]
    embedding_dim = model.split('-')[1].split('dim')[-1]
    model_dir = f'{home_dir}/models/google_nnlm/{language}{embedding_dim}'
    if len(model.split('-'))>2:
        model_dir += '_wn'
    # Check if the model is on disk
    if os.path.exists(model_dir):
        print('Model is already on disk.')
        return None
    # download model
    path = kagglehub.model_download(nnlm_model, force_download=True)
    # Create model_dir
    os.mkdir(model_dir)
    # Move downloaded model to model_dir
    for dir, contents, files in os.walk(path):
        # Create sub-directory
        if not os.path.exists(model_dir+dir[len(path):]):
            os.mkdir(model_dir+dir[len(path):])
        # Move files
        for file in files:
            os.rename(dir+'/'+file, model_dir+dir[len(path):]+'/'+file)
    print('Model downloaded succesfully.')
    return None

In [9]:
# Create a directory for google nnlm models
if not os.path.exists(home_dir+'/models/google_nnlm'):
    os.mkdir(home_dir+'/models/google_nnlm')

# Download English text embedding model
download_g_nnlm("google/nnlm/tensorFlow2/en-dim50-with-normalization")

Model is already on disk.


In [10]:
# You can load the embedding model via:
#    hub.load(path) or tf.saved_model.load(path) as a tensorflow model
#    hub.KerasLayer(path, input_shape=[], dtype=tf.string) as a keras layer.
embedding_en50wn = hub.load(home_dir+'/models/google_nnlm/en50_wn')

num_embeddings_en50wn, embedding_dim_en50wn = embedding_en50wn.embeddings.shape

print(f'number of embedding vectors = {num_embeddings_en50wn}')
print(f'embedding dimension = {embedding_dim_en50wn}')

number of embedding vectors = 963812
embedding dimension = 50


In [11]:
# Here is the output of the embedding on a sample string
for x in ds.take(1):
    i = 4
    sample_embedding = embedding_en50wn(x.numpy()[:i])
    sample_string = x.numpy()[0].decode('utf-8').split(' ')[:i]
    sample_string = np.reshape(
        np.transpose(
            np.concatenate(
                [[sample_string],[[' ']*len(sample_string)]], axis=0
            )
        ), (-1))
    sample_string = ''.join(sample_string[:-1])
    print(f'Sample string "{sample_string} ..."\n is embedded as:')
    tf.print(sample_embedding)

Sample string "Fairly entertaining movie, but ..."
 is embedded as:
[[0.667886376 0.0443405434 -0.149116173 ... -0.410180092 -0.107391208 -0.155435026]
 [0.302538723 0.198385894 -0.0855945349 ... -0.781784713 0.252008289 0.321103722]
 [0.182023913 0.232008681 0.136273429 ... -0.215836763 0.132544205 -0.0584928058]
 [0.21110025 0.314057738 -0.119848102 ... -0.0213225503 0.447388232 -0.0899484828]]


### Embedding vocabulary

In order to translate embedding vectors back to tokens, we need to form the embedding vocabulary. This takes the form of a dictionary {index : token} where the index identifies the embedding vector from our set of 963812 embedding vectors that comprise the weights of the embedding model and the token is the token string corresponding to that vector.

In [12]:
# Define a function to generate embedding vocabulary
def generate_nnlm_vocab(model_path: str,
                        embedding,
                        unmatched_indices = [],
                        unmatched_tokens = [],
                        populate_unmatched = True) -> dict:
    '''Generates a dictionary for a pretrained Google NNLM embedding
    model to map embedding vector indices to token strings: 
        vocab[i] = token  <-->  embedding(token) = weights[i]
    Returns a dictionary.

    Inputs:
        model_path: A string specifying the path of the model
        Ex: .../models/google_nnlm/en50_wn
        
        embedding: The model pre-loaded as a keras model
        
        unmatched_indices: An empty list which will be updated
        with vector indices that could not be matched with
        token strings in the tokens file.

        unmatched_tokens: An empty list which will be updated
        to store token strings whose embedding is not an exact
        match with the embedding vectors.

        populate_unmatched: When False, the output dictionary
        will not have unmatched indices as keys.
        When True, dict.keys = range(num_embeddings).
    '''
    num_embeddings, embedding_dim = embedding.embeddings.shape
    # Load the list of token strings from the tokens file
    token_strings = []
    with open(model_path+'/assets/tokens.txt','r') as tokens:
        for t in tokens:
            token_strings.append(t.strip('\n'))
    # Convert to numpy array
    token_strings = np.array(token_strings).astype(str)
    # Evaluate the embedding vectors associated to the tokens
    token_embeddings = embedding(token_strings)
    # Pad token embeddings to match embedding.weights[0].shape
    token_embeddings = tf.pad(token_embeddings,
                              [[0,num_embeddings-len(token_strings)],[0,0]],
                              "CONSTANT")
    # Find the indices of tokens with matching embedding vectors.
    # i.e. (index, token) such that embedding(token) == weights[i]
    matched_indices = tf.squeeze(
        tf.where(
            tf.math.reduce_all(token_embeddings == embedding.embeddings,
                               axis=1)
        )).numpy()
    # Create a dictionary to store the index:token pairs
    vocabulary = pd.DataFrame(token_strings[matched_indices],
                              index = matched_indices).to_dict()[0]
    # Add the indices of unmatched embedding vectors to unmatched_indices
    unmatched_indices += np.delete(np.arange(num_embeddings), matched_indices).tolist()
    # Add unmatched token strings to unmatched_tokens
    unmatched_tokens += np.delete(token_strings, matched_indices).tolist()
    # Populate the unmatched indices in the vocabulary
    if populate_unmatched:
        vocabulary.update({i:f'#UNK_{i}' for i in unmatched_indices})
    #
    print(f'Number of embedding vectors = {num_embeddings}')
    print(f'Number of tokens = {len(token_strings)}')
    print(f'Number of matched (index,token) pairs = {len(matched_indices)}')
    print(f'Number of unmatched embedding vectors = {len(unmatched_indices)}')
    print(f'Number of unmatched token strings = {len(unmatched_tokens)}')
    #
    return vocabulary

In [13]:
# Create two numpy arrays to store unmatched indices and token strings
unmatched_indices_en50wn = []
unmatched_tokens_en50wn = []

# Generate the embedding model vocabulary
vocabulary_en50wn = generate_nnlm_vocab(home_dir+'/models/google_nnlm/en50_wn',
                                        embedding_en50wn,
                                        unmatched_indices_en50wn,
                                        unmatched_tokens_en50wn)

Number of embedding vectors = 963812
Number of tokens = 960752
Number of matched (index,token) pairs = 899290
Number of unmatched embedding vectors = 64522
Number of unmatched token strings = 61462


In [14]:
# The unmatched tokens will include punctuations (since we are
# using the model with normalization) as well as infrequent tokens
# which are grouped into hash buckets.
unmatched_tokens_en50wn[:10]

['</S>', ',', '.', '"', '-', "'s", '##', ')', '(', '####']

### Embedding distance

Generative NLP models, such as large language models (LLM) first generate a vector, V, in the embedding space. In order to match V to a token we need to find the token with the 'closest' embedding vector to V. Thus, we need a notion of distance or similarity in our embedding space. The two common choices are the Euclidean distance (keras.losses.MeanSquaredError) and cosine similarity (keras.losses.cosine_similarity).

In [15]:
# Define a function to find the k nearest neighboring embedding
# vectors of a point in the embedding space.
def token_knn_en50wn(v: tf.Tensor, k=1) -> pd.DataFrame:
    ''' For a point in the embedding space identified by the
    50-component tensor, v, finds the k nearest token indices
    identified by token embeddings of the google nnlm en50 --
    with normalization -- model.

    Inputs:
        v: a tf.Tensor of shape (50,) or (1,50).
        k: an integer > 0
    Returns a dataframe with the index column listing the nearby
    tokesn and the distance column their distance from the input
    vector.
    '''
    x = tf.math.reduce_sum(
        tf.square(tf.subtract(embedding_en50wn.embeddings, v)),
        axis=1).numpy()
    x = pd.DataFrame(x).sort_values(by=0).rename(columns={0: 'distance'}).iloc[:k]
    x.index = x.index.map(vocabulary_en50wn)
    x.distance = np.sqrt(x.distance)
    return x

In [16]:
# We can use this function to study very basic word relationships:
sample_token = 'bike'
k = 5
print(f'The {k} closest tokens to "{sample_token}":')
token_knn_en50wn(embedding_en50wn([sample_token]), k)

The 5 closest tokens to "bike":


Unnamed: 0,distance
bike,0.0
bicycle,0.585474
motorbike,0.671201
skateboard,0.678404
scooter,0.685947


## Custom embedding

A pre-trained embedding has a fixed tokenization which is most appropriate for its training data. Thus, if you want to choose your own tokens or if your dataset has uncommon tokens, you may need to build your own embedding model.

### Standardization

Our first task is standardize the data. To minimize the number of tokens we convert all the text into lower case and remove all symbols and punctuations.

In [17]:
# Create a string of all the charachters to strip from the dataset.
strip_chars = string.punctuation

print(f'The following charachters will be striped: {strip_chars}')

# Escape the special charachters via the regular expression package.
# The square brackets indicate that there are multiple patterns.
strip_chars = '['+re.escape(strip_chars)+']'

# Define a standardization function for the text dataset.
def custom_standardization(input_str):
    '''Standardizes an input string or tensor of strings by
    convering all the charachters to lower case and striping
    the charachters in strip_chars. Returns a string valued
    tf.tensor.
    '''
    return tf.strings.regex_replace(tf.strings.lower(input_str), strip_chars, "")

The following charachters will be striped: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


### Vectorization

Next we need to convert the standardized words/tokens into vectors. We can simply list all the unique words as our vocabulary and use the index or the one-hot vectors, but in a large corpus of text the number of unique words may be too large. Thus, to limit the size of the vocabulary we also need to consider the frequency of appearance of each word and only drop the least frequent words.

We can achieve this using the TextVectorization layer in Keras.

In [18]:
# Define a function to count the number of unique tokens in a dataset
def count_unique_tokens(ds: tf.data.Dataset) -> int:
    ''' Counts the number of unique words/tokens in a
    string-valued dataset.
    '''
    unique_tokens = np.array([], dtype=str)
    for batch in ds:
        batch_tokens = np.array([], dtype=str)
        for s in batch.numpy():
            batch_tokens = np.concatenate([batch_tokens, s.decode("utf-8").split(' ')])
            batch_tokens = np.unique(batch_tokens)
        unique_tokens = np.unique(np.concatenate([unique_tokens, batch_tokens]))
    return unique_tokens.shape[0]

# Find the number of unique tokens in the dataset
num_unique_tokens = count_unique_tokens(ds.map(custom_standardization))
print(f'Found {num_unique_tokens} unique tokens.')

Found 172830 unique tokens.


In [19]:
# Note that the vast majority of the unique tokens we counted
# appear only a handful of times.
# Set the size of the vocabulary
imdb_vocab_size = 120000

# Create a vectorization layer for the dataset.
imdb_vectorization = layers.TextVectorization(
    max_tokens=imdb_vocab_size,
    output_mode="int",
    output_sequence_length=None,
    ragged=True,
    standardize=custom_standardization)

# Load or adapt the vectorization layers vocabularies.
if os.path.exists(home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}.csv'):
    imdb_vectorization.set_vocabulary(list(pd.read_csv(
        home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}.csv',
        header=None,
        na_filter='')[0]))
else:
    imdb_vectorization.adapt(ds)

# Save vocabulary to disk
if not os.path.exists(home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}.csv'):
    pd.DataFrame(imdb_vectorization.get_vocabulary()).to_csv(
        home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}.csv',
        header=False,
        index=False)

In [20]:
# The vectorization layer takes a string (review from our dataset)
# as input and generates an integer-valued tensor as the output.
# The values can be interpreted as the index of each word in the corresponding vocabulary.
print(f'\nThe sample string "{sample_string} ..." vectorizes as: \n ')
tf.print(imdb_vectorization(np.array([sample_string])))


The sample string "Fairly entertaining movie, but ..." vectorizes as: 
 
[[930, 392, 17, 18]]


In [21]:
# The vectors can be reverted back into text via a dictionary that maps
# the indices to the vocabulary.
imdb_vocabulary = imdb_vectorization.get_vocabulary()
imdb_vocabulary = dict(zip(range(len(imdb_vocabulary)), imdb_vocabulary))

def decode_imdb_vector(v: np.array) -> str:
    '''Transforms a vectorized imdb review back to
    English text using the recorded vocabulary.
    Returns a string.
    '''
    tokens = list(map(imdb_vocabulary.get, v))
    tokens = np.reshape(np.transpose(np.concatenate([[tokens], [[' ']*v.shape[-1]]])), (-1))
    return ''.join(tokens).rstrip()

In [22]:
# Here is an example for our decoder:
print(f'Original text: {sample_string} ...\n')
print('Decoded text: ',decode_imdb_vector(np.array([930, 392, 17, 18])))

Original text: Fairly entertaining movie, but ...

Decoded text:  fairly entertaining movie but


In [23]:
'''
Note that the only information the vectorization layers encode is the 
frequency of appearance of words in our dataset. Word relationships 
such as synonymity are not encoded. To illustrate this, consider the 
synonymous words "bike" and "scooter".
'''
# Indices of synonymous words
print('The index corresponding to the word "bike" is {}'.format(
    imdb_vectorization(np.array(['bike'])).numpy()[0][0]))
print('The index corresponding to the word "scooter" is {}'.format(
    imdb_vectorization(np.array(['scooter'])).numpy()[0][0]))

The index corresponding to the word "bike" is 6866
The index corresponding to the word "scooter" is 24990


### Word2Vec embedding

<a href="https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf">Word2Vec</a> is a class of neural network models proposed to generate an effective embedding by analyzing the pairing of words in a corpus of text. To illustrate the method by which the embedding vectors are generated consider the following simple model.

We start with an embedding space and assign two random vectors, u and v, to each token in our vocabulary. The vectors u[i] will serve as the embedding vectors while the vectors v[i] are auxiliary vectors that will capture 'compatibility'. If two words appear together in our dataset (ex: "entertaining" and "movie") we consider them compatible. Correspondingly, the model determines compatibility of the two words based on the inner product of their embedding vectors v[i].u[j].

#### Positive skipgrams

To train our model, we need to generate a dataset of pairs of neighboring words. We can further improve the model by including next to nearest words and so on, but for this simple example we will only consider adjacent words or 'bigrams'. (You can also generate such a dataset using keras.preprocessing.sequence.skipgrams.)

In [24]:
# Define a function to break text into sentences.
def break_into_sentences(batch: tf.RaggedTensor, sep=['. ', '; ']) -> tf.RaggedTensor:
    ''' Breaks down a string valued dataset into sentences using the
    markers in 'sep'. Returns a ragged tensor of the same dimension with
    the sentences stacked along axis=0.
    Use with dataset.map().
    '''
    x = batch
    for s in sep:
        x = tf.strings.split(x, s)
        x = x.merge_dims(0,1)
    return x

# Define a function to add markers at the two ends of each sentence
def add_markers(batch: tf.RaggedTensor) -> tf.RaggedTensor:
    ''' Adds the markers "[start]" and "[end]" to the beginning and
    the end of each setence in the dataset.
    Use with dataset.map().
    '''
    m0 = tf.fill(batch.shape, '[start]')
    m1 = tf.fill(batch.shape, '[end]')
    return tf.strings.join([m0, batch, m1], separator=' ')

In [25]:
# Create a copy of the dataset and break it down into sentences.
ds_b = ds.map(break_into_sentences, num_parallel_calls = num_cores)

# Standardize the dataset
ds_b = ds_b.map(custom_standardization, num_parallel_calls = num_cores)

# Add markers to mark the beginning and the end of each sentence.
ds_b = ds_b.unbatch().map(
    add_markers, num_parallel_calls = num_cores).batch(batch_size)

In [26]:
# Re-initialize the vectorization layer without standardization
imdb_vectorization = layers.TextVectorization(
    max_tokens=imdb_vocab_size,
    output_mode="int",
    output_sequence_length=None,
    ragged=True,
    standardize=None)

# Reload or re-adapt the vectoriazation layer to include the markers.
if os.path.exists(home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}_wm.csv'):
    imdb_vectorization.set_vocabulary(list(pd.read_csv(
        home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}_wm.csv',
        header=None,
        na_filter='')[0]))
else:
    imdb_vectorization.adapt(ds_b)

# Save vocabulary to disk
if not os.path.exists(home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}_wm.csv'):
    pd.DataFrame(imdb_vectorization.get_vocabulary()).to_csv(
        home_dir+f'/datasets/aclImdb/imbd_vocab_{imdb_vocab_size}_wm.csv',
        header=False,
        index=False)

# Update the vocabulary
imdb_vocabulary = imdb_vectorization.get_vocabulary()
imdb_vocabulary = dict(zip(range(len(imdb_vocabulary)), imdb_vocabulary))

# Print first few tokens in vocabulary
print('Our new vocabulary includes the markers: ',
      list(zip(range(5), map(imdb_vocabulary.get, range(5)))))

Our new vocabulary includes the markers:  [(0, ''), (1, '[UNK]'), (2, 'the'), (3, '[start]'), (4, '[end]')]


In [27]:
# Define a function to generate bigrams
def gen_bigrams(batch: tf.RaggedTensor) -> tf.RaggedTensor:
    ''' Generates bigrams -- pairs of adjacent tokens in the
    dataset -- from a (ragged) tensor of sentences (strings).
    Returns a Ragged Tensor of bigrams (strings with 2 tokens).
    Use with dataset.map().
    '''
    x = tf.strings.split(batch, ' ')
    x = tf.strings.join([x[:, :-1], x[:, 1:]], separator=' ')
    x = x.merge_dims(0,1)
    return x

In [28]:
# Transform the sentences dataset into bigrams
ds_b = ds_b.map(gen_bigrams, num_parallel_calls = num_cores)

# Vectorize the bigrams using our vectorization layer
ds_b = ds_b.map(imdb_vectorization, num_parallel_calls = num_cores)

# Convert RaggedTensor to normal Tensor
ds_b = ds_b.map(
    lambda rt: tf.reshape(rt.flat_values, (-1,2)),
    num_parallel_calls = num_cores)

# Re-batch the dataset
ds_b = ds_b.unbatch().batch(batch_size)

In [29]:
# Define a generator that generates 1.
def one_gen():
    yield 1

# Create the positive labels dataset
ds_pos_labels = tf.data.Dataset.from_generator(
    one_gen,
    output_signature=(tf.TensorSpec(shape=(), dtype=tf.int32)))

# Repeat and batch the labels dataset
ds_pos_labels = ds_pos_labels.repeat().batch(batch_size)

# Combine the bigrams and labels dataset to return (bigram, label) pairs
ds_b = tf.data.Dataset.zip((ds_b, ds_pos_labels))

#### Negative skipgrams

So far we only have a dataset of positive (compatible) skipgrams. We can also include negative skipgrams to help better train our embedding model. We will generate these at random.

Note that of all possible bigrams (1.44e10) the vast majority are negative (incompatible) bigrams. Thus, while generating negative bigrams at random does produce 'false negatives', they do not hinder the training of our embedding model. Furthermore, the more compatible a pair of tokens are, the more frequently the appear in the positive bigrams dataset thus reducing the impact of such false negatives.

In [30]:
# Set the number of negative samples
num_neg = int(1e8)

# Define a function to produce random bigrams with label=0
def bigrams_gen(size = num_neg, vocab_size = imdb_vocab_size):
    gen = tf.random.Generator.from_seed(1)
    return (gen.uniform(shape=(size,2), minval=0, maxval=vocab_size, dtype=tf.int64),
           tf.zeros(shape=(size,), dtype=tf.int32))

# Create a dataset of negative/random bigrams
ds_neg = tf.data.Dataset.from_tensor_slices(bigrams_gen()).batch(batch_size)

In [32]:
# Resample the dataset from the combined positive and negative skipgrams datasets
ds_b = tf.data.Dataset.sample_from_datasets([ds_b, ds_neg], weights=[0.5,0.5])

# Increase batch size and re-batch the dataset
batch_size = 256
ds_b = ds_b.unbatch().batch(batch_size)

# Shuffle, prefetch and chache
ds_b = ds_b.shuffle(256).prefetch(128).cache()

#### Embedding model

We can now implement our simple word2vec model and train it using the imdb skipgrams dataset.

In [33]:
# Define a function to generate our simple word2vec model
def simple_word2vec_model(embedding_dim : int) -> keras.Model:
    ''' Creates a simple word2vec model which takes pairs of integers,
    which represent the index of tokens in imdb_vocabulary, as inputs
    and measures their compatibility using their embeddings.
    '''
    x = layers.Input(shape = (2,))
    x = layers.Reshape(target_shape = (2,1))(x)
    # branch_0: embedding of the bigrams[:, 0] tokens
    u = layers.Cropping1D(cropping = (0,1))(x)
    u = layers.Reshape(target_shape = ())(u)
    u = layers.Embedding(input_dim=imdb_vocab_size, output_dim=embedding_dim)(u)
    # branch_1: Auxiliary embedding of the bigrams[:,1] tokens
    v = layers.Cropping1D(cropping = (1,0))(x)
    v = layers.Reshape(target_shape = ())(v)
    v = layers.Embedding(input_dim=imdb_vocab_size, output_dim=embedding_dim)(v)
    # Compatibility of the output of the two branches
    y = layers.Dot(axes=(1,1))([u,v])
    y = layers.Activation('sigmoid')(y)
    return keras.Model(x, y, name = 'simple_word2vec')

In [34]:
# Initialize and compile the model using the appropriate optimizer,
# loss function and metrics.
embed_dim = 48
sw2v_model = simple_word2vec_model(embed_dim)

sw2v_optimizer = "adam"
sw2v_loss = "binary_crossentropy"
sw2v_metrics = ["binary_accuracy"]

sw2v_model.compile(optimizer=sw2v_optimizer, loss=sw2v_loss, metrics=sw2v_metrics)

# Generate a summary of the model
sw2v_model.summary()

Model: "simple_word2vec"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 2, 1)]       0           []                               
                                                                                                  
 cropping1d (Cropping1D)        (None, 1, 1)         0           ['input_2[0][0]']                
                                                                                                  
 cropping1d_1 (Cropping1D)      (None, 1, 1)         0           ['input_2[0][0]']                
                                                                                                  
 reshape_1 (Reshape)            (None,)              0           ['cropping1d[1][0]']             
                                                                                    

In [None]:
# Train the model/load weights from file
if not os.path.exists(home_dir+f'/models/06_embedding/simple_word2vec_{embed_dim}.h5'):
    sw2v_history = sw2v_model.fit(ds_b,
                                  batch_size=batch_size,
                                  steps_per_epoch=1024,
                                  epochs=4)
else:
    sw2v_model.load_weights(home_dir+f'/models/06_embedding/simple_word2vec_{embed_dim}.h5')

In [36]:
# Save weights to file
if not os.path.exists(home_dir+'/models/06_embedding'):
    os.mkdir(home_dir+'/models/06_embedding')

if not os.path.exists(home_dir+f'/models/06_embedding/simple_word2vec_{embed_dim}.h5'):
    sw2v_model.save_weights(home_dir+f'/models/06_embedding/simple_word2vec_{embed_dim}.h5')
else:
    print('File already exists!')

In [37]:
# Extract the weights of the embedding layer as the embedding map
embedding_map = sw2v_model.weights[0].numpy()

# Define a function to map words to their embedding
def word_embed(word: str) -> np.array:
    return embedding_map[imdb_vectorization(word).numpy()[0]]

# Define a function to find the k nearest neighboring embedding
# vectors of a point in the embedding space.
def token_knn_sw2v(v: tf.Tensor, k=1) -> pd.DataFrame:
    ''' For a point in the embedding space identified by the
    32-component tensor, v, finds the k nearest token indices
    identified by token embeddings of sw2v_model.

    Inputs:
        v: a tf.Tensor of shape (32,) or (1,32).
        k: an integer > 0
    Returns a dataframe with the index column listing the nearby
    tokesn and the distance column their distance from the input
    vector.
    '''
    x = tf.math.reduce_sum(
        tf.square(tf.subtract(embedding_map, v)),
        axis=1).numpy()
    x = pd.DataFrame(x).sort_values(by=0).rename(columns={0: 'distance'}).iloc[:k]
    x.index = x.index.map(imdb_vocabulary)
    x.distance = np.sqrt(x.distance)
    return x

Bear in mind that we have only trained the simplest word2vec model and with only a small specialized corpus of text. Thus we can only expect the model to capture word relationships in the context of sentiment analysis of movie reviews.

In [40]:
print('Examples of frequent words ("____ movie"):' , '\n')
for word in ['dramatic', 'action']:
    print( token_knn_sw2v(word_embed(word), 3) ,'\n' )

print('Examples of infrequent words:' , '\n')
for word in ['cat', 'norway']:
    print( token_knn_sw2v(word_embed(word), 3) ,'\n' )

Examples of frequent words ("____ movie"): 

          distance
dramatic   0.00000
strange    0.30421
cheap      0.34318 

             distance
action       0.000000
interesting  0.396115
excellent    0.409452 

Examples of infrequent words: 

            distance
cat         0.000000
lies        0.269113
flashbacks  0.289725 

        distance
norway  0.000000
caesar  0.275237
rosss   0.276205 



### Aside: Sparse tensors

Many of the operations in this notebook involve sparsely populated tensors and can be replicated or are implicitly done using tf.sparse.SparseTensor objects. It is therefore beneficial to familiarize yourself with sparse tensors and sparse operations. We will discuss two examples here: Sparse one-hot vector and Sparse bigrams dataset.

#### Sparse one-hot vectors

While the keras embedding layer maps token index to embedding vector, the weights of the embedding layer have the following shape: (vocabulary_size, embedding_dim). To achieve the same effect using a dense layer, there is an intermediate step which is to map token index to a one-hot vector. The efficient way to store the one-hot vectors with a large number of classes is to use sparse tensors.

In [38]:
# Define a function to generate one-hot encodings as sparse tensors.
def sparse_one_hot(indices: tf.Tensor, num_classes: tf.int64) -> tf.SparseTensor:
    ''' Generates the one-hot encodings from the 'indices' array in
    the form of a sparse tensor. It is the sparse equivalent of tf.one_hot.

    indices: A numpy array of integers valued in the range(0,num_classes)
    num_classes: Number of classes which determines the shape of the output
    tensor along axis=1.
    '''
    sample_size = indices.__len__()
    indices = tf.cast(indices, dtype=tf.int64)
    indices = tf.concat([tf.expand_dims(tf.range(sample_size, dtype=tf.int64), axis=1),
                         tf.expand_dims(indices, axis=1)], axis=1)
    return tf.SparseTensor(indices = indices,
                           values = tf.ones(shape=(sample_size,), dtype=tf.int64),
                           dense_shape = (sample_size, num_classes))

In [39]:
# Take a sample review from the imdb reviews dataset.
for x in ds.take(1).map(custom_standardization):
    sample_review = x.numpy()[0]

# Vectorize the review using the imdb_vectorization layer and
# Convert to sparse one-hot encoding
sample_review_1h = imdb_vectorization(sample_review)
sample_review_1h = sparse_one_hot(sample_review_1h, num_classes=imdb_vocab_size)

# Print one-hot encodings
print('Here is the sparse one-hot encoding of our sample review: \n')
tf.print(sample_review_1h)

Here is the sparse one-hot encoding of our sample review: 

'SparseTensor(indices=[[0 1027]
 [1 238]
 [2 2]
 ...
 [632 55]
 [633 57]
 [634 586]], values=[1 1 1 ... 1 1 1], shape=[635 120000])'


In [40]:
# When the inputs are sparse tensors we need to use the Input layer with
# "sparse = True".
embedding_model_s1h = keras.Sequential([
    layers.Input(shape=(imdb_vocab_size), sparse=True),
    layers.Dense(units=embed_dim, use_bias=False)],
    name='Embedding_sparse_1h')

# Load the trained weights from our simple w2v model
embedding_model_s1h.set_weights([embedding_map])

# Find the embedding of the sample review
embedding_model_s1h(sample_review_1h)

<tf.Tensor: shape=(635, 64), dtype=float32, numpy=
array([[ 0.23788725, -0.2656307 , -0.23392057, ...,  0.29518747,
         0.2782097 , -0.21971081],
       [ 0.35165983, -0.45316482, -0.30913842, ...,  0.44955376,
         0.48651364, -0.43001372],
       [-0.33992714, -1.0376357 , -1.0656841 , ...,  1.0289359 ,
         1.0317237 , -1.1334589 ],
       ...,
       [ 0.38555515, -0.49675035, -0.41708922, ...,  0.37819645,
         0.5308759 , -0.4329502 ],
       [ 0.43772763, -0.547649  , -0.5353722 , ...,  0.57603467,
         0.5433307 , -0.5437628 ],
       [ 0.09001058, -0.19188467, -0.17871836, ...,  0.23232064,
         0.21456173, -0.21309493]], dtype=float32)>

#### Bigrams dataset as a sparse tensor

Our bigrams dataset is too large to load as a single tensor. The main reason for this is the multiplicity of the bigrams, some of the bigrams appear thousands of times in the dataset!

If we count the number of unique ([a,b] != [b,a]) bigrams with vocabulary size = 120000 we get 1.44e10 bigrams. Combined with the fact that the vast majority of these bigrams are negative bigrams, we can easily load the positive bigrams dataset as a single sparse tensor.

In [41]:
# Reload the positive bigrams dataset
batch_size = 4096
ds_b = ds.map(break_into_sentences)
ds_b = ds_b.map(custom_standardization)
ds_b = ds_b.unbatch().map(add_markers).batch(batch_size)
ds_b = ds_b.map(gen_bigrams)
ds_b = ds_b.map(imdb_vectorization)
ds_b = ds_b.map(lambda rt: tf.reshape(rt.flat_values, (-1,2)))
ds_b = ds_b.unbatch().batch(batch_size)

In [42]:
# Create a sparse tensor to store the frequency of appearance of bigrams.
bigram_freq = tf.sparse.SparseTensor(tf.constant([[]], dtype=tf.int64, shape=(0,2)),
                                     tf.constant([], dtype=tf.int64),
                                     dense_shape=(imdb_vocab_size,imdb_vocab_size))

# Count the frequency of appearance of bigrams in the dataset
for batch in ds_b:
    x = tf.sparse.SparseTensor(batch,
                               tf.ones(shape=(batch.shape[0]), dtype=tf.int64),
                               dense_shape=(imdb_vocab_size,imdb_vocab_size))
    bigram_freq = tf.sparse.add(bigram_freq, x)

In [43]:
# Stats on frequency of appearance of bigrams
print('Number of sparse tensor indices / Number of all bigrams = ',
      f'{bigram_freq.values.shape[0]} / {imdb_vocab_size**2} \n')

print('Maximum value recorded in sparse tensor = ', 
      tf.reduce_max(bigram_freq.values).numpy())

Number of sparse tensor indices / Number of all bigrams =  12146027 / 14400000000 

Maximum value recorded in sparse tensor =  19


#### **Caution**: Repeated indices in sparse tensors

The above stats are obviously incorrect. The reason can be traced back to repeated indices. The way sparse tensors and sparse tensor operations are coded in tensorflow, it is possible to define or obtain sparse tensors with repeated indices:

```
x = tf.sparse.SparseTensor(tf.constant([[0,1], [1,1], [0,1]]),
                           tf.constant([2, 4, 8]),
                           dense_shape=(2,2))
```
However, if you attempt to convert this sparse tensor to a dense tensor with validate_indices=True you will get an error, and with validate_indices=False you will get the last value for each repeated index and **not the sum**.

```
tf.sparse.to_dense(x, validate_indices=False) = [[0, 8],[0, 4]]
```
You can do this operation in SciPy instead where by default the values for repeated indices are summed over, or explicitly sum over the values for repeated indices (not very efficient!).


In [None]:
# Define a function to sum over the values of repeated indices
# for a 2 dimensional sparse tensor.
def reduce_sum_rsi(x : tf.sparse.SparseTensor,
                   batch_size = 512) -> tf.sparse.SparseTensor:
    ''' Reduces a sparse 2 dimensional tensor with repeated 
    indices to a sparse tensor with valid (not repeated) indices 
    by summing over the (integer) values for the repeated indices.
    This operation is done batch-wise by slicing unique(x.indices).
    
    Inputs:
    x: A 2-dimensional sparse tensor with dtype = tf.int64
    batch_size: A positive integer. 
    (Assumes batch_size < len(x.indices) is True.)

    Returns an integer-valued tf.spare.SparseTensor with the
    same dense_shape as x.
    '''
    # Select all unique index pairs
    indices = tf.raw_ops.UniqueV2(x=x.indices, axis = [0])[0]
    # Initialize the values tensor
    values = tf.Variable(tf.zeros(shape = (indices.__len__(),),
                                  dtype=tf.float32))
    
    # Batch parameters
    num_indices = x.indices.__len__()
    num_batches = num_indices // batch_size
    if num_indices % batch_size:
        num_batches +=1
    
    # Loop over batches
    pbar = keras.utils.Progbar(target=num_batches-1)
    s = batch_size
    for k in range(num_batches):
        # Adjust batch_size for the last batch
        if k == num_batches-1 and num_indices % batch_size:
            s = num_indices % batch_size
        
        batch = tf.slice(x.indices, [k*batch_size,0], [s, 2])
        # Match x in batch with y in indices
        # mask.shape = (batch_size, num_indices)
        mask = tf.reduce_all(
            tf.equal(tf.expand_dims(batch, axis=1), indices),
            axis=2)
        # Convert to sparse tensor with dtyep=float32
        # (necessary for sparse_dense_matmul)
        mask = tf.sparse.SparseTensor(
            tf.where(mask),
            tf.ones(shape=(tf.where(mask).__len__(),),
                    dtype=tf.float32),
            dense_shape = mask.shape)
        # Update values
        values.assign_add(
            tf.reshape(
                tf.sparse.sparse_dense_matmul(
                    tf.expand_dims( tf.cast(
                        tf.slice(bigram_freq.values,
                                 [k*batch_size,], [s,]),
                        dtype = tf.float32), axis = 0),
                    mask), 
                shape = (-1,))
        )
        pbar.update(k)

    # Convert to dtype=int64
    values = tf.cast(values, tf.int64)
    
    return tf.sparse.SparseTensor(indices, values, x.dense_shape)

In [None]:
# Sum over the values of repeated indices in bigram_freq
bigram_freq_validated = reduce_sum_rsi(bigram_freq, batch_size=2048)

In [49]:
# Stats on frequency of appearance of bigrams
print('Number of unique positive bigrams / Number of all bigrams = ',
      '{} / {}'.format(
          tf.raw_ops.UniqueV2(
              x=bigram_freq.indices, axis = [0])[0].__len__(), 
          imdb_vocab_size**2)
     )

Number of unique positive bigrams / Number of all bigrams =  2253110 / 14400000000


For large num_unique_indices we also need to loop over unique indices, below is a modified version of reduced_sum_rsi for this case.

```
def reduce_sum_rsi_v2(x : tf.sparse.SparseTensor,
                   batch_size = 512) -> tf.sparse.SparseTensor:
    ''' Reduces a sparse 2 dimensional tensor with repeated 
    indices to a sparse tensor with valid (not repeated) indices 
    by summing over the (integer) values for the repeated indices.
    This operation is done batch-wise by slicing unique(x.indices).
    
    Inputs:
    x: A 2-dimensional sparse tensor with dtype = tf.int64
    batch_size: A positive integer. 
    (Assumes batch_size < len(x.indices) is True.)

    Returns an integer-valued tf.spare.SparseTensor with the
    same dense_shape as x.
    '''
    # Select all unique index pairs
    indices = tf.raw_ops.UniqueV2(x=x.indices, axis = [0])[0]
    # Initialize the values tensor
    values = tf.Variable(tf.zeros(shape = (indices.__len__(),), dtype=tf.int64))
    
    # Batch parameters
    s1 = batch_size
    num_indices = x.indices.__len__()
    num_batches = num_indices // batch_size
    if num_indices % batch_size:
        num_batches +=1

    num_unique_indices = indices.__len__()
    num_unique_batches = num_unique_indices // batch_size
    if num_unique_indices % batch_size:
        num_unique_batches +=1
        
    # Loop over batches
    pbar = keras.utils.Progbar(target=num_batches-1)
    for k in range(num_batches):
        # Adjust batch_size for the last batch
        if k == num_batches-1 and num_indices % batch_size:
            s1 = num_indices % batch_size
        
        batch = tf.slice(x.indices, [k*batch_size,0], [s1, 2])
        s2 = batch_size
        for j in range(num_unique_batches):
            if j == num_unique_batches-1 and num_unique_indices % batch_size:
                s2 = num_unique_indices % batch_size
            # Match x in batch with y in indices
            # mask.shape = (s1, s2)
            mask = tf.reduce_all(
                tf.equal(tf.expand_dims(batch, axis=1), 
                         tf.slice(indices, [j*batch_size,0], [s2, 2])),
                axis=2)
            # Convert to dtyep=int64
            mask = tf.cast(mask, dtype=tf.int64)
            # Update values
            values[j*batch_size:j*batch_size+s2].assign_add(
                tf.reshape(
                    tf.matmul(
                        tf.expand_dims(
                            tf.slice(bigram_freq.values,[k*batch_size,], [s1,]),
                            axis = 0),
                        mask),
                    shape = (-1,))
            )
        
        pbar.update(k)
    
    return tf.sparse.SparseTensor(indices, values, x.dense_shape)
```