# Chapter 16: Natural Language Processing with RNNs and Attention

### Setup

In [9]:
from datetime import date
import os
import collections
import json

# External libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tqdm import tqdm

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from keras import layers, optimizers, Sequential, callbacks, losses, metrics
from tensorflow.keras.optimizers.legacy import Adam, Nadam, SGD
import tensorflow_text as text
import tensorflow_hub as hub
import tensorflow_addons as tfa


# Hugging Face
from transformers import GPT2Tokenizer, TFGPT2Model, pipeline, set_seed

# Personal Libraries
from ml_functions import ml_functions, ml_learning_rate


TRAIN = False


In [4]:
BATCH_SIZE = 128
AUTOTUNE = tf.data.AUTOTUNE


### 8. Embedded Reber grammars were used by Hochreiter and Schmidhuber in [their paper](https://homl.info/93) about LSTMs. They are artificial grammars that produce strings such as “BPBTSXXVPSEPE”. Check out [Jenny Orr’s nice introduction](https://homl.info/108) to this topic, then choose a particular embedded Reber grammar (such as the one represented on Orr’s page), then train an RNN to identify whether a string respects that grammar or not. You will first need to write a function capable of generating a training batch containing about 50% strings that respect the grammar, and 50% that don’t.

In [None]:
reber_list = {
    "B": ["T", "P"],
    "B_T": ["S", "X"],
    "T_S": ["S", "X"],
    "S_S": ["S", "X"],
    "T_X": ["X", "S"],
    "S_X": ["X", "S"],
    "X_X": ["T"],
    "P_X": ["T"],
    "X_S": ["E"],
    "P_S": ["E"],
    "B_P": ["T", "V"],
    "T_T": ["T", "V"],
    "P_T": ["T", "V"],
    "P_V": ["P", "V"],
    "T_V": ["P", "V"],
    "X_V": ["P", "V"],
    "X_T": ["T", "V"],
    "V_P": ["X", "S"],
    "V_V": ["E"],
}

non_reber_list = {
    "B": ["T", "P"],
    "T": ["S", "X", "T", "V"],
    "S": ["S", "X", "E"],
    "X": ["X", "S", "T"],
    "P": ["T", "V", "X", "S"],
    "V": ["P", "V", "E"],
}


def next_letter_reber(letter: str , prev_letter: str):
    if letter == "B":
        options = reber_list[letter]
    else:
        options = reber_list[f"{prev_letter}_{letter}"]

    choice = np.random.choice(options)
    return letter, choice


def next_letter_non_reber(letter: str):
    options = non_reber_list[letter]
    choice = np.random.choice(options)
    return letter, choice


def generate_reber(reber: bool =True):
    letter = "B"
    prev_letter = None
    final_string = letter

    while letter != "E":
        if reber:
            prev_letter, letter = next_letter_reber(letter, prev_letter)
        elif reber == False:
            prev_letter, letter = next_letter_non_reber(letter)

        final_string += letter

    return final_string


def generate_samples(n_samples: int, reber: bool=True):
    def generate_sample():
        string = generate_reber(reber=reber)
        return string

    list_strings = []
    for _ in range(n_samples):
        string = generate_sample()
        list_strings.append(string)

    return list_strings


def generate_classes(list_reber, class_type: int =1):
    return [class_type] * len(list_reber)


def generate_dataset(n_samples: int =10000):
    list_reber = generate_samples(n_samples // 2)
    list_non_reber = generate_samples(n_samples // 2, reber=False)
    list_samples = list_reber + list_non_reber
    X = list_samples

    Y = np.array(
        [[1.0] for _ in range(len(list_reber))]
        + [[0.0] for _ in range(len(list_non_reber))]
    )

    return X, Y


- When creating the `text_vec_layer` object, it's important to use all the training data at the same time, and to limit the output lenght with the `output_sequence_length` parameter.

In [None]:
x_train, y_train = generate_dataset(50000)
x_valid, y_valid = generate_dataset(10000)


In [None]:
def train_model(x_train, y_train, x_valid, y_valid, epochs: int):
    text_vec_layer = (
        layers.TextVectorization(
            split="character", standardize="lower", output_sequence_length=60
        )
    )
    text_vec_layer.adapt(x_train)
    n_tokens = text_vec_layer.vocabulary_size()

    x_train_encoded = text_vec_layer(x_train)
    x_valid_encoded = text_vec_layer(x_valid)


    reber_model = Sequential(
        [
            layers.Embedding(input_dim=n_tokens, output_dim=8, mask_zero=True),
            layers.GRU(256),
            layers.Dense(1, activation="sigmoid"),
        ]
    )

    # Compile the model
    reber_model.compile(
        loss="binary_crossentropy",
        metrics=["accuracy"],
        optimizer=Adam(learning_rate=1e-4),
    )

    # Train the model
    reber_model.fit(x_train_encoded, y_train, validation_data=(x_valid_encoded, y_valid), epochs=epochs)

    model = Sequential([
        text_vec_layer,
        reber_model
    ])
    return model


def create_predictions(strings, model):
    y_proba = model.predict(strings)
    print(y_proba)


In [None]:
with tf.device("/cpu:0"):
    reber_model = train_model(x_train, y_train, x_valid, y_valid, epochs=35)


In [None]:
x_test, y_test = generate_dataset(16)
with tf.device("/cpu:0"):
    y_proba = reber_model.predict(x_test)
    for index, string in enumerate(x_test):
        prob = y_proba[index][0] * 100
        print(f"{string}: {prob:.2f}% ({y_test[index] * 100}%)")

### 9. Train an encoder–decoder model that can convert a date string from one format to another (e.g., from “April 22, 2019” to “2019-04-22”)

We will convert from format `22 April, 2019` to format `22/04/19`.

In [196]:
# cannot use strftime()'s %B format since it depends on the locale
MONTHS = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December",
]

INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
OUTPUT_CHARS = "1234567890/"
MODEL_FILEPATH = "../../models/chapter_16/date_conversor"


def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    x_dates = [
        f"{dt.strftime('%d')} {MONTHS[dt.month - 1]}, {dt.strftime('%Y')}"
        for dt in dates
    ]
    y_dates = [dt.strftime("%d/%m/%Y") for dt in dates]
    return x_dates, y_dates


def date_to_ids(string, chars=INPUT_CHARS):
    return [chars.index(char) for char in string]


def dates_to_ids(list_dates, chars=INPUT_CHARS):
    list_ids = [date_to_ids(date, chars) for date in list_dates]
    list_ids = tf.ragged.constant(list_ids, ragged_rank=1)
    list_ids = (list_ids + 1).to_tensor()
    return list_ids


def shift_sequence(sequence):
    sos_value = len(OUTPUT_CHARS) + 1
    sos_token = tf.fill(dims=(len(sequence), 1), value=sos_value)
    return tf.concat([sos_token, sequence[:, :-1]], axis=1)


def create_dataset(n_samples):
    x, y = random_dates(n_samples)
    x = dates_to_ids(x)
    y = dates_to_ids(y, OUTPUT_CHARS)
    x_decoder = shift_sequence(y)

    return x, y, x_decoder


In [216]:
def train_model(train_data, valid_data, epochs=10):
    # Separate the data
    x_train, y_train, x_train_decoder = train_data
    x_valid, y_valid, x_valid_decoder = valid_data

    # Inputs of the model
    encoder_inputs = layers.Input(shape=[None])
    decoder_inputs = layers.Input(shape=[None])

    # Embeddings
    embed_size = 32

    encoder_embedding_layer = layers.Embedding(
        input_dim=len(INPUT_CHARS) + 1, output_dim=embed_size, mask_zero=True
    )
    encoder_embeddings = encoder_embedding_layer(encoder_inputs)

    decoder_embedding_layer = layers.Embedding(
        input_dim=len(OUTPUT_CHARS) + 2, output_dim=embed_size, mask_zero=True
    )
    decoder_embeddings = decoder_embedding_layer(decoder_inputs)

    # Encoder
    encoder = layers.GRU(128, return_state=True)
    encoder_outputs, *encoder_state = encoder(encoder_embeddings)

    # Decoder
    decoder = layers.GRU(128, return_sequences=True)
    decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

    # Attention layer
    attention_layer = layers.Attention()
    attention_outputs = attention_layer([decoder_outputs, encoder_outputs])

    # Output layer
    output_layer = layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
    y_proba = output_layer(decoder_outputs)

    # Create the model
    date_conversor = tf.keras.Model(
        inputs=[encoder_inputs, decoder_inputs], outputs=y_proba
    )

    # Compile the model
    date_conversor.compile(
        metrics=["accuracy"],
        loss="sparse_categorical_crossentropy",
        optimizer=Nadam(),
    )

    # Callbacks
    logs = "../../reports/logs/chapter_16/date_conversor"
    logdir = ml_functions.get_logdir(date_type="datetime", path_folder=logs)
    tensorboard_cb = callbacks.TensorBoard(log_dir=logdir)

    # Train the model
    date_conversor.fit(
        [x_train, x_train_decoder],
        y_train,
        validation_data=([x_valid, x_valid_decoder], y_valid),
        epochs=epochs,
        callbacks=[tensorboard_cb],
    )
    date_conversor.save(MODEL_FILEPATH, save_format="tf")

    print(
        f"""
        The model has been successfully trained!

        The path of the model is:
            {MODEL_FILEPATH}
    """
    )


In [217]:
train_data = create_dataset(100000)
valid_data = create_dataset(20000)

In [218]:
with tf.device("/cpu:0"):
    train_model(train_data, valid_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20

InvalidArgumentError: Graph execution error:

Detected at node 'Nadam/Nadam/update_5/add_2' defined at (most recent call last):
    File "/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/opt/homebrew/Cellar/python@3.10/3.10.11/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/l8/6g7jw01d22z1cl19whfqrxhw0000gn/T/ipykernel_40726/2969415456.py", line 2, in <module>
      train_model(train_data, valid_data, epochs=20)
    File "/var/folders/l8/6g7jw01d22z1cl19whfqrxhw0000gn/T/ipykernel_40726/4196942204.py", line 57, in train_model
      date_conversor.fit(
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/engine/training.py", line 1054, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 588, in minimize
      return self.apply_gradients(grads_and_vars, name=name)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 747, in apply_gradients
      return tf.__internal__.distribute.interim.maybe_merge_call(
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 806, in _distributed_apply
      update_op = distribution.extended.update(
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/optimizers/legacy/optimizer_v2.py", line 785, in apply_grad_to_update_var
      update_op = self._resource_apply_dense(grad, var, **apply_kwargs)
    File "/Users/mmenendezg/Developer/Books/.venv/lib/python3.10/site-packages/keras/optimizers/legacy/nadam.py", line 186, in _resource_apply_dense
      coefficients["one_minus_m_t"] * g_prime
Node: 'Nadam/Nadam/update_5/add_2'
Incompatible shapes: [32,384] vs. [0]
	 [[{{node Nadam/Nadam/update_5/add_2}}]] [Op:__inference_train_function_2270066]

In [None]:
_, y_train, _ = train_data
max_ouput_length = y_train.shape[1]


def evaluate_model(dates: list, targets: list):
    x = dates_to_ids(dates)
    y = dates_to_ids(targets, chars=OUTPUT_CHARS)
    x_decoder = shift_sequence(y)
    date_conversor = tf.keras.models.load_model(MODEL_FILEPATH)
    evaluation = date_conversor.evaluate([x, x_decoder], y)
    print(evaluation)


def ids_to_dates(ids, chars=OUTPUT_CHARS):
    return ["".join([("?" + chars)[index] for index in sequence]) for sequence in ids]


def convert_dates(list_dates: list):
    date_conversor = tf.keras.models.load_model(MODEL_FILEPATH)

    x = dates_to_ids(list_dates)
    y_pred = tf.fill(dims=(len(x), 1), value=len(OUTPUT_CHARS) + 1)
    x_fill = tf.fill(dims=(len(x), max_ouput_length - 1), value=0)
    x_decoder = tf.concat([y_pred, x_fill], axis=1)
    for index in range(max_ouput_length):
        y_probas_next = date_conversor.predict([x, x_decoder], verbose=0)[
            :, index : index + 1
        ]
        y_pred_next = tf.argmax(y_probas_next, axis=-1, output_type=tf.int32)
        y_pred = tf.concat([y_pred, y_pred_next], axis=1)
    print(ids_to_dates(y_pred[:, 1:]))


In [None]:
list_dates = [
    "15 June, 1992",
    "21 March, 1995",
    "16 August, 1996",
    "22 December, 1993",
    "21 April, 2016",
    "21 August, 2022"
]

list_targets = [
    "15/06/1992",
    "21/03/1995",
    "16/08/1996",
    "22/12/1993",
    "21/04/2016",
    "21/08/2022"
]

convert_dates(list_dates)

['18/02/9196', '21/03/9166', '19/08/9196', '22/12/1119', '21/04/4592', '21/08/4522']


### 10. Go through the example on the Keras website for [“Natural language image search with a Dual Encoder”](https://homl.info/dualtuto). You will learn how to build a model capable of representing both images and text within the same embedding space. This makes it possible to search for images using a text prompt, like in the CLIP model by OpenAI.

In [3]:
tf.get_logger().setLevel("ERROR")

#### Data Preparation

In [4]:
MS_COCO_DIR = "../../datasets/chapter_16/ms_coco" 
annotations_dir = tf.io.gfile.join(MS_COCO_DIR, "annotations")
images_dir = tf.io.gfile.join(MS_COCO_DIR, "train_2014")
tfrecords_dir = tf.io.gfile.join(MS_COCO_DIR, "tfrecords")
annotation_file = tf.io.gfile.join(annotations_dir, "captions_train2014.json")

# Download caption annotation files
if not tf.io.gfile.exists(annotations_dir):
    annotation_zip = tf.keras.utils.get_file(
        "captions.zip",
        cache_dir=os.path.abspath("."),
        origin="http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
        extract=True,
    )
    os.remove(annotation_zip)

# Download the image files
if not tf.io.gfile.exists(images_dir):
    image_zip = tf.keras.utils.get_file(
        "train2014.zip",
        cache_dir=os.path.abspath("."),
        origin="http://images.cocodataset.org/zips/train2014.zip",
        extract=True,
    )
    os.remove(image_zip)

print("Dataset is downloaded and extracted successfully.")

Dataset is downloaded and extracted successfully.


In [5]:
with open(annotation_file, "r") as f:
    annotations = json.load(f)["annotations"]

image_path_to_caption = collections.defaultdict(list)
for element in annotations:
    caption = f"{element['caption'].lower().rstrip('.')}"
    image_path = images_dir + "/COCO_train2014_" + "%012d.jpg" % (element["image_id"])
    image_path_to_caption[image_path].append(caption)

image_paths = list(image_path_to_caption.keys())
print(f"Number of images: {len(image_paths)}")

Number of images: 82783


#### Process and save the data to TFRecord files

In [6]:
train_size = 30_000
valid_size = 5_000
captions_per_image = 2
images_per_file = 4_000

train_image_paths = image_paths[:train_size]
num_train_files = int(np.ceil(train_size / images_per_file))
train_files_prefix = os.path.join(tfrecords_dir, "train")

valid_image_paths = image_paths[-valid_size:]
num_valid_files = int(np.ceil(valid_size / images_per_file))
valid_files_prefix = os.path.join(tfrecords_dir, "valid")

tf.io.gfile.makedirs(tfrecords_dir)

In [7]:
def bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def create_example(image_path, caption):
    features = tf.train.Features(
        feature={
            "caption": bytes_feature(caption.encode()),
            "raw_image": bytes_feature(tf.io.read_file(image_path).numpy()),
        }
    )
    return tf.train.Example(features=features)


def write_tfrecords(file_name, image_paths):
    caption_list = []
    image_path_list = []
    for image_path in image_paths:
        captions = image_path_to_caption[image_path][:captions_per_image]
        caption_list.extend(captions)
        image_path_list.extend([image_path] * len(captions))

    with tf.io.TFRecordWriter(file_name) as writer:
        for example_idx in range(len(image_path_list)):
            example = create_example(
                image_path_list[example_idx], caption_list[example_idx]
            )
            writer.write(example.SerializeToString())
    return example_idx + 1


def write_data(
    image_paths=train_image_paths,
    num_files=num_train_files,
    files_prefix=train_files_prefix,
):
    example_counter = 0
    for file_idx in tqdm(range(num_files)):
        file_name = files_prefix + "-%02d.tfrecord" % (file_idx)
        start_idx = images_per_file * file_idx
        end_idx = start_idx + images_per_file
        example_counter += write_tfrecords(file_name, image_paths[start_idx:end_idx])
    return example_counter


def read_example(example):
    feature_description = {
        "caption": tf.io.FixedLenFeature([], tf.string),
        "raw_image": tf.io.FixedLenFeature([], tf.string),
    }
    features = tf.io.parse_single_example(example, feature_description)
    raw_image = features.pop("raw_image")
    features["image"] = tf.image.resize(
        tf.image.decode_jpeg(raw_image, channels=3), size=(299, 299)
    )
    return features


def get_dataset(file_pattern, batch_size):
    return (
        tf.data.TFRecordDataset(tf.data.Dataset.list_files(file_pattern))
        .map(read_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
        .shuffle(batch_size * 100)
        .prefetch(buffer_size=tf.data.AUTOTUNE)
        .batch(batch_size)
    )


In [8]:
train_example_count = write_data()
print(f"{train_example_count} examples were saved as TFRecord files for training.")

  0%|          | 0/8 [00:00<?, ?it/s]

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



100%|██████████| 8/8 [00:32<00:00,  4.11s/it]

60000 examples were saved as TFRecord files for training.





In [9]:
valid_example_count = write_data(valid_image_paths, num_valid_files, valid_files_prefix)
print(f"{valid_example_count} examples were saved as TFRecord files for validation.")

100%|██████████| 2/2 [00:05<00:00,  2.59s/it]

10000 examples were saved as TFRecord files for validation.





#### Implement the projection head, the vision encoder and text encoder

In [10]:
def project_embeddings(
    embeddings, num_projection_layers, projection_dims, dropout_rate
):
    """
    The projected head is used to transform the image and the text embeddings
    to the same embedding space with the same dimensionality.
    """
    projected_embeddings = layers.Dense(units=projection_dims)(embeddings)
    for _ in range(num_projection_layers):
        x = tf.nn.gelu(projected_embeddings)
        x = layers.Dense(projection_dims)(x)
        x = layers.Dropout(dropout_rate)(x)
        x = layers.Add()([projected_embeddings, x])
        projected_embeddings = layers.LayerNormalization()(x)
    return projected_embeddings


def create_vision_encoder(
    num_projection_layers, projection_dims, dropout_rate, trainable=False
):
    """
    We use Xception as the base model for the vision encoder
    """
    xception = keras.applications.Xception(
        include_top=False, weights="imagenet", pooling="avg"
    )
    # Set the trainaiblity fo the base encoder
    for layer in xception.layers:
        layer.trainable = trainable
    # Receive the images as inputs

    inputs = layers.Input(shape=(299, 299, 3), name="image_input")
    # Preprocess the input image
    xception_input = keras.applications.xception.preprocess_input(inputs)
    # Generate the embeddings for the images using the xception model
    embeddings = xception(xception_input)
    # Project the embeddings produced by the model
    outputs = project_embeddings(
        embeddings, num_projection_layers, projection_dims, dropout_rate
    )
    return keras.Model(inputs, outputs, name="vision_encoder")


def create_text_encoder(
    num_projection_layers, projection_dims, dropout_rate, trainable=False
):
    """
    We use BERT as the base model for the text encoder
    """
    # Load the BERT preprocessing module
    preprocess = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2",
        name="text_preprocessing",
    )
    # Load the pre-trained BERT model to be used as the base encoder
    bert = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
        name="bert",
    )
    # Set the trainability of the base encoder
    bert.trainable = trainable
    # Receive the text as inputs
    inputs = layers.Input(shape=(), dtype=tf.string, name="text_input")
    # Preprocess the text
    bert_inputs = preprocess(inputs)
    # Generate embeddings for the preprocessed text using BERT
    embeddings = bert(bert_inputs)["pooled_output"]
    outputs = project_embeddings(
        embeddings,
        num_projection_layers,
        projection_dims,
        dropout_rate,
    )
    # Create the text encoder model
    return keras.Model(inputs, outputs, name="text_encoder")


#### Implement the dual encoder

In [11]:
class DualEncoder(keras.Model):
    def __init__(self, text_encoder, image_encoder, temperature=1.0, **kwargs):
        super(DualEncoder, self).__init__(**kwargs)
        self.text_encoder = text_encoder
        self.image_encoder = image_encoder
        self.temperature = temperature
        self.loss_tracker = metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.loss_tracker]

    def call(self, features, training=False):
        with tf.device("/gpu:0"):
            caption_embeddings = self.text_encoder(
                features["caption"], training=training
            )
        with tf.device("/gpu:1"):
            image_embeddings = self.image_encoder(features["image"], training=training)

        return caption_embeddings, image_embeddings

    def compute_loss(self, caption_embeddings, image_embeddings):
        # logits[i][j] is the dot_similarity(caption_i, image_j)
        logits = (
            tf.matmul(caption_embeddings, image_embeddings, transpose_b=True)
            / self.temperature
        )
        # image_similarity[i][j] is the dot_similarity(image_i, caption_j)
        image_similarity = tf.matmul(
            image_embeddings, caption_embeddings, transpose_b=True
        )
        # caption_similarity[i][j] is the dot_similarity(caption_i, cpation_j)
        caption_similarity = tf.matmul(
            caption_embeddings, caption_embeddings, transpose_b=True
        )
        # targets[i][j] = averate dot_similarity(caption_i, caption_j) and
        # dot_similarity(image_i, image_j)
        targets = tf.keras.activations.softmax(
            (caption_similarity + image_similarity) / (2 * self.temperature)
        )
        # compute the loss for the captions using crossentropy
        caption_loss = losses.categorical_crossentropy(
            y_true=targets, y_pred=logits, from_logits=True
        )
        # compute the loss for the images using crossentropy
        image_loss = losses.categorical_crossentropy(
            y_true=tf.transpose(targets), y_pred=tf.transpose(logits), from_logits=True
        )

        # Return the mean of the loss over the batch
        return (caption_loss + image_loss) / 2

    def train_step(self, features):
        with tf.GradientTape() as tape:
            # Forward Pass
            caption_embeddings, image_embeddings = self(features, training=True)
            loss = self.compute_loss(caption_embeddings, image_embeddings)
        
        # Backward pass
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        # Monitor loss
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}
    
    def test_step(self, features):
        caption_embeddings, image_embeddings = self(features, training=False)
        loss = self.compute_loss(caption_embeddings, image_embeddings)
        self.loss_tracker.update_state(loss)
        return {"loss": self.loss_tracker.result()}

#### Train the dual encoder model

In [12]:
num_epochs = 10
batch_size = 256

vision_encoder = create_vision_encoder(
    num_projection_layers=1, projection_dims=256, dropout_rate=0.1
)
text_encoder = create_text_encoder(
    num_projection_layers=1, projection_dims=256, dropout_rate=0.1
)

dual_encoder = DualEncoder(text_encoder, vision_encoder, temperature=0.05)
dual_encoder.compile(
    optimizer=tfa.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-3)
)

In [13]:
print(f"Number of GPUs: {len(tf.config.list_physical_devices('GPU'))}")
print(f"Number of examples (caption-image pairs): {train_example_count}")
print(f"Batch Size: {batch_size}")
print(f"Steps per epoch: {int(np.ceil(train_example_count / batch_size))}")

train_ds = get_dataset(os.path.join(tfrecords_dir, "train-*.tfrecord"), batch_size)
valid_ds = get_dataset(os.path.join(tfrecords_dir, "valid-*.tfrecord"), batch_size)

# Create a learning rate scheduler callback
reduce_lr = callbacks.ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=3
)
# Create an early stopping callback
early_stopping_cb = callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

history = dual_encoder.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=num_epochs,
    callbacks=[reduce_lr, early_stopping_cb]
)

print("Training completed. Saving vision and text encoders...")
vision_encoder.save("vision_encoder")
text_encoder.save("text_encoder")
print("Models successfully saved")

Number of GPUs: 1
Number of examples (caption-image pairs): 60000
Batch Size: 256
Steps per epoch: 235
Epoch 1/10
    423/Unknown - 5573s 13s/step - loss: 7.7599

### 11. Use the Hugging Face Transformers library to download a pretrained language model capable of generating text (e.g., GPT), and try generating more convincing Shakespearean text. You will need to use the model’s generate() method—see Hugging Face’s documentation for more details.

In [12]:
generator = pipeline('text-generation', model='gpt2-xl')
set_seed(42)
generator("To be or not to be", max_length=50, num_return_sequences=5)


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2-xl.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'To be or not to be, as it were, or not to be, or not to be, so to speak, according to what we were in the Lord, whether we were there when he made it or on the morrow (2 Corinthians'},
 {'generated_text': "To be or not to be – that is the question, that is the choice – and that is something I struggle with a lot, that's the heart of it. At the age of 18 I had a choice to become a pilot or a pilot"},
 {'generated_text': 'To be or not to be" was the question, and if you thought all of this was being addressed adequately to the people\'s concerns, I would say you were sorely mistaken. All we had was a very vague and highly charged statement saying they were'},
 {'generated_text': 'To be or not to be, that is the question.'},
 {'generated_text': 'To be or not to be: That is the question\n\n"I think everyone, in a position of leadership in life, has to think about this," said the former CIA director.\n\nTrump\'s election has been hailed by some liberals and'}]

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
model = TFGPT2Model.from_pretrained("gpt2-xl")

Downloading tf_model.h5: 100%|██████████| 6.23G/6.23G [12:38<00:00, 8.21MB/s]


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2-xl.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [6]:
text = "To be or not be"
encoded_input = tokenizer(text, return_tensors="tf")


In [7]:
output = model(encoded_input)

In [8]:
print(output)

TFBaseModelOutputWithPastAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 5, 1600), dtype=float32, numpy=
array([[[ 0.2606948 ,  0.581097  ,  0.81493074, ..., -4.227391  ,
          0.28385282,  0.27711612],
        [-1.1135614 ,  0.37249213,  0.9493996 , ..., -1.3213112 ,
         -0.10956633,  0.05264648],
        [ 0.53276235, -1.129468  ,  0.08183268, ..., -1.0769141 ,
          0.08501442,  1.7663525 ],
        [ 0.06902504, -0.60247433,  0.736676  , ..., -1.3213501 ,
          1.2200413 ,  1.4579804 ],
        [-1.3845676 ,  0.3320303 ,  0.83540314, ..., -1.1858324 ,
         -0.01671576,  0.33990306]]], dtype=float32)>, past_key_values=(<tf.Tensor: shape=(2, 1, 25, 5, 64), dtype=float32, numpy=
array([[[[[ 6.79623008e-01,  6.16473556e-01, -1.19582736e+00, ...,
            5.10271668e-01,  3.62511784e-01,  2.81917393e-01],
          [-3.57440680e-01, -4.02378291e-01,  1.16328847e+00, ...,
           -1.69485033e-01, -3.56251150e-01, -1.76885203e-01],
          [ 1.01503