This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**

This notebook was generated for TensorFlow 2.6.

### Processing words as a sequence: The sequence model approach

#### A first practical example

## Downloading and pre-processing the data

**These data are the same as those in notebook 601, so if you keep the aclImdb directory structure exactly as used in that notebook, you will be able to use this code directly**

**Preparing integer sequence datasets**

In [1]:
import os, pathlib, shutil, random
from tensorflow import keras
batch_size = 32
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
#for category in ("neg", "pos"):
#    os.makedirs(val_dir / category)
#    files = os.listdir(train_dir / category)
#    random.Random(1337).shuffle(files)
#    num_val_samples = int(0.2 * len(files))
#    val_files = files[-num_val_samples:]
#    for fname in val_files:
#        shutil.move(train_dir / category / fname,
#                    val_dir / category / fname)

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)
text_only_train_ds = train_ds.map(lambda x, y: x)

2024-11-14 19:42:33.409606: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-14 19:42:33.647851: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-14 19:42:33.859046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-14 19:42:34.166737: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-14 19:42:34.247347: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-14 19:42:34.647711: I tensorflow/core/platform/cpu_feature_gu

Found 25000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [8]:
from tensorflow.keras import layers

max_length = 300
max_tokens = 10000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

2024-11-14 19:46:03.194344: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [3]:
int_train_ds

<_ParallelMapDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [4]:
for inputs, targets in int_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[10]:", targets[0])
    break

inputs.shape: (32, 300)
inputs.dtype: <dtype: 'int64'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(
[ 154 6072 1256    5   49    4 3491   18  139   28  101    1  129   12
   41 1460    1  254 1256    1    3  546  170 1720   52   11    7   49
    4  146 3491   18  139  163   39    3  232   39   54 1234   11    7
    4    1    5 2510    2  690    6 1557  109    3  216    7 6272  146
  255   11    7    4 1700 4804  361    9 1391    6  896  196    1    1
 7262 2794   81 3491 2879   93  132    2  274    3  667  102  504    3
  163   39   35   24  146  119   83  397    4 1806   12  157  263 1566
    3  263   39    2  146  151 1313    2   97    9  139   28  201    2
 5683    1 3340  654 9043    6 2341    1    1   16  126 1493  152   12
   90 1594   10   41  624   12   47 6350 2923 5636    1  388    2  597
    6 2185    4  371 1948 1220    5   11 7057   10   90  451   45    9
  261    6 2195   42  790    6  294   10   59  110   76 1403    5  147
    9   41    2  1

**A sequence model built on one-hot encoded vector sequences**

### Do not .fit it! Takes too long for nothing

In [13]:
import tensorflow as tf
from tensorflow.keras.layers import CategoryEncoding, Input, Embedding
from tensorflow.keras.models import Model

inputs = keras.Input(shape=(None,), 
                     dtype="int64")

multi_hot_layer = CategoryEncoding(num_tokens=max_tokens, 
                                   output_mode="multi_hot")(inputs)

x = layers.Bidirectional(layers.LSTM(32))(multi_hot_layer)

x = layers.Dropout(0.5)(x)

outputs = layers.Dense(1, 
                       activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

ValueError: Input 0 of layer "bidirectional" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 10000)

**Training a first basic sequence model**

In [None]:
#model.fit(int_train_ds, 
#          validation_data=int_val_ds, 
#          epochs=10)


In [None]:
#model = keras.models.load_model("one_hot_bidir_lstm.keras")
#print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

#### Understanding word embeddings

#### Learning word embeddings with the Embedding layer

La idea de esta parte del tutorial es comprobar de primera mano las ventajas de todo tipo de la capa Embeddings, que realiza una codificación mucho más eficaz computacionalmente, pues pasamos de una dimensionalidad del input de 10000 a 256. 

Este modelo, idéntico al anterior, ahora sí permite entrenar en un tiempo razonable (tampoco para tirar cohetes). 

Lo que no funcionaba era el callback de ModelCheckpoint. Y es, perdonadme que lo diga así, una "chorrada". Parece ser que con la extensión .keras con la que estaba, algo ha cambiado de las versiones anteriores de tensorflow a la actual:

https://stackoverflow.com/questions/76701617/the-following-arguments-are-not-supported-with-the-native-keras-format-opti

Soluciones (de entre las varias que se ofrecen ahí -qué haríamos sin stackoverflow!!):
- Bajar la versión de TF (absurda)
- Cambiar la extensión del archivo que guarda los modelos a  cualquiera otra (.tf por ejemplo)
- Utilizar una función de ModelCheckpoint propia (sería cuestión de probarla).

Lo que he hecho ha sido utilizar la solución 2 que además permite especificar qué es lo que se guarda en esos "checkpoint".

**El código es el mismo que en el 603 original** pero he separado el código en varias casillas para su más fácil lectura.

Para consultar la entrada específica de ModelCheckPoint en la API keras:

https://keras.io/api/callbacks/model_checkpoint/

Por supuesto, este código que solo se incluye en el siguiente .fit, es aplicable al resto de .fit en este notebook, o en cualquier otro.


**Instantiating an `Embedding` layer**

In [14]:
embedding_layer = layers.Embedding(input_dim=max_tokens, 
                                   output_dim=256)

**Model that uses an `Embedding` layer trained from scratch**

In [15]:
inputs = keras.Input(shape=(None,), 
                     dtype="int64")

embedded = layers.Embedding(input_dim=max_tokens, 
                            output_dim=256)(inputs)

x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()


In [23]:
callbacks = [keras.callbacks.ModelCheckpoint(filepath="603_LSTM_bidir.keras",
                                             save_best_only=True,
                                             monitor="val_loss")]

In [None]:
model.fit(int_train_ds, 
          validation_data=int_val_ds, 
          epochs=10, 
          callbacks = callbacks)
model = keras.models.load_model("embeddings_bidir_gru.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 263ms/step - accuracy: 0.6448 - loss: 0.6085 - val_accuracy: 0.8424 - val_loss: 0.3882
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 241ms/step - accuracy: 0.8383 - loss: 0.3992 - val_accuracy: 0.8854 - val_loss: 0.3035
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m266s[0m 340ms/step - accuracy: 0.8786 - loss: 0.3173 - val_accuracy: 0.9172 - val_loss: 0.2254
Epoch 4/10
[1m760/782[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m5s[0m 266ms/step - accuracy: 0.8933 - loss: 0.2849

#### Understanding padding and masking

**Using an `Embedding` layer with masking enabled**

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    input_dim=max_tokens, 
    output_dim=256, 
    mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_gru_with_masking.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

#### Using pretrained word embeddings

In [None]:
#!wget https://nlp.stanford.edu/projects/glove/glove.6B.zip
!unzip -q glove.6B.zip

**Parsing the GloVe word-embeddings file**

In [None]:
import numpy as np
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

**Preparing the GloVe word-embeddings matrix**

In [None]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))

for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

**Model that uses a pretrained Embedding layer**

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")