In [6]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout, Dense, Dropout, Input, GlobalAveragePooling1D
import tensorflow.keras as keras
from util import DynamicPadding
from encoder import Encoder
import numpy as np

%load_ext autoreload
%autoreload 2
%load_ext tensorboard

import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard



## Load Data

We take the IMBD data, and sort the training data according to the length of the sequence. Sorting introduces more uniform batch sizes w.r.t. the sequence length which reduces training time considerably if combined with dynamical padding. Furthermore, we crop sequences beyond 200 tokens.


In [7]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 200  # Only consider the first 200 words of each movie review
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")

# sort training data w.r.t. the sequence length
seq_length = [len(x) for x in x_train]
permuted_indicies = np.argsort(seq_length)
x_train, y_train = x_train[permuted_indicies], y_train[permuted_indicies]

# crop sequences
x_train = [x[:maxlen] for x in x_train]
x_val = [x[:maxlen] for x in x_val]

25000 Training sequences
25000 Validation sequences


## Dynamical Padding

We overwrite the Keras Sequence class to support dynamical padding which pads batches only and therefore reduce sequence length. This speeds up training because Transformers training time growths quadratically with the sequence legth. See also [Michaël Benesty](https://towardsdatascience.com/divide-hugging-face-transformers-training-time-by-2-or-more-21bf7129db9q-21bf7129db9e) contribution for further details.

In [8]:
# dump the data into the Dynamic Padding batch loader
train = DynamicPadding(x_train, y_train, batch_size=64)
test = DynamicPadding(x_val, y_val, batch_size=64)

## Build the Model

Build a Classifier by using a single encoding layer. The architecture is adopted from the official [Keras example](https://keras.io/examples/nlp/text_classification_with_transformer/) by Apoorv Nandan.

In [9]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(maxlen,))
encoder_embedding = Encoder(vocab_size + 1, maxlen, embed_dim, num_heads, ffn_units=ff_dim, encoders=1)
x = encoder_embedding(inputs)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=inputs, outputs=outputs)

## Compile and train model

In [10]:
adam_opt = Adam(0.001, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# save the model after each epoch
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='./imbd_model/prst_model_{epoch}',
        save_freq='epoch',
        ),
    tf.keras.callbacks.TensorBoard(
        log_dir='./imbd_logs',
        profile_batch=0, 
        )
]

# define relevant metrics
metrics = [
    tf.keras.metrics.AUC(),
    tf.keras.metrics.Precision(),
    tf.keras.metrics.Recall()
]

# compile model
model.compile(
    optimizer=adam_opt,
    loss="binary_crossentropy",
    metrics=metrics,
#     run_eagerly=True
)

history = model.fit(
    train, validation_data=test, callbacks=callbacks, epochs=2, verbose=1
)

Epoch 1/2
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./imbd_model/prst_model_1\assets
Epoch 2/2
