## Data preparation

In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

2023-02-17 10:22:18.081913: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-17 10:22:18.730886: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/include:/usr/local/cuda-11.2/lib64:
2023-02-17 10:22:18.730938: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.2/include:/usr/local/cuda-11.2/lib64:


In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

In [None]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

In [None]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [2]:
batch_size = 32
seed = 12345
train_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2023-02-17 10:22:53.716556: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-17 10:22:53.721529: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-17 10:22:53.721786: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-17 10:22:53.722280: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [3]:
vocab_size   = 20000
sequence_len = 200

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

vectorization = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_len,
)

vectorization.adapt(train_ds.map(lambda text, label: text))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [4]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorization(text), label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)

In [5]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

1
[    4   269    62    42     4   269  2639  2573     5  4080     4  2639
   421     4     1  8606     5 18213   148    12    13 11884     4 11586
   332    20    29     1    15  8087    33  3721     1     1     1     1
     1  2268     1  4383     1     2    84     3     2   226     5     2
    85 11586     1     5  2390     5 14033   148   232    29   432     8
     2  9169     5     1     8     1     4  2639    32     2  1243    15
     2    80     3    15     4 10521 19876     2     1  1667   186     8
    11  2639     7    12     9    64    57    80   546     8     9   557
   143    12  1083     8     2   949     5     1     2   931   555     5
     2  1143   302     4  1734 11135   421    33 16266  2295 19854     8
   992     5    29 13788     8     2   255     5     2  2639     8  2573
     5  4080  6278    43     9   263    20     2 12343    43    30 16447
     9    43    29  1724    28    93     5    13  1692     1     1     4
  3816  1551    35     2   332   617 16952   356 

In [6]:
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)

## Training

In [7]:
from tensorflow.keras import layers
from tensorflow import keras

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + ffn_output)

In [8]:
# Two seperate embedding layers, one for tokens, one for token index (positions)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [9]:
embed_dim = 128  # Embedding size for each token
num_heads = 6    # Number of attention heads
ff_dim = 128     # Hidden layer size in feed forward network inside transformer

embedding_layer = TokenAndPositionEmbedding(sequence_len, vocab_size, embed_dim)
transformer_block1 = TransformerBlock(embed_dim, num_heads, ff_dim)
transformer_block2 = TransformerBlock(embed_dim, num_heads, ff_dim)

inputs = layers.Input(shape=(sequence_len,))
x = embedding_layer(inputs)
x = transformer_block1(x)
x = transformer_block2(x)
x = layers.Flatten()(x)
x = layers.Dense(128, activation="relu")(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 token_and_position_embeddin  (None, 200, 128)         2585600   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 200, 128)         429184    
 merBlock)                                                       
                                                                 
 transformer_block_1 (Transf  (None, 200, 128)         429184    
 ormerBlock)                                                     
                                                                 
 flatten (Flatten)           (None, 25600)             0     

In [10]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(train_ds, batch_size=32, epochs=5, validation_data=val_ds)

Epoch 1/5


2023-02-17 10:23:22.985542: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-02-17 10:23:22.993603: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f750c0b3c20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-02-17 10:23:22.993617: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3070, Compute Capability 8.6
2023-02-17 10:23:22.996707: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-02-17 10:23:23.086650: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
