In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dataset, info = tfds.load(
    "imdb_reviews/plain_text",
    split=["train[:80%]", "train[80%:]", "test"],
    as_supervised=True,
    with_info = True
)

dataset_train_original = dataset[0]
dataset_validate_original = dataset[1]
dataset_test_original = dataset[2]
info

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJGAJY2/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJGAJY2/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJGAJY2/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [3]:
vocabulary_size = 10000
sequence_length=128

encoder = layers.TextVectorization(
    max_tokens=vocabulary_size,
    output_sequence_length=sequence_length,
    standardize="lower_and_strip_punctuation",
    split="whitespace",
    output_mode="int"
)
encoder.adapt(dataset_train_original.map(lambda text, label: text).batch(512))

In [4]:
dataset_train = dataset_train_original.map(lambda text, label: (encoder(text), label))
dataset_train = dataset_train.cache()
dataset_train = dataset_train.shuffle(25000)
dataset_train = dataset_train.batch(128)

dataset_validate = dataset_validate_original.map(lambda text, label: (encoder(text), label))
dataset_validate = dataset_validate.cache()
dataset_validate = dataset_validate.batch(128)

In [18]:
# We create the encoder layer as it is not implemented in Keras
class TransformerEncoder(layers.Layer):
  
    def __init__(self, embed_dim, dense_dim, num_heads, **kwarg):
        super().__init__(**kwarg)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads

        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_projection = keras.Sequential([
                                            layers.Dense(dense_dim, activation="relu"),
                                            layers.Dense(embed_dim, activation="linear")
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
    
    def call(self, inputs):
        attention_output = self.attention(inputs, inputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_projection(proj_input)
        return self.layernorm_2(proj_input + proj_output)



In [20]:
# plug it into model
model = models.Sequential()
model.add(layers.Embedding(
    input_dim=vocabulary_size, output_dim=256, input_length=sequence_length)
)
model.add(TransformerEncoder(embed_dim=256, dense_dim=32, num_heads=2)) # head is # of times to do the multhead projection
model.add(TransformerEncoder(embed_dim=256, dense_dim=32, num_heads=2))
model.add(layers.GlobalMaxPooling1D()) # convert back to 1-dim / flatten would do the same but maintain more numbers
model.add(layers.Dropout(0.5)) 
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# train
history = model.fit(
    dataset_train,
    epochs=10,
    validation_data=dataset_validate
)

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 128, 256)          2560000   
                                                                 
 transformer_encoder_18 (Tra  (None, 128, 256)         543776    
 nsformerEncoder)                                                
                                                                 
 transformer_encoder_19 (Tra  (None, 128, 256)         543776    
 nsformerEncoder)                                                
                                                                 
 global_max_pooling1d_5 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                     

In [27]:
# positional encoding
class PositionalEmbedding(layers.Layer):

    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
    
    def call(self, inputs):
        embedded_tokens = self.token_embeddings(inputs) # inputs = word indices

        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions


In [28]:
# update model with new position layer

model = models.Sequential()
model.add(layers.Input(shape=(sequence_length,), dtype="int64"))
model.add(PositionalEmbedding(
    input_dim=vocabulary_size,
    output_dim=256,
    sequence_length=sequence_length
))

model.add(TransformerEncoder(embed_dim=256, dense_dim=32, num_heads=2)) # head is # of times to do the multhead projection
model.add(TransformerEncoder(embed_dim=256, dense_dim=32, num_heads=2))
model.add(layers.GlobalMaxPooling1D()) # convert back to 1-dim / flatten would do the same but maintain more numbers
model.add(layers.Dropout(0.5)) 
model.add(layers.Dense(1, activation="sigmoid"))

model.summary()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# train
history = model.fit(
    dataset_train,
    epochs=10,
    validation_data=dataset_validate
)

Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 positional_embedding_3 (Pos  (None, 128, 256)         2592768   
 itionalEmbedding)                                               
                                                                 
 transformer_encoder_20 (Tra  (None, 128, 256)         543776    
 nsformerEncoder)                                                
                                                                 
 transformer_encoder_21 (Tra  (None, 128, 256)         543776    
 nsformerEncoder)                                                
                                                                 
 global_max_pooling1d_6 (Glo  (None, 256)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_6 (Dropout)         (None, 256)             