In [7]:
#Algorithm
## Step 1:
## -> We need to compute the relevancy score of each and every word vector with
##    every other word in the sentence. This will be our attention scores. The
##    scores are obtained by the dot product of the word vectors. Then the scores
##    will go through a scaling function and a softmax function.

## Step 2:
# -> We compute the sum of all word vectors in the sentence weighted by our relevancy
#    scores. The resulting vector is our new representation for a specific word

import numpy as np
from tensorflow.keras.activations import softmax
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def self_attention(input_sequence):
    output = np.zeros(shape=input_sequence.shape)
    for i, pivot_vector in enumerate(input_sequence):
        scores = np.zeros(shape=(len(input_sequence),))
        for j, vector in enumerate(input_sequence):
            scores[j] = np.dot(pivot_vector, vector.T)
        scores /= np.sqrt(input_sequence.shape[1])
        scores = softmax(scores)
        new_pivot_vector = np.zeros(shape=(pivot_vector.shape,))
        for j, vector in enumerate(input_sequence):
            new_pivot_vector += vector * scores[j]
        output[i] = new_pivot_vector
    return output

In [None]:
import pathlib

base_dir = pathlib.Path('../../n-grams_with_tf-idf/resources/aclImdb')

batch_size = 32
train_ds = tf.keras.utils.text_dataset_from_directory(base_dir/'train',
                                                      batch_size=batch_size)
val_ds = tf.keras.utils.text_dataset_from_directory(base_dir/'val',
                                                    batch_size=batch_size)
test_ds = tf.keras.utils.text_dataset_from_directory(base_dir/'test',
                                                     batch_size=batch_size)


In [None]:
from keras.layers import TextVectorization

max_tokens = 20000
max_length = 600
text_vectorization = TextVectorization(max_tokens=max_tokens, output_mode='int',
                                       output_sequence_length=max_length)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(lambda x, y : (text_vectorization(x), y),
                            num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y : (text_vectorization(x), y),
                        num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y : (text_vectorization(x), y),
                          num_parallel_calls=4)

In [6]:
# Implementing a transformer encoder

class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation='relu'),
             layers.Dense(embed_dim, )]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim' : self.embed_dim,
            'dense_dim' : self.dense_dim,
            'num_heads' : self.num_heads
        })
        return config


In [None]:
vocab_size = 20000
embed_dim = 256
num_heads = 4
dense_dim = 32

inputs = keras.Input(shape=(None, ), dtype='int64')
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPool1D()(x)
x = layers.Dropout(0.5)(x)
output = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, output)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

callback_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='transformer_encoder.keras',
        save_best_only=True,
    ),
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callback_list)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 256)         5120000   
                                                                 
 transformer_encoder_2 (Tran  (None, None, 256)        1069600   
 sformerEncoder)                                                 
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 1)                 257   

In [9]:
model = keras.models.load_model('transformer_encoder.keras',
                                custom_objects={'TransformerEncoder':TransformerEncoder})
print(f'The accuracy is: {model.evaluate(int_test_ds)[1]:.3%}')

The accuracy is: 88.064%
