In [1]:
# Different layers
from tensorflow.keras.layers import MultiHeadAttention, Input, Dense
from tensorflow.keras.layers import LayerNormalization, Layer
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D
# For miscellaneous functions
from tensorflow.data import Dataset
from tensorflow import convert_to_tensor, string, float32, shape, range, reshape
from tensorflow.keras import utils
# Keras models
from tensorflow.keras import Model, Sequential
# For datasets
from sklearn.datasets import fetch_20newsgroups
# For evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# For math/arrays
import numpy as np
# For plotting
import matplotlib.pyplot as plt

2023-03-08 00:28:52.216564: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load the training dataset while removing headers, footers and quotes
train_dataset = fetch_20newsgroups(subset='train', random_state=0,
remove=("headers", "footers", "quotes"))
train_X, train_Y = (train_dataset.data, train_dataset.target)

# Test dataset
test_dataset = fetch_20newsgroups(subset='test', random_state=0,
remove=("headers", "footers", "quotes"))
test_X, test_Y = (test_dataset.data, test_dataset.target)
# Target classes
newsgroup_names = train_dataset.target_names
# Total classes
n_classes = len(train_dataset.target_names)
# Convert to binary vectors to represent categories
train_Y_categorical = utils.to_categorical(train_Y)
test_Y_categorical = utils.to_categorical(test_Y)

#Print statistics
print("Total training sequences: ", len(train_X))
print("Total test sequences: ", len(test_X))
print("Target categories are: ", newsgroup_names)

Total training sequences:  11314
Total test sequences:  7532
Target categories are:  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [3]:
toy_sentences = [["I am happy today"], ["today weather is awesome"]]

# Create the TextVectorization layer
toy_vectorize_layer = TextVectorization(
output_sequence_length=8,
max_tokens=15)

# Learn a dictionary
toy_vectorize_layer.adapt(Dataset.from_tensor_slices(toy_sentences))

# Use the trained TextVectorization to replace each word by its
# dictionary index
toy_vectorized_words = toy_vectorize_layer(convert_to_tensor(toy_sentences, dtype=string))
print("Dictionary: ", toy_vectorize_layer.get_vocabulary())
print("Vectorized words: ", toy_vectorized_words)

Dictionary:  ['', '[UNK]', 'today', 'weather', 'is', 'i', 'happy', 'awesome', 'am']
Vectorized words:  tf.Tensor(
[[5 8 6 2 0 0 0 0]
 [2 3 4 7 0 0 0 0]], shape=(2, 8), dtype=int64)


2023-03-08 00:28:58.358100: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# The total distinct words to use
vocab_size = 25000
# Specify the maximum characters to consider in each newsgroup
sequence_length = 300

train_X_tensor = Dataset.from_tensor_slices(train_X)

# TextVectorization layer
vectorize_layer = TextVectorization(
output_sequence_length=sequence_length,
max_tokens=vocab_size)

# Adapt method trains the TextVectorization layer and
# creates a dictionary
vectorize_layer.adapt(train_X_tensor)

# Convert all newsgroups in train_X to vectorized tensors
train_X_tensors = convert_to_tensor(train_X, dtype=string)
train_X_vectorized = vectorize_layer(train_X_tensors)

# Convert all newsgroups in test_X to vectorized tensors
test_X_tensors = convert_to_tensor(test_X, dtype=string)
test_X_vectorized = vectorize_layer(test_X_tensors)

In [5]:
# Embedding for words
toy_word_embedding_layer = Embedding(input_dim=15, output_dim=4)
toy_embedded_words = toy_word_embedding_layer(toy_vectorized_words)

# Embedding for positions
toy_position_embedding_layer = Embedding(input_dim=8, output_dim=4)
toy_positions = range(start=0, limit=8, delta=1)
toy_embedded_positions = toy_position_embedding_layer(toy_positions)

In [6]:
class EmbeddingLayer(Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super(EmbeddingLayer, self).__init__()
        self.word_embedding = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embedding = Embedding(input_dim=sequence_length, output_dim=embed_dim)

def call(self, tokens):
    sequence_length = shape(tokens)[-1]
    all_positions = range(start=0, limit=sequence_length, delta=1)
    positions_encoding = self.position_embedding(all_positions)
    words_encoding = self.word_embedding(tokens)
    return positions_encoding + words_encoding

In [7]:
toy_multihead = MultiHeadAttention(num_heads=1, key_dim=3)
toy_x = np.array([[[1, 2, 3]]])
toy_x_tensor = convert_to_tensor(toy_x, dtype=float32)
toy_attn_output, toy_attn_wts = toy_multihead(toy_x_tensor, toy_x_tensor, return_attention_scores=True)

print('Multihead layer output: \n', toy_attn_output)
print('\nMultihead attention wts: \n', toy_attn_wts)
print('\nTotal Layer weights: ', len(toy_multihead.get_weights()))

Multihead layer output: 
 tf.Tensor([[[ 1.8797683  1.506011  -2.363825 ]]], shape=(1, 1, 3), dtype=float32)

Multihead attention wts: 
 tf.Tensor([[[[1.]]]], shape=(1, 1, 1, 1), dtype=float32)

Total Layer weights:  8


In [8]:
class EncoderLayer(Layer):
    def __init__(self, total_heads, total_dense_units, embed_dim):
        super(EncoderLayer, self).__init__()# Multihead attention layer
        self.multihead = MultiHeadAttention(num_heads=total_heads, key_dim=embed_dim)# Feed forward network layer
        self.nnw = Sequential([Dense(total_dense_units, activation="relu"),
        Dense(embed_dim)])# Normalization
        self.normalize_layer = LayerNormalization()

def call(self, inputs):
    attn_output = self.multihead(inputs, inputs)
    normalize_attn = self.normalize_layer(inputs + attn_output)
    nnw_output = self.nnw(normalize_attn)
    final_output = self.normalize_layer(normalize_attn + nnw_output)
    return final_output

In [9]:
embed_dim = 64
num_heads = 2
total_dense_units = 60
# Our two custom layers
embedding_layer = EmbeddingLayer(sequence_length, vocab_size, embed_dim)
encoder_layer = EncoderLayer(num_heads, total_dense_units, embed_dim)

# Start connecting the layers together
inputs = Input(shape=(sequence_length, ))
emb = embedding_layer(inputs)
enc = encoder_layer(emb)
pool = GlobalAveragePooling1D()(enc)
d = Dense(total_dense_units, activation="relu")(pool)
outputs = Dense(n_classes, activation="softmax")(d)

# Construct the transformer model
transformer_model = Model(inputs=inputs, outputs=outputs)
transformer_model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['accuracy', 'Precision', 'Recall'])
transformer_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding_layer (EmbeddingL  (None, 300)              0         
 ayer)                                                           
                                                                 


ValueError: Weights for model sequential have not yet been created. Weights are created when the Model is first called on inputs or `build()` is called with an `input_shape`.