# Chapter 5 â€” State-of-the-Art in Deep Learning: Transformers

## 5.1 Text Representation and Tokenization

In [None]:
import tensorflow as tf
import numpy as np

# Text preprocessing
sample_texts = [
    "The quick brown fox jumps over the lazy dog",
    "Transformers are amazing for NLP tasks",
    "Attention is all you need for sequence processing"
]

# Text vectorization
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=1000,
    output_mode='int',
    output_sequence_length=10
)

vectorize_layer.adapt(sample_texts)
vectorized_texts = vectorize_layer(sample_texts)

print("Vectorized texts:")
print(vectorized_texts.numpy())
print(f"Vocabulary size: {len(vectorize_layer.get_vocabulary())}")

## 5.2 Self-Attention Mechanism

In [None]:
def scaled_dot_product_attention(query, key, value, mask=None):
    """Scaled dot-product attention implementation"""
    matmul_qk = tf.matmul(query, key, transpose_b=True)
    
    depth = tf.cast(tf.shape(key)[-1], tf.float32)
    logits = matmul_qk / tf.math.sqrt(depth)
    
    if mask is not None:
        logits += (mask * -1e9)
    
    attention_weights = tf.nn.softmax(logits, axis=-1)
    output = tf.matmul(attention_weights, value)
    
    return output, attention_weights

# Test attention
batch_size = 2
seq_length = 5
depth = 64

query = tf.random.normal((batch_size, seq_length, depth))
key = tf.random.normal((batch_size, seq_length, depth))
value = tf.random.normal((batch_size, seq_length, depth))

output, attention_weights = scaled_dot_product_attention(query, key, value)

print(f"Input shapes - Query: {query.shape}, Key: {key.shape}, Value: {value.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attention_weights.shape}")

## 5.3 Multi-Head Attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, query, key, value, mask=None):
        batch_size = tf.shape(query)[0]
        
        query = self.wq(query)
        key = self.wk(key)
        value = self.wv(value)
        
        query = self.split_heads(query, batch_size)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)
        
        scaled_attention, attention_weights = scaled_dot_product_attention(
            query, key, value, mask
        )
        
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
        
        return output, attention_weights

# Test multi-head attention
mha = MultiHeadAttention(d_model=128, num_heads=8)
query = tf.random.normal((batch_size, seq_length, 128))
output, weights = mha(query, query, query)

print(f"Multi-head attention output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")

## 5.4 Transformer Encoder Layer

In [None]:
class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self, x, training, mask=None):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        
        return out2

# Test encoder layer
encoder_layer = TransformerEncoderLayer(d_model=128, num_heads=8, dff=512)
sample_input = tf.random.normal((batch_size, seq_length, 128))
output = encoder_layer(sample_input, training=False)

print(f"Encoder input shape: {sample_input.shape}")
print(f"Encoder output shape: {output.shape}")

## 5.5 Positional Encoding

In [None]:
def positional_encoding(position, d_model):
    """Generate positional encoding"""
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates
    
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )
    
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

# Test positional encoding
pos_encoding = positional_encoding(50, 128)
print(f"Positional encoding shape: {pos_encoding.shape}")
print(f"Sample values: {pos_encoding[0, 0, :5]}")

## 5.6 Complete Transformer Model

In [None]:
class TextClassificationTransformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, 
                 vocab_size, max_position_encoding, num_classes, rate=0.1):
        super(TextClassificationTransformer, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_encoding = positional_encoding(max_position_encoding, d_model)
        
        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate) 
                         for _ in range(num_layers)]
        
        self.dropout = tf.keras.layers.Dropout(rate)
        self.global_avg_pooling = tf.keras.layers.GlobalAveragePooling1D()
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')
    
    def call(self, x, training):
        seq_len = tf.shape(x)[1]
        
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        
        x = self.dropout(x, training=training)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training)
        
        x = self.global_avg_pooling(x)
        x = self.classifier(x)
        
        return x

# Create and test the model
transformer = TextClassificationTransformer(
    num_layers=2,
    d_model=128,
    num_heads=8,
    dff=512,
    vocab_size=1000,
    max_position_encoding=100,
    num_classes=3
)

sample_input = tf.random.uniform((2, 50), maxval=1000, dtype=tf.int32)
output = transformer(sample_input, training=False)

print(f"Input shape: {sample_input.shape}")
print(f"Output shape: {output.shape}")
print(f"Sample predictions: {output.numpy()}")

## Exercises

### Exercise 1: Causal Masking

In [None]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

look_ahead_mask = create_look_ahead_mask(5)
print("Look-ahead mask:")
print(look_ahead_mask.numpy())

### Exercise 2: Learning Rate Schedule

In [None]:
class TransformerLearningRateSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

lr_schedule = TransformerLearningRateSchedule(128)
print("Learning rates:")
for step in [1, 1000, 4000]:
    print(f"Step {step}: {lr_schedule(step):.6f}")

## Chapter 5 Summary

Key concepts covered:
- Self-attention mechanism
- Multi-head attention
- Positional encoding
- Transformer encoder architecture
- Practical text classification implementation

Transformers have revolutionized NLP with their ability to handle long-range dependencies and parallel computation.