In [17]:
import tensorflow as tf
from tensorflow.keras import layers, Model

# -----------------------------
# Positional Encoding Layer
# -----------------------------
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pos_encoding = self.positional_encoding(max_len, d_model)

    def get_angles(self, pos, i, d_model):
        pos = tf.cast(pos, tf.float32)
        i = tf.cast(i, tf.float32)
        angle_rates = 1 / tf.pow(
            10000.0,
            (2 * (i // 2)) / tf.cast(d_model, tf.float32)
        )
        return pos * angle_rates

    def positional_encoding(self, max_len, d_model):
        positions = tf.range(max_len)[:, tf.newaxis]
        dims = tf.range(d_model)[tf.newaxis, :]

        angle_rads = self.get_angles(positions, dims, d_model)

        # Apply sin to even indices, cos to odd indices
        sin_part = tf.sin(angle_rads[:, 0::2])
        cos_part = tf.cos(angle_rads[:, 1::2])

        # Interleave sin and cos
        pos_encoding = tf.concat(
            [sin_part[..., tf.newaxis], cos_part[..., tf.newaxis]],
            axis=-1
        )
        pos_encoding = tf.reshape(pos_encoding, (max_len, d_model))

        return pos_encoding[tf.newaxis, ...]

    def call(self, x):
        return x + tf.cast(
            self.pos_encoding[:, :tf.shape(x)[1], :],
            x.dtype
        )

In [18]:
# -----------------------------
# Transformer Encoder Block
# -----------------------------
def transformer_encoder_block(embed_dim, num_heads, ff_dim):
    inputs = layers.Input(shape=(None, embed_dim))

    # Multi-Head Attention
    x = layers.LayerNormalization()(inputs)
    attention = layers.MultiHeadAttention(
        num_heads=num_heads,
        key_dim=embed_dim // num_heads
    )(x, x)
    x = layers.Add()([inputs, attention])

    # Feed Forward Network
    ff = layers.LayerNormalization()(x)
    ff = layers.Dense(ff_dim, activation="relu")(ff)
    ff = layers.Dense(embed_dim)(ff)
    outputs = layers.Add()([x, ff])

    return Model(inputs, outputs)

In [19]:
# -----------------------------
# Build Transformer Model
# -----------------------------
def build_transformer(
    max_len=100,
    vocab_size=10000,
    embed_dim=64,
    num_heads=4,
    ff_dim=128,
    num_layers=3
):
    inputs = layers.Input(shape=(None,), dtype=tf.int32)

    # Embedding + Positional Encoding
    x = layers.Embedding(vocab_size, embed_dim)(inputs)
    x = PositionalEncoding(max_len, embed_dim)(x)

    # Encoder Stack
    for _ in range(num_layers):
        x = transformer_encoder_block(embed_dim, num_heads, ff_dim)(x)

    # Classification Head
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    return Model(inputs, outputs, name="Transformer_3Layer")



In [20]:
# -----------------------------
# Instantiate + Compile Model
# -----------------------------
model = build_transformer()

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()