In [21]:
# author: Michael Hüppe
# date: 11.11.2024
# project: resources/transformer.py
import tensorflow as tf
from resources.training.transformer.encoder import Encoder
from resources.training.transformer.decoder import Decoder

def Transformer(
        context_vocab_size: int = 5000, target_vocab_size: int = 5000,
        model_max_length: int = 250,
        embedding_dim: int = 64,
        dropout: float = 0.1,
        num_layers_encoder: int = 1, num_layers_decoder: int = 1,
        num_heads: int = 1,
        positional_embedding: str = "rope", use_seperate_embedding: bool = True,
        return_attention_scores: bool = False, **kwargs):
    """
    Implementation of a Transformer model after "Attention is all you need"
    :param context_vocab_size: Vocab size of the context
    :param target_vocab_size: Vocab size of the target
    :param model_max_length: Maximum length of the
    :param embedding_dim: Dimension of the Embedding
    :param dropout: Dropout probability after two drop out layers
    :param num_layers_encoder: Number of Encoder Layers
    :param num_layers_decoder: Number of Encoder Layers
    :param num_heads: Number of heads per layer
    :param dropout: Dropout probability after two drop out layers
    :param positional_embedding: Type of positional embedding to use [absolute, relative, rope, (segment)]
    :param use_seperate_embedding: if True, use seperate Embeddings for encoding and decoding
    :param return_attention_scores: if True, the attention scores for the encoder and decoder are returned for each layer
    :return:
    """
    model_max_length = max(model_max_length, kwargs["context_max_length"], kwargs["target_max_length"])
    encoder_input = tf.keras.Input(shape=(None,), name="encoder_input")
    decoder_input = tf.keras.Input(shape=(None,), name="decoder_input")
    encoder_embedding_layer = tf.keras.layers.Embedding(
        input_dim=context_vocab_size,
        output_dim=embedding_dim,
        mask_zero=True
    )

    if num_layers_encoder != 0:
        encoder_embedding = encoder_embedding_layer(encoder_input)

        x, encoder_attention = Encoder(encoder_embedding, model_max_length, embedding_dim, dropout,
                                       num_layers_encoder, num_heads, positional_embedding)(
            encoder_embedding)
    else:
        x = encoder_input
        encoder_attention = {}

    if use_seperate_embedding or num_layers_encoder == 0:
        decoder_embedding = tf.keras.layers.Embedding(
            input_dim=context_vocab_size,
            output_dim=embedding_dim,
            mask_zero=True
        )(decoder_input)
    else:
        decoder_embedding = encoder_embedding_layer(decoder_input)

    x, decoder_attention_causal, decoder_attention_causal_cross = Decoder(decoder_embedding, model_max_length,
                                                                          embedding_dim, dropout, num_layers_decoder,
                                                                          num_heads,
                                                                          positional_embedding)([decoder_embedding, x])
    x = tf.keras.layers.Dense(target_vocab_size)(x)
    # Define outputs based on the return_attention_scores flag
    outputs = x
    if return_attention_scores:
        outputs = (x, [encoder_attention, decoder_attention_causal, decoder_attention_causal_cross])
    target = tf.keras.Input(shape=(target_vocab_size,), name="target")
    model = tf.keras.Model(inputs=[encoder_input, decoder_input, target], outputs=outputs, name="Transformer")
    return model

In [6]:
context = tf.random.uniform((1, 350), minval=0, maxval=100, dtype=tf.int32)
sample = tf.random.uniform((1, 20), minval=0, maxval=100, dtype=tf.int32)

In [25]:
def init_model(Model, params):
    """
    Initialize the model
    :param Model: Model class to init
    :param params: Parameters for the model (parameters are handled in the model functions)
    """
    tf.keras.backend.clear_session()  # Clearing Keras memory
    tf.random.set_seed(params.get("SEED", 69))  # For reproducibility

    # TODO: describe the parameters and softcode
    optimizer = tf.keras.optimizers.Adam(
        0.001,
        beta_1=0.9, # The exponential decay rate for the 1st moment estimates. Defaults to 0.9
        beta_2=0.98, # The exponential decay rate for the 2nd moment estimates. Defaults to 0.999
        epsilon=1e-9 # A small constant for numerical stability
    )

    if isinstance(Model, str):
        if Model == "Transformer":
            Model = Transformer
        else:
            raise KeyError

    model = Model(**params)

    model.compile(
        optimizer=optimizer
    )

    return model

import os
import json

model_path = r"C:\Users\mhuep\Master_Informatik\Semester_3\MachineLearning\trained_models\Transformer\01_15_2025__17_41_21"
train_params = json.load(open(os.path.join(model_path, "modelInfo.json")))
model_params = train_params["model_parameters"]
target_max_length, context_max_length = model_params["target_max_length"], model_params["context_max_length"]
model = init_model(Transformer, model_params)
def CustomLoss(y_true, y_pred, x, context):
    print(y_true, y_pred, x, context)
    return 0

model.add_loss( CustomLoss( model.input[2], model.output, model.input[0], model.input[1] ) )
model.compile(loss=None, optimizer='adam')

model.load_weights(os.path.join(model_path, "modelCheckpoint.weights.h5"))

<KerasTensor shape=(None, 7671), dtype=float32, sparse=False, name=target> <KerasTensor shape=(None, None, 7671), dtype=float32, sparse=False, name=keras_tensor_82> <KerasTensor shape=(None, None), dtype=float32, sparse=False, name=encoder_input> <KerasTensor shape=(None, None), dtype=float32, sparse=False, name=decoder_input>


NotImplementedError: 

<KerasTensor shape=(None, None, 7671), dtype=float32, sparse=False, name=keras_tensor_82>