<h1 style="text-align:center; font-size:40px;">Transformer from scratch using only NumPy</h1>

<h2 style="text-align:center; font-size:30px;">No deep learning frameworks (e.g., PyTorch, TensorFlow)</h2>

---

<h3 style="font-size:20px;">1. Imports</h3>

Imported are the `NumPy` library and a custom `TransformerHelper` class ( file: **transformer_helper.ipynb** ), written to keep the `Transformer` class clean and modular.

In [1]:
import numpy as np
from transformer_helper import TransformerHelper

---

<h3 style="font-size:20px;">2. Encoder and Decoder Classes</h3>

The `Encoder` and `Decoder` classes contain the trainable parameters used to compute output probabilities during inference. They also store intermediate results required for backpropagation during training. The trainable parameters are initialised with pseudorandom values.

In [2]:
class Encoder():
    def __init__(self, len_input_vocab, input_vocab, seed=7287):
        self.input_vocab = input_vocab
        rng = np.random.default_rng(seed)

        #######################################################################################
        ############ ENCODER LAYER TRAINABLE PARAMETERS #######################################
        #######################################################################################

        self.embedding_matrix = rng.standard_normal((len_input_vocab, 4))

        self.W_keys_self_att_h_1    = rng.standard_normal((4, 4))
        self.W_queries_self_att_h_1 = rng.standard_normal((4, 4))
        self.W_values_self_att_h_1  = rng.standard_normal((4, 4))

        self.W_keys_self_att_h_2    = rng.standard_normal((4, 4))
        self.W_queries_self_att_h_2 = rng.standard_normal((4, 4))
        self.W_values_self_att_h_2  = rng.standard_normal((4, 4))

        self.W_keys_self_att_h_3    = rng.standard_normal((4, 4))
        self.W_queries_self_att_h_3 = rng.standard_normal((4, 4))
        self.W_values_self_att_h_3  = rng.standard_normal((4, 4))

        self.W_output_self_att = rng.standard_normal((12, 4))

        self.gamma_norm_self_att = rng.standard_normal((1, 4))
        self.beta_norm_self_att  = rng.standard_normal((1, 4))

        self.W_ffn_layer_1 = rng.standard_normal((4, 4))
        self.W_ffn_layer_2 = rng.standard_normal((4, 4))

        self.gamma_norm_ffn = rng.standard_normal((1, 4))
        self.beta_norm_ffn  = rng.standard_normal((1, 4))
        
        
        #######################################################################################
        ############ ENCODER LAYER INTERMEDIATE CALCULATIONS ##################################
        #######################################################################################
        
        self.position_encoded = None
        
        self.queries_self_att_h_1 = None
        self.keys_self_att_h_1 = None
        self.values_self_att_h_1 = None
        self.scores_self_att_h_1 = None
        self.scaled_self_att_h_1 = None
        self.softmax_self_att_h_1 = None
        self.output_self_att_h_1 = None

        self.queries_self_att_h_2 = None
        self.keys_self_att_h_2 = None
        self.values_self_att_h_2 = None
        self.scores_self_att_h_2 = None
        self.scaled_self_att_h_2 = None
        self.softmax_self_att_h_2 = None
        self.output_self_att_h_2 = None

        self.queries_self_att_h_3 = None
        self.keys_self_att_h_3 = None
        self.values_self_att_h_3 = None
        self.scores_self_att_h_3 = None
        self.scaled_self_att_h_3 = None
        self.softmax_self_att_h_3 = None
        self.output_self_att_h_3 = None

        self.self_att_output = None
        
        self.added_self_att = None
        self.norm_self_att = None
        
        self.ffn_layer_1 = None
        self.relu_ffn = None
        self.ffn_layer_2 = None
        
        self.added_ffn = None
        self.norm_ffn = None

In [3]:
class Decoder():
    def __init__(self, len_output_vocab, output_vocab, seed=9373):
        self.output_vocab = output_vocab
        rng = np.random.default_rng(seed)  

        #######################################################################################
        ########################## DECODER LAYER TRAINABLE PARAMETERS #########################
        #######################################################################################

        self.embedding_matrix = rng.standard_normal((len_output_vocab, 4))

        self.W_keys_masked_att_h_1    = rng.standard_normal((4, 4))
        self.W_queries_masked_att_h_1 = rng.standard_normal((4, 4))
        self.W_values_masked_att_h_1  = rng.standard_normal((4, 4))

        self.W_keys_masked_att_h_2    = rng.standard_normal((4, 4))
        self.W_queries_masked_att_h_2 = rng.standard_normal((4, 4))
        self.W_values_masked_att_h_2  = rng.standard_normal((4, 4))

        self.W_keys_masked_att_h_3    = rng.standard_normal((4, 4))
        self.W_queries_masked_att_h_3 = rng.standard_normal((4, 4))
        self.W_values_masked_att_h_3  = rng.standard_normal((4, 4))

        self.W_output_masked_att = rng.standard_normal((12, 4))

        self.gamma_norm_masked_att = rng.standard_normal((1, 4))
        self.beta_norm_masked_att  = rng.standard_normal((1, 4))

        self.W_keys_cross_att_h_1    = rng.standard_normal((4, 4))
        self.W_queries_cross_att_h_1 = rng.standard_normal((4, 4))
        self.W_values_cross_att_h_1  = rng.standard_normal((4, 4))

        self.W_keys_cross_att_h_2    = rng.standard_normal((4, 4))
        self.W_queries_cross_att_h_2 = rng.standard_normal((4, 4))
        self.W_values_cross_att_h_2  = rng.standard_normal((4, 4))

        self.W_keys_cross_att_h_3    = rng.standard_normal((4, 4))
        self.W_queries_cross_att_h_3 = rng.standard_normal((4, 4))
        self.W_values_cross_att_h_3  = rng.standard_normal((4, 4))

        self.W_output_cross_att = rng.standard_normal((12, 4))

        self.gamma_norm_cross_att = rng.standard_normal((1, 4))
        self.beta_norm_cross_att  = rng.standard_normal((1, 4))

        self.W_ffn_layer_1 = rng.standard_normal((4, 4))
        self.W_ffn_layer_2 = rng.standard_normal((4, 4))

        self.gamma_norm_ffn = rng.standard_normal((1, 4))
        self.beta_norm_ffn  = rng.standard_normal((1, 4))

        
        #######################################################################################
        ########################## DECODER LAYER INTERMEDIATE CALCULATIONS ####################
        #######################################################################################

        self.position_encoded = None
        
        self.queries_masked_att_h_1 = None
        self.keys_masked_att_h_1 = None
        self.values_masked_att_h_1 = None
        self.scores_masked_att_h_1 = None
        self.masked_scores_masked_att_h_1 = None
        self.scaled_masked_att_h_1 = None
        self.softmax_masked_att_h_1 = None
        self.output_masked_att_h_1 = None

        self.queries_masked_att_h_2 = None
        self.keys_masked_att_h_2 = None
        self.values_masked_att_h_2 = None
        self.scores_masked_att_h_2 = None
        self.masked_scores_masked_att_h_2 = None
        self.scaled_masked_att_h_2 = None
        self.softmax_masked_att_h_2 = None
        self.output_masked_att_h_2 = None

        self.queries_masked_att_h_3 = None
        self.keys_masked_att_h_3 = None
        self.values_masked_att_h_3 = None
        self.scores_masked_att_h_3 = None
        self.masked_scores_masked_att_h_3 = None
        self.scaled_masked_att_h_3 = None
        self.softmax_masked_att_h_3 = None
        self.output_masked_att_h_3 = None

        self.masked_att_output = None
        
        self.added_masked_att = None
        self.norm_masked_att = None

        self.keys_cross_att_h_1 = None
        self.values_cross_att_h_1 = None
        self.queries_cross_att_h_1 = None
        self.scores_cross_att_h_1 = None
        self.scaled_cross_att_h_1 = None
        self.softmax_cross_att_h_1 = None
        self.output_cross_att_h_1 = None

        self.keys_cross_att_h_2 = None
        self.values_cross_att_h_2 = None
        self.queries_cross_att_h_2 = None
        self.scores_cross_att_h_2 = None
        self.scaled_cross_att_h_2 = None
        self.softmax_cross_att_h_2 = None
        self.output_cross_att_h_2 = None

        self.keys_cross_att_h_3 = None
        self.values_cross_att_h_3 = None
        self.queries_cross_att_h_3 = None
        self.scores_cross_att_h_3 = None
        self.scaled_cross_att_h_3 = None
        self.softmax_cross_att_h_3 = None
        self.output_cross_att_h_3 = None

        self.cross_att_output = None
        
        self.added_cross_att = None
        self.norm_cross_att = None
        
        self.ffn_layer_1 = None
        self.relu_ffn = None
        self.ffn_layer_2 = None
        
        self.added_ffn = None
        self.norm_ffn = None

---

<h3 style="font-size:20px;">3. Transformer Class</h3>

The `Transformer` class includes both training and inference routines.

In [4]:
class Transformer():
    
    def __init__(self):
        # Define input and output vocabularies (token-to-index mappings)
        self.input_vocab = { 'hello': 0, 'my': 1, 'name': 2, 'is': 3, 'messi': 4, '<eos>': 5, '<start>': 6 }
        self.output_vocab = { 'hola': 0, 'mi': 1, 'nombre': 2, 'es': 3, 'messi': 4, '<eos>': 5, '<start>': 6 }

        # Instantiate encoder and decoder with their respective vocabularies
        self.encoder_layer = Encoder(len(self.input_vocab), self.input_vocab)
        self.decoder_layer = Decoder(len(self.output_vocab), self.output_vocab)

    #######################################################################################
    ########################## TRAINING LOOP ##############################################
    #######################################################################################

    def train(self, train_set, num_of_epochs, learning_rate):
        train_loss = []

        for epoch_idx in range(num_of_epochs):
            gradients_sum = {}                     # Accumulate gradients over the batch
            losses_sum = 0                         # Total loss for this epoch
            num_of_target_tokens_in_batch = 0      # Total number of target tokens processed

            for (source_sequence, target_sequence) in train_set:
                num_of_target_tokens_in_batch += len(target_sequence)

                # Loop over each target token in the sequence
                for target_token_idx, target_token in enumerate(target_sequence.split()):
                    # Compute loss and gradients for the current target token
                    loss_for_token, gradients = TransformerHelper.compute_loss_and_gradient(
                        self,
                        source_sequence, 
                        target_sequence, 
                        target_token_idx,
                        self.output_vocab[target_token]
                    )

                    losses_sum += loss_for_token

                    # Accumulate dense gradients (for weights, biases, etc.)
                    for key, grad in gradients.items():
                        if key not in [
                            'decoder_layer.loss_wrt_input_embeddings',
                            'encoder_layer.loss_wrt_input_embeddings'
                        ]:
                            if key not in gradients_sum:
                                gradients_sum[key] = grad.copy()
                            else:
                                gradients_sum[key] += grad

                    # Handle sparse gradients for decoder embeddings
                    # Recreate the decoder input (all tokens before the current one)
                    target_tokens = target_sequence.split()
                    decoder_input_tokens = ['<start>'] + target_tokens[:target_token_idx]
                    decoder_input_grads = gradients['decoder_layer.loss_wrt_input_embeddings']

                    for i, token in enumerate(decoder_input_tokens):
                        token_id = self.output_vocab[token]
                        grad_i = decoder_input_grads[i]

                        if 'decoder_layer.embedding_matrix' not in gradients_sum:
                            gradients_sum['decoder_layer.embedding_matrix'] = np.zeros_like(
                                self.decoder_layer.embedding_matrix)

                        gradients_sum['decoder_layer.embedding_matrix'][token_id] += grad_i

                    # Handle sparse gradients for encoder embeddings
                    source_tokens = source_sequence.split()
                    encoder_input_grads = gradients['encoder_layer.loss_wrt_input_embeddings']

                    for i, token in enumerate(source_tokens):
                        token_id = self.input_vocab[token]
                        grad_i = encoder_input_grads[i]

                        if 'encoder_layer.embedding_matrix' not in gradients_sum:
                            gradients_sum['encoder_layer.embedding_matrix'] = np.zeros_like(
                                self.encoder_layer.embedding_matrix)

                        gradients_sum['encoder_layer.embedding_matrix'][token_id] += grad_i

            # Normalize accumulated gradients by total number of target tokens
            for key in gradients_sum:
                gradients_sum[key] /= num_of_target_tokens_in_batch

            # Apply gradients to update each parameter (basic SGD)
            for key, grad in gradients_sum.items():
                layer_name, param_name = key.split('.')
                layer = getattr(self, layer_name)                 # e.g., self.encoder_layer
                param = getattr(layer, param_name)                # e.g., encoder_layer.W_ffn
                setattr(layer, param_name, param - learning_rate * grad)

            # Store average loss for this epoch
            train_loss.append(losses_sum / num_of_target_tokens_in_batch)

        return train_loss

    #######################################################################################
    ########################## SEQUENCE TRANSFORMATION (INFERENCE) ########################
    #######################################################################################

    def transform(self, source_sequence, max_len=10):
        predicted_tokens = []

        for _ in range(max_len):
            # Construct decoder input (always starts with <start>, then predicted tokens)
            decoder_input = ' '.join(['<start>'] + predicted_tokens)

            # Run forward pass to get output token probabilities
            output_probabilities = TransformerHelper.full_forward_pass(
                self,
                source_sequence,
                decoder_input,
                store_intermediate_results=False
            )

            # Select the token with the highest probability at the final position
            probs = output_probabilities[-1]                # Shape: (vocab_size,)
            predicted_token_id = np.argmax(probs)

            # Convert token ID back to token string
            inv_vocab = {v: k for k, v in self.output_vocab.items()}
            predicted_token = inv_vocab[predicted_token_id]

            # Stop if <eos> is generated
            if predicted_token == '<eos>':
                break

            # Append predicted token and repeat
            predicted_tokens.append(predicted_token)

        return ' '.join(predicted_tokens)

---

<h3 style="font-size:20px;">4. Demonstrating Transformer training and inference</h3>


In [6]:
train_data = [
    ("hello my name", "hola mi nombre <eos>"),
    ("hello messi", "hola messi <eos>"),
    ("my name is messi", "mi nombre es messi <eos>")
]

model = Transformer()

print("\nINFERENCES USING THE UNTRAINED MODEL")
 
for source_seq, target_seq in train_data:
    print("------------------------------------------------------------------")
    print(f"Source sequence: \"{source_seq}\"")
    print(f"Target sequence: \"{target_seq}\"")
    model_output = model.transform(source_seq)
    print(f"MODEL OUTPUT \nmodel.transform(\"{source_seq}\"): \"{model_output}\"")

print("\n======================================================================")
print("TRAINING")

losses = model.train(train_data, num_of_epochs=8000, learning_rate=0.01)


print(f"\nEpoch {1:>5}: Loss: {losses[0]:.4f}")
for epoch,loss in enumerate(losses):
    if (epoch+1) % 500 == 0:
        print(f"Epoch {epoch + 1:>5}: Loss: {loss:.4f}")
print("======================================================================")

print("\nINFERENCES USING THE TRAINED MODEL")
 
for source_seq, target_seq in train_data:
    print("------------------------------------------------------------------")
    print(f"Source sequence: \"{source_seq}\"")
    print(f"Target sequence: \"{target_seq}\"")
    model_output = model.transform(source_seq)
    print(f"MODEL OUTPUT \nmodel.transform(\"{source_seq}\"): \"{model_output}\"")


INFERENCES USING THE UNTRAINED MODEL
------------------------------------------------------------------
Source sequence: "hello my name"
Target sequence: "hola mi nombre <eos>"
MODEL OUTPUT 
model.transform("hello my name"): "messi messi messi messi messi messi messi messi messi messi"
------------------------------------------------------------------
Source sequence: "hello messi"
Target sequence: "hola messi <eos>"
MODEL OUTPUT 
model.transform("hello messi"): "messi messi messi messi messi messi messi messi messi messi"
------------------------------------------------------------------
Source sequence: "my name is messi"
Target sequence: "mi nombre es messi <eos>"
MODEL OUTPUT 
model.transform("my name is messi"): "messi messi messi messi messi messi messi messi messi messi"

TRAINING

Epoch     1: Loss: 0.5661
Epoch   500: Loss: 0.3929
Epoch  1000: Loss: 0.3819
Epoch  1500: Loss: 0.3294
Epoch  2000: Loss: 0.3197
Epoch  2500: Loss: 0.2786
Epoch  3000: Loss: 0.2662
Epoch  3500: Loss