# Setup

In [4]:
# This is my first transformer project
# The idea is to create a transformer that can take as imput a list (N mod p1, N mod p2, ..., N mod pn)
# where N is a number (integer, rational, etc) and p1,p2,...,pn are prime numbers
# and returns the number N

# %% 
# Let us start by importing the necessary libraries
import torch  # Main framework for defining and training the transformer
import torch.nn as nn  # Neural network module
import torch.optim as optim  # Optimization functions
import numpy as np  # For numerical operations
import random  # For generating random numbers
import itertools  # (Optional) For generating structured datasets
import math  # For mathematical operations

import matplotlib.pyplot as plt  # (Optional) For visualization
from torch.utils.data import Dataset, DataLoader  # To handle training data efficiently

import time # For timing the training process

import json # For saving and loading the model

In [2]:
# Load configuration from a JSON file
with open("config1.json", "r") as f:
    config = json.load(f)

# Access parameters like:
input_dim = config["model_params"]["input_dim"]
hidden_dim = config["model_params"]["hidden_dim"]
output_dim = config["model_params"]["output_dim"]

learning_rate = config["training_params"]["learning_rate"]
batch_size = config["training_params"]["batch_size"]
num_epochs = config["training_params"]["num_epochs"]

print("Loaded configuration:")
print(config)

Loaded configuration:
{'model_params': {'model': 'MLP', 'input_dim': 5, 'hidden_dim': 512, 'output_dim': 1}, 'training_params': {'learning_rate': 0.001, 'batch_size': 32, 'num_epochs': 100, 'optimizer': 'Adam'}, 'log_params': {'experiment_name': 'experiment_001', 'notes': 'Baseline experiment with MLP'}}


# Dataset

In [3]:
class TranslationDataset(Dataset):
    def __init__(self, num_samples=500, primes=[3, 5, 7, 11]):
        self.primes = primes
        self.P = 1
        for p in primes:
            self.P *= p
        self.samples = []
        for _ in range(num_samples):
            # Random integer N in [0, P)
            N = torch.randint(0, self.P, (1,)).item()
            # Input tokens: remainders for each prime (as integers)
            input_tokens = [N % p for p in primes]
            # Output tokens: digits of N (each as an integer)
            output_tokens = [int(d) for d in str(N)]
            self.samples.append((torch.tensor(input_tokens, dtype=torch.long),
                                 torch.tensor(output_tokens, dtype=torch.long)))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return self.samples[idx]

# Quick test
dataset = TranslationDataset(num_samples=10)
print("Sample from dataset:", dataset[0])


Sample from dataset: (tensor([1, 3, 5, 6]), tensor([5, 2, 3]))


# Positional encoding

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)  # Create a (max_len, d_model) matrix.
        position = torch.arange(0, max_len).unsqueeze(1)  # Shape: (max_len, 1) with positions 0,1,2,...
        # Compute a scaling factor for each even dimension.
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # For even indices: use sine; for odd indices: use cosine.
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)  # Shape becomes (max_len, 1, d_model) for easy broadcasting.
        self.register_buffer('pe', pe)  # Register as a buffer so it’s part of the module but not a parameter.

    def forward(self, x):
        # x shape: (seq_len, batch_size, d_model)
        x = x + self.pe[:x.size(0)]  # Add positional encoding to each token embedding.
        return self.dropout(x)


In [13]:
# Test parameters
d_model = 4      # Dimensionality of embeddings/positional encodings
seq_len = 1      # Sequence length
batch_size = 4   # Batch size

# Create a dummy token matrix (for example, all zeros)
dummy_tokens = torch.zeros(seq_len, batch_size, d_model)
print("Original token matrix:")
print(dummy_tokens)

# Instantiate PositionalEncoding with no dropout for clarity
pos_enc = PositionalEncoding(d_model, dropout=0.0, max_len=10)

print("\nPositional encodings (first 4 positions):")
# The positional encoding matrix has shape (max_len, 1, d_model)
# We'll print the first 4 positions, which correspond to our sequence length.
print(pos_enc.pe[:seq_len])

# Add positional encoding to the dummy tokens
tokens_with_pe = pos_enc(dummy_tokens)
print("\nToken matrix after adding positional encoding:")
print(tokens_with_pe)

Original token matrix:
tensor([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]])

Positional encodings (first 4 positions):
tensor([[[0., 1., 0., 1.]]])

Token matrix after adding positional encoding:
tensor([[[0., 1., 0., 1.],
         [0., 1., 0., 1.],
         [0., 1., 0., 1.],
         [0., 1., 0., 1.]]])


# Transformer model

In [None]:
# Seq2Seq Transformer model
class Seq2SeqTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=64, nhead=4,
                 num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=128, dropout=0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.d_model = d_model
        # Embedding layers for source (moduli) and target (digits).
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        # Positional encodings for source and target.
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.pos_decoder = PositionalEncoding(d_model, dropout)
        # Transformer module from PyTorch.
        self.transformer = nn.Transformer(d_model, nhead,
                                          num_encoder_layers, num_decoder_layers,
                                          dim_feedforward, dropout)
        # Final linear layer maps transformer output to target vocabulary logits.
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Expect src and tgt shapes: (batch_size, seq_len)
        # Transpose to shape: (seq_len, batch_size) as required by the transformer.
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)
        # Obtain token embeddings and apply scaling.
        src_emb = self.src_embedding(src) * math.sqrt(self.d_model)
        tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)
        # Add positional encodings.
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_decoder(tgt_emb)
        # Forward pass through the transformer.
        outs = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask)
        # Project transformer outputs to logits and transpose back to (batch_size, seq_len, vocab_size).
        logits = self.fc_out(outs)
        return logits.transpose(0, 1)


In [None]:

# Example usage:
# For instance, suppose our source vocabulary (mod values) is 0..11 and target vocabulary (digits) is 0..9 plus a special token.
src_vocab_size = 12   # e.g., mod values 0,1,...,11
tgt_vocab_size = 11   # e.g., digits 0-9 plus a special token (like BOS)

# Instantiate the model.
model = Seq2SeqTransformer(src_vocab_size, tgt_vocab_size, d_model=64, nhead=4,
                           num_encoder_layers=2, num_decoder_layers=2, dim_feedforward=128, dropout=0.1)
print(model)