In [1]:
# This is my first transformer project
# The idea is to create a transformer that can take as imput a list (N mod p1, N mod p2, ..., N mod pn)
# where N is a number (integer, rational, etc) and p1,p2,...,pn are prime numbers
# and returns the number N

# %% 
# Let us start by importing the necessary libraries
import torch  # Main framework for defining and training the transformer
import torch.nn as nn  # Neural network module
import torch.optim as optim  # Optimization functions
import numpy as np  # For numerical operations
import random  # For generating random numbers
import itertools  # (Optional) For generating structured datasets

import matplotlib.pyplot as plt  # (Optional) For visualization
from torch.utils.data import Dataset, DataLoader  # To handle training data efficiently

import time # For timing the training process

import json # For saving and loading the model


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/paolopichini/Desktop/Desktop - MacBook Pro di Paolo/Coding/ML/Solo/MLpractice/venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/paolopichini/Desktop/Desktop - MacBook Pro di Paolo/Coding/ML/Solo/MLpractice/venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users

In [3]:
# Load configuration from a JSON file
with open("config1.json", "r") as f:
    config = json.load(f)

# Access parameters like:
input_dim = config["model_params"]["input_dim"]
hidden_dim = config["model_params"]["hidden_dim"]
output_dim = config["model_params"]["output_dim"]

learning_rate = config["training_params"]["learning_rate"]
batch_size = config["training_params"]["batch_size"]
num_epochs = config["training_params"]["num_epochs"]

print("Loaded configuration:")
print(config)

Loaded configuration:
{'model_params': {'model': 'MLP', 'input_dim': 5, 'hidden_dim': 512, 'output_dim': 1}, 'training_params': {'learning_rate': 0.001, 'batch_size': 32, 'num_epochs': 100, 'optimizer': 'Adam'}, 'log_params': {'experiment_name': 'experiment_001', 'notes': 'Baseline experiment with MLP'}}


In [4]:

# Now we move onto data generation
# the idea is to first define a list of primes (p1,p2,...,pn)
# then we pick integers N and compute the remainders of N when divided by each prime
# then the pairs (N mod p1, N mod p2, ..., N mod pn) are stored as input and N is stored as output
# this will form our training dataset

# Define a list of small primes
#primes = [2, 3, 5, 7, 11]
primes = [3, 7, 13, 19, 31]

# Compute the product P = p1 * p2 * ... * pn
P = np.prod(primes)

# Check what we have done so far
print(f"Chosen primes: {primes}")
print(f"Product of primes (P): {P}")

Chosen primes: [3, 7, 13, 19, 31]
Product of primes (P): 160797


In [5]:

# Now we define a PyTorch Dataset to handle our data
# This creates a class that inherits from the `Dataset` class in PyTorch
# so that we can use PyTorch's `DataLoader` to load the data efficiently

# Define a PyTorch Dataset for our data
class ModuloDataset(Dataset):
    def __init__(self, num_samples=1000, myprimes=None):
        super(ModuloDataset, self).__init__() # Initialize the base class 'Dataset', not strictly required here but a good practice
        self.myprimes = myprimes if myprimes else primes
        self.P = np.prod(self.myprimes)
        self.samples = []

        for i in range(num_samples):
            N = random.randint(0, self.P - 1)  # Pick a random integer N
            normalized_N = 2 * (N / self.P) - 1  # Normalize & center N
            normalized_remainders = [2*(N % p)/p - 1 for p in self.myprimes]  # Compute remainders
            self.samples.append((torch.tensor(normalized_remainders, dtype=torch.float32), 
                                 torch.tensor(normalized_N, dtype=torch.float32)))  # Convert to tensors

    def __len__(self):
        return len(self.samples)  # Return the total number of samples

    def __getitem__(self, idx):
        return self.samples[idx]  # Return the (input, output) pair at index `idx`

# Create the dataset
dataset = ModuloDataset(num_samples=10,myprimes=primes)

# Check some rescaled samples
for i in range(min(3,len(dataset))):
    print(f"Sample {i}: Input (moduli) {dataset[i][0]}, Output (N) {dataset[i][1]}")

# Check some samples without rescaling
for i in range(min(3,len(dataset))):
    print(f"Sample {i}: Input (moduli) {(torch.tensor(primes)*dataset[i][0]+torch.tensor(primes))/2}, Output (N) {(torch.tensor(int(P))*dataset[i][1]+torch.tensor(int(P)))/2}")

# Check the samples above are consistent
checksample0 = (torch.tensor(int(P))*dataset[0][1]+torch.tensor(int(P))).item()/2
print(f"Check first sample: {[checksample0 % p for p in primes]} from {checksample0}")

Sample 0: Input (moduli) tensor([-0.3333, -0.4286, -0.3846, -0.4737,  0.8710]), Output (N) 0.28164082765579224
Sample 1: Input (moduli) tensor([-1.0000, -0.7143,  0.2308,  0.7895, -0.6774]), Output (N) -0.3841676115989685
Sample 2: Input (moduli) tensor([ 0.3333, -0.7143, -0.8462, -0.6842,  0.2258]), Output (N) 0.7091239094734192
Sample 0: Input (moduli) tensor([ 1.,  2.,  4.,  5., 29.]), Output (N) 103042.0
Sample 1: Input (moduli) tensor([ 0.,  1.,  8., 17.,  5.]), Output (N) 49512.0
Sample 2: Input (moduli) tensor([ 2.,  1.,  1.,  3., 19.]), Output (N) 137411.0
Check first sample: [1.0, 2.0, 4.0, 5.0, 29.0] from 103042.0


In [6]:

# Now we define the DataLoaders to load the data efficiently during training
# This will allow us to load the data in batches, shuffle it, etc.

# Create a DataLoader to load the dataset in batches
batch_size = min(32, len(dataset))  # Number of samples per batch
print(f"Batch size: {batch_size}")

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Check one batch of data
for batch in dataloader:
    inputs, targets = batch  # Unpack batch
    print(f"Batch Inputs (moduli): {inputs.shape}")  # Should be (batch_size, num_primes)
    print(f"Batch Inputs (moduli): {inputs}")  # Should be (batch_size, num_primes)
    print(f"Batch Targets (N values): {targets.shape}")  # Should be (batch_size,)
    print(f"Batch Targets (N values): {targets}")  # Should be (batch_size,)
    break  # Only print the first batch

Batch size: 10
Batch Inputs (moduli): torch.Size([10, 5])
Batch Inputs (moduli): tensor([[ 0.3333, -0.7143, -0.8462, -0.6842,  0.2258],
        [-0.3333, -0.7143,  0.5385,  0.3684, -0.6129],
        [-0.3333, -0.4286, -0.3846, -0.4737,  0.8710],
        [-0.3333, -1.0000,  0.3846, -0.4737, -0.0968],
        [-1.0000,  0.7143, -0.0769,  0.3684, -0.8065],
        [-1.0000, -0.7143, -0.2308,  0.2632,  0.6129],
        [-0.3333, -0.4286,  0.8462,  0.0526,  0.4839],
        [-1.0000, -0.7143,  0.2308,  0.7895, -0.6774],
        [ 0.3333, -0.1429, -0.0769, -0.3684,  0.5484],
        [-1.0000,  0.1429,  0.6923,  0.3684, -0.7419]])
Batch Targets (N values): torch.Size([10])
Batch Targets (N values): tensor([ 0.7091,  0.4209,  0.2816,  0.5035,  0.4228,  0.3522, -0.7028, -0.3842,
        -0.8647, -0.4520])


In [9]:

# Now we define the transformer model
# We use the nn.Module class in PyTorch to define our model
# The transformer model consists of an input embedding layer, followed by a transformer encoder
# and finally a linear layer to output the predicted value
# Note: here we need to use super() or else some necessary initializations from nn.Module will be missed
class ModuloTransformer(nn.Module):
    def __init__(self, num_primes, d_model=128, num_heads=4, num_layers=2, hidden_dim=256):
        super(ModuloTransformer, self).__init__()

        self.embedding = nn.Linear(num_primes, d_model)  # Input embedding
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads, dim_feedforward=hidden_dim),
            num_layers=num_layers
        )
        self.fc_out = nn.Linear(d_model, 1)  # Final output layer

    def forward(self, x):
        x = self.embedding(x)  # Project input to d_model dimension
        x = self.transformer_encoder(x)  # Pass through transformer layers
        x = self.fc_out(x).squeeze(-1)  # Final output (scalar prediction)
        return x
    
# Also define a simple MLP model for comparison
class MLP(nn.Module):
    def __init__(self, input_dim=5, hidden_dim=128, output_dim=1):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x).squeeze(-1)  # No activation on output (for regression)
        return x

# define model
#model = ModuloTransformer(num_primes=len(primes), d_model=256, num_heads=4, num_layers=2, hidden_dim=256)
model = MLP(input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim)

In [10]:
# Now we check the model before training

# We begin by checking the output of the untrained model
# Get a batch from the DataLoader
test_inputs, test_targets = next(iter(dataloader))  # Get first batch
# Run a forward pass through the model
test_outputs = model(test_inputs)
# Print input values
print("Sample Inputs (moduli):")
print(test_inputs)
# Print raw model predictions
print("Model Output Before Training:")
print(test_outputs)
# Print expected target values
print("Expected Targets (N values):")
print(test_targets)
# Print shape information
print(f"Inputs shape: {test_inputs.shape}")  # Should be (batch_size, num_primes)
print(f"Outputs shape: {test_outputs.shape}")  # Should be (batch_size,)
print(f"Targets shape: {test_targets.shape}")  # Should match outputs (batch_size,)

Sample Inputs (moduli):
tensor([[-0.3333, -0.4286, -0.3846, -0.4737,  0.8710],
        [ 0.3333, -0.1429, -0.0769, -0.3684,  0.5484],
        [-1.0000, -0.7143,  0.2308,  0.7895, -0.6774],
        [-1.0000,  0.1429,  0.6923,  0.3684, -0.7419],
        [-1.0000,  0.7143, -0.0769,  0.3684, -0.8065],
        [-0.3333, -0.7143,  0.5385,  0.3684, -0.6129],
        [ 0.3333, -0.7143, -0.8462, -0.6842,  0.2258],
        [-0.3333, -1.0000,  0.3846, -0.4737, -0.0968],
        [-1.0000, -0.7143, -0.2308,  0.2632,  0.6129],
        [-0.3333, -0.4286,  0.8462,  0.0526,  0.4839]])
Model Output Before Training:
tensor([ 0.0693,  0.0373,  0.0098, -0.0219, -0.0298, -0.0051,  0.0520,  0.0057,
         0.0658,  0.0654], grad_fn=<SqueezeBackward1>)
Expected Targets (N values):
tensor([ 0.2816, -0.8647, -0.3842, -0.4520,  0.4228,  0.4209,  0.7091,  0.5035,
         0.3522, -0.7028])
Inputs shape: torch.Size([10, 5])
Outputs shape: torch.Size([10])
Targets shape: torch.Size([10])


In [None]:

# Now we define the training loop
# We define the loss function (Mean Squared Error) and the optimizer (Adam)
# Then we iterate over the data in batches, perform forward pass, compute loss, backpropagate and update weights

# Define Mean Squared Error loss (for regression)
#loss_fn = nn.MSELoss()
# Define Cross Entropy loss (for classification)
loss_fn = nn.CrossEntropyLoss()

# Define Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)  # Reduce LR by gamma after step_size epochs
#scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)  # Cosine annealing scheduler, gradually reduces learning rate, specify eta_min=X for minimum LR value (default 0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=10, factor=0.5)  # Reduce LR on plateau, reduce LR by factor of 0.5 if validation loss does not improve for 10 epochs

# Number of epochs
num_epochs = 100

# Check output for a single training step
optimizer.zero_grad()  # Reset gradients
test_outputs = model(test_inputs)  # Forward pass (predict N)
test_loss = loss_fn(test_outputs, test_targets)  # Compute loss
test_loss.backward()  # Backpropagation
optimizer.step()  # Update weights
print("Model Output After One Training Step:")
print(model(test_inputs))


start_time = time.time()  # Track how long training takes

# Training loop
for epoch in range(num_epochs):
    total_loss = 0  # Track total loss for the epoch

    for inputs, targets in dataloader:  # Iterate over batches
        optimizer.zero_grad()  # Reset gradients
        outputs = model(inputs)  # Forward pass (predict N)
        loss = loss_fn(outputs, targets)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()  # Keep track of loss

    # Step the scheduler every epoch
    #scheduler.step()
    # Use for the ReduceLROnPlateau scheduler
    scheduler.step(total_loss)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}, LR: {scheduler.get_last_lr()[0]}")  # Print progress

print(f"Training completed in {time.time() - start_time:.2f} seconds")  # Print total time

Model Output After One Training Step:
tensor([ 0.1953,  0.0301, -0.0878, -0.1462,  0.0515, -0.0039,  0.3470,  0.1609,
         0.1694, -0.1073], grad_fn=<SqueezeBackward1>)
Epoch 1/100, Loss: 0.1945, LR: 0.001
Epoch 2/100, Loss: 0.1336, LR: 0.001
Epoch 3/100, Loss: 0.0891, LR: 0.001
Epoch 4/100, Loss: 0.0578, LR: 0.001
Epoch 5/100, Loss: 0.0380, LR: 0.001
Epoch 6/100, Loss: 0.0277, LR: 0.001
Epoch 7/100, Loss: 0.0235, LR: 0.001
Epoch 8/100, Loss: 0.0213, LR: 0.001
Epoch 9/100, Loss: 0.0181, LR: 0.001
Epoch 10/100, Loss: 0.0137, LR: 0.001
Epoch 11/100, Loss: 0.0090, LR: 0.001
Epoch 12/100, Loss: 0.0052, LR: 0.001
Epoch 13/100, Loss: 0.0027, LR: 0.001
Epoch 14/100, Loss: 0.0016, LR: 0.001
Epoch 15/100, Loss: 0.0015, LR: 0.001
Epoch 16/100, Loss: 0.0021, LR: 0.001
Epoch 17/100, Loss: 0.0027, LR: 0.001
Epoch 18/100, Loss: 0.0032, LR: 0.001
Epoch 19/100, Loss: 0.0031, LR: 0.001
Epoch 20/100, Loss: 0.0028, LR: 0.001
Epoch 21/100, Loss: 0.0023, LR: 0.001
Epoch 22/100, Loss: 0.0017, LR: 0.001


In [12]:

# Now we evaluate the model on a new test dataset
# This dataset was not seen by the model during training
# We will use the trained model to predict the number N from the moduli
# Generate new test dataset (never seen before)
num_test_samples = 10  # Number of test examples

test_samples = []
for _ in range(num_test_samples):
    N = random.randint(0, P - 1)  # New random integer N
    normalized_remainders = [2*(N % p) / p - 1 for p in primes]  # Normalized moduli
    target = (N - (P / 2)) / (P / 2)  # Normalized target
    test_samples.append((torch.tensor(normalized_remainders, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)))

# Convert test samples into a PyTorch Dataset and DataLoader
test_dataset = ModuloDataset(num_samples=num_test_samples, myprimes=primes)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Evaluate model on test data
model.eval()  # Set model to evaluation mode
predictions = []
true_values = []

with torch.no_grad():  # Disable gradient calculations for efficiency
    for inputs, targets in test_loader:
        outputs = model(inputs)  # Get model predictions
        predictions.extend(outputs.tolist())  # Store predicted values
        true_values.extend(targets.tolist())  # Store true values

# Convert back to original scale
true_values = [(t * (P / 2)) + (P / 2) for t in true_values]
predictions = [(p * (P / 2)) + (P / 2) for p in predictions]

# Print first 10 predictions vs actual values
for i in range(10):
    print(f"True N: {int(true_values[i])}, Predicted N: {int(predictions[i])}")

True N: 12056, Predicted N: 32298
True N: 30140, Predicted N: 69283
True N: 44814, Predicted N: 99066
True N: 137700, Predicted N: 56465
True N: 108445, Predicted N: 135402
True N: 142177, Predicted N: 150794
True N: 100557, Predicted N: 159804
True N: 19161, Predicted N: 160711
True N: 64601, Predicted N: 123332
True N: 30270, Predicted N: 82207
