In [5]:
# build transformer
from config import get_config
from model import build_transformer
from train import get_ds

In [3]:
config = get_config()

In [6]:
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

Max length of the src sentence: 309
Max length of the tgt sentence: 274


In [23]:
import torch
def format_number(num):
    if num >= 1e9:
        return f"{num / 1e9:.2f}B"
    elif num >= 1e6:
        return f"{num / 1e6:.2f}M"
    elif num >= 1e3:
        return f"{num / 1e3:.2f}K"
    return str(num)


def count_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    memory_bytes = total_params * 4  # Assuming 32-bit float (4 bytes per parameter)
    memory_mb = memory_bytes / 1e6  # Convert to MB

    return {
        "total_params": format_number(total_params),
        "trainable_params": format_number(trainable_params),
        "non_trainable_params": format_number(total_params - trainable_params),
        "estimated_memory_mb": f"{memory_mb:.2f} MB"
    }

In [31]:
config

{'batch_size': 8,
 'num_epochs': 10,
 'lr': 0.0001,
 'seq_len': 350,
 'd_model': 512,
 'lang_src': 'en',
 'lang_tgt': 'it',
 'model_folder': 'weights',
 'model_basename': 'tmodel_',
 'preload': None,
 'tokenizer_file': 'tokenizer_{0}.json',
 'experiment_name': 'runs/tmodel',
 'num_layers': 3,
 'num_heads': 4,
 'd_ff': 1024,
 'dropout': 0.1}

In [30]:
transformer = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(),src_seq_len=config["seq_len"],
                                    tgt_seq_len=config["seq_len"], d_model=config["d_model"], h=config["num_heads"],
                                    d_ff=config["d_ff"], dropout=config["dropout"], N=config["num_layers"])


count_model_parameters(transformer)

{'total_params': '46.82M',
 'trainable_params': '46.82M',
 'non_trainable_params': '0',
 'estimated_memory_mb': '187.27 MB'}

In [32]:
transformer = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(),src_seq_len=config["seq_len"],
                                    tgt_seq_len=config["seq_len"], d_model=config["d_model"], h=8,
                                    d_ff=config["d_ff"], dropout=config["dropout"], N=config["num_layers"])

count_model_parameters(transformer)


{'total_params': '46.82M',
 'trainable_params': '46.82M',
 'non_trainable_params': '0',
 'estimated_memory_mb': '187.27 MB'}

In [33]:
transformer = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(),src_seq_len=config["seq_len"],
                                    tgt_seq_len=config["seq_len"], d_model=config["d_model"], h=8,
                                    d_ff=config["d_ff"], dropout=config["dropout"], N=6)

count_model_parameters(transformer)


{'total_params': '62.57M',
 'trainable_params': '62.57M',
 'non_trainable_params': '0',
 'estimated_memory_mb': '250.30 MB'}

In [35]:
transformer = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(),src_seq_len=config["seq_len"],
                                    tgt_seq_len=config["seq_len"], d_model=config["d_model"], h=8,
                                    d_ff=2048, dropout=config["dropout"], N=6)

count_model_parameters(transformer)


{'total_params': '75.17M',
 'trainable_params': '75.17M',
 'non_trainable_params': '0',
 'estimated_memory_mb': '300.68 MB'}