# Attention all I need Implimentation:
## Problem Statement:
    create a language translator:
    English to Italian:
    Eng: He could not fail to recognize Yashvin ' s voice , though he did not see him .
    It: Non poté non distinguere la voce di Jašvin , ma non lo scorse .
    
    Task:
        1. create dataset
            a. Tokenization
        2. Format dataset as per Transformer
            a. Embedding vectors
        3. Build transformer Archtecture code.
            a. Encoder
            b. Decoder
        4. Calculate Loss
        5. Inferencing code 
            a. auto regressive code
        6. creat training loop
        

## Installations:

In [3]:
# !pip install transformes
# !pip install datasets

## Dataset load and create:

In [1]:
import torch
#from datasets import load_dataset
from tokenizers import tokenizers
from tokenizers.models import WordLevel
from torch.utils.data import Dataset, DataLoader, random_split

import random
random.seed(42)

from pathlib import Path

def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

Using Device: cpu


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

## tokenization and Dataloader:

In [3]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

# Build tokenizers
tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

# Keep 90% for training, 10% for validation
train_ds_size = int(0.9 * len(ds_raw))
val_ds_size = len(ds_raw) - train_ds_size
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

# Find the maximum length of each sentence in the source and target sentence
max_len_src = 0
max_len_tgt = 0

for item in ds_raw:
    src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
    tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
    max_len_src = max(max_len_src, len(src_ids))
    max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')


train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)


  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since opus_books couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-it' at C:\Users\Deevia\.cache\huggingface\datasets\opus_books\en-it\0.0.0\1f9f6191d0e91a3c539c2595e2fe48fc1420de9b (last modified on Fri Oct  4 17:02:26 2024).


Max length of source sentence: 309
Max length of target sentence: 274


In [191]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x1e2a4e65a00>

In [313]:
for i in train_dataloader:
  print(i)
  break

{'encoder_input': tensor([[   2,   87,   27,  ...,    1,    1,    1],
        [   2,   88,   14,  ...,    1,    1,    1],
        [   2,   87,  236,  ...,    1,    1,    1],
        ...,
        [   2, 1246,   10,  ...,    1,    1,    1],
        [   2,  240, 1930,  ...,    1,    1,    1],
        [   2,   12,    9,  ...,    1,    1,    1]]), 'decoder_input': tensor([[    2,    52,    22,  ...,     1,     1,     1],
        [    2,   399,   125,  ...,     1,     1,     1],
        [    2,    52,  6555,  ...,     1,     1,     1],
        ...,
        [    2,  2585,  1526,  ...,     1,     1,     1],
        [    2,   950,   436,  ...,     1,     1,     1],
        [    2,     9, 10814,  ...,     1,     1,     1]]), 'encoder_mask': tensor([[[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        ...,


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]]], dtype

In [9]:
i.keys()

dict_keys(['encoder_input', 'decoder_input', 'encoder_mask', 'decoder_mask', 'label', 'src_text', 'tgt_text'])

In [10]:
for k in i.keys():
  try:
    print(k)
    print(i[k].shape)
  except:
    print(k)
    print(len(i[k]))

encoder_input
torch.Size([8, 350])
decoder_input
torch.Size([8, 350])
encoder_mask
torch.Size([8, 1, 1, 350])
decoder_mask
torch.Size([8, 1, 350, 350])
label
torch.Size([8, 350])
src_text
src_text
8
tgt_text
tgt_text
8


In [11]:
i['src_text']

['We took up the hoops, and began to drop them into the sockets placed for them.',
 'There were days when she was quite silent; but there were others when I could not account for the sounds she made.',
 "'I have informed my husband,' she began, and was unable to write any more.",
 'I will show her to you.',
 'Submitting to her mood, he felt in his whole being an ever-increasing stress of joy.',
 'She was standing among that group, very erect as usual, and was talking to the master of the house with her head slightly turned toward him, when Kitty approached.',
 "'Do you think she doesn't understand?' said Nicholas. 'She understands it all better than any of us.",
 'I left them to mourn over my folly, and now I am left to mourn under the consequences of it.']

In [12]:
i['tgt_text']

['Prendemmo i ferri, e cominciammo per fissarli negl’incastri pronti a riceverli.',
 'Poi sentivo pure quel mormorio più strano del riso. In certi giorni taceva e in altri faceva udire suoni inesprimibili.',
 'L’altra lettera la doveva scrivere a Vronskij. «Ho confessato a mio marito» scrisse, ma poi non seppe andare avanti.',
 'Elle est très gentille.',
 'Immedesimandosi nello stato d’animo di lei, egli sentiva in tutto il suo essere una tensione di felicità che diventava sempre più intensa.',
 'Stava in piedi, tenendosi, come sempre, straordinariamente diritta e quando Kitty si avvicinò al gruppo, parlava col padrone di casa volgendo lieve il capo verso di lui.',
 '— Tu credi che lei non capisca nulla? — disse Nikolaj. — Capisce tutto meglio di noi.',
 'Lasciai gli autori de’ miei giorni nel cordoglio che costarono ad essi le mie follie; or son lasciato nel cordoglio che mi costano le conseguenze di esse.']

In [13]:
i['label']

tensor([[16486,    32,  5807,  ...,     1,     1,     1],
        [  669,  1115,   297,  ...,     1,     1,     1],
        [  215,    10,   145,  ...,     1,     1,     1],
        ...,
        [ 2206,    16,   329,  ...,     1,     1,     1],
        [    9,   514,  4184,  ...,     1,     1,     1],
        [ 7298,    38, 13462,  ...,     1,     1,     1]])

## Batche creation in training loop

In [155]:
from tqdm import tqdm
initial_epoch = 0
global_step = 0
preload = config['preload']

for epoch in range(initial_epoch, config['num_epochs']):
    torch.cuda.empty_cache()
    #model.train()
    batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")
    for batch in batch_iterator:

        encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
        decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
        encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
        decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

        # # Run the tensors through the encoder, decoder and the projection layer
        # encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
        # decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
        # proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

        # Compare the output with the label
        label = batch['label'].to(device) # (B, seq_len)
        break
    break


Processing Epoch 00:   0%|          | 0/3638 [00:00<?, ?it/s]


In [194]:
# max seq length = 350
# batch size = 8
encoder_input.shape, decoder_input.shape, encoder_mask.shape, decoder_mask.shape

(torch.Size([8, 350]),
 torch.Size([8, 350]),
 torch.Size([8, 1, 1, 350]),
 torch.Size([8, 1, 350, 350]))

In [17]:
# already Build tokenizers
# tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
# tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])


# vocab size for
# src == english
# tgt == italian

tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()

(15698, 22463)

## Input Embeddings Generation:

In [18]:
# input embeddings:

from torch import nn
e = nn.Embedding(20, 8)   # vocab_size, seq_lenght
inp = torch.tensor([[1, 2, 4],[2, 3, 5]])  # input (2,3)

# input:  (2,3)
# weight of emmb metrix ->  shape(vocab size * vector size ) #(20,3)
# outputs:   (2,3,8)

e(inp).shape, inp.shape, e.weight.shape

(torch.Size([2, 3, 8]), torch.Size([2, 3]), torch.Size([20, 8]))

In [19]:
import torch
import torch.nn as nn
import math

class InputEmbeddings(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)


xx = InputEmbeddings(5, 20)
inp = torch.tensor([[1, 2, 4],[2, 3, 5]])
inp_emb = xx(inp)   # (batch, seq_length, emmb_vector_dim)

print(f"input: \n{inp}")
print(f"input: {inp.shape}")
print(f"emmd matrix {xx.embedding.weight.shape}")
print(f"input in embedding: {inp_emb.shape}")


input: 
tensor([[1, 2, 4],
        [2, 3, 5]])
input: torch.Size([2, 3])
emmd matrix torch.Size([20, 5])
input in embedding: torch.Size([2, 3, 5])


## Postional Encoding Generation:

![image.png](attachment:image.png)

In [20]:
# postional encoding
# input  (seq_length, embd_vector_dim)-> (src_seq_len, d_model)

class PostionalEncoding(nn.Module):
    def __init__(self,d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len

        # create metrix of shape (seq_len , emd_vec_dim)
        pe = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # create it 2D
        
        mid = torch.arange(0, d_model, 2, dtype=torch.float) *( -math.log(10000.0) / d_model)
        div = torch.exp(mid)
        #div =  1/(10000**2i/d_model) -> exp[-2i * log(10000)/d_model]
        # sin(pos * div) or  cos(pos *div)
        
        #for even nd odd index in emb_vector
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)

        # pe -> shape(seq_len, d_model)
        # change it to -> (1, seq_len, d_model)
        # to add batch dim 
        # its req for adding pe with input emb x

        pe = pe.unsqueeze(0)

        # register to buffer
        self.register_buffer('pe',pe)
    

    def forward(self, x):

        # shape of x-> (batch, seq_len, emd_vec_dim)
        # shape pe -> ( 1, seq_len, emd_vec_dim)
        x = x + self.pe[:, :x.shape[1], :]

        return x

        

#input:
batch_size = 2
seq_len = 3

# emb metrix
vocab_size = 26
emmeding_vec_dim = 10

xx = InputEmbeddings(10, 26)
inp = torch.tensor([[1, 2, 4],[2, 3, 5]])
inp_emb = xx(inp)   # (batch, seq_length, emmb_vector_dim)

pp = PostionalEncoding(10, 3)
inp_emb_with_pos = pp(inp_emb)


print(f"input: \n{inp}")
print(f"input: {inp.shape}")
print(f"emmd matrix {xx.embedding.weight.shape}")
print(f"input in embedding: {inp_emb.shape}")
print(f"------")
print(f"{inp_emb_with_pos.shape=}")


input: 
tensor([[1, 2, 4],
        [2, 3, 5]])
input: torch.Size([2, 3])
emmd matrix torch.Size([26, 10])
input in embedding: torch.Size([2, 3, 10])
------
inp_emb_with_pos.shape=torch.Size([2, 3, 10])


In [21]:
#explanation:

# list operation
l = [i for i in range(10)]
print(l)
print(l[2:6])
# Using [0::2] to select every second 
# element starting from index 0
print(l[0::2])
print(l[0::3])
print(l[1::2])
print(l[1::3])
print(l[::-1])  # reverse the list

tn = torch.tensor([[1,2,3,4,5],[10,20,30,40,50]])
print(f"{tn=}")
print(f"{tn[:,2]=}")
print(f"{tn[:,:1]=}")
print(f"{tn[:,0::2]=}") # pic even index 
[print(f"{tn[: , 1::2]=}")] # pick odd index

print("-------------------")
# squeez operaion:
print(torch.arange(0,4).shape)
# convert it in 2D
print(torch.arange(0,4).unsqueeze(1).shape)


# In PyTorch, self.register_buffer('pe', pe) 
# is used within a nn.Module class to register a 
# tensor (pe in this case) as a buffer in the model. 
# Buffers are like parameters, but they are not trainable. 
# Here’s what this does and when it is useful:


print(f"-------------------------")
# torch.squeez()
#---------------------------
# Removes dimensions of size 1 from a tensor.
# If no dimension is specified, squeeze() removes all 
# dimensions with size 1.
# If a specific dimension is provided (e.g., dim=1), 
# it will remove the dimension only if it has size 1.
t = torch.tensor([[[1], [2], [3]]])
print(f"{t.shape} \n{t.squeeze().shape} \n{t.squeeze(0).shape}\n{t.squeeze(1).shape}")

# torch.unsqueez()
#------------------------------
# Adds a new dimension of size 1 to a tensor at 
# the specified dim.
# This is useful for expanding the dimensions of 
# a tensor when you need to meet shape requirements 
# for broadcasting or other operations.

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[2, 3, 4, 5]
[0, 2, 4, 6, 8]
[0, 3, 6, 9]
[1, 3, 5, 7, 9]
[1, 4, 7]
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
tn=tensor([[ 1,  2,  3,  4,  5],
        [10, 20, 30, 40, 50]])
tn[:,2]=tensor([ 3, 30])
tn[:,:1]=tensor([[ 1],
        [10]])
tn[:,0::2]=tensor([[ 1,  3,  5],
        [10, 30, 50]])
tn[: , 1::2]=tensor([[ 2,  4],
        [20, 40]])
-------------------
torch.Size([4])
torch.Size([4, 1])
-------------------------
torch.Size([1, 3, 1]) 
torch.Size([3]) 
torch.Size([3, 1])
torch.Size([1, 3, 1])


### [ model.py] version 1

In [None]:
# my model.py

import torch
import math
from torch import nn

class InputEmbeddings(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)
  

class PostionalEncoding(nn.Module):
    def __init__(self,d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len

        # create metrix of shape (seq_len , emd_vec_dim)
        pe = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # create it 2D
        
        mid = torch.arange(0, d_model, 2, dtype=torch.float) *( -math.log(10000.0) / d_model)
        div = torch.exp(mid)
        #div =  1/(10000**2i/d_model) -> exp[-2i * log(10000)/d_model]
        # sin(pos * div) or  cos(pos *div)
        
        #for even nd odd index in emb_vector
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)

        # pe -> shape(seq_len, d_model)
        # change it to -> (1, seq_len, d_model)
        # to add batch dim 
        # its req for adding pe with input emb x

        pe = pe.unsqueeze(0)

        # register to buffer
        self.register_buffer('pe',pe)
    

    def forward(self, x):

        # shape of x-> (batch, seq_len, emd_vec_dim)
        # shape pe -> ( 1, seq_len, emd_vec_dim)
        x = x + self.pe[:, :x.shape[1], :]

        return x
    


class Transformer(nn.Module):
   
    def __init__(self, 
                src_embed: InputEmbeddings, 
                src_pos: PostionalEncoding
                ):
        super().__init__()
        self.src_embed = src_embed
        self.src_pos = src_pos

    def encode(self, src, src_mask=None):
       src = self.src_embed(src)
       src = self.src_pos(src)
       return src
    


def build_transformer(src_vocab_size:int,
                      src_seq_len:int,
                      d_model:int=512,
                      ):
   
   src_embed = InputEmbeddings(d_model, src_vocab_size)
   src_pos = PostionalEncoding(d_model, src_seq_len)


   transformer = Transformer(src_embed, src_pos)

   return transformer
   

In [153]:
# my train.py

import torch
from mymodel import build_transformer

seq_len = 3
batch_size = 2
vocab_size = 26
d_model = 10
encoder_input = torch.tensor([[1, 2, 4],[2, 3, 5]])
encoder_mask = None


model = build_transformer(src_vocab_size= 26,
                          src_seq_len= 3,
                          d_model= 10)

pos_out = model.encode(encoder_input, encoder_mask)

print(pos_out.shape) # batch, seq_len, d_model

torch.Size([2, 3, 10])


In [25]:
# use of nn.ModuleList
#--------------------------------

class MyModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.linears = nn.ModuleList([nn.Linear(10, 10) for i in range(10)])

    def forward(self, x):
        # ModuleList can act as an iterable, or be indexed using ints
        for i, l in enumerate(self.linears):
            x = self.linears[i // 2](x) + l(x)
        return x

from torch import nn
xx =nn.Linear(10, 10)
yy = InputEmbeddings(10, 26)
lr = nn.ModuleList([xx,xx,yy])
print('lr',lr)
for ii in lr:
    print(ii)

lr ModuleList(
  (0-1): 2 x Linear(in_features=10, out_features=10, bias=True)
  (2): InputEmbeddings(
    (embedding): Embedding(26, 10)
  )
)
Linear(in_features=10, out_features=10, bias=True)
Linear(in_features=10, out_features=10, bias=True)
InputEmbeddings(
  (embedding): Embedding(26, 10)
)


## Self Attention Block 

In [None]:
# self attention block
# input seq:

# input sentence -> input tokens -> input idx of tokens
# input idx of tokens -> [InputEmbedding] -> (E)input embed metrix(vector)
# (E)input embed metrix(vector) -> [PostionalEncoding(P)] -> n_E = E+P = new input embed metrix(vector)


# (n_E)new input embed metrix(vector)
# (n_E)  = q = k = v
    # q = n_E
    # k = n_E
    # v = n_E



# q, k, v -> [MultHeadAttention]


![image-2.png](attachment:image-2.png)

In [None]:
# MultiheadAttention:
#-----------------------------
# Version 1: self attention 

class MultiHeadAttentionBlock(nn.Module):
   
    def __init__(self,
                d_model:int,
                h:int):
        super().__init__()
        self.d_model = d_model  # embd vec dim
        # no of head -> (division of embed_dim)
        # d_model_part   d_k  = d_model / k
        self.h = h  

        assert d_model % h ==0, {"d_model is not visible by h"}

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias = False)
        self.w_k = nn.Linear(d_model, d_model, bias = False)
        self.w_v = nn.Linear(d_model, d_model, bias = False)
    


    # below is self attention function
    def attention(self, query, key, value, mask = None):
        d_k = query.shape[-1]
        #cal attention 
        #attention_score = (q*k.T / root(model_d)) * v
        attention_score = (query @ key.transpose(1,2)) / math.sqrt(d_k)

        # for decoder self attention
        # it should not see the future 
        # instancess while calculating sefl attention
        if mask is not None:
            attention_score.mask_fill(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)

        return (attention_scores @ value), attention_scores


    

In [86]:

# self attention

query = key = value = pos_out

d_k = query.shape[-1]  # embd vector dim

attention_scores = (query @ key.transpose(1,2)) / math.sqrt(d_k)
attention_scores = attention_scores.softmax(dim = -1)

#output
self_att_out = attention_scores @ value


print(f"input embed after POS: { pos_out.shape}")
print(f"----------------")
print(f"----------------")
print(f"batch size: {pos_out.shape[0]} ")
print(f"seq len: {pos_out.shape[1]} ")
print(f"embed vec dim d_model: {pos_out.shape[-1]}")
print(f"----------------")
print(f"{query.shape=}")
print(f"{key.shape=}")
print(f"{value.shape=}")
print(f"----------------")
print(f"key.T: {key.transpose(1,2).shape}")
print(f"----------------")
print(f"{attention_scores.shape=}")
print(f"attention score prob for 2 batches : \n{attention_scores.view(6,3)}")
print(f"----------------")
print(f"----------------")
print(f"final output from self attention: {self_att_out.shape}")

input embed after POS: torch.Size([2, 3, 10])
----------------
----------------
batch size: 2 
seq len: 3 
embed vec dim d_model: 10
----------------
query.shape=torch.Size([2, 3, 10])
key.shape=torch.Size([2, 3, 10])
value.shape=torch.Size([2, 3, 10])
----------------
key.T: torch.Size([2, 10, 3])
----------------
attention_scores.shape=torch.Size([2, 3, 3])
attention score prob for 2 batches : 
tensor([[1.0000e+00, 6.3445e-21, 1.9617e-17],
        [1.6062e-27, 1.0000e+00, 2.0310e-19],
        [5.8817e-07, 2.4053e-02, 9.7595e-01],
        [1.0000e+00, 7.1869e-20, 1.8708e-19],
        [2.2071e-14, 1.0000e+00, 1.0202e-22],
        [9.0434e-13, 1.6058e-21, 1.0000e+00]], grad_fn=<ViewBackward0>)
----------------
----------------
final output from self attention: torch.Size([2, 3, 10])


![image.png](attachment:image.png)

In [175]:
# MultiheadAttention:
#-----------------------------
# Version 1: self attention 
# Version 2: multihead self attention
# will break the emed dim in some parts 
# and calcualte attention for all part 
# than concat them


class MultiHeadAttentionBlock(nn.Module):
   
    def __init__(self,
                d_model:int,
                h:int):
        super().__init__()
        self.d_model = d_model  # embd vec dim
        # no of head -> (division of embed_dim)
        # d_model_part   d_k  = d_model / k
        self.h = h  

        assert d_model % h ==0, {"d_model is not visible by h"}

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias = False)
        self.w_k = nn.Linear(d_model, d_model, bias = False)
        self.w_v = nn.Linear(d_model, d_model, bias = False)
    


    # below is self attention function
    @staticmethod
    def attention(query, key, value, mask = None):
        d_k = query.shape[-1]
        #cal attention 
        #attention_score = (q*k.T / root(model_d)) * v
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
    
        # for decoder self attention
        # it should not see the future 
        # instancess while calculating sefl attention
        if mask is not None:
            attention_scores.mask_fill(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)
        

    
        return attention_scores @ value, attention_scores
    

    def forward(self, q, k, v, mask = None):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)


    
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask)
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return x


query = pos_out
key = pos_out
value = pos_out
m = MultiHeadAttentionBlock(d_model = 10, h = 2)
multi_hed_att_out = m(query, key, value, mask= None)
multi_hed_att_out.shape

torch.Size([2, 3, 10])

In [151]:
# self attention
query = pos_out
key = pos_out
value = pos_out
print(f"input embed after POS: { pos_out.shape}")
print(f"----------------")
print(f"----------------")
print(f"batch size: {pos_out.shape[0]} ")
print(f"seq len: {pos_out.shape[1]} ")
print(f"embed vec dim d_model: {pos_out.shape[-1]}")
print(f"----------------")
print(f"input without multihead:-->")
print(f"{query.shape=}")
print(f"{key.shape=}")
print(f"{value.shape=}")

h = 2
d_k = int(query.shape[-1] / h)
print(f"----------------")
print(f"no of heads h: {h}")
print(f"sun embed bvector dim d_k: {d_k}")

# note here 
# q = k = v = shape ( batch , seq_len , d_model)
# for multihead
# we break d_model in h parts---> d_k = d_model / h
# now 
# q = k = v = shape ( batch , seq_len , h, d_k)

# now before passing it to multihead attention '
# change dim(-3,-2)
# ( batch , h, seq_len , d_k) from ( batch , seq_len , h, d_k)

# multi had attention shape same ( seq_len , seq_len)


# now after passing it throught multihead attention 
# final out put shape 
# ( batch , h, seq_len , d_k)



query = query.view(query.shape[0], query.shape[1], h, d_k ).transpose(1,2)
key = key.view(key.shape[0], key.shape[1], h, d_k ).transpose(1,2)
value  = value.view(value.shape[0], value.shape[1], h, d_k ).transpose(1,2)

attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
attention_scores = attention_scores.softmax(dim = -1)

#output
self_att_out = attention_scores @ value

self_att_out2 = self_att_out.transpose(1,2).contiguous().view(self_att_out.shape[0], -1, h*d_k)






print(f"----------------")
print(f"input with multihead:-->")
print(f"{query.shape=}")
print(f"{key.shape=}")
print(f"{value.shape=}")

print(f"key.T: {key.transpose(-2,-1).shape}")
print(f"----------------")
print(f"{attention_scores.shape=}")
print(f"----------------")
print(f"----------------")
print(f"output from self attention: {self_att_out.shape}")
print(f"final output from self attention: {self_att_out2.shape}")

input embed after POS: torch.Size([2, 3, 10])
----------------
----------------
batch size: 2 
seq len: 3 
embed vec dim d_model: 10
----------------
input without multihead:-->
query.shape=torch.Size([2, 3, 10])
key.shape=torch.Size([2, 3, 10])
value.shape=torch.Size([2, 3, 10])
----------------
no of heads h: 2
sun embed bvector dim d_k: 5
----------------
input with multihead:-->
query.shape=torch.Size([2, 2, 3, 5])
key.shape=torch.Size([2, 2, 3, 5])
value.shape=torch.Size([2, 2, 3, 5])
key.T: torch.Size([2, 2, 5, 3])
----------------
attention_scores.shape=torch.Size([2, 2, 3, 3])
----------------
----------------
output from self attention: torch.Size([2, 2, 3, 5])
final output from self attention: torch.Size([2, 3, 10])


## FeedForward Block

In [None]:
# next block 
# feedforward block
# input :
# (batch, seq_len, d_model)
# modification
# (bach, seq_len, d_ff)
# finally converted back to 
# (batch, seq_len, d_model)

# (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))



In [176]:
multi_hed_att_out.shape

torch.Size([2, 3, 10])

In [184]:
# input:
#(batch, seq_len, d_model)
inp_feed = multi_hed_att_out
d_ff = 60
l1 = nn.Linear(10, 60)
dp = nn.Dropout(0.2)
l2 = nn.Linear(60, 10)

o1 = l1(inp_feed)
o2 = dp(o1)
o3 = l2(o2)

print(f"input to feedforward: {inp_feed.shape}")
print(f"----------------")
print(f"----------------")
print(f"batch size: {inp_feed.shape[0]} ")
print(f"seq len: {inp_feed.shape[1]} ")
print(f"embed vec dim d_model: {inp_feed.shape[-1]}")
print(f"----------------")
print(f"linear1: {o1.shape=}")
print(f"dropout: {o2.shape=}")
print(f"linera2: {o3.shape=}")

print(f"----------------")
print(f"----------------")
print(f"final output {o3.shape}")


input to feedforward: torch.Size([2, 3, 10])
----------------
----------------
batch size: 2 
seq len: 3 
embed vec dim d_model: 10
----------------
linear1: o1.shape=torch.Size([2, 3, 60])
dropout: o2.shape=torch.Size([2, 3, 60])
linera2: o3.shape=torch.Size([2, 3, 10])
----------------
----------------
final output torch.Size([2, 3, 10])


In [None]:
# till now 
#-----------------------------
# implemented
# code structure 

# methods:

# 1. build_transformer
        # args:
        # seq_le, d_model, d_ff, dropout, h, N
        # class obj initaite -> 
        # InputEmbeddings 
        # PostionalEmbedding

        # Encoder block
                # contains MulthedAttention, FeedForward

        # Encoder -> contains N no of Encoder blocks

        # Transformer -> main class
            # will call ->
            # InputEmbeddings, PostionalEmbedding
            # Encoder ->
                    # encoder blocks -> 
                                    # multhead
                                    # feedforward





### [ model.py] version 2

In [9]:
# my_model.py
#--------------------------------
import torch
import math
from torch import nn

class InputEmbeddings(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)
  
class PostionalEncoding(nn.Module):
    def __init__(self,d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len

        # create metrix of shape (seq_len , emd_vec_dim)
        pe = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # create it 2D
        
        mid = torch.arange(0, d_model, 2, dtype=torch.float) *( -math.log(10000.0) / d_model)
        div = torch.exp(mid)
        #div =  1/(10000**2i/d_model) -> exp[-2i * log(10000)/d_model]
        # sin(pos * div) or  cos(pos *div)
        
        #for even nd odd index in emb_vector
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)

        # pe -> shape(seq_len, d_model)
        # change it to -> (1, seq_len, d_model)
        # to add batch dim 
        # its req for adding pe with input emb x

        pe = pe.unsqueeze(0)

        # register to buffer
        self.register_buffer('pe',pe)
    

    def forward(self, x):

        # shape of x-> (batch, seq_len, emd_vec_dim)
        # shape pe -> ( 1, seq_len, emd_vec_dim)
        x = x + self.pe[:, :x.shape[1], :]

        return x
    
class MultiHeadAttentionBlock(nn.Module):
   
    def __init__(self,
                d_model:int,
                h:int):
        super().__init__()
        self.d_model = d_model  # embd vec dim
        # no of head -> (division of embed_dim)
        # d_model_part   d_k  = d_model / k
        self.h = h  

        assert d_model % h ==0, {"d_model is not visible by h"}

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias = False)
        self.w_k = nn.Linear(d_model, d_model, bias = False)
        self.w_v = nn.Linear(d_model, d_model, bias = False)
    


    # below is self attention function
    @staticmethod
    def attention(query, key, value, mask = None):
        d_k = query.shape[-1]
        #cal attention 
        #attention_score = (q*k.T / root(model_d)) * v
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
    
        # for decoder self attention
        # it should not see the future 
        # instancess while calculating sefl attention
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)
        

    
        return attention_scores @ value, attention_scores
    

    def forward(self, q, k, v, mask = None):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)


    
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask)
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return x

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class EncoderBlock(nn.Module):
   
    def __init__(self,
                self_attention_block,
                feed_forward_block,
                ):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block

    def forward(self, x, mask=None):
       x = self.self_attention_block(x, x, x)
       x = self.feed_forward_block(x)
       return x
    
class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        
    def forward(self, x , mask=None):
        for layer in self.layers:
            x = layer(x)
        
        return x
      
class Transformer(nn.Module):
   
    def __init__(self, 
                src_embed: InputEmbeddings, 
                src_pos: PostionalEncoding,
                encoder: Encoder,
                ):
        super().__init__()
        self.src_embed = src_embed
        self.src_pos = src_pos
        self.encoder = encoder

    def encode(self, src, src_mask=None):
       src = self.src_embed(src)
       src = self.src_pos(src)
       return self.encoder(src, src_mask)
    
def build_transformer(src_vocab_size:int,
                      src_seq_len:int,
                      d_model:int=512,    # input embedding vector dim
                      d_ff: int = 2048,   # feed forward dim
                      dropout: int = 0.1,
                      h: int = 2,   # no of head
                      N: int = 1,   # no of blocks of Enxoder decoder

                      ):

    src_embed = InputEmbeddings(d_model, src_vocab_size)
    src_pos = PostionalEncoding(d_model, src_seq_len)

    encoder_blocks = []
    encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h)
    feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        
    for _ in range(N):
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block)
        encoder_blocks.append(encoder_block)


    encoder = Encoder(nn.ModuleList(encoder_blocks))
    transformer = Transformer(src_embed, src_pos,encoder)

    return transformer


In [None]:
# my_train.py
#----------------------
import torch
from mymodel import build_transformer

seq_len = 3
batch_size = 2
vocab_size = 26
d_model = 10
encoder_input = torch.tensor([[1, 2, 4],[2, 3, 5]])
encoder_mask = None

model = build_transformer(src_vocab_size= 26,
                          src_seq_len= 3,
                          d_model= 10,
                          d_ff = 60,
                          N=1,
                          dropout=0.1,
                          h = 2
                          )

encoder_output = model.encode(encoder_input, encoder_mask)
print(encoder_output.shape)

## Decoder Implementation:

In [373]:
# max seq length = 350
# batch size = 8
encoder_input.shape, decoder_input.shape, encoder_mask.shape, decoder_mask.shape

(torch.Size([8, 350]),
 torch.Size([8, 350]),
 torch.Size([8, 1, 1, 350]),
 torch.Size([8, 1, 350, 350]))

In [316]:
encoder_input[:2,:25].shape, decoder_input[:2,:25].shape, encoder_mask[:2,:,:1,:25].shape, decoder_mask[:2,:,:25,:25].shape

(torch.Size([2, 25]),
 torch.Size([2, 25]),
 torch.Size([2, 1, 1, 25]),
 torch.Size([2, 1, 25, 25]))

In [385]:
# new inputs to encoder and decoder:

# batch size = 2
# seq_len = 25

encoder_input = encoder_input[:2,:25] 
decoder_input = decoder_input[:2,:25]
encoder_mask = encoder_mask[:2,:,:1,:25]
decoder_mask = decoder_mask[:2,:,:25,:25]

print(f"{encoder_input.shape=}")
print(f"{encoder_mask.shape=}")
print(f"{decoder_input.shape=}")
print(f"{decoder_mask.shape=}")

encoder_input.shape=torch.Size([2, 25])
encoder_mask.shape=torch.Size([2, 1, 1, 25])
decoder_input.shape=torch.Size([2, 25])
decoder_mask.shape=torch.Size([2, 1, 25, 25])


In [386]:
# tokenizer
# Build tokenizers
# already intialized
tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])



In [434]:
tokenizer_src.get_vocab_size()
print(f"tokenizer has its vocab size: \nwe have two tokenizes one for eng and other for italian: ")

print(f"vocab size: \neng: {tokenizer_src.get_vocab_size()} \nit: {tokenizer_tgt.get_vocab_size()}")

tokenizer has its vocab size: 
we have two tokenizes one for eng and other for italian: 
vocab size: 
eng: 15698 
it: 22463


In [435]:
print(f"tokenizer has its vocab size: \nwe have two tokenizes one for eng and other for italian: ")
print(f"vocab size: \neng: {tokenizer_src.get_vocab_size()} \nit: {tokenizer_tgt.get_vocab_size()}")


print(f"------------------------------------------")
print(f"------------------------------------------")

for s in range(2):
    lst = []
    for i in encoder_input[s].tolist():
        lst.append(tokenizer_src.id_to_token(i))
    
    print(f"encoder input  exmple: \n{" ".join(lst)}"   )
    print(f"{encoder_input[s].tolist()}<---- token ids")
    print(f"{encoder_mask[s].tolist()}<----- encoder mask")


    lst = []
    for i in decoder_input[s].tolist():
        lst.append(tokenizer_tgt.id_to_token(i))
    
    print(f"decoder input  exmple: \n{" ".join(lst)}"   ) 
    print(f"{decoder_input[s].tolist()}<---- token ids")

    print(f"------------------------------------------")
    print(f'Eng: {tokenizer_src.decode(encoder_input[0].tolist())}')
    print(f'It: {tokenizer_tgt.decode(decoder_input[0].tolist())}')

    print(f"------------------------------------------")
    print(f"------------------------------------------")

print(f"------------------------------------------")
print(f"------------------------------------------")
print(f"{encoder_input.shape=}")
print(f"{encoder_mask.shape=}")
print(f"{decoder_input.shape=}")
print(f"{decoder_mask.shape=}")
print(f"------------------------------------------")
print(f"similar to encoder mask there is decodr mask fro each sentence having shape of (25,25) looks like below\
       \n{decoder_mask[0].view(25,25)[:10,:10]}")



tokenizer has its vocab size: 
we have two tokenizes one for eng and other for italian: 
vocab size: 
eng: 15698 
it: 22463
------------------------------------------
------------------------------------------
encoder input  exmple: 
[SOS] He could not fail to recognize Yashvin ' s voice , though he did not see him . [EOS] [PAD] [PAD] [PAD] [PAD] [PAD]
[2, 60, 57, 21, 2451, 8, 2475, 1054, 12, 41, 279, 4, 199, 16, 73, 21, 99, 32, 7, 3, 1, 1, 1, 1, 1]<---- token ids
[[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]]<----- encoder mask
decoder input  exmple: 
[SOS] Non poté non distinguere la voce di Jašvin , ma non lo scorse . [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[2, 54, 1288, 12, 3804, 11, 195, 7, 1122, 4, 26, 12, 36, 3370, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]<---- token ids
------------------------------------------
Eng: He could not fail to recognize Yashvin ' s voice , though he did not see him .
It: Non poté non distinguere la voce di 

## save and load various tensors for experiment purpose:

In [148]:
# for generating same result i have stored these tensors:
# load and store the tensors:
#--------------------------------------------

# import torch
# # Save all tensors in a dictionary
# tensor_dict = {
#     'encoder_input': encoder_input,
#     'decoder_input': decoder_input,
#     'encoder_mask': encoder_mask,
#     'decoder_mask': decoder_mask
# }

# torch.save(tensor_dict, 'input_model_tensors.pth')
# print("Tensors saved successfully!")


# Load the tensors from the file
loaded_tensors = torch.load('input_model_tensors1.pth')

# Access each tensor
encoder_input = loaded_tensors['encoder_input']
decoder_input = loaded_tensors['decoder_input']
encoder_mask = loaded_tensors['encoder_mask']
decoder_mask = loaded_tensors['decoder_mask']
label = loaded_tensors['label']

print("Loaded Tensors:")

print(f"{encoder_input.shape=}")
print(f"{encoder_mask.shape=}")
print(f"{decoder_input.shape=}")
print(f"{decoder_mask.shape=}")
print(f"{label.shape=}")

Loaded Tensors:
encoder_input.shape=torch.Size([2, 25])
encoder_mask.shape=torch.Size([2, 1, 1, 25])
decoder_input.shape=torch.Size([2, 25])
decoder_mask.shape=torch.Size([2, 1, 25, 25])


In [5]:
# my_model.py
#--------------------------------
import torch
import math
from torch import nn

class InputEmbeddings(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)
  
class PostionalEncoding(nn.Module):
    def __init__(self,d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len

        # create metrix of shape (seq_len , emd_vec_dim)
        pe = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # create it 2D
        
        mid = torch.arange(0, d_model, 2, dtype=torch.float) *( -math.log(10000.0) / d_model)
        div = torch.exp(mid)
        #div =  1/(10000**2i/d_model) -> exp[-2i * log(10000)/d_model]
        # sin(pos * div) or  cos(pos *div)
        
        #for even nd odd index in emb_vector
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)

        # pe -> shape(seq_len, d_model)
        # change it to -> (1, seq_len, d_model)
        # to add batch dim 
        # its req for adding pe with input emb x

        pe = pe.unsqueeze(0)

        # register to buffer
        self.register_buffer('pe',pe)
    

    def forward(self, x):

        # shape of x-> (batch, seq_len, emd_vec_dim)
        # shape pe -> ( 1, seq_len, emd_vec_dim)
        x = x + self.pe[:, :x.shape[1], :]

        return x
    
class MultiHeadAttentionBlock(nn.Module):
   
    def __init__(self,
                d_model:int,
                h:int):
        super().__init__()
        self.d_model = d_model  # embd vec dim
        # no of head -> (division of embed_dim)
        # d_model_part   d_k  = d_model / k
        self.h = h  

        assert d_model % h ==0, {"d_model is not visible by h"}

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias = False)
        self.w_k = nn.Linear(d_model, d_model, bias = False)
        self.w_v = nn.Linear(d_model, d_model, bias = False)
    


    # below is self attention function
    @staticmethod
    def attention(query, key, value, mask = None):
        d_k = query.shape[-1]
        #cal attention 
        #attention_score = (q*k.T / root(model_d)) * v
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
    
        # for decoder self attention
        # it should not see the future 
        # instancess while calculating sefl attention
        if mask is not None:
            attention_scores.mask_fill(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)
        

    
        return attention_scores @ value, attention_scores
    

    def forward(self, q, k, v, mask = None):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)


    
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask)
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return x

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class EncoderBlock(nn.Module):
   
    def __init__(self,
                self_attention_block,
                feed_forward_block,
                ):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block

    def forward(self, x, mask=None):
       x = self.self_attention_block(x, x, x)
       x = self.feed_forward_block(x)
       return x
    
class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        
    def forward(self, x , mask=None):
        for layer in self.layers:
            x = layer(x)
        
        return x
      
class Transformer(nn.Module):
   
    def __init__(self, 
                src_embed: InputEmbeddings, 
                src_pos: PostionalEncoding,
                encoder: Encoder,
                ):
        super().__init__()
        self.src_embed = src_embed
        self.src_pos = src_pos
        self.encoder = encoder

    def encode(self, src, src_mask=None):
       src = self.src_embed(src)
       src = self.src_pos(src)
       return self.encoder(src, src_mask)
    
def build_transformer(src_vocab_size:int,
                      src_seq_len:int,
                      d_model:int=512,    # input embedding vector dim
                      d_ff: int = 2048,   # feed forward dim
                      dropout: int = 0.1,
                      h: int = 2,   # no of head
                      N: int = 1,   # no of blocks of Enxoder decoder

                      ):

    src_embed = InputEmbeddings(d_model, src_vocab_size)
    src_pos = PostionalEncoding(d_model, src_seq_len)

    encoder_blocks = []
    encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h)
    feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        
    for _ in range(N):
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block)
        encoder_blocks.append(encoder_block)


    encoder = Encoder(nn.ModuleList(encoder_blocks))
    transformer = Transformer(src_embed, src_pos,encoder)

    return transformer


In [149]:
# now pass input to the transformer:
#-------------------------------------
batch_size = 2
seq_len = 25
d_model = 10


print(f"{encoder_input.shape=}")
print(f"{encoder_mask.shape=}")
print(f"{decoder_input.shape=}")
print(f"{decoder_mask.shape=}")

encoder_input.shape=torch.Size([2, 25])
encoder_mask.shape=torch.Size([2, 1, 1, 25])
decoder_input.shape=torch.Size([2, 25])
decoder_mask.shape=torch.Size([2, 1, 25, 25])


In [None]:
# my_train.py
#----------------------
import torch
#from mymodel import build_transformer

batch_size = 2
seq_len = 25
d_model = 10



en_vocb_size = tokenizer_src.get_vocab_size()
it_vocab_size = tokenizer_src.get_vocab_size()

print(f"{encoder_input.shape=}")
print(f"{encoder_mask.shape=}")
print(f"{decoder_input.shape=}")
print(f"{decoder_mask.shape=}")
print(f"{en_vocb_size=}")
print(f"-------------------------------")

model = build_transformer(src_vocab_size= en_vocb_size,
                          src_seq_len= 25,
                          d_model= 10,
                          d_ff = 60,
                          N=1,
                          dropout=0.1,
                          h = 2
                          )

encoder_output = model.encode(encoder_input, encoder_mask)
print(encoder_output.shape)

In [60]:
#2nd final version

# my_model.py
#--------------------------------
import torch
import math
from torch import nn

class PostionalEncoding(nn.Module):
    def __init__(self,d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len

        # create metrix of shape (seq_len , emd_vec_dim)
        pe = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # create it 2D
        
        mid = torch.arange(0, d_model, 2, dtype=torch.float) *( -math.log(10000.0) / d_model)
        div = torch.exp(mid)
        #div =  1/(10000**2i/d_model) -> exp[-2i * log(10000)/d_model]
        # sin(pos * div) or  cos(pos *div)
        
        #for even nd odd index in emb_vector
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)

        # pe -> shape(seq_len, d_model)
        # change it to -> (1, seq_len, d_model)
        # to add batch dim 
        # its req for adding pe with input emb x

        pe = pe.unsqueeze(0)

        # register to buffer
        self.register_buffer('pe',pe)
    

    def forward(self, x):

        # shape of x-> (batch, seq_len, emd_vec_dim)
        # shape pe -> ( 1, seq_len, emd_vec_dim)
        x = x + self.pe[:, :x.shape[1], :]

        return x
    
class EncoderBlock(nn.Module):
   
    def __init__(self,
                self_attention_block,
                feed_forward_block,
                ):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block

    def forward(self, x, mask=None):
       x = self.self_attention_block(x, x, x)
       x = self.feed_forward_block(x)
       return x
    
class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        
    def forward(self, x , mask=None):
        for layer in self.layers:
            x = layer(x)
        
        return x
      
class MultiHeadAttentionBlock(nn.Module):
   
    def __init__(self,
                d_model:int,
                h:int):
        super().__init__()
        self.d_model = d_model  # embd vec dim
        # no of head -> (division of embed_dim)
        # d_model_part   d_k  = d_model / k
        self.h = h  

        assert d_model % h ==0, {"d_model is not visible by h"}

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias = False)
        self.w_k = nn.Linear(d_model, d_model, bias = False)
        self.w_v = nn.Linear(d_model, d_model, bias = False)
    


    # below is self attention function
    @staticmethod
    def attention(query, key, value, mask = None):
        d_k = query.shape[-1]
        #cal attention 
        #attention_score = (q*k.T / root(model_d)) * v
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
    
        # for decoder self attention
        # it should not see the future 
        # instancess while calculating sefl attention
        if mask is not None:
    
            attention_scores.masked_fill_(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)
        

    
        return attention_scores @ value, attention_scores
    

    def forward(self, q, k, v, mask = None):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)


    
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask)
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return x
    
class InputEmbeddings(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))
    
class DecoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.self_attention_block(x, x, x, tgt_mask)
        x = self.cross_attention_block(x, encoder_output, encoder_output, src_mask)
        
        return x

class Decoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
       

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return x

class Transformer(nn.Module):
   
    def __init__(self, 
                src_embed: InputEmbeddings, 
                src_pos: PostionalEncoding,
                tgt_embed: InputEmbeddings, 
                tgt_pos: PostionalEncoding,
                encoder: Encoder,
                decoder: Decoder
                ):
        super().__init__()

        self.src_embed = src_embed
        self.src_pos = src_pos
        self.tgt_embed = tgt_embed
        self.tgt_pos = tgt_pos
        self.encoder = encoder
        self.decoder = decoder

    def encode(self, src, src_mask=None):
       src = self.src_embed(src)
       src = self.src_pos(src)
       return self.encoder(src, src_mask)
    
    def decode(self, encoder_output, src_mask, 
               tgt, tgt_mask):

        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)

        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
def build_transformer(src_vocab_size:int,
                      src_seq_len:int,

                      tgt_vocab_size:int,
                      tgt_seq_len:int,

                      d_model:int=512,    # input embedding vector dim
                      d_ff: int = 2048,   # feed forward dim
                      dropout: int = 0.1,
                      h: int = 2,   # no of head
                      N: int = 1,   # no of blocks of Enxoder decoder

                      ):

    src_embed = InputEmbeddings(d_model, src_vocab_size)
    src_pos = PostionalEncoding(d_model, src_seq_len)


   
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
  
    tgt_pos = PostionalEncoding(d_model, tgt_seq_len)

    encoder_blocks = []
    encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h)
    feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        
    for _ in range(N):
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block)
        encoder_blocks.append(encoder_block)

    decoder_blocks = []
    decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h)
    decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h)
    feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        
    for _ in range(N):
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block=feed_forward_block)
        decoder_blocks.append(decoder_block)


    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    transformer = Transformer(src_embed, src_pos, tgt_embed, tgt_pos, encoder, decoder)

    return transformer



en_vocab_size = tokenizer_src.get_vocab_size()
it_vocab_size = tokenizer_tgt.get_vocab_size()

model = build_transformer(src_vocab_size= en_vocab_size,
                          src_seq_len= 25,
                          tgt_vocab_size= it_vocab_size,
                          tgt_seq_len= 25,
                          d_model= 10,
                          d_ff = 60,
                          N=1,
                          dropout=0.1,
                          h = 2
                          )

encoder_output = model.encode(encoder_input, encoder_mask)
print(encoder_output.shape)

decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)


----------------------(10, 22463)
InputEmbeddings(
  (embedding): Embedding(22463, 10)
)
torch.Size([2, 3, 10])
--------------------InputEmbeddings(
  (embedding): Embedding(22463, 10)
)
-----------------mask-torch.Size([2, 1, 25, 25])
-----------------mask-torch.Size([2, 2, 25, 25])
torch.Size([2, 2, 25, 5]) torch.Size([2, 2, 25, 5]) torch.Size([2, 2, 25, 5])


In [62]:
decoder_output.shape

torch.Size([2, 25, 10])

## Final Transformer Archtecture Code:

### [ model.py] version 3 Final Version

In [None]:
# final version

# my_model.py
#--------------------------------
import torch
import math
from torch import nn

class PostionalEncoding(nn.Module):
    def __init__(self,d_model, seq_len):
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len

        # create metrix of shape (seq_len , emd_vec_dim)
        pe = torch.zeros(seq_len, d_model)
        pos = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # create it 2D
        
        mid = torch.arange(0, d_model, 2, dtype=torch.float) *( -math.log(10000.0) / d_model)
        div = torch.exp(mid)
        #div =  1/(10000**2i/d_model) -> exp[-2i * log(10000)/d_model]
        # sin(pos * div) or  cos(pos *div)
        
        #for even nd odd index in emb_vector
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)

        # pe -> shape(seq_len, d_model)
        # change it to -> (1, seq_len, d_model)
        # to add batch dim 
        # its req for adding pe with input emb x

        pe = pe.unsqueeze(0)

        # register to buffer
        self.register_buffer('pe',pe)
    

    def forward(self, x):

        # shape of x-> (batch, seq_len, emd_vec_dim)
        # shape pe -> ( 1, seq_len, emd_vec_dim)
        x = x + self.pe[:, :x.shape[1], :]

        return x
    
class EncoderBlock(nn.Module):
   
    def __init__(self,
                self_attention_block,
                feed_forward_block,
                ):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block

    def forward(self, x, mask=None):
       x = self.self_attention_block(x, x, x)
       x = self.feed_forward_block(x)
       return x
    
class Encoder(nn.Module):

    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        
    def forward(self, x , mask=None):
        for layer in self.layers:
            x = layer(x)
        
        return x
      
class MultiHeadAttentionBlock(nn.Module):
   
    def __init__(self,
                d_model:int,
                h:int):
        super().__init__()
        self.d_model = d_model  # embd vec dim
        # no of head -> (division of embed_dim)
        # d_model_part   d_k  = d_model / k
        self.h = h  

        assert d_model % h ==0, {"d_model is not visible by h"}

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model, bias = False)
        self.w_k = nn.Linear(d_model, d_model, bias = False)
        self.w_v = nn.Linear(d_model, d_model, bias = False)
    


    # below is self attention function
    @staticmethod
    def attention(query, key, value, mask = None):
        d_k = query.shape[-1]
        #cal attention 
        #attention_score = (q*k.T / root(model_d)) * v
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k)
    
        # for decoder self attention
        # it should not see the future 
        # instancess while calculating sefl attention
        if mask is not None:
    
            attention_scores.masked_fill_(mask == 0, -1e9)

        attention_scores = attention_scores.softmax(dim = -1)
        

    
        return attention_scores @ value, attention_scores
    

    def forward(self, q, k, v, mask = None):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)


    
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask)
        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        return x
    
class InputEmbeddings(nn.Module):
  def __init__(self, d_model, vocab_size):
    super().__init__()
    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embedding = nn.Embedding(vocab_size, d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))
    
class DecoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.self_attention_block(x, x, x, tgt_mask)
        x = self.cross_attention_block(x, encoder_output, encoder_output, src_mask)
        
        return x

class Decoder(nn.Module):

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
       

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return x

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)
    
class Transformer(nn.Module):
   
    def __init__(self, 
                src_embed: InputEmbeddings, 
                src_pos: PostionalEncoding,
                tgt_embed: InputEmbeddings, 
                tgt_pos: PostionalEncoding,
                encoder: Encoder,
                decoder: Decoder,
                projection_layer:ProjectionLayer,

                ):
        super().__init__()

        self.src_embed = src_embed
        self.src_pos = src_pos
        self.tgt_embed = tgt_embed
        self.tgt_pos = tgt_pos
        self.encoder = encoder
        self.decoder = decoder
        self.projection_layer = projection_layer

    def encode(self, src, src_mask=None):
       src = self.src_embed(src)
       src = self.src_pos(src)
       return self.encoder(src, src_mask)
    
    def decode(self, encoder_output, src_mask, 
               tgt, tgt_mask):
        
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)

        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)
    
def build_transformer(src_vocab_size:int,
                      src_seq_len:int,

                      tgt_vocab_size:int,
                      tgt_seq_len:int,

                      d_model:int=512,    # input embedding vector dim
                      d_ff: int = 2048,   # feed forward dim
                      dropout: int = 0.1,
                      h: int = 2,   # no of head
                      N: int = 1,   # no of blocks of Enxoder decoder

                      ):

    src_embed = InputEmbeddings(d_model, src_vocab_size)
    src_pos = PostionalEncoding(d_model, src_seq_len)


    print(f"----------------------{d_model, tgt_vocab_size}")
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)
    print(f"{tgt_embed}")
    tgt_pos = PostionalEncoding(d_model, tgt_seq_len)

    encoder_blocks = []
    encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h)
    feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        
    for _ in range(N):
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block)
        encoder_blocks.append(encoder_block)

    decoder_blocks = []
    decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h)
    decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h)
    feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        
    for _ in range(N):
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block=feed_forward_block)
        decoder_blocks.append(decoder_block)


    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    transformer = Transformer(src_embed, src_pos, tgt_embed, tgt_pos, encoder, decoder, projection_layer)

    return transformer



## train.py..
#------------------------------

en_vocab_size = tokenizer_src.get_vocab_size()
it_vocab_size = tokenizer_tgt.get_vocab_size()

model = build_transformer(src_vocab_size= en_vocab_size,
                          src_seq_len= 25,
                          tgt_vocab_size= it_vocab_size,
                          tgt_seq_len= 25,
                          d_model= 10,
                          d_ff = 60,
                          N=1,
                          dropout=0.1,
                          h = 2
                          )

encoder_output = model.encode(encoder_input, encoder_mask)
print(encoder_output.shape)

decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)

proj_output = model.project(decoder_output)

## Loss calculations:

In [98]:
# loss calculation :
#--------------------------------

print(f"decoder output: {proj_output.shape}") # batch, seq_len, vocab_size
print(f"true label shape: {label.shape}")   # bach seq_len(true values)

print(f"changed proj out shape: {proj_output.view(-1, tokenizer_tgt.get_vocab_size()).shape}")
print(f"changed label shape: {label.reshape(-1).shape}")

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.reshape(-1))

print(f"Loss: {loss}")       



decoder output: torch.Size([2, 25, 22463])
true label shape: torch.Size([2, 25])
changed proj out shape: torch.Size([50, 22463])
changed label shape: torch.Size([50])
Loss: 10.085467338562012


## inferencing Code:

In [176]:
# how to validate:
# valoidation with singl example:
#---------------------------------------------
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0



for batch in val_dataloader:
    break

encoder_input = batch['encoder_input'][:, :25]
encoder_mask = batch['encoder_mask'][:,:,:, :25]
source_text = batch["src_text"][0]
target_text = batch["tgt_text"][0]

print(f"{encoder_input.shape}")
print(f"{encoder_mask.shape}")
print(f"{source_text}")
print(f"{target_text}")
print(f"variable decoder mask:------------>")
for i in range(5):
    decoder_mask = causal_mask(i).type_as(encoder_input).to(device)
    print(decoder_mask)


sos_idx = tokenizer_tgt.token_to_id('[SOS]')
eos_idx = tokenizer_tgt.token_to_id('[EOS]')

encoder_output = model.encode(encoder_input, encoder_mask)
decoder_mask = causal_mask(decoder_input.size(1)).type_as(encoder_input).to(device)
decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(encoder_input)
 




torch.Size([1, 25])
torch.Size([1, 1, 1, 25])
In such vault I had been told did Mr. Reed lie buried; and led by this thought to recall his idea, I dwelt on it with gathering dread.
Mi era stato detto che sotto quella vòlta riposava il signor Reed; questo pensiero mi ricondusse e m'ispirò riflessioni spaventose.
variable decoder mask:------------>
tensor([], size=(1, 0, 0), dtype=torch.int64)
tensor([[[1]]])
tensor([[[1, 0],
         [1, 1]]])
tensor([[[1, 0, 0],
         [1, 1, 0],
         [1, 1, 1]]])
tensor([[[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 1]]])


In [177]:
# calculate output
out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
prob = model.project(out[:, -1])
_, next_word = torch.max(prob, dim = -1)

decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(encoder_input).fill_(next_word.item()).to(device)], dim=1
        )



print(f"output shapes transform: {out.shape, out[:, -1].shape}")
print(f"{prob.shape= }")
print(f"max prob value: and its index: \n{_, next_word }") # tensor value and its index

print(f"extend decoder input by concat the generated nextword: \n{decoder_input}")


output shapes transform: (torch.Size([1, 1, 10]), torch.Size([1, 10]))
prob.shape= torch.Size([1, 22463])
max prob value: and its index: 
(tensor([0.6127], grad_fn=<MaxBackward0>), tensor([8992]))
extend decoder input by concat the generated nextword: 
tensor([[   2, 8992]])


## Model Parameters and Layers:

In [2]:
# model Archtecture:
#----------------------
model

Transformer(
  (src_embed): InputEmbeddings(
    (embedding): Embedding(15698, 100)
  )
  (src_pos): PostionalEncoding()
  (tgt_embed): InputEmbeddings(
    (embedding): Embedding(22463, 100)
  )
  (tgt_pos): PostionalEncoding()
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=100, out_features=100, bias=False)
          (w_k): Linear(in_features=100, out_features=100, bias=False)
          (w_v): Linear(in_features=100, out_features=100, bias=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=100, out_features=60, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=60, out_features=100, bias=True)
        )
      )
    )
  )
  (decoder): Decoder(
    (layers): ModuleList(
      (0): DecoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_

In [4]:
for name,_ in model.named_parameters():
    print(name)

src_embed.embedding.weight
tgt_embed.embedding.weight
encoder.layers.0.self_attention_block.w_q.weight
encoder.layers.0.self_attention_block.w_k.weight
encoder.layers.0.self_attention_block.w_v.weight
encoder.layers.0.feed_forward_block.linear_1.weight
encoder.layers.0.feed_forward_block.linear_1.bias
encoder.layers.0.feed_forward_block.linear_2.weight
encoder.layers.0.feed_forward_block.linear_2.bias
decoder.layers.0.self_attention_block.w_q.weight
decoder.layers.0.self_attention_block.w_k.weight
decoder.layers.0.self_attention_block.w_v.weight
decoder.layers.0.cross_attention_block.w_q.weight
decoder.layers.0.cross_attention_block.w_k.weight
decoder.layers.0.cross_attention_block.w_v.weight
decoder.layers.0.feed_forward_block.linear_1.weight
decoder.layers.0.feed_forward_block.linear_1.bias
decoder.layers.0.feed_forward_block.linear_2.weight
decoder.layers.0.feed_forward_block.linear_2.bias
projection_layer.proj.weight
projection_layer.proj.bias


In [14]:
pm = sum(p.numel() for p in model.parameters())
t_pm = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Params: {pm/1000000} million")
print(f"Total Trainable Params: {t_pm/1000000} million")

Total Params: 6.199183 million
Total Trainable Params: 6.199183 million


In [5]:
# to be continued......
# thanks!!!
