In [1]:
import torch
from datasets import load_dataset
from tokenizers import tokenizers
from tokenizers.models import WordLevel
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset


from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

import random
random.seed(42)

from pathlib import Path
from mymodel import build_transformer

def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")

def get_or_build_tokenizer(config, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer


# Build tokenizers
tokenizer_src = get_or_build_tokenizer(config,  config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config,  config['lang_tgt'])

# load dataset:
# Load the tensors from the file

# batch size = 2
# seq_len = 25
# embed_dim = d_model = 10
# src vocab size = 10248
# tgt vocab size = 24345

print(f"Load dataset info:--------------------------->")
loaded_tensors = torch.load('input_model_tensors2.pth')

# Access each tensor
encoder_input = loaded_tensors['encoder_input']
decoder_input = loaded_tensors['decoder_input']
encoder_mask = loaded_tensors['encoder_mask']
decoder_mask = loaded_tensors['decoder_mask']
label = loaded_tensors['label']

print("Loaded Tensors:")

print(f"{encoder_input.shape=}")
print(f"{encoder_mask.shape=}")
print(f"{decoder_input.shape=}")
print(f"{decoder_mask.shape=}")
print(f"{label.shape}")



en_vocab_size = tokenizer_src.get_vocab_size()
it_vocab_size = tokenizer_tgt.get_vocab_size()

model = build_transformer(src_vocab_size= en_vocab_size,
                          src_seq_len= 25,
                          tgt_vocab_size= it_vocab_size,
                          tgt_seq_len= 25,
                          d_model= 100,
                          d_ff = 60,
                          N=1,
                          dropout=0.1,
                          h = 2
                          )

encoder_output = model.encode(encoder_input, encoder_mask)
decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
proj_output = model.project(decoder_output)

print(f"Forward Pass:------------------------->")
print(f"{encoder_output.shape=}")
print(f"{decoder_output.shape=}")
print(f"{proj_output.shape=}")


# loss calculation :
#--------------------------------
print(f"Loss Calculation:----------------->")
print(f"decoder output: {proj_output.shape}") # batch, seq_len, vocab_size
print(f"true label shape: {label.shape}")   # bach seq_len(true values)

print(f"changed proj out shape: {proj_output.view(-1, tokenizer_tgt.get_vocab_size()).shape}")
print(f"changed label shape: {label.reshape(-1).shape}")

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.reshape(-1))

print(f"Loss: {loss}")  


#########################################



  from .autonotebook import tqdm as notebook_tqdm


Using Device: cpu
Load dataset info:--------------------------->
Loaded Tensors:
encoder_input.shape=torch.Size([2, 25])
encoder_mask.shape=torch.Size([2, 1, 1, 25])
decoder_input.shape=torch.Size([2, 25])
decoder_mask.shape=torch.Size([2, 1, 25, 25])
torch.Size([2, 25])
Forward Pass:------------------------->
encoder_output.shape=torch.Size([2, 25, 100])
decoder_output.shape=torch.Size([2, 25, 100])
proj_output.shape=torch.Size([2, 25, 22463])
Loss Calculation:----------------->
decoder output: torch.Size([2, 25, 22463])
true label shape: torch.Size([2, 25])
changed proj out shape: torch.Size([50, 22463])
changed label shape: torch.Size([50])
Loss: 10.099520683288574


In [2]:
# model Archtecture:
#----------------------
model

Transformer(
  (src_embed): InputEmbeddings(
    (embedding): Embedding(15698, 100)
  )
  (src_pos): PostionalEncoding()
  (tgt_embed): InputEmbeddings(
    (embedding): Embedding(22463, 100)
  )
  (tgt_pos): PostionalEncoding()
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=100, out_features=100, bias=False)
          (w_k): Linear(in_features=100, out_features=100, bias=False)
          (w_v): Linear(in_features=100, out_features=100, bias=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=100, out_features=60, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=60, out_features=100, bias=True)
        )
      )
    )
  )
  (decoder): Decoder(
    (layers): ModuleList(
      (0): DecoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_

In [4]:
for name,_ in model.named_parameters():
    print(name)

src_embed.embedding.weight
tgt_embed.embedding.weight
encoder.layers.0.self_attention_block.w_q.weight
encoder.layers.0.self_attention_block.w_k.weight
encoder.layers.0.self_attention_block.w_v.weight
encoder.layers.0.feed_forward_block.linear_1.weight
encoder.layers.0.feed_forward_block.linear_1.bias
encoder.layers.0.feed_forward_block.linear_2.weight
encoder.layers.0.feed_forward_block.linear_2.bias
decoder.layers.0.self_attention_block.w_q.weight
decoder.layers.0.self_attention_block.w_k.weight
decoder.layers.0.self_attention_block.w_v.weight
decoder.layers.0.cross_attention_block.w_q.weight
decoder.layers.0.cross_attention_block.w_k.weight
decoder.layers.0.cross_attention_block.w_v.weight
decoder.layers.0.feed_forward_block.linear_1.weight
decoder.layers.0.feed_forward_block.linear_1.bias
decoder.layers.0.feed_forward_block.linear_2.weight
decoder.layers.0.feed_forward_block.linear_2.bias
projection_layer.proj.weight
projection_layer.proj.bias


In [14]:
pm = sum(p.numel() for p in model.parameters())
t_pm = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Params: {pm/1000000} million")
print(f"Total Trainable Params: {t_pm/1000000} million")

Total Params: 6.199183 million
Total Trainable Params: 6.199183 million


In [15]:
# inference

In [16]:
import torch
#from datasets import load_dataset
from tokenizers import tokenizers
from tokenizers.models import WordLevel
from torch.utils.data import Dataset, DataLoader, random_split

import random
random.seed(42)

from pathlib import Path

def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "datasource": 'opus_books',
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/tmodel"
    }

config = get_config()


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device}")
import torch
import torch.nn as nn
from torch.utils.data import Dataset

class BilingualDataset(Dataset):

    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        super().__init__()
        self.seq_len = seq_len

        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang

        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, idx):
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]

        # Transform the text into tokens
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
def get_all_sentences(ds, lang):
    for item in ds:
        yield item['translation'][lang]
def get_or_build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    if not Path.exists(tokenizer_path):
        # Most code taken from: https://huggingface.co/docs/tokenizers/quicktour
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
    return tokenizer

ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')

# Build tokenizers
tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])
tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config['lang_tgt'])

# Keep 90% for training, 10% for validation
train_ds_size = int(0.9 * len(ds_raw))
val_ds_size = len(ds_raw) - train_ds_size
train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

# Find the maximum length of each sentence in the source and target sentence
max_len_src = 0
max_len_tgt = 0

for item in ds_raw:
    src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
    tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
    max_len_src = max(max_len_src, len(src_ids))
    max_len_tgt = max(max_len_tgt, len(tgt_ids))

print(f'Max length of source sentence: {max_len_src}')
print(f'Max length of target sentence: {max_len_tgt}')


train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

train_dataloader

Using Device: cpu
Max length of source sentence: 309
Max length of target sentence: 274


<torch.utils.data.dataloader.DataLoader at 0x247e66e3ce0>

In [23]:
# how to validate:
# valoidation with singl example:
#---------------------------------------------
def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0



for batch in val_dataloader:
    break

encoder_input = batch['encoder_input'][:, :25]
encoder_mask = batch['encoder_mask'][:,:,:, :25]
source_text = batch["src_text"][0]
target_text = batch["tgt_text"][0]

print(f"{encoder_input.shape}")
print(f"{encoder_mask.shape}")
print(f"{source_text}")
print(f"{target_text}")
print(f"variable decoder mask:------------>")
for i in range(5):
    decoder_mask = causal_mask(i).type_as(encoder_input).to(device)
    print(decoder_mask)


sos_idx = tokenizer_tgt.token_to_id('[SOS]')
eos_idx = tokenizer_tgt.token_to_id('[EOS]')

encoder_output = model.encode(encoder_input, encoder_mask)
decoder_mask = causal_mask(decoder_input.size(1)).type_as(encoder_input).to(device)
decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(encoder_input)
 




torch.Size([1, 25])
torch.Size([1, 1, 1, 25])
'What is it?' he asked drily. 'We are busy.'
— Che vuoi? — le disse asciutto. — Siamo occupati.
variable decoder mask:------------>
tensor([], size=(1, 0, 0), dtype=torch.int64)
tensor([[[1]]])
tensor([[[1, 0],
         [1, 1]]])
tensor([[[1, 0, 0],
         [1, 1, 0],
         [1, 1, 1]]])
tensor([[[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 1, 0],
         [1, 1, 1, 1]]])


In [24]:
# calculate output
out = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
prob = model.project(out[:, -1])
_, next_word = torch.max(prob, dim = -1)

decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(encoder_input).fill_(next_word.item()).to(device)], dim=1
        )



print(f"output shapes transform: {out.shape, out[:, -1].shape}")
print(f"{prob.shape= }")
print(f"max prob value: and its index: \n{_, next_word }") # tensor value and its index

print(f"extend decoder input by concat the generated nextword: \n{decoder_input}")




output shapes transform: (torch.Size([1, 1, 100]), torch.Size([1, 100]))
prob.shape= torch.Size([1, 22463])
max prob value: and its index: 
(tensor([2.0558], grad_fn=<MaxBackward0>), tensor([1495]))
extend decoder input by concat the generated nextword: 
tensor([[   2, 1495]])
