# *Training selfmade transformer*

## Piotr Szyszka
### ***Lublin University of Technology***
### *Engineering and Data Analysis, 2024*
### *Advanced Machine Learning Methods*


This notebook is used to train the model.

# Necessary imports

In [1]:
import torch
import torch.nn as nn
import math

from torch.utils.tensorboard import SummaryWriter
import torchmetrics

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

from datasets import Dataset as DS



from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace, WhitespaceSplit

from tqdm import tqdm




  from .autonotebook import tqdm as notebook_tqdm


# Model definition

In [2]:

###############################################################################
# Utility layers

class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias


class ResidualConnection(nn.Module):
        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

# Input Embedding Layer
class InputEmbeddings(nn.Module):
    """
    Input embedding layer - very first layer that transforms tokens into its numerical representation.

    # Parameters:

    - `d_model (int)` - dimension of the model (output size of embedidng layer)

    - `vocab_size (int)` - number of unique words in vocabulary.


    """
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model) # multiply by sqrt(d_model) as in original paper

# 0.2) Postional encoding

class PositionalEncoding(nn.Module):
    """
    Positional Encoding layer - stores information of order of words occuring in the sequence.

    # Parameters

    - `d_model (int)` - dimension of the model (output size of embedidng layer),

    - `seq_len (int)` - fixed number of tokens for every sequence of words.

    - `dropout (float)` - dropout factor added to encoding layer; applied to avoid overfitting.

    """
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model) # positional encoding matrix

        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (0, 1, ..., seq_len - 1)


        # following formulas given in original paper
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)

        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))

        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))



        pe = pe.unsqueeze(0) # add batch dimension - (1, seq_len, d_model)

        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)


        return self.dropout(x)


# Multi-Head Attention
class MultiHeadAttentionBlock(nn.Module):
    """
    Multi-Head Attention block - implementation of multi-head attention mechanism.

    # Parameters:

    - `d_model (int)` - dimension of the model (output size of embedidng layer),

    - `h (int)` - number of attention heads,

    - `dropout (float)` - dropout factor.

    """
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()

        assert d_model % h == 0, "Model dimension (d_model) must be divisible by number of attention heads (h)"

        self.d_model = d_model
        self.h = h

        self.head_dim = d_model // h # dim of vector seen by each head

        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq - Query
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk - Key
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv - Value
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo

        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # following formula in paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9) # it'll become 0 after softmax

        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)


        # split (query, key, value) equally among attention heads:

        query = query.view(query.shape[0], query.shape[1], self.h, self.head_dim).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.head_dim).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.head_dim).transpose(1, 2)

        # calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.head_dim)

        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)


# Feed Forward Block
class FeedForwardBlock(nn.Module):
    """
    Feed Forward Block (Linear -> ReLu -> Dropout -> Linear)

    # Parameters:

    - `d_model (int)` - model dimension,

    - `d_ff (int)` - feed forward dimension,

    - `dropout (float)` - dropout factor.

    """
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


# Encoder block and encoder

class EncoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):
    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

# Decoder block and decoder

class DecoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):
    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)



# Projection layer
class ProjectionLayer(nn.Module):
    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)



class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)



def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

# Dataset and tokenizer

In [3]:
def yield_sentences(ds):
    for item in ds:
        yield item['Context']
        yield item['Response']

In [4]:
def create_tokenizer(ds: DS) -> Tokenizer:

    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = WhitespaceSplit() # split words based on whitespaces between

    trainer = WordLevelTrainer(special_tokens = ['[UNK]', '[PAD]', '[SOS]', '[EOS]'], min_frequency = 2, vocab_size = 50_000)

    tokenizer.train_from_iterator(yield_sentences(ds), trainer = trainer)


    return tokenizer

In [5]:
class MHDataset(Dataset):
    def __init__(self, ds: DS, tokenizer, src_seq_len, tgt_seq_len) -> None:
        super().__init__()
        self.ds = ds

        self.src_seq_len = src_seq_len
        self.tgt_seq_len = tgt_seq_len

        self.tokenizer = tokenizer

        self.sos_token = torch.tensor([tokenizer.token_to_id('[SOS]')], dtype = torch.int64)
        self.eos_token = torch.tensor([tokenizer.token_to_id('[EOS]')], dtype = torch.int64)
        self.pad_token = torch.tensor([tokenizer.token_to_id('[PAD]')], dtype = torch.int64)

    def __len__(self):
        return len(self.ds)

    def __getitem__(self, index):
        src_target_pair = self.ds[index]
        src_text, tgt_text = src_target_pair['Context'], src_target_pair['Response']

        enc_input_tokens = self.tokenizer.encode(src_text).ids
        dec_input_tokens = self.tokenizer.encode(tgt_text).ids


        # padding
        enc_num_padding_tokens = self.src_seq_len - len(enc_input_tokens) - 2 # -2 : [SOS], [EOS]
        dec_num_padding_tokens = self.tgt_seq_len - len(dec_input_tokens) - 1 # we do not want to have [EOS] ([SOS] will be always starting token, what is necessary for decoding)
        if enc_num_padding_tokens < 0:
            enc_input_tokens = enc_input_tokens[:self.src_seq_len - 2]  # -2 for [SOS] and [EOS]
            encoder_input = torch.cat(
                [
                    self.sos_token,
                    torch.tensor(enc_input_tokens, dtype=torch.int64),
                    self.eos_token
                ], dim = 0
            )
        else:
            encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(enc_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64)
            ], dim = 0
        )


        if dec_num_padding_tokens <= 0:
            dec_input_tokens = dec_input_tokens[:self.tgt_seq_len - 1]  # -1 for [SOS]
            decoder_input = torch.cat(
                [
                    self.sos_token,
                    torch.tensor(dec_input_tokens, dtype = torch.int64),
                ], dim = 0
            )

            label = torch.cat(
              [
                  torch.tensor(dec_input_tokens, dtype = torch.int64),
                  self.eos_token
              ], dim = 0
            )
        else:
            decoder_input = torch.cat(
                [
                    self.sos_token,
                    torch.tensor(dec_input_tokens, dtype = torch.int64),
                    torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64)
                ], dim = 0
            )
            label = torch.cat(
                [
                    torch.tensor(dec_input_tokens, dtype = torch.int64),
                    self.eos_token,
                    torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64)
                ], dim = 0
            )


        assert encoder_input.size(0) == self.src_seq_len
        assert decoder_input.size(0) == self.tgt_seq_len
        assert label.size(0) == self.tgt_seq_len

        return {'encoder_input': encoder_input,
                'decoder_input': decoder_input,
                'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
                'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & decoder_mask(decoder_input.size(0)),
                'label': label,
                'src_text': src_text,
                'tgt_text': tgt_text
                }


def decoder_mask(size: int) -> torch.Tensor:
    """
    Generates a mask for the decoder to prevent attention to subsequent positions in the sequence.

    This mask is a lower triangular matrix of zeros and ones, ensuring that the decoder at a given position can only
    attend to previous positions in the sequence.

    Args:
        size (int): The size of the mask, typically the length of the target sequence.

    Returns:
        torch.Tensor: A mask of shape (1, size, size), where True values indicate positions that can be attended to.
    """
    return torch.tril(torch.ones(1, size, size), diagonal=1).type(torch.int)



def prepare_dataset(train_size: float, src_seq_len: int = None, tgt_seq_len: int = None, train_batch_size: int = 16, val_batch_size: int = 16):

    assert train_size > 0 and train_size < 1, "Train size can't be outside (0; 1) range "

    print('Fetching dataset')

    dataset = DS.from_parquet('./MH_train.parquet')
    
    print('Dataset loaded successfully!')

    tokenizer = create_tokenizer(dataset)

    print('Creating train-vali split')
    train_ds_size = int(train_size * len(dataset))
    val_ds_size = len(dataset) - train_ds_size

    print('Training samples:', train_ds_size)
    print('Validation samples', val_ds_size)


    train_ds, val_ds = random_split(dataset, [train_ds_size, val_ds_size])

    if src_seq_len is None:
        src_len = 0

        for item in dataset:
            src_ids = tokenizer.encode(item['Context']).ids
            src_len = max(src_len, len(src_ids))

        print(f'Max length of source:', src_len)
        src_len += 2 # [SOS] and [EOS] tokens
    else:
        src_len = src_seq_len

    if tgt_seq_len is None:
        tgt_len = 0

        for item in dataset:
            tgt_ids = tokenizer.encode(item['Response']).ids
            tgt_len = max(tgt_len, len(tgt_ids))

        print(f'Max length of target:', tgt_len)
        tgt_len += 1 # [EOS] token
    else:
        tgt_len = tgt_seq_len


    train_ds = MHDataset(train_ds, tokenizer, src_seq_len, tgt_seq_len)
    val_ds = MHDataset(val_ds, tokenizer, src_seq_len, tgt_seq_len)



    train_dataloader = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=val_batch_size, shuffle=True)

    
    return train_dataloader, val_dataloader, tokenizer

# Validation loop

In [6]:
def run_validation(model:Transformer, loss_fn, validation_loader:DataLoader, tokenizer:Tokenizer, device:torch.device, writer:SummaryWriter, global_step:int) -> None:

    model.eval()

    with torch.no_grad():
        batch_iterator = tqdm(validation_loader, desc=f"Validation step")

        for id, batch in enumerate(batch_iterator):
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            # print(decoder_input.shape)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)


            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            
            label = batch['label'].to(device) # (B, seq_len)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(proj_output.view(-1, tokenizer.get_vocab_size()), label.view(-1))
            
            
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})
            writer.add_scalar('val loss', loss, global_step)
            writer.flush()    


# Training

In [7]:

# model_preload - dict, keys: ('model_state_dict', 'global_step', 'optimizer_state_dict)

def train_model(model:Transformer, num_epochs:int, device:torch.device, train_loader:DataLoader, val_loader:DataLoader, tokenizer:Tokenizer, loss_fn, optimizer:torch.optim, writer:SummaryWriter, model_preload:str, save_every_epoch: bool, train_decode_step: int = None):
    print('Using device:', device)

    global_step = 0

    if model_preload:
        print(f'Preloading model {model_preload}')
        state = torch.load(model_preload)
        global_step = model_preload['global_step']
        model.load_state_dict(state['model_state_dict'])
        optimizer.load_state_dict(state['optimizer_state_dict'])

    for epoch in range(num_epochs):
        torch.cuda.empty_cache()
        model.train()

        batch_iterator = tqdm(train_loader, desc=f"Processing Epoch {epoch:02d}")

        for id, batch in enumerate(batch_iterator):

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)

            label = batch['label'].to(device) # (B, seq_len)
       
            loss = loss_fn(proj_output.view(-1, tokenizer.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})



            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            if train_decode_step is not None:
                
                if id % train_decode_step == 0:
                    source_text = batch['src_text'][0]
                    truth_text = batch['tgt_text'][0]

                    pred_tokens = torch.argmax(proj_output, dim=-1)[0]
                    pred_text = tokenizer.decode(pred_tokens.tolist())
                    print('\n')
                    print('*' * 80)
                    print('Source: ', source_text)
                    print('Truth: ', truth_text)
                    print('Pred: ', pred_text)

        global_step += 1
        run_validation(model, val_loader, tokenizer, device, writer, global_step)

        if save_every_epoch:
            save_model(f'model_gep_{global_step}.pt', model, optimizer, global_step)



def save_model(filename, model, optimizer, global_step):
    try:
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, filename)

    except Exception as e:
        print(str(e))
        pass





# Training the model!

In [8]:
MAX_SRC_LEN = 150
MAX_TGT_LEN = 451


train_dataloader, val_dataloader, tokenizer = prepare_dataset(train_size=0.9, tgt_seq_len=MAX_TGT_LEN, src_seq_len=MAX_SRC_LEN, train_batch_size = 8)

Fetching dataset
Dataset loaded successfully!
Creating train-vali split
Training samples: 4475
Validation samples 498


In [9]:
src_vocab_size = tokenizer.get_vocab_size()
tgt_vocab_size = tokenizer.get_vocab_size()


In [10]:
transformer = build_transformer(N = 4, d_model = 256, h = 8, src_vocab_size=src_vocab_size, tgt_vocab_size=tgt_vocab_size, src_seq_len=MAX_SRC_LEN, tgt_seq_len=MAX_TGT_LEN)


In [11]:
transformer

Transformer(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-3): 4 x EncoderBlock(
        (self_attention_block): MultiHeadAttentionBlock(
          (w_q): Linear(in_features=256, out_features=256, bias=False)
          (w_k): Linear(in_features=256, out_features=256, bias=False)
          (w_v): Linear(in_features=256, out_features=256, bias=False)
          (w_o): Linear(in_features=256, out_features=256, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward_block): FeedForwardBlock(
          (linear_1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear_2): Linear(in_features=2048, out_features=256, bias=True)
        )
        (residual_connections): ModuleList(
          (0-1): 2 x ResidualConnection(
            (dropout): Dropout(p=0.1, inplace=False)
            (norm): LayerNormalization()
          )
        )
      )
    )
    (norm): LayerNormalization

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir runs

In [12]:
num_epochs = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id('[PAD]'), label_smoothing=0.1).to(device)
torch.cuda.empty_cache()
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-3)

writer = SummaryWriter('runs/mhmodel')


transformer = transformer.to(device)


train_model(model = transformer, num_epochs=num_epochs, device=device, train_loader=train_dataloader, val_loader=val_dataloader,
tokenizer = tokenizer, loss_fn=loss_fn, optimizer=optimizer, writer = writer, model_preload=None, save_every_epoch=True, train_decode_step=100)


Using device: cpu


Processing Epoch 00:   0%|          | 1/560 [00:05<51:08,  5.49s/it, loss=9.939]



********************************************************************************
Source:  I was the one who ended it, and I'm so glad I did. It was the best decision I made in my life. But how do I stop the nightmares and flashbacks? It is creating a wall in my current relationship.
Truth:  From what you describe about yourself, I agree with you that ending your former relationship was a very wise decision.The nightmares and flashbacks show that you were deeply affected emotionally and on the foundations of your basic nature.The way for these to stop is by the slow process of realizing how badly injured and frightened you were of your former partner.Once you've stabilized yourself by accepting the tremendous harshness that was part of the former relationship, then the nightmares and flashbacks will disappear gradually usually, maybe all at once.There is a possibility too that your former relationship connected with being emotionally ignored, abandoned, treated harshly during your tim

Processing Epoch 00:   2%|▎         | 14/560 [01:24<54:46,  6.02s/it, loss=7.631]


KeyboardInterrupt: 