In [1]:
# Install necessary packages
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from tqdm import tqdm
from collections import Counter

# 1. Transformer Architecture by Hand

### Embeddings of the words
1) Words need to be turned into numerical vectors so their meaning can be understood by the model.
2) We also need to include positional encoding to keep track of the order and context of the words.

In [2]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

### Layer Normalization
Applying normalization over a single sample, across all embeddings. The purpose is to achieve a mean equal to 0 and unit variance so the model converges faster and the vanishing/exploding gradient problem becomes less likely.

In [4]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

### FeedForward Block
Each of the layers in encoder and decoder contains a fully connected feed-forward network, which is applied to each input tensor.

In [5]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

### Multi-Head Attention Block
Full implementation of it. This class will be used for both self-attention (in encoder) and cross-attention (decoder).

In [6]:
class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.h = h
        self.d_k = d_model // h

        # obtaining the keys, queries and values matrices from the embeddings
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)

    def attention(self, query, key, value, mask=None):
        # getting the size of the last dimension (d_k)
        d_k = query.size(-1)

        # formula of attention
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

        # making the values for the words which are not seen yet very low so after applying softmax it becomes 0 (needed in decoder part)
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        
        attention_probs = torch.softmax(attention_scores, dim=-1)
        attention_probs = self.dropout(attention_probs)
        
        return torch.matmul(attention_probs, value)
    
    def forward(self, q, k, v, mask=None):
        batch_size = q.size(0)

        # split the matrix tensor into h heads and changing dimensions
        def transform(x):
            return x.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        query = transform(self.w_q(q))
        key = transform(self.w_k(k))
        value = transform(self.w_v(v))
        
        # calculation of the final matrix with attention values
        attention_output = self.attention(query, key, value, mask)
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        
        return self.w_o(attention_output)

### Implementing Residual Connection
Adding the input back to the output, with some regularization and normalization.

In [7]:
class ResidualConnection(nn.Module):
    
    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.norm = LayerNormalization(features)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

###  Implementing Encoder
At first defining an encoder block consisting of self-attention, feed-forward, and residual connections and then definind the whole the overall encoder composed of several such blocks.

In [8]:
class EncoderBlock(nn.Module):
    # One block of the encoder with attention and feed-forward layers.

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block  # Attention part
        self.feed_forward_block = feed_forward_block  # Feed-forward part
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])  # Shortcut connections

    def forward(self, x, src_mask):
        # Apply attention layer with shortcut
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        # Apply feed-forward layer with shortcut
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x


class Encoder(nn.Module):
    # Full encoder made of multiple blocks.

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers  # All encoder blocks stacked together
        self.norm = LayerNormalization(features)  # Normalizing output

    def forward(self, x, mask):
        # Pass through all blocks one by one
        for layer in self.layers:
            x = layer(x, mask)
        # Normalize final output
        return self.norm(x)


### Implementing Decoder
At first defining a decoder block consisting of self-attention, cross-attention, feed-forward, and residual connections and then definind the whole  decoder composed of several such blocks.

In [9]:
class DecoderBlock(nn.Module):
    # One block of the decoder with self-attention, cross-attention, and feed-forward.

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block  # Decoder's self-attention
        self.cross_attention_block = cross_attention_block  # Attention over encoder output
        self.feed_forward_block = feed_forward_block  # Feed-forward part
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])  # Three shortcuts

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # Decoder looks at itself first
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        # Then it looks at encoder's output (cross-attention)
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        # Finally, applies feed-forward layer
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x


class Decoder(nn.Module):
    # Full decoder made of multiple decoder blocks.

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers  # All decoder blocks stacked together
        self.norm = LayerNormalization(features)  # Normalizing output

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # Pass through all blocks one by one
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        # Normalize final output
        return self.norm(x)


### Linear Layer

In [10]:
class LinearLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        return self.proj(x)

### Transformer
Implementation of the whole transformer architecture utilizing all the previosuly defined classes

In [11]:
class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, linear_layer: LinearLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.linear_layer = linear_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)
    
    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.linear_layer(x)

In [12]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)
    
    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)
    
    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))
    
    # Create the linear layer
    linear_layer = LinearLayer(d_model, tgt_vocab_size)
    
    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos,linear_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    
    return transformer

# 2. Training the Model
Using https://huggingface.co/docs/transformers/en/tasks/translation dataset for training the tranformer model for MT (en to fr) task. </br>
To prepare the data we need to take a dataset and a tokenizer, convert sentences into lists of numbers, make all sentences the same length by adding padding, and return the processed data as PyTorch tensors ready for training.

In [44]:
books = load_dataset("opus_books", "en-fr")
books = books["train"].train_test_split(test_size=0.2)

README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [66]:
# map words to their ID's
class SimpleTokenizer:
    def __init__(self, texts, max_vocab_size=20000):
        self.word2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2word = {0: "<pad>", 1: "<unk>"}

        word_counts = Counter(" ".join(texts).split())
        most_common = word_counts.most_common(max_vocab_size - len(self.word2idx))

        for idx, (word, _) in enumerate(most_common, len(self.word2idx)):
            self.word2idx[word] = idx
            self.idx2word[idx] = word

        self.vocab_size = len(self.word2idx)

    def encode(self, text):
        return [self.word2idx.get(word, self.word2idx["<unk>"]) for word in text.split()]

    def decode(self, indices):
        return " ".join([self.idx2word.get(idx, "<unk>") for idx in indices])


def prepare_data(dataset, tokenizer, max_len):
    inputs = []
    targets = []

    for item in dataset:
        en_text = item['translation']['en']
        fr_text = item['translation']['fr']

        en_tokens = tokenizer.encode(en_text)[:max_len]
        fr_tokens = tokenizer.encode(fr_text)[:max_len]

        en_tokens = en_tokens + [0] * (max_len - len(en_tokens))
        fr_tokens = fr_tokens + [0] * (max_len - len(fr_tokens))

        inputs.append(en_tokens)
        targets.append(fr_tokens)

    return torch.tensor(inputs), torch.tensor(targets)

In [64]:
max_len = 64

train_subset = books['train'].select(range(5000))
test_subset = books['test'].select(range(1000))

train_texts = [item['translation']['en'] for item in train_subset] + [item['translation']['fr'] for item in train_subset]
test_texts = [item['translation']['en'] for item in test_subset] + [item['translation']['fr'] for item in test_subset]

simple_tokenizer = SimpleTokenizer(train_texts)

# building the vocabulary
train_inputs, train_targets = prepare_data(train_subset, simple_tokenizer, max_len)
test_inputs, test_targets = prepare_data(test_subset, simple_tokenizer, max_len)

# tokenizing and padding
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_targets)
test_dataset = torch.utils.data.TensorDataset(test_inputs, test_targets)

# creating Datasets and DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=0, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, num_workers=0, pin_memory=True)

# setting model hyperparameters
src_vocab_size = simple_tokenizer.vocab_size
tgt_vocab_size = simple_tokenizer.vocab_size
src_seq_len = max_len
tgt_seq_len = max_len

# building the model
transformer = build_transformer(src_vocab_size, tgt_vocab_size, src_seq_len, tgt_seq_len)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer.to(device)

# setting loss and optim
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=1e-4)

In [55]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training", leave=False):
        src, tgt = batch
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        src_mask = torch.ones((src.shape[0], 1, 1, src.shape[1]), device=device)
        tgt_mask = torch.ones((tgt_input.shape[0], 1, 1, tgt_input.shape[1]), device=device)

        optimizer.zero_grad()

        encoder_output = transformer.encode(src, src_mask)
        decoder_output = transformer.decode(encoder_output, src_mask, tgt_input, tgt_mask)
        output = transformer.project(decoder_output)

        loss = criterion(output.reshape(-1, output.shape[-1]), tgt_output.reshape(-1))
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)


num_epochs = 5

for epoch in range(num_epochs):
    train_loss = train_epoch(transformer, train_dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}")

                                                                                

Epoch 1/5, Train Loss: 8.4263


                                                                                

Epoch 2/5, Train Loss: 6.9458


                                                                                

Epoch 3/5, Train Loss: 6.5374


                                                                                

Epoch 4/5, Train Loss: 6.4607


                                                                                

Epoch 5/5, Train Loss: 6.2382


