In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

#############################

import os
from os.path import exists

from torch.nn.functional import log_softmax, pad
# import math
# import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets


import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [3]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)] #MJ: add positioning embedding vector of x to the word empbedding vector x

In [5]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [6]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

es, that's a great analogy! A stack of Transformer blocks is indeed analogous to a stack of convolutional layers in the way they both progressively transform the input data and extract more abstract features at each layer, though they operate on different types of data and use different mechanisms.

Here’s a breakdown of the analogy:

1. Input and Output:
Transformer Stack: Takes a sequence of tokens as input and outputs a sequence of contextually enhanced representations (feature vectors), where each token is represented by a vector that captures its contextual relationships with other tokens.
Convolutional Layers: Takes an image (2D grid of pixels) as input and outputs feature maps, where each map highlights certain features (like edges, textures, etc.) extracted from the image.
2. Transformation Process:
Transformer Stack: Each block in the stack applies self-attention (to model relationships between tokens) and feed-forward networks (to refine the features). As you go deeper in the stack, the model learns more abstract relationships between the tokens in the sequence.
Convolutional Stack: Each layer applies convolutions (to capture local spatial relationships between pixels) and non-linear activations. Deeper layers learn more abstract and complex features, such as detecting objects or shapes from lower-level features like edges.
3. Feature Representation:
Transformers: The stack produces a sequence of feature vectors, where each vector corresponds to a token in the input sequence and represents a combination of its semantic and contextual meaning.
Convolutional Layers: The stack produces a set of feature maps, where each map highlights different aspects of the image's structure or content (e.g., edges, textures, or complex patterns).
4. Hierarchical Learning:
In both architectures, earlier layers capture low-level information, while deeper layers capture higher-level, more abstract information:
Transformer: Earlier layers might focus on shorter-range dependencies, while deeper layers capture long-range dependencies and complex relationships between tokens.
Convolutions: Earlier layers might capture simple patterns like edges, while deeper layers identify more complex structures like objects.
Key Difference:
Spatial Relationships: Convolutions are designed to capture local spatial dependencies in images by focusing on nearby pixels, while Transformers capture global dependencies by allowing each token to attend to every other token in the sequence, regardless of their position.
Summary:
The analogy between a stack of Transformer blocks and a stack of convolutional layers works because both progressively transform the input to produce a more abstract and contextually enriched representation:

Transformers produce a sequence of feature vectors that encode the relationships between tokens.
Convolutional layers produce feature maps that highlight various aspects of an image.







In [7]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        #MJ: enc_output: [64,100,512]   
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        #MJ: dec_output: torch.Size([64, 99, 512])
        output = self.fc(dec_output)
        return output #MJ: [64,99,512]

In [8]:
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [11]:
src_data


tensor([[1749, 2867, 4651,  ..., 4130,  514,  410],
        [ 236, 1746, 4941,  ..., 1872, 3643, 4555],
        [ 705, 3274, 3992,  ..., 2960, 2967, 4016],
        ...,
        [4503, 4600, 4048,  ..., 1497, 1449, 3994],
        [3688, 3933, 3771,  ..., 3511,  960, 4111],
        [2265,  688, 3635,  ..., 1579, 3889, 2557]])

In [9]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(100):
    optimizer.zero_grad()
    output = transformer(src_data, tgt_data[:, :-1])
    #MJ: the input to the decoder (tgt_data[:, :-1]) should consist of all tokens up to the current token, 
    #     excluding the last token.
    #MJ: output: [64,99,5000] => [6336, 5000] <==> [6336]
    loss = criterion( output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1)) #MJ: 6336 = 64 * 99
    #MJ: The prediction of <EOS> occurs because the model is trained on the target (tgt_data[:, 1:]), which includes <EOS>.
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 8.689992904663086
Epoch: 2, Loss: 8.555461883544922
Epoch: 3, Loss: 8.484436988830566
Epoch: 4, Loss: 8.428950309753418
Epoch: 5, Loss: 8.371879577636719
Epoch: 6, Loss: 8.299415588378906
Epoch: 7, Loss: 8.21992301940918
Epoch: 8, Loss: 8.143165588378906
Epoch: 9, Loss: 8.055954933166504
Epoch: 10, Loss: 7.9785590171813965
Epoch: 11, Loss: 7.904333114624023
Epoch: 12, Loss: 7.811092376708984
Epoch: 13, Loss: 7.729271411895752
Epoch: 14, Loss: 7.6476054191589355
Epoch: 15, Loss: 7.56585693359375
Epoch: 16, Loss: 7.481439113616943
Epoch: 17, Loss: 7.398675441741943
Epoch: 18, Loss: 7.314747333526611
Epoch: 19, Loss: 7.2287092208862305
Epoch: 20, Loss: 7.145570755004883
Epoch: 21, Loss: 7.068298816680908
Epoch: 22, Loss: 6.998837471008301
Epoch: 23, Loss: 6.919703483581543
Epoch: 24, Loss: 6.847179889678955
Epoch: 25, Loss: 6.768188953399658
Epoch: 26, Loss: 6.692165374755859
Epoch: 27, Loss: 6.6215434074401855
Epoch: 28, Loss: 6.5354533195495605
Epoch: 29, Loss: 6.4737062

In [10]:
transformer.eval()

# Generate random sample validation data
val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

with torch.no_grad():

    val_output = transformer(val_src_data, val_tgt_data[:, :-1])
    val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data[:, 1:].contiguous().view(-1))
    print(f"Validation Loss: {val_loss.item()}")

Validation Loss: 8.823338508605957
