In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [2]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value
        
        # Linear layers for transforming inputs
        self.W_q = nn.Linear(d_model, d_model) # Query transformation
        self.W_k = nn.Linear(d_model, d_model) # Key transformation
        self.W_v = nn.Linear(d_model, d_model) # Value transformation
        self.W_o = nn.Linear(d_model, d_model) # Output transformation
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9) #mask: [80,1,1,15] => [80,1,15,15]
            #MJ: If mask[i, j] == 0, it means the position j in sequence i should be masked out (ignored).
        
        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)  #MJ: (batch_size, num_heads, seq_length, seq_length)).
        #MJ: The purpose of using -1e9 is to effectively "mask out" those positions in the attention scores 
        #  by setting them to a very large negative number. When you apply the softmax function later on these scores, the large negative values will result in near-zero probabilities
        #MJ:  attn_probs is called the "attention matrix" of shape [B, num_head, seq_length, d_k], wehre d_model = hum_head * d_k ;
        # The attention matrix is a key component in the scaled dot-product attention mechanism,
        # which computes the relationship between different positions in the input sequence.
        
        # Multiply by values to obtain the final output
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))  #MJ: Q : [B, num_head, seq_length, d_k], wehre d_model = hum_head * d_k
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

Learned Positional Embeddings: The Generative Pretrained Transformer (GPT) models, starting from GPT-1 and extending through GPT-2 and GPT-3, all use learned positional embeddings.
Description: These models learn the positional encodings as part of the training process and apply them to the input sequence to capture word-order dependencies dynamically.

In [3]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [5]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)  #MJ: src_mask: [B, 1, 1,  seq_length]
        x = self.norm1(x + self.dropout(attn_output))  #MJ: apply droput to the encoder self attenttion output
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [6]:
import torch

def simulated_dropout(input, p=0.5, inplace=False):
    if p < 0.0 or p > 1.0:
        raise ValueError("Dropout probability must be between 0 and 1, but got {}".format(p))

    # If not in training mode or p == 0, return the input unchanged
    if p == 0 or not input.requires_grad:
        return input

    # Generate a mask with the same shape as the input
    # Each element in the mask is 1 with probability (1 - p), and 0 with probability p
    mask = torch.bernoulli(torch.ones_like(input) * (1 - p))

    if inplace:
        # Modify the input tensor in-place
        input.mul_(mask)  # Zero out elements where mask is 0
        input.div_(1 - p) # Scale the remaining elements by (1 / (1 - p))
        return input
    else:
        # Return a new tensor with the mask applied
        return input * mask / (1 - p) 
    #MJ: * = an element wise multiplication
    # n the typical case where the mask consists of 0s and 1s, input * mask selectively sets elements of the input to zero.

# Example usage
input_tensor = torch.randn(5, 5, requires_grad=True)
output_tensor = simulated_dropout(input_tensor, p=0.5, inplace=False)

print("Input Tensor:\n", input_tensor)
print("Output Tensor (with Dropout Applied):\n", output_tensor)


Input Tensor:
 tensor([[ 1.4356e+00, -5.3014e-01, -6.5391e-01, -5.8317e-03, -7.3512e-01],
        [ 1.4799e-01, -7.0493e-01, -2.0578e+00,  5.7831e-01,  2.6336e-01],
        [ 5.9107e-01,  2.0203e-01,  1.0797e+00,  7.6774e-01, -5.1349e-02],
        [ 1.0117e+00, -1.5447e+00,  3.2342e-01,  3.1134e+00,  3.4324e-01],
        [ 1.9893e-01,  1.2275e+00, -3.5685e-01, -7.8183e-04, -1.0179e-01]],
       requires_grad=True)
Output Tensor (with Dropout Applied):
 tensor([[ 2.8712, -0.0000, -1.3078, -0.0000, -0.0000],
        [ 0.0000, -0.0000, -4.1156,  0.0000,  0.5267],
        [ 0.0000,  0.0000,  2.1595,  1.5355, -0.0000],
        [ 2.0234, -0.0000,  0.0000,  6.2267,  0.0000],
        [ 0.0000,  2.4549, -0.7137, -0.0000, -0.0000]], grad_fn=<DivBackward0>)


In [7]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [8]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout,pad=0):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model) #MJ: A simple lookup table that stores embeddings of a fixed dictionary and size.
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.pad = pad

    def generate_mask(self, src, tgt_x):
        src_mask = (src != self.pad).unsqueeze(1).unsqueeze(2)  #MJ: crc: [1, L] => src_mask: [B, 1, 1, L]
        tgt_mask = (tgt_x != self.pad).unsqueeze(1).unsqueeze(3)   #MJ:tgt: [1, 1] => tgt_mask: [B, 1, 1, 1] in inference
        # (batch_size, seq_length) to (batch_size, 1, seq_length). 
        # => (batch_size, 1, seq_length) to (batch_size, 1, seq_length, 1).
        #MJ: the attention mechanism (which often works with 4D tensors in the form of
        # (batch_size, num_heads, seq_length, seq_length)).
        #That is, his step further prepares the mask to be broadcast properly over the attention scores
        # when used in multi-head attention. In particular, the resulting mask will be able to match the shape of 
        # the attention scores, which are often of shape (batch_size, num_heads, seq_length, seq_length).
        
        seq_length = tgt_x.size(1) #MJ= 8; to ensure that the model does not "peek" at future tokens when making predictions for the current token.
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask  #MJ: peek: to look quickly or secretly at something, often without permission or in a way that is not meant to be seen

    def forward(self, src, tgt_x): #MJ: src, tgt:  (batch_size, seq_length); value = index to word
        src_mask, tgt_mask = self.generate_mask(src, tgt_x)
        
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt_x)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask) #MJ: torch.Size([80, 10, 512])

        dec_output = tgt_embedded  #MJ: torch.Size([80, 8, 512])
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output) #MJ: The same as Generator in Harvard tutorial
        #MJ: This tutorial does not use log_softmax(self.fc(x), dim=-1), because it is handled by the CrossEntropy function self
        return output  #MJ: torch.Size([80, 8, 11])
    #MJ: Added for the inference
    # self.encoder( self.src_embed(src), src_mask)
    
    #MJ: memory = self.encode(src, src_mask): src=tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
    # def decode(self, memory, src_mask, tgt, tgt_mask): #MJ: memory: [1,10,512];  src_mask:[1,1,10]; tgt.shape=[1,1]
    #     return self.decoder( self.tgt_embed(tgt), memory, src_mask, tgt_mask) #tgt_mask: [1,1,1]
    
    
    def encoder(self, src):
        src_mask = (src != self.pad).unsqueeze(1).unsqueeze(2)  #MJ: crc: [1, L] => src_mask: [B, 1, 1, L]
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        enc_output = src_embedded
                    
        for enc_layer in self.encoder_layers:
                enc_output = enc_layer(enc_output, src_mask) 
        return enc_output 
         
    def decoder(self, enc_output, src_mask, tgt_x):
        
        tgt_mask = (tgt_x != self.pad).unsqueeze(1).unsqueeze(3)   #MJ:tgt: [1, 1] => tgt_mask: [B, 1, 1, 1] in inference
        # (batch_size, seq_length) to (batch_size, 1, seq_length). 
        # => (batch_size, 1, seq_length) to (batch_size, 1, seq_length, 1).
        #MJ: the attention mechanism (which often works with 4D tensors in the form of
        # (batch_size, num_heads, seq_length, seq_length)).
        #That is, his step further prepares the mask to be broadcast properly over the attention scores
        # when used in multi-head attention. In particular, the resulting mask will be able to match the shape of 
        # the attention scores, which are often of shape (batch_size, num_heads, seq_length, seq_length).
        
        seq_length = tgt_x.size(1) #MJ= 8; to ensure that the model does not "peek" at future tokens when making predictions for the current token.
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt_x)))
        
        
        dec_output = tgt_embedded  #MJ: torch.Size([80, 8, 512])
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output) #MJ: The same as Generator in Harvard tutorial
        #MJ: This tutorial does not use log_softmax(self.fc(x), dim=-1), because it is handled by the CrossEntropy function self
        return output  #MJ: torch.Size([80, 8, 11])
              

In [9]:
class Batch:
    """Object for holding a batch of data during training."""

    def __init__(self,  src, tgt, sos, eos, pad):  
        batch_size = src.shape[0]
        
        #MJ: src = ['1', '2', '3'], tgt=['1', '2', '3'] 
        self.src = torch.cat( [ torch.tensor([sos]).repeat(batch_size,1),  src, torch.tensor([eos]).repeat(batch_size,1),torch.tensor([pad]).repeat(batch_size,1) ], dim=1 )
        #self.src =[<sos>,1,2,3,<eos>,pad]
        self.tgt =  torch.cat( [ torch.tensor([sos]).repeat(batch_size,1),  tgt, torch.tensor([eos]).repeat(batch_size,1),torch.tensor([pad]).repeat(batch_size,1)  ], dim=1 )
        #self.src =[<sos>,1,2,3,<eos>,pad]
        
        if tgt is not None:
          #MJ: get the decoder input seq, tgt_x 
          self.tgt_x =self.tgt[:,:-1]
                   
          #self.tgt_x = ['<sos>', '1', '2', '3',<eos>]   = the decoder input seq   
          # get self.tgt_y, the decoder target seq ("right shift")
          self.tgt_y = self.tgt[:,1:]  
          
          #self.tgt_y = [  '1',   '2', '3', '<eos>', pad]
       
         
                            
            
           
          self.ntokens = (self.tgt_y != pad).data.sum()
            
# Example Recap:
# Target sequence: [A, B, C, D]
# Step 1:
# Input: The decoder receives the <SOS> token.
# Prediction: The model predicts A (or some other token).
# Comparison: The model's prediction is compared with the actual token at time step 1, which is A in the target sequence.
# So, at step 1, the model's prediction is compared against the actual token A.


In [10]:
# src_vocab_size = 5000
# tgt_vocab_size = 5000
# d_model = 512
# num_heads = 8

# num_layers = 6

# d_ff = 2048  #MJ: 2045 = 512 * 4
# max_seq_length = 100
# dropout = 0.1

# transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)



In [11]:
# # Generate random sample data
# src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

In [12]:
# criterion = nn.CrossEntropyLoss(ignore_index=0)
# #MJ: The ignore_index argument is set to 0, meaning the loss will not consider targets
# # with an index of 0 (typically reserved for padding tokens).
# optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# transformer.train()

# for epoch in range(100):
#     optimizer.zero_grad()
#     #MJ: use a single batch 
#     tgt_data_x = tgt_data[:, :-1]
#     tgt_data_y = tgt_data[:, 1:]
#     output = transformer(src_data, tgt_data_x)
#     loss = criterion(output.contiguous().view(-1, tgt_vocab_size),tgt_data_y.contiguous().view(-1))
#     loss.backward()
#     optimizer.step()
#     print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

In [13]:
# transformer.eval()

# # Generate random sample validation data
# val_src_data = torch.randint(1, src_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)
# val_tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length))  # (batch_size, seq_length)

# with torch.no_grad():
#     val_tgt_data_x = val_tgt_data[:, :-1]
#     val_tgt_data_y = val_tgt_data[:, 1:]
#     val_output = transformer(val_src_data, val_tgt_data_x)
#     val_loss = criterion(val_output.contiguous().view(-1, tgt_vocab_size), val_tgt_data_y.contiguous().view(-1))
#     print(f"Validation Loss: {val_loss.item()}")

In [13]:

class DataIterator:
    def __init__(self, vocab_size, max_seq_length, batch_size, nbatches):
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.nbatches = nbatches

    def __len__(self):
        # This method will return the number of batches
        return self.nbatches

    def __iter__(self):
        # This method will yield batches, effectively making this class an iterable
        pad = 0
        sos = 1
        eos = 2
        real_tokens_length = self.max_seq_length - 3  # Exclude <sos>, <eos>, and <pad>
        
        for i in range(self.nbatches):
            data = torch.randint(3, self.vocab_size, size=(self.batch_size, real_tokens_length))
            src = data.requires_grad_(False).clone().detach()
            tgt = data.requires_grad_(False).clone().detach()

            yield Batch(src, tgt, sos, eos, pad)

# Example usage:
# iterator = DataIterator(V=11, max_seq_length=15, batch_size=80, nbatches=20)
# print(len(iterator))  # Will output 20
# for batch in iterator:
#     # Process batch
#     pass


In [11]:
# def data_iter_gen(vocab_size, max_seq_length, batch_size, nbatches): #bdata_iter_gen(V, batch_size, 20), nbatches = 20
#     "Generate random data for a src-tgt copy task."
#     pad = 0
#     sos = 1
#     eos = 2
   
        
#     for i in range(nbatches): #MJ: nbathces = 20; batch_size = 80
#         real_tokens_length = max_seq_length - 3 #MJ: max_seq_length includes <sos>, and <eos>, <pad>; real_tokens exlcude them
#         data = torch.randint(3, vocab_size, size=(batch_size, real_tokens_length)) #MJ: real tokens range from 2 to vocab_size-1, where toekn= 0,1,vocab_size
#         # corresponds to sos, eos, and padding token
#         #V = 11 and batch_size = 80: the function generates an 80xreal_tokens_length(=12)  filled with random integers between 1 and 11 (exclusive).
#         # data consits of numbers 3,4,...,vocab_size-1
#         src = data.requires_grad_(False).clone().detach()
#         tgt = data.requires_grad_(False).clone().detach()
#         #MJ: src == tgt at this point
       
#         yield Batch(src, tgt, sos, eos, pad) # MJ: return  (self.src,self.tgt_x, self.tgt_y), 
        

In [40]:
src_vocab_size = 11
tgt_vocab_size = 11
d_model = 512
num_heads = 8

num_layers = 6

d_ff = 2048  #MJ: 2045 = 512 * 4
max_seq_length = 15
dropout = 0.1

pad = 0
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout,pad)
criterion = nn.CrossEntropyLoss(ignore_index= pad)
#MJ: In this experiment, the ignore_index argument is set to 11, meaning the loss will not consider targets
# with an index of 11 (typically reserved for padding tokens).
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

batch_size = 80
nbatches = 100 #MJ: The number of batches for the dataset
#MJ: From 1 to src_vocab_size-1
#train_data_iter = data_iter_gen(src_vocab_size,max_seq_length, batch_size, nbatches)

train_data_iter = DataIterator(src_vocab_size, max_seq_length, batch_size, nbatches)

nbatches = 10 #MJ for validation

#val_data_iter = data_iter_gen(src_vocab_size,max_seq_length, batch_size, nbatches)
val_data_iter = DataIterator(src_vocab_size, max_seq_length, batch_size, nbatches)

# Yes, in your code, data_iter refers to the iterator object itself, 
# not the data generated by it. The function data_iter_gen(V, batch_size, nbatches) 
# is a generator function, and calling it returns an iterator object. 
# This iterator object will yield batches of data when you iterate over it, 
# for example using a for loop or calling next(data_iter).

# Each time next(data_iter) is called, the generator produces a new batch of data 
# (via the yield statement), which consists of src, tgt, and Batch(src, tgt, 11). 
# But until you start iterating over data_iter, no actual data is generated.

# yield vs. return: Unlike a return statement, which terminates a function and sends
# a value back, the yield statement pauses the function and allows it to resume 
# from where it left off when the next value is requested. 
# This enables the function to produce a sequence of values over time, 
# instead of computing them all at once.

# Generator Object: When you call a function that contains yield,
# the function does not execute immediately. Instead, it returns 
# a generator object (an iterator), which you can use to retrieve the values one at a time
# by iterating over it (e.g., using a for loop or the next() function).



best_valid_loss = float("inf")
  
for epoch in range(2): #MJ: epoch=1 causes the network not trained sufficiently; epoch =2 is quite good, though not perfect.
  print(f"Epoch: {epoch+1}")
         
  #Train Loss
  transformer.train()
  train_epoch_loss = 0
   
  for i, batch in enumerate(train_data_iter): #MJ: use an iterator of batches 
      #print(f"Epoch: {epoch+1}, batch no: {i}, batch.src[0], batch.tgt_x[0], batch.tgt_y[0]:{batch.src[0], batch.tgt_x[0], batch.tgt_y[0]}")
      
      src_data = batch.src
      tgt_data_x = batch.tgt_x    #MJ: tgt_data =tgt_data_x: torch.Size([80, 9])
      tgt_data_y = batch.tgt_y
        
      optimizer.zero_grad()
      
    
    
      #  If the target sequence is ['A', 'B', 'C', 'D'], the right-shifted sequence given to the decoder is ['<SOS>', 'A', 'B', 'C'].
      #  The model then predicts the next token in the sequence, which should be ['A', 'B', 'C', 'D'].
      
      output = transformer(src_data, tgt_data_x) #MJ: loss = log(y1) + logp(y2|y1} + log(y3|y1,y2) + ... + log(yn|y1,y2,...,y_{n-1})
      batch_train_loss = criterion(output.contiguous().view(-1, tgt_vocab_size),tgt_data_y.contiguous().view(-1))
      batch_train_loss.backward()
      optimizer.step()
      train_epoch_loss += batch_train_loss
     
  #for i, batch in enumerate(train_data_iter)
      
  epoch_avg_train_loss = train_epoch_loss /  len(train_data_iter)    
  print(f"Epoch: {epoch+1}, epoch_avg_train_loss: {epoch_avg_train_loss}")
  
  
  #Validation Loss
  
 
  
  
  
  #MJ: validation
  pad = 0
  sos = 1
  eos = 2 

  transformer.eval()

  val_epoch_loss = 0
  
  
  with torch.no_grad():

    for i, batch in enumerate(val_data_iter): #MJ: use an iterator of batches 
      
    
      src_data = batch.src
      tgt_data_x = batch.tgt_x    #MJ: tgt_data =tgt_data_x: torch.Size([80, 9])
      tgt_data_y = batch.tgt_y
      
      output = transformer(src_data, tgt_data_x) #MJ: loss = log(y1) + logp(y2|y1} + log(y3|y1,y2) + ... + log(yn|y1,y2,...,y_{n-1})
      batch_val_loss = criterion(output.contiguous().view(-1, tgt_vocab_size),tgt_data_y.contiguous().view(-1))
        
      val_epoch_loss += batch_val_loss
      
    #for i, batch in enumerate(train_data_iter)
    epoch_avg_val_loss = val_epoch_loss / len(val_data_iter)  
  #with torch.no_grad()
      
  
  print(f"Epoch: {epoch+1}, epoch_avg_val_loss: {epoch_avg_val_loss}")
    
  #Compare the epoch train loss and the epoch val loss
  if epoch_avg_val_loss < best_valid_loss:
        best_valid_loss = epoch_avg_val_loss
        print(f"Epoch: {epoch+1}, best_valid_loss: { best_valid_loss}")
        torch.save( transformer.state_dict(), "tut-transformer.pt")   
      
      
#for epoch in range(1000)
  

Epoch: 1
Epoch: 1, epoch_avg_train_loss: 1.2848767042160034
Epoch: 1, epoch_avg_val_loss: 0.18356886506080627
Epoch: 1, best_valid_loss: 0.18356886506080627
Epoch: 2
Epoch: 2, epoch_avg_train_loss: 0.0895015150308609
Epoch: 2, epoch_avg_val_loss: 0.0021430221386253834
Epoch: 2, best_valid_loss: 0.0021430221386253834


In [None]:
# # Original target sequence
# x = ['A', 'B', 'C', 'D']

# # Special start token
# sos_token = '<SOS>'

# # Right-shift the sequence
# x_shifted = [sos_token] + x[:-1]

# print(x_shifted)


In [21]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)  #MJ: size = seq_len  = 20: 
    #MJ: the upper triangular part of the tensor X; The diagonal=1 specifies that the diagonal starts at the first superdiagonal (i.e., one position above the main diagonal).
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0  #MJ: broadcasting is applied ==> Upper triagnular boolean matrix, upper triangular = False

In [41]:
nbatches = 10 #MJ for validation
batch_size = 80
#val_data_iter = data_iter_gen(src_vocab_size,max_seq_length, batch_size, nbatches)
test_data_iter = DataIterator(src_vocab_size, max_seq_length, batch_size, nbatches)

 def encoder(self, src):
        src_mask = (src != self.pad).unsqueeze(1).unsqueeze(2)  #MJ: crc: [1, L] => src_mask: [B, 1, 1, L]
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        enc_output = src_embedded
                    
        for enc_layer in self.encoder_layers:
                enc_output = enc_layer(enc_output, src_mask) 
        return enc_output 
         
    def decoder(self, enc_output, src_mask, tgt):

In [20]:
def greedy_decode(transformer_model, src, max_len, start_token, end_token):
#greedy_decode(transformer, src_data,  max_gen_seq_length, sos, eos)    
    enc_output = transformer_model.encoder(src)    
    ys = torch.zeros(1, 1).fill_(start_token).type_as(src)  #MJ: Do not use src.data but use .detach() and/or with torch.no_grad() 
    #  torch.zeros(1, 1).fill_(start_token) = tensor([[0.]])
    src_mask = (src != transformer_model.pad).unsqueeze(1).unsqueeze(2)  #MJ: crc: [1, L] => src_mask: [B, 1, 1, L]
    
    for i in range(max_len - 1):
        out = transformer_model.decoder(enc_output, src_mask, ys)  #MJ: src: [1,15], ys: [1,1]; out_prob: [B, 1,11] =[B,location, seq_length]
        #print(f"out={out}") #out_prob=torch.Size([1, 1, 11]) => out_prob=torch.Size([1, 2, 11])
        #out_prob=tensor([[[-5.4479, -1.6750,  0.6865,  1.1233,  0.3584,  0.3773,  0.7420,
        #   1.0585, -0.2761,  1.0520,  1.0914]]], grad_fn=<ViewBackward0>)
        #out_prob=tensor([[[-5.4479, -1.6750,  0.6865,  1.1233,  0.3584,  0.3773,  0.7420,
        #    1.0585, -0.2761,  1.0520,  1.0914],
        #  [-5.4578, -1.2135,  0.7173,  0.9492,  0.3095,  0.3843,  0.7443,
        #    1.0394, -0.3098,  0.9794,  1.0174]]], grad_fn=<ViewBackward0>)
        last_logit = out[:, -1]
        #MJ: out[:, -1] selects the last time step along the sequence length dimension L
        # meaning you are extracting the features (of size 𝐷) at the last time step for each batch.
        # This operation slices the second dimension (sequence length), 
        #  reducing the tensor shape from (B, L, D) to (B, D).
        # (B, L, D) = (32, 100, 512), where: 
        # 32 is the batch size,
        # 100 is the sequence length (L),
        # 512 is the model dimension (D).
        # Then, out[:, -1] will give you a tensor with shape:

        # (B, D) = (32, 512), meaning you have selected the last token's representation (along the sequence dimension) for each batch.

        #print(f"last logit={last_logit}")
        _, next_word = torch.max(last_logit, dim=1)  
        
        #print(f"next_word={next_word}; shape={next_word.shape}")
                    
    
        # calling y = x.data will be a Tensor that shares the same data with x, is unrelated with the computation history of x, and has requires_grad=False.
        if next_word == end_token: #MJ next_word =tensor([7]) >
        #if (next_word == end_token).all():  # All values must be the end token
        #    print(f"reached the <eos> token") 
        #    print(f'ys={ys}')
           return ys 
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src).fill_(next_word.item())], dim=1
        ) 
    # print(f"reached the max_seq_length: {max_len}")   
    # print(f'ys={ys}')  
    return ys

In [43]:
#MJ: Inference
transformer.load_state_dict(torch.load("tut-transformer.pt"))
pad = 0
sos = 1
eos = 2 

transformer.eval()
total_loss =0
with torch.no_grad():
  
  for i, batch in enumerate(test_data_iter): #MJ: use an iterator of batches 
    for j in range( len(batch.src) ): #MJ: = 80
        
      src_data = batch.src[j][None]  #MJ: [80,15] <==> [1,15], 15 = max_seq_length
    
      tgt_data_y = batch.tgt_y[j][None]
   
      # src_data = batch.src
      # tgt_data_y = batch.tgt_y
      decoded_seq  = greedy_decode(transformer, src_data,  max_seq_length, sos, eos)
    
      print(f"i,j={i,j}: source  seq={src_data}")
      
      #print(f"target_y  seq={tgt_data_y[:,:]}")
      print(f"i,j={i,j}: decoded seq={ decoded_seq}")
      
      src_content =  src_data[0][: len(decoded_seq[0]) ]
      diff = (decoded_seq[0].float() - src_content.float())
      loss = ( diff * diff ).mean()
      if loss > 0:
        print(f'***************************loss nonzero: i,j={i,j}:  loss={loss}') 
      total_loss += loss
#with torch.no_grad()
print(f'total loss={total_loss}')    

i,j=(0, 0): source  seq=tensor([[ 1,  4,  3,  8,  3, 10,  6,  8,  9,  9,  9,  7,  4,  2,  0]])
i,j=(0, 0): decoded seq=tensor([[ 1,  4,  3,  8,  3, 10,  6,  8,  9,  9,  9,  7,  4]])
i,j=(0, 1): source  seq=tensor([[ 1,  8,  4,  3, 10,  3, 10,  5,  7,  5,  3,  9,  6,  2,  0]])
i,j=(0, 1): decoded seq=tensor([[ 1,  8,  4,  3, 10,  3, 10,  5,  7,  5,  3,  9,  6]])
i,j=(0, 2): source  seq=tensor([[ 1,  7,  9,  6,  9,  6,  6, 10,  4, 10, 10,  6, 10,  2,  0]])
i,j=(0, 2): decoded seq=tensor([[ 1,  7,  9,  6,  9,  6,  6, 10,  4, 10, 10,  6, 10]])
i,j=(0, 3): source  seq=tensor([[ 1, 10,  9,  6,  6,  7,  3,  9,  6,  4, 10,  6,  9,  2,  0]])
i,j=(0, 3): decoded seq=tensor([[ 1, 10,  9,  6,  6,  7,  3,  9,  6,  4, 10,  6,  9]])
i,j=(0, 4): source  seq=tensor([[ 1,  7,  5,  6,  4,  6,  5,  8,  5,  6,  8,  8, 10,  2,  0]])
i,j=(0, 4): decoded seq=tensor([[ 1,  7,  5,  6,  4,  6,  5,  8,  5,  6,  8,  8, 10]])
i,j=(0, 5): source  seq=tensor([[ 1, 10,  7,  3,  7,  4, 10,  3,  8,  6, 10,  7,  4,  2,  