In [857]:
import torch
import torch.nn as nn
import math
import numpy as np

## 1. input embedding


In [858]:
# first we;ll be building input embeddings
# allows to convert token into embedding of dim 1x52  : token -> input ID(position in vocab) ->embedding


class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        """

        Args:
            d_model (int): dim of vector
            vocab_size (int): # of words in vocab
        """

        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

    def forward(self, x):
        return self.embedding(x)


# Example usage
input_embeddings = InputEmbeddings(d_model=512, vocab_size=1000)
# Create an example input tensor (batch size 1, sequence length 5, embedding dimension 20)
batch_of_sentences = torch.tensor([[5, 6, 7, 0, 0]])  # Shape: (batch_size, max_sentence_length)
print(batch_of_sentences.shape)


# Pass through the embedding layer
# The forward method is called automatically when you use the instance like a function.
embedded_sentences = input_embeddings(batch_of_sentences)
embedded_sentences.shape, embedded_sentences  # (batch, seq_len, embedding dim)

torch.Size([1, 5])


(torch.Size([1, 5, 512]),
 tensor([[[-0.7521,  0.6727,  0.7883,  ...,  0.4614,  0.3725,  0.7242],
          [ 0.2382,  0.7201, -0.0123,  ..., -2.2650,  1.5683, -1.5002],
          [ 0.1547,  1.3250, -1.6600,  ...,  0.8833,  1.1661,  0.8756],
          [-0.6573,  0.0062, -0.2856,  ...,  1.6304, -0.2121, -0.0408],
          [-0.6573,  0.0062, -0.2856,  ...,  1.6304, -0.2121, -0.0408]]],
        grad_fn=<EmbeddingBackward0>))

In [859]:
vocab_size = 5
d_model = 6
nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)

Embedding(5, 6)

In [860]:
input_embeddings = InputEmbeddings(d_model=512, vocab_size=1000)
x = torch.tensor([1,2,3,4])
print(x.shape)
input_embeddings(x).shape

torch.Size([4])


torch.Size([4, 512])

In [861]:
# batch
x = torch.tensor([[1,2,3,4],[5,6,7,8]])
print(x.shape)
input_embeddings(x).shape  #(batch_size, seq_len, d_min)

torch.Size([2, 4])


torch.Size([2, 4, 512])

## 2. positional encoding


In [862]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        """
                Since our model contains no recurrence and no convolution, in order for the model to make use of the
        order of the sequence, we must inject some information about the relative or absolute position of the
        tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the
        bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel
        as the embeddings, so that the two can be summed. There are many choices of positional encodings,
        learned and fixed [9].
        In this work, we use sine and cosine functions of different frequencies:
            `PE(pos,2i) = sin(pos/(10000)**2i/dmodel)`
            `PE(pos,2i+1) = cos(pos/(10000)**2i/dmodel)`
        where pos is the position and i is the dimension. That is, each dimension of the positional encoding
        corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We
        chose this function because we hypothesized it would allow the model to easily learn to attend by
        relative positions, since for any fixed offset k, P E(pos+k) can be represented as a linear function of
        PE(pos).

        Keyword arguments:
        dropout -- to make model less overfit
        seq_len -- Specifies the maximum length of sequence that the model can handle. This helps determine the scale and range of the positional encodings.
        """
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        # positional encodeing shape: seq_len X d_model i.e. each token will be represented (1*d_model) vector

        """
        
        formula :`PE(pos,2i) = cos(pos/(10000)**2i/dmodel) for i=1,3,5, ...and `PE(pos,2i) = sin(pos/(10000)**2i/dmodel) for i=2,4,6, ...and `
        
        """

        #  Create a model of shape (seq_len , d_model)

        pe = torch.zeros(seq_len, d_model)
        #  create a vector of shape(seq_len,1) to represent position of word in sequence

        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)  # (seq_len,1)  # pos in formula
        # create denominator of formula
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # apply sin to even positions
        pe[:, 0::2] = torch.sin(position * div_term)

        # apply cos to odd positions
        pe[:, 1::2] = torch.cos(position * div_term)

        # now we need to add batch dimension to these sentences so we can apply it to whole sentences, so to all the batch of sentence, because weill have batch of sentences.
        # adding batch dim
        pe = pe.unsqueeze(0)  # (1, seq_len, d_model)

        # register this tensor in buffer of module  .. it is done for the tensor that you want to keep inside the module, not as a lerarned parameter but you want it to be saved when you save the file of the model
        # you should register it as a buffer. this way the tensor would be saved in file along with state of model
        self.register_buffer("pe", pe)  # This is typically used to register a buffer that should not to be considered a model parameter.
        
        
        """
        Say you have a linear layer nn.Linear. You already have weight and bias parameters. But if you need a new parameter you use register_parameter() to register a new named parameter that is a tensor.
        When you register a new parameter it will appear inside the module.parameters() iterator, but when you register a buffer it will not.
        The difference:
        Buffers are named tensors that do not update gradients at every step, like parameters. For buffers, you create your custom logic (fully up to you).
        """

    def forward(self, x):
        """
        we need to add positional encoding to every token/word inside sequence/sentence
        """
        print("x.shape:",x.shape)
        print("x.shape[1]:",x.shape[1])
        print("self.pe: ", self.pe.shape)
        print("self.pe[:, : x.shape[1], :]: ", self.pe[:, :, :].shape)
        print("\n\n\n\n")
        
        
        x = x + (self.pe[:, : x.shape[1], :]).requires_grad_(False)  # x:token and pe is positional encoding  # because we dont want to learn pe because these are fixed
        return self.dropout(x)


# Example usage
positional = PositionalEncoding(d_model=512, seq_len=15, dropout=0.5)

# Create an example input tensor (batch size , sequence length , embedding dimension )

# Apply positional encoding
positional_encoded = positional(embedded_sentences)
print("input ", embedded_sentences)

print("input shape", embedded_sentences.shape)
print("positional_encoded shape", positional_encoded.shape)

print(positional_encoded)  # (1, seq_len,d_model)

x.shape: torch.Size([1, 5, 512])
x.shape[1]: 5
self.pe:  torch.Size([1, 15, 512])
self.pe[:, : x.shape[1], :]:  torch.Size([1, 15, 512])





input  tensor([[[-0.7521,  0.6727,  0.7883,  ...,  0.4614,  0.3725,  0.7242],
         [ 0.2382,  0.7201, -0.0123,  ..., -2.2650,  1.5683, -1.5002],
         [ 0.1547,  1.3250, -1.6600,  ...,  0.8833,  1.1661,  0.8756],
         [-0.6573,  0.0062, -0.2856,  ...,  1.6304, -0.2121, -0.0408],
         [-0.6573,  0.0062, -0.2856,  ...,  1.6304, -0.2121, -0.0408]]],
       grad_fn=<EmbeddingBackward0>)
input shape torch.Size([1, 5, 512])
positional_encoded shape torch.Size([1, 5, 512])
tensor([[[-1.5043,  0.0000,  1.5765,  ...,  0.0000,  0.0000,  0.0000],
         [ 2.1593,  2.5208,  1.6191,  ..., -0.0000,  0.0000, -0.0000],
         [ 2.1280,  0.0000, -1.4472,  ...,  3.7665,  2.3326,  0.0000],
         [-1.0323, -0.0000, -0.0811,  ...,  5.2608, -0.4236,  0.0000],
         [-0.0000, -0.0000, -1.8856,  ...,  5.2608, -0.4234,  1.9184]]],
       grad_fn=

p


In [863]:
# create tensor
tensorx = torch.randn(10,10)
tensory = torch.randn(1, 10)
tensorx.shape, tensory

(torch.Size([10, 10]),
 tensor([[-0.2874, -0.8507, -0.6007, -0.5157,  0.3350,  1.0351,  1.4875, -0.3185,
          -2.7246,  1.7447]]))

In [864]:
tensorx[::2,:].shape, tensorx[::2,:]

(torch.Size([5, 10]),
 tensor([[-0.5988,  0.7209,  0.9964, -0.4112, -0.1477, -1.2911,  0.9932, -0.1335,
          -0.6351,  0.3323],
         [-0.8539,  1.2648,  0.1975, -0.5337,  2.0465,  0.7591,  0.0898, -0.6929,
          -0.2612,  0.7542],
         [-0.8129,  2.2606,  0.2895, -0.9672,  0.2092,  1.4318,  0.4218, -0.0687,
           0.7508,  1.6987],
         [-0.3953, -0.8902, -1.4158,  0.7205,  1.3755, -0.6518, -0.0923,  0.2495,
          -0.3197,  0.0689],
         [-0.1325, -0.1278,  0.7780, -0.8491, -0.2670, -0.5789, -0.0672, -0.3224,
          -1.9802, -1.6636]]))

In [865]:
tensorx[::2,::5].shape, tensorx[::2,::5]

(torch.Size([5, 2]),
 tensor([[-0.5988, -1.2911],
         [-0.8539,  0.7591],
         [-0.8129,  1.4318],
         [-0.3953, -0.6518],
         [-0.1325, -0.5789]]))

### 2.1 register_buffer


In [866]:
class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()

        # Register a buffer tensor with zeros
        self.register_buffer("buffer_tensor", torch.zeros(3, 3))

        # Register another buffer tensor with specific values
        data = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.float32)
        self.register_buffer("another_buffer", data)

    def forward(self, x):
        # Use the buffer tensors in the forward pass
        output = x + self.buffer_tensor
        return output


# Create an instance of MyModule
model = MyModule()

# Print the module to see its structure
print(model)

# Accessing the buffer tensors
print("Buffer tensor:")
print(model.buffer_tensor)

print("\nAnother buffer tensor:")
print(model.another_buffer)

MyModule()
Buffer tensor:
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])

Another buffer tensor:
tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])


Buffers in PyTorch are a key feature that allows you to store tensors that are not parameters of a model but are still important for its operation. Here's an in-depth look at buffers, including their purpose, usage, and how they differ from parameters.

## What are Buffers?

Buffers are named tensors that are registered within a PyTorch `nn.Module`. Unlike model parameters, which are learned during training (e.g., weights and biases), buffers are typically used to store state information that should be preserved across model saves and loads, but does not require gradients.

### Key Characteristics of Buffers

1. **Non-learnable**: Buffers do not get updated during the training process through backpropagation. They are not optimized by any learning algorithm.
2. **State Preservation**: Buffers are saved in the model's `state_dict`, allowing them to be restored when the model is loaded. This is useful for maintaining certain statistics or configurations that are necessary for the model's operation.

3. **Device Management**: Buffers are automatically moved to the appropriate device (CPU or GPU) when the model is transferred between devices using `.to()`, ensuring that they are always in the correct context for computations.

### Common Use Cases

Buffers are often used in scenarios where you need to maintain certain statistics or configurations that evolve over time but are not directly learned parameters. A common example is in batch normalization, where the running mean and variance are stored as buffers.


In [867]:
class RunningMean(nn.Module):
    def __init__(self, num_features, momentum=0.9):
        super().__init__()
        self.momentum = momentum
        self.register_buffer('mean', torch.zeros(num_features))

    def forward(self, x):
        # Update the running mean
        self.mean = self.momentum * self.mean + (1 - self.momentum) * x.mean(dim=0)
        return x


In this example, `mean` is a buffer that keeps track of the running average of the input features.

### Registering Buffers

You can register a buffer in a PyTorch module using the `register_buffer()` method. This method takes the name of the buffer and the tensor you want to register.

```python
self.register_buffer('buffer_name', tensor)
```

### Differences Between Buffers and Parameters

1. **Gradient Tracking**: Parameters are part of the model's learnable parameters and have gradients computed for them during training. Buffers do not have gradients and are not updated through optimization algorithms.

2. **Inclusion in `parameters()`**: When you call the `parameters()` method on a module, it returns only the parameters. Buffers will not be included in this list.

3. **Use Cases**: Buffers are typically used for maintaining state (like running statistics), while parameters are used for weights and biases that need to be learned from data.

### Practical Considerations

- **Memory Management**: Buffers can help manage memory by storing intermediate computations or states that are needed later without the overhead of learning them.
- **Checkpointing**: Buffers can be used in techniques like checkpointing, where you save the state of certain variables without saving the entire model, allowing for more efficient training and inference.

- **Integration with Other Features**: Buffers can be used alongside other PyTorch features like hooks, which allow for custom operations during the forward and backward passes of the model.


## parameters automatically get moved to gpu with .to(device) but tensor dont that why we have to register them as buffer

Yes, you are correct. When you move a PyTorch model to a device (e.g., a GPU) using .to(device), the model’s parameters are automatically moved to the specified device. However, regular tensors defined within the model are not automatically moved. To ensure that these tensors are moved along with the parameters, you register them as buffers.

### Why Use Buffers?

1. Automatic Device Management: Buffers are automatically moved to the specified device when you call .to(device) on the model.
2. Consistency: Buffers maintain consistency in terms of device allocation, ensuring all parts of the model (parameters and non-parameters) are on the same device.
3. Persistence: Buffers are saved and loaded along with the model’s parameters, making it easier to manage the model state.


In [868]:
import torch
import torch.nn as nn

class MyModule(nn.Module):
    def __init__(self):
        super(MyModule, self).__init__()
        self.param = nn.Parameter(torch.randn(3, 3))  # A learnable parameter
        self.buffer = torch.randn(3, 3)  # A regular tensor
        self.register_buffer('registered_buffer', torch.randn(3, 3))  # A registered buffer

    def forward(self, x):
        return x + self.param + self.registered_buffer

# Create an instance of the module
model = MyModule()

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Check device of model components
print("Parameter device:", model.param.device)
print("Regular tensor device (not moved):", model.buffer.device)
print("Buffer device:", model.registered_buffer.device)


Parameter device: cpu
Regular tensor device (not moved): cpu
Buffer device: cpu


## 3. Add & Norm - layer normalization

for each item in the batch, calculte mean & var, and normalize each item so that each has mean=0, and var of 1(z-standardization), Beta and Gamma are also learnt to minimize the data flactuation as having values between - and 1 might be too restrictive.

new xj = (xj -meanj) / math.sqrt(var\*\*2 + epsilon)

simplified version: `x = α * (x - μ) / (σ + ε) + β`

gamma(multiplication) and beta(addition) will be learnt after this. epsilon is for numericalsatability as if denominator gets very small, overall number would be difficult to manage percision wise.

![alt text](<Screenshot from 2024-07-22 14-45-21.png>)


In [869]:
class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 10**-6) -> None:
        super().__init__()
        self.eps = eps  # epsilon
        self.alpha = nn.Parameter(torch.ones(1))  # gamma  # mulltiplied
        self.bias = nn.Parameter(torch.zeros(1))  # added

    def forward(self, x):
        # print(x.shape)
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        # print("mean shape", mean.shape, mean)

        return self.alpha * (x - mean) / (std + self.eps) + self.bias


ln = LayerNormalization()

# print("Before normalization:")
# print(positional_encoded)

normalized = ln(positional_encoded)
# print("After normalization:")
print(normalized.shape)
normalized  # (1, seq_len,d_model)

torch.Size([1, 5, 512])


tensor([[[-1.2170, -0.2799,  0.7022,  ..., -0.2799, -0.2799, -0.2799],
         [ 1.0176,  1.2587,  0.6572,  ..., -0.4228, -0.4228, -0.4228],
         [ 1.0056, -0.3246, -1.2293,  ...,  2.0299,  1.1335, -0.3246],
         [-0.9406, -0.3104, -0.3599,  ...,  2.9010, -0.5690, -0.3104],
         [-0.2604, -0.2604, -1.4239,  ...,  2.9857, -0.5217,  0.9233]]],
       grad_fn=<AddBackward0>)

## 4. feed forward block

In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully
connected feed-forward network, which is applied to each position separately and identically. This
consists of two linear transformations with a ReLU activation in between.

`FFN(x) = max(0, xW1 + b1)W2 + b2 (2)` # two lyers with ReLu in between

While the linear transformations are the same across different positions, they use different parameters
from layer to layer. Another way of describing this is as two convolutions with kernel size 1.
The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality
dff = 2048.


In [870]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff, bias=True)  # first layer: w1,b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model, bias=True)  # second layer: w2,b2

    def forward(self, x):
        # input:(batch, seq_len, d_model)

        # after first layer: (batch, seq_len, d_ff)

        # after second layer: (batch, seq_len, d_model)

        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


feedforwardblock = FeedForwardBlock(d_model=512, d_ff=2048, dropout=0.5)

print("Before feedforwardblock:")
print(normalized.shape, normalized)

feedforwarded = feedforwardblock(normalized)
# print("After normalization:")
print(feedforwarded.shape)
feedforwarded  # (1, seq_len,d_model)

Before feedforwardblock:
torch.Size([1, 5, 512]) tensor([[[-1.2170, -0.2799,  0.7022,  ..., -0.2799, -0.2799, -0.2799],
         [ 1.0176,  1.2587,  0.6572,  ..., -0.4228, -0.4228, -0.4228],
         [ 1.0056, -0.3246, -1.2293,  ...,  2.0299,  1.1335, -0.3246],
         [-0.9406, -0.3104, -0.3599,  ...,  2.9010, -0.5690, -0.3104],
         [-0.2604, -0.2604, -1.4239,  ...,  2.9857, -0.5217,  0.9233]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 5, 512])


tensor([[[ 0.0674, -0.1074, -0.2726,  ...,  0.5297,  0.0338,  0.2055],
         [ 0.1020,  0.0711, -0.2530,  ...,  0.3117,  0.4551, -0.1430],
         [ 0.0120,  0.6650,  0.6040,  ...,  0.4192,  0.4496,  0.1132],
         [ 0.3258,  0.3699,  0.0034,  ...,  0.2887,  0.1953, -0.0397],
         [ 0.3807, -0.1868, -0.1668,  ...,  1.1326,  0.4202, -0.3650]]],
       grad_fn=<ViewBackward0>)

In [871]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int=512, d_ff: int=2048, dropout: float=0.5) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff, bias=True)  # first layer: w1,b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model, bias=True)  # second layer: w2,b2

    def forward(self, x):
        # input:(batch, seq_len, d_model)

        # after first layer: (batch, seq_len, d_ff)

        # after second layer: (batch, seq_len, d_model)

        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))


feedforwardblock = FeedForwardBlock()

print("Before feedforwardblock:")
print(normalized.shape, normalized)

feedforwarded = feedforwardblock(normalized)
# print("After normalization:")
print(feedforwarded.shape)
feedforwarded  # (1, seq_len,d_model)

Before feedforwardblock:
torch.Size([1, 5, 512]) tensor([[[-1.2170, -0.2799,  0.7022,  ..., -0.2799, -0.2799, -0.2799],
         [ 1.0176,  1.2587,  0.6572,  ..., -0.4228, -0.4228, -0.4228],
         [ 1.0056, -0.3246, -1.2293,  ...,  2.0299,  1.1335, -0.3246],
         [-0.9406, -0.3104, -0.3599,  ...,  2.9010, -0.5690, -0.3104],
         [-0.2604, -0.2604, -1.4239,  ...,  2.9857, -0.5217,  0.9233]]],
       grad_fn=<AddBackward0>)
torch.Size([1, 5, 512])


tensor([[[-0.3161, -0.4402,  0.0345,  ...,  0.0255,  0.1650, -0.2611],
         [-0.3515,  0.5928,  0.0686,  ...,  0.0961, -0.1577,  0.7342],
         [ 0.1095,  0.0188, -0.1056,  ..., -0.2353, -0.2274,  0.1106],
         [-0.2529,  0.0896,  0.3624,  ..., -0.1020,  0.0230,  0.6208],
         [-0.3263,  0.2230,  0.6840,  ..., -0.2624,  0.2756, -0.1731]]],
       grad_fn=<ViewBackward0>)

# 5. Multi-head attention

takes input:(seq_len, d_model) of encoder and uses it three times k:key, q:query, v:values. then we multiply these matrices with Wk, Wq and Wv respectively. resulting in K',Q',V' of same(seq_len, d_model) dim. Now,split each of K', Q' and V' into h parts along d_model(embedding) dim where h is number of head. So that each head will have access to full sentence but different part of embedding of each token.

Now, apply following formulas to each head which will result into h matrices of `(seq_len, d_k)` dims where `d_k` = `d_model/h`

$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
$$

$$
\text{head}_i = \text{Attention}(QW^Q_i, K W^K_i, V W^V_i)
$$

Now concatenate all heads,

$$
\text{MultiHead(Q, K, V)} = \text{Concatenate}(\text{head}_1, \text{head}_2, \ldots, \text{head}_h) W^o
$$

![alt text](MHA.png)

W^o is of `(seq, h*d_v)` shape where `d_v = d_k`

and resultant MH-A is `(seq_len, d_model)` same as input

But we also have to consider batch_dim for dealing with multiple sentences; the above intition works for single sentence.

`SO WE WILL CONSIDER BATCH DIMENSION.`

---

### MASK

before applying multiplying with V meaning

$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)
$$

we get a scaled dot profuct of q and k, its (seq_len, seq_len) matrix. this shows interaction of each words with each other word.

If we dont want some words to interact with other words, we basically replace there attention score(before applying softmax) with very small value, which means after softmax these values will become zero,so basically we hide attention between those two words.


In [872]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model is not divisible by h"
        self.d_k = d_model // h

        # define weight matrices
        self.wq = nn.Linear(d_model, d_model)  # wq
        self.wk = nn.Linear(d_model, d_model)  # wk
        self.wv = nn.Linear(d_model, d_model)  # wv

        # output matrix Wo (h*dv, d_model) where dv = dk
        self.wo = nn.Linear(d_model, d_model)  # wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod  # so we could cal fn wothout specifying class instance
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]  # last dim of query/key/value

        # (batch, h, seq_len, d_k) -> # (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)  # transpose(-2, -1): transpose last two dims

        # apply mask: just replace values you want to mask with very small values

        if mask is not None:
            attention_scores.masked_fill(mask == 0, -1e9)  # replace all values where mask==0 (conidtion is true) with -1e9

        # applying softmax
        attention_scores = attention_scores.softmax(dim=-1)  # (batch_size, h, seq_len,seq_len)

        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        # (batch, seq_len, d_model) -> (batch, seq_len, d_model)
        query = self.wq(q)
        print("q::::::::::::::::::::::::::::::::::::::::::::::::::::", q.shape)
        print("k::::::::::::::::::::::::::::::::::::::::::::::::::::", k.shape)
        print("v::::::::::::::::::::::::::::::::::::::::::::::::::::", v.shape)
        # ouput: q:::::::::::::::::::::::::::::::::::::::::::::::::::: torch.Size([8, 350, 512])

        key = self.wk(k)
        value = self.wv(v)

        # splitting
        # (batch, seq_len, d_model) -> (batch, seq_len, h, d_k) -> (batch, h, seq_len, d_k)
        # we moved h dimension because we want each head to consider (seq_len, d_k)
        # each head considers full sentence but smaller embedding
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # (batch, h, seq_len, d_k)
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query=query, key=key, value=value, mask=mask, dropout=self.dropout)

        # (batch, h, seq_len, d_k) -> (batch, seq_len, h, d_k)
        x = x.transpose(1, 2)

        # (batch, seq_len, d_model)
        x = x.contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (batch, seq_len, d_model)
        return self.wo(x)

## 4,1 einsum


In [873]:
import torch

# Define query and key tensors with the specified shapes
query = torch.randn(8,8, 350, 512)
key = torch.randn(8,8, 350, 512)
value = torch.randn(8,8, 350, 512)

# Calculate using Einstein summation notation
attention_scores = torch.einsum('bhij,bhkj->bhik', query, key)

# print("Result shape:", attention_scores.shape)  # Should print (8, 350, 350)
# Calculate using Einstein summation notation
# result = torch.einsum('bij,bjk->bik', query, key)


# applying softmax
attention_scores = attention_scores.softmax(dim=-1)  # (batch_size, h, seq_len,seq_len)

print("attention_scores: ",attention_scores.shape)
print("key: ",key.shape)

result = torch.einsum('bhsj,bhsk->bhsk', attention_scores, value)

print("result: ",result.shape)

print("=====================")
resultmat = attention_scores @ key
# print("resultmat: ",resultmat[0][0][0][:5])




attention_scores:  torch.Size([8, 8, 350, 350])
key:  torch.Size([8, 8, 350, 512])
result:  torch.Size([8, 8, 350, 512])


In [874]:
import torch
import math

# Example dimensions
batch_size = 2
num_heads = 3
seq_len = 4
d_k = 5

# Random tensors
attention_scores = torch.randn(batch_size, num_heads, seq_len, seq_len)
key = torch.randn(batch_size, num_heads, seq_len, d_k)

# Compute result using torch.einsum
result = torch.einsum('bhsj,bhsk->bhsk', attention_scores, key)

# Compute result using @ operator
resultmat = attention_scores @ key

# Print a sample of the results to compare
print("result (einsum): ", result[0][0][0][:5])
print("=====================")
print("resultmat (matmul): ", resultmat[0][0][0][:5])


result (einsum):  tensor([-0.1943, -0.0143, -0.6272, -0.5324, -0.9999])
resultmat (matmul):  tensor([-0.3911, -0.9771,  0.0572, -0.1239, -1.1396])


## 5. Residual/skip connection

between add & Norm and previous layer


In [875]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):  # sublayer:previous layer
        """take x and combine with output of next layer"""

        return x + self.dropout(sublayer(self.norm(x)))

## 6. Encoderblock

![alt text](<Screenshot from 2024-07-22 14-45-21.png>)

it will contain one multi-head attention, two Add&Norm, one Feed forward block and two residual connections


In [876]:
class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()

        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connection = nn.ModuleList([ResidualConnection(dropout=dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        """
        src_mask: mask we want to apply to input of encoder. we need this to hide interaction of padding word with other words.

        """

        # multihead attention within

        x = self.residual_connection[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))

        """
        The lambda is used because self_attention_block needs four arguments (query, key, value, mask) 
        but ResidualConnection expects a function that takes only one argument.
        The lambda allows us to create a function that takes one argument x and expands it to the required four arguments, including the src_mask.
        """
        x = self.residual_connection[1](x, self.feed_forward_block)
        return x

## Encoder

is made up of many encoder

Each encoder block is repeated Nx times

![alt text](<Screenshot from 2024-07-22 14-45-21.png>)


In [877]:
class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers

        self.norm = LayerNormalization()  # at end

    def forward(self, x, mask):
        # apply on layer after another # order matters
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

# Decoder

![alt text](<Screenshot from 2024-07-22 15-50-17.png>)


## output embeddings

output embeddings are same as input embeddings, so weill just intialize it twice

masked attention is some what same as self attenntion because of 3 same inputs while Mulihead attention block is actually cross attension bea=cause key and value are cping from encoder.


In [878]:
class DecoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block

        # we have three residual connections
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout=dropout) for _ in range(3)])
        self.dropout = dropout

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        """


        Args:
            x: input of decoder
            src_mask: mask applied to encoder
            tgt_mask: target mask applied to decoder



        src_mask and tgt_mask because we are dealing with language transalation. SO, source language is English and target language is italian

        """

        # i. masked multihead attention: first residual connection
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))  # tgt_mask:becasue its decoder

        # ii. cross attention: second residual connection
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))

        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [879]:
class Decoder(nn.Module):
    """build decoder which is n times DecoderBlock one after anotherjust we did for encoder

    Args:
        nn (_type_): whic
    """

    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            # each layer is a decoderblock
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

# Projection/linear layer

![alt text](<Screenshot from 2024-07-23 10-50-34.png>)

output of multihead attention is (seq_len, d_model)

However we want to these words back into vocabularly which convert embedding to position in vocab


In [880]:
class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        """
        this is a linear layer that is converting from d_model to vocab_size

        """

        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch seq_len, vocab_size)
        # The purpose of applying softmax is to convert the raw output of the linear layer into a probability distribution over the vocabulary.
        #  we will also apply softmax, specifically log_softmax for mathematiacal stability

        return torch.log_softmax(self.proj(x), dim=-1)

In [881]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        """
        we need source embedding and target embedding because we are dealing with multiple languages

        """

        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    # three methods, one to encoder, one to decode and one to project
    # Not creating single forward method because we can reuse output of encoder and to also visualize the attention

    def encode(self, src, src_mask):
        """_summary_

        Args:
            src (_type_): src of language
            src_mask (_type_): source mask
        """

        src = self.src_embed(src)
        src = self.src_pos(src)
        src = self.encoder(src, src_mask)
        return src

    def decode(self, encoder_ouput, src_mask, tgt, tgt_mask):
        print("decodig.............................................................................")
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_ouput, src_mask, tgt_mask)

    def project(self, x):
        return self.projection_layer(x)

In [882]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int = 512, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:
    """sumary_line

    we need vocab size of src and tgt so get info about how many vectors to be created
    Keyword arguments:

    N: number of input layers i.e. number of enccoder blocks and number of decoder blocks
    h: # of heads
    """

    """strcuture will be same across all tasks"""

    # create embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # positional encoding layers
    # one encoding layer wold be enough
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, src_seq_len, dropout)

    # create encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)

        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)
    # create encoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)

        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # now create encoder and decoder
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))

    # create projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # create transfromer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # initilize parameter to make trainig faster so they dont just strat with random values
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return transformer

In [883]:
# import torch
# import torch.nn as nn
# import math
# import numpy as np

# # 1 input embeddings


# class InputEmbeddings(nn.Module):
#     def __init__(
#         self,
#         d_model: int,
#         vocab_size: int,
#     ):
#         super().__init__()
#         self.vocab_size = vocab_size
#         self.d_model = d_model
#         self.embedding = nn.Embedding(self.vocab_size, self.d_model)

#     def forward(self, x):
#         print("\tsampleInputShape: (batch, seq_len) : ", x.shape)
#         x = self.embedding(x)
#         print("\tinputEmbeddingsReturnShape: (batch, seq_len, d_model) : ",
#               x.shape)
#         return x


# # Example usage
# input_embeddings = InputEmbeddings(d_model=512, vocab_size=1000)
# # Create an example input tensor (batch size 1, sequence length 5, embedding dimension 20)
# batch_of_sentences = torch.tensor(
#     [[5, 6, 7, 0, 0]])  # Shape: (batch_size, max_sentence_length)
# # print(batch_of_sentences.shape)

# # Pass through the embedding layer
# # The forward method is called automatically when you use the instance like a function.
# embedded_sentences = input_embeddings(batch_of_sentences)
# # print(
# #     "embedded_sentences.shape: ",
# #     embedded_sentences.shape,
# #     " embedded_sentences: ",
# #     embedded_sentences,
# # )  # (batch, seq_len, embedding dim)

# ######################################################################################################################


# class PositionalEncoding(nn.Module):
#     def __init__(self, d_model: int, seq_len: int, dropout: float):
#         super().__init__()
#         self.d_model = d_model
#         self.seq_len = seq_len
#         self.dropout = nn.Dropout(dropout)
#         # build matrix of (seq_len, d_model)
#         pe = torch.zeros(seq_len, d_model)

#         #  create a vector of shape (seq_len,1) to represent position of word in sequence
#         pos = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(
#             1)  # (seq_len,1)  # pos in formula

#         # create denominator
#         div_term = torch.exp(
#             torch.arange(0, d_model, 2).float() *
#             (-math.log(10000.0) / d_model))

#         # apply sin to even positions
#         pe[:, 0::2] = torch.sin(pos * div_term)

#         # apply cos to odd positions
#         pe[:, 1::2] = torch.cos(pos * div_term)

#         # print("pe.shape", pe.shape)
#         pe = pe.unsqueeze(0)  # (batch, seq_len, d_model)

#         self.register_buffer("pe", pe)

#     def forward(self, x):
#         """
#         we need to add positional encoding to every token/word inside sequence/sentence
#         """
#         x = x + self.pe[:, :x.shape[1], :]  #
#         # x:token and pe is positional encoding  # because we dont want to learn pe because these are fixed
#         print("\tpositionalEncodingReturnShape: (batch, seq_len, d_model) : ",
#               x.shape)

#         #  :x.shape[1]:selecting just # of tokens because of input sequence length.
#         return self.dropout(x)  # (batch, seq_len, d_model)


# # Example usage
# positional = PositionalEncoding(d_model=512, seq_len=35, dropout=0.5)
# # print("positional:", positional)

# # Create an example input tensor (batch size , sequence length , embedding dimension )

# # print("\n\n\n input to PE", embedded_sentences.shape)

# positional_encoded = positional(embedded_sentences)
# # print("input ", embedded_sentences)

# # print("input shape", embedded_sentences.shape)
# # print("positional_encoded shape", positional_encoded.shape)

# # print(positional_encoded)  # (1, seq_len,d_model)

# ######################################################################################################################


# class LayerNormalization(nn.Module):
#     def __init__(self, eps: float = 1e-6):
#         super().__init__()
#         self.eps = eps
#         self.alpha = nn.Parameter(torch.ones(1))  # gamma  # mulltiplied
#         self.bias = nn.Parameter(torch.zeros(1))  # added

#     def forward(self, x):
#         # print(x.shape)
#         mean = x.mean(dim=-1, keepdim=True)
#         std = x.std(dim=-1, keepdim=True)
#         # print("mean shape", mean.shape, mean)
#         normalized = self.alpha * (x - mean) / (std + self.eps) + self.bias

#         print("\tnormalizedReturnShape: (batch, seq_len, d_model) : ",
#               normalized.shape)

#         return normalized


# ln = LayerNormalization()

# # print("Before normalization:")
# # print(positional_encoded)

# normalized = ln(positional_encoded)
# # print("After normalization:")
# # print(normalized.shape)
# # normalized  # (1, seq_len,d_model)

# ##########################################################################################

# # feed forward network


# class FeedForwardBlock(nn.Module):
#     def __init__(self,
#                  d_model: int = 512,
#                  d_ff: int = 2048,
#                  dropout: float = 0.5):
#         super().__init__()
#         self.linear_1 = nn.Linear(d_model, d_ff, bias=True)
#         self.dropout = nn.Dropout(dropout)
#         self.linear_2 = nn.Linear(d_ff, d_model, bias=True)

#     def forward(self, x):
#         # print("\tbeforeFeedForwardReturnShape: (batch, seq_len, d_model) : ",
#         #       x.shape)

#         x = self.linear_2(self.dropout(torch.relu(self.linear_1(x))))
#         print("\tfeedForwardReturnShape: (batch, seq_len, d_model) : ",
#               x.shape)

#         return x


# feedforwardblock = FeedForwardBlock()

# feedforwarded = feedforwardblock(normalized)

# #############################################################################################

# # Multi


# class MultiHeadAttentionBlock(nn.Module):
#     def __init__(self, d_model: int, h: int, dropout: float):
#         super().__init__()
#         self.d_model = d_model
#         self.h = h
#         self.dropout = nn.Dropout(dropout)
#         assert d_model % h == 0, "d_model is not divisible by h"
#         self.d_k = d_model // h
#         attention_scores = None

#         # weight matrices
#         self.wq = nn.Linear(d_model, d_model)
#         self.wk = nn.Linear(d_model, d_model)
#         self.wv = nn.Linear(d_model, d_model)

#         # output matrix :Wo (h*dv, d_model)
#         self.wo = nn.Linear(d_model, d_model)

#     @staticmethod
#     def attention(query, key, value, mask, dropout: nn.Dropout):
#         d_k = query.shape[-1]  # extract embedding length

#         # (b,h,seq_len,dk)->b,h,seq_len,seq_len)
#         attention_scores = torch.einsum("bhij,bhkj->bhik", query,
#                                         key) / math.sqrt(d_k)

#         if mask is not None:
#             # replace all values with very small number so softwax will assign them 0 in output.
#             attention_scores = attention_scores.masked_fill(mask == 0, -1e9)

#         attention_scores = attention_scores.softmax(dim=-1)

#         if dropout is not None:
#             attention_scores = dropout(attention_scores)

#         # (b,h,seq_len, seq_len) -> (b,h, seq_len, d_k)
#         attention_scoresV = torch.einsum("bhsj, bhsk->bhsk", attention_scores,
#                                          value)

#         print(
#             "\t\tmultiHeadAttentionReturnShape: (batch,heads, seq_len, d_k) : ",
#             attention_scoresV.shape,
#         )

#         return (attention_scoresV, attention_scores)

#     def forward(self, q, k, v, mask):
#         query = self.wq(q)
#         value = self.wv(v)
#         key = self.wk(k)

#         query = query.view(query.shape[0], query.shape[1], self.h, self.d_k)
#         value = value.view(value.shape[0], value.shape[1], self.h, self.d_k)
#         key = key.view(key.shape[0], key.shape[1], self.h, self.d_k)

#         # swap seq_len and h so that we can consider each head as (seq_len, d_k)
#         # (batch, seq_len, h, d_k) -> (batch, h, seq_len, d_k)

#         query = query.permute(0, 2, 1, 3)
#         value = value.permute(0, 2, 1, 3)
#         key = key.permute(0, 2, 1, 3)

#         # (batch, h,seq_len, d_k)
#         x, self.attention_scores = MultiHeadAttentionBlock.attention(
#             query, key, value, mask, self.dropout)

#         # reverting shape permutation: (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k)
#         x = x.permute(0, 2, 1, 3)

#         # (batch, seq_len, h, d_k) -->(batch, seq_len, d_model)
#         x = x.contiguous().view(x.shape[0], -1, self.h * self.d_k)
#         x = self.wo(x)

#         print("\tmhaReturnShape: (batch, seq_len, d_model) : ", x.shape)

#         # (batch, seq_len, d_model
#         return x


# # Create an instance of MultiHeadAttentionBlock
# MHA = MultiHeadAttentionBlock(d_model=512, h=8, dropout=0.5)

# # Create random input tensors
# batch_size = 8
# seq_len = 10
# d_model = 512

# # Create random tensors for query, key, and value
# # Shape: (batch_size, seq_len, d_model)
# q = torch.randn(batch_size, seq_len, d_model)
# k = torch.randn(batch_size, seq_len, d_model)
# v = torch.randn(batch_size, seq_len, d_model)

# # No mask for this example
# mask = None

# # Call the MultiHeadAttentionBlock
# mha = MHA(q, k, v, mask)

# ##################################################


# class ResidualConnection(nn.Module):
#     def __init__(self, dropout: float):
#         super().__init__()
#         self.dropout = nn.Dropout(dropout)
#         self.norm = LayerNormalization()

#     def forward(self, x, sublayer):
#         # x is input
#         return x + self.dropout(sublayer(self.norm(x)))


# ###################################################


# class EncoderBlock(nn.Module):
#     def __init__(
#         self,
#         self_attention_block: MultiHeadAttentionBlock,
#         feed_forward_block: FeedForwardBlock,
#         dropout: float,
#     ):
#         super().__init__()
#         self.self_attention_block = self_attention_block  # bve
#         self.feed_forward_block = feed_forward_block
#         self.residual_connection = nn.ModuleList(
#             [ResidualConnection(dropout) for _ in range(2)])

#     def forward(self, x, src_mask):
#         x = self.residual_connection[0](
#             x, lambda x: self.self_attention_block(x, x, x, src_mask))
#         x = self.residual_connection[1](x, self.feed_forward_block)
#         print("\tencoderBlockReturnShape: (batch, seq_len, d_model) : ",
#               x.shape)
#         return x


# ###################################################
# # for N encoder blocks


# class Encoder(nn.Module):
#     def __init__(self, layers: nn.ModuleList):
#         super().__init__()
#         self.layers = layers  # this would be encoderBlock
#         self.norm = LayerNormalization()  # layerNorm at end

#     def forward(self, x, mask):
#         for i, layer in enumerate(self.layers):
#             print(f"\n\nEncoder Block:{i+1} \n")
#             x = layer(x, mask)
#         x = self.norm(x)
#         print("\nencoderReturnShape: (batch, seq_len, d_model) : ", x.shape)
#         return x


# ######################################################3333
# # # Example usage
# # d_model = 512
# # vocab_size = 1000
# # seq_len = 35
# # dropout = 0.5
# # num_layers = 6
# # num_heads = 8
# # d_ff = 2048

# # # Create instances
# # input_embeddings = InputEmbeddings(d_model=d_model, vocab_size=vocab_size)
# # positional_encoding = PositionalEncoding(d_model=d_model,
# #                                          seq_len=seq_len,
# #                                          dropout=dropout)
# # feed_forward_block = FeedForwardBlock(d_model=d_model,
# #                                       d_ff=d_ff,
# #                                       dropout=dropout)
# # self_attention_block = MultiHeadAttentionBlock(d_model=d_model,
# #                                                h=num_heads,
# #                                                dropout=dropout)

# # encoder_blocks = nn.ModuleList([
# #     EncoderBlock(self_attention_block, feed_forward_block, dropout)
# #     for _ in range(num_layers)
# # ])
# # encoder = Encoder(layers=encoder_blocks)

# # # Create example input
# # batch_size = 1
# # batch_of_sentences = torch.tensor(
# #     [[5, 6, 7, 0, 0]])  # Shape: (batch_size, max_sentence_length)
# # embedded_sentences = input_embeddings(batch_of_sentences)
# # positional_encoded = positional_encoding(embedded_sentences)

# # # Run through the encoder
# # mask = None  # Assuming no mask for simplicity
# # encoder_output = encoder(positional_encoded, mask)

# # #################################################

# # # output embeddings are same as input embeddings


# class DecoderBlock(nn.Module):
#     def __init__(
#         self,
#         self_attention_block: MultiHeadAttentionBlock,
#         cross_attention_block: MultiHeadAttentionBlock,
#         feed_forward_block: FeedForwardBlock,
#         dropout: float,
#     ):
#         super().__init__()
#         self.self_attention_block = self_attention_block
#         self.cross_attention_block = cross_attention_block
#         self.feed_forward_block = feed_forward_block
#         self.residual_connections = nn.ModuleList(
#             [ResidualConnection(dropout) for _ in range(3)])

#     def forward(self, x, encoder_output, src_mask, tgt_mask):
#         x = self.residual_connections[0](
#             x, lambda t: self.self_attention_block(t, t, t, tgt_mask))
#         x = self.residual_connections[1](
#             x,
#             lambda t: self.cross_attention_block(t, encoder_output,
#                                                  encoder_output, src_mask),
#         )
#         x = self.residual_connections[2](x, self.feed_forward_block)
#         print("\tdecoderBlockReturnShape: (batch, seq_len, d_model) : ",
#               x.shape)
#         return x


# class Decoder(nn.Module):
#     def __init__(self, layers: nn.ModuleList):
#         super().__init__()
#         self.layers = layers
#         self.norm = LayerNormalization()

#     def forward(self, x, encoder_output, src_mask, tgt_mask):
#         for i, layer in enumerate(self.layers):
#             print(f"\n\nDecoder Block:{i+1} \n")
#             x = layer(x, encoder_output, src_mask, tgt_mask)
#         x = self.norm(x)
#         print("\ndecoderReturnShape: (batch, seq_len, d_model) : ", x.shape)

#         return x


# class ProjectionLayer(nn.Module):
#     def __init__(self, d_model: int, vocab_size: int):
#         super().__init__()
#         self.proj = nn.Linear(d_model, vocab_size)

#     def forward(self, x):
#         x = torch.log_softmax(self.proj(x), dim=-1)
#         print("\nprojectionLayerReturnShape: (batch, seq_len, d_model) : ",
#               x.shape)
#         return x


# ####################################################

# # d_model = 512
# # vocab_size = 1000
# # seq_len = 35
# # dropout = 0.5
# # num_layers = 6
# # num_heads = 8
# # d_ff = 2048

# # input_embeddings = InputEmbeddings(d_model=d_model, vocab_size=vocab_size)
# # positional_encoding = PositionalEncoding(d_model=d_model,
# #                                          seq_len=seq_len,
# #                                          dropout=dropout)
# # feed_forward_block = FeedForwardBlock(d_model=d_model,
# #                                       d_ff=d_ff,
# #                                       dropout=dropout)
# # self_attention_block = MultiHeadAttentionBlock(d_model=d_model,
# #                                                h=num_heads,
# #                                                dropout=dropout)

# # projection_layer = ProjectionLayer(d_model, vocab_size)

# # encoder_blocks = nn.ModuleList([
# #     EncoderBlock(self_attention_block, feed_forward_block, dropout)
# #     for _ in range(num_layers)
# # ])
# # encoder = Encoder(layers=encoder_blocks)

# # # Decoder specific components
# # cross_attention_block = MultiHeadAttentionBlock(d_model=d_model,
# #                                                 h=num_heads,
# #                                                 dropout=dropout)
# # decoder_blocks = nn.ModuleList([
# #     DecoderBlock(self_attention_block, cross_attention_block,
# #                  feed_forward_block, dropout) for _ in range(num_layers)
# # ])
# # decoder = Decoder(layers=decoder_blocks)

# # # Create example input
# # batch_size = 1
# # batch_of_sentences = torch.tensor(
# #     [[5, 6, 7, 0, 0]])  # Shape: (batch_size, max_sentence_length)
# # embedded_sentences = input_embeddings(batch_of_sentences)
# # positional_encoded = positional_encoding(embedded_sentences)

# # # Run through the encoder
# # mask = None  # Assuming no mask for simplicity
# # encoder_output = encoder(positional_encoded, mask)

# # # Create example decoder input
# # decoder_input = torch.tensor([[1, 2, 3, 4, 5]])  # Shape: (batch_size, seq_len)
# # decoder_embedded = input_embeddings(decoder_input)
# # decoder_positional_encoded = positional_encoding(decoder_embedded)

# # # Run through the decoder
# # tgt_mask = None  # Assuming no mask for simplicity
# # decoder_output = decoder(decoder_positional_encoded, encoder_output, mask,
# #                          tgt_mask)

# # projecteded = projection_layer(decoder_output)

# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# #############################o##############################
# # Transformer


# class Transformer(nn.Module):
#     def __init__(
#         self,
#         encoder: Encoder,
#         decoder: Decoder,
#         src_embed: InputEmbeddings,
#         tgt_embed: InputEmbeddings,
#         src_pos: PositionalEncoding,
#         tgt_pos: PositionalEncoding,
#         projection_layer: ProjectionLayer,
#     ):
#         super().__init__()
#         self.encoder = encoder
#         self.decoder = decoder
#         self.src_embed = src_embed
#         self.tgt_embed = tgt_embed
#         self.src_pos = src_pos
#         self.tgt_pos = tgt_pos
#         self.projection_layer = projection_layer

#     # three methods, one to encoder, one to decode and one to project
#     # Not creating single forward method because we can reuse output of encoder and to also visualize the attention

#     def encode(self, src, src_mask):
#         src = self.src_embed(src)
#         src = self.src_pos(src)
#         src = self.encoder(src, mask)
#         return src

#     def decode(self, encoder_output, src_mask, tgt, tgt_mask):
#         tgt = self.tgt_embed(tgt)
#         tgt = self.tgt_pos(tgt)
#         tgt = self.decoder(tgt, encoder_output, src_mask, tgt_mask)
#         return tgt

#     def project(self, x):
#         return self.projection_layer(x)


# #####################################################################


# def build_transformer(
#     src_vocab_size: int,
#     tgt_vocab_size: int,
#     src_seq_len: int,
#     tgt_seq_len: int,
#     d_model: int,
#     N:int=6,
#     h:int=8,
#     dropout: float = 0.5,
#     d_ff:int=2048,
# ):
    
#     """
#     we need vocab size of src and tgt so get info about how many vectors to be created
#     Keyword arguments:

#     N: number of input layers i.e. number of enccoder blocks and number of decoder blocks
#     h: # of heads
#     """
#     """strcuture will be same across all tasks"""
#     src_embed = InputEmbeddings(d_model, src_vocab_size)
#     tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

#     # positional encoding layers
#     # one encoding layer wold be enough
#     src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
#     tgt_pos = PositionalEncoding(d_model, src_seq_len, dropout)

#     # create encoder blocks
#     encoder_blocks = []
#     for _ in range(N):
#         encoder_self_attention_block = MultiHeadAttentionBlock(
#             d_model, h, dropout)
#         feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
#         encoder_block = EncoderBlock(encoder_self_attention_block,
#                                      feed_forward_block, dropout)
#         encoder_blocks.append(encoder_block)

#     # create encoder blocks
#     decoder_blocks = []
#     for _ in range(N):
#         decoder_self_attention_block = MultiHeadAttentionBlock(
#             d_model, h, dropout)
#         decoder_cross_attention_block = MultiHeadAttentionBlock(
#             d_model, h, dropout)
#         feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)

#         decoder_block = DecoderBlock(
#             decoder_self_attention_block,
#             decoder_cross_attention_block,
#             feed_forward_block,
#             dropout,
#         )
#         decoder_blocks.append(decoder_block)

#     encoder = Encoder(nn.ModuleList(encoder_blocks))
#     decoder = Decoder(nn.ModuleList(decoder_blocks))

#     # projection layer
#     projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

#     # create transfromer
#     transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos,
#                               tgt_pos, projection_layer)

#     # initilize parameter to make trainig faster so they dont just strat with random values
#     for p in transformer.parameters():
#         if p.dim() > 1:
#             nn.init.xavier_uniform_(p)
#     return transformer


# build_transformer(
#     src_vocab_size=1000,
#     tgt_vocab_size=1000,
#     src_seq_len=100,
#     tgt_seq_len=100,
#     d_model=512,
#     h=8,
#     dropout=0.5,
#     d_ff=2048,
# )


# engIt - task

1. Download dataset: https://huggingface.co/datasets/Helsinki-NLP/opus_books/viewer/en-it
2. build tokenizer


In [884]:
# Tokenizer: create sentence into tokens. h=there are manyh tokenizers like BPE, subword-level,, word level, etc
# we'll be creting word-level tokenizer i.e. split by space
# so tokenizer builds vocab and maps tokens to index
# there would be special tokens too for paddings, start of sentence, end of sentence

### train,py


In [885]:
# train.py
import torch
import torch.nn as nn
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel

# class that will train tokenizer
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace  # to split words according to whitespace

from pathlib import Path  # to assist in creating absolutes path using relative paths

In [886]:
def get_all_sentences(ds, lang):
    # pasrsing each item which is a pair in dataset # (english, italian)
    # print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nitem:", item)
    
    for item in ds:
        sentence = item["translation"][lang]
        print(f"Sentence ({lang}): {sentence}")
        yield item["translation"][lang]

In [887]:
# buils the tokeizer
def get_or_build_tokenizer(config, ds, lang):
    """
    Building the tokenizer

    Keyword arguments:
    config -- config of our model
    ds -- dataset
    lang -- lang to build tokeizer for
    """

    # file to save this tokenizer
    tokenizer_path = Path(config["tokenizer_file"].format(lang))  # mean we can change
    #'../tokenizers/tokenizer_{0}.format(lang).json' same as f"'../tokenizers/tokenizer_{lang}'

    # so if tokenizer ddoesn't exists we create it
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()  # split by whitespces
        #  now training tokenizer
        trainer = WordLevelTrainer(special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"], min_frequency=2)  # for a word to appear in vocab it must have min frequency of 2
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))

    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer


# def get_or_build_tokenizer(config, ds, lang):
#     """lang: language to build tokenizer for"""
#     # config['tokenizer_file'] = '../tokenizers/tokenizer_{0}'
#     tokenizer_path = Path(config["tokenizer_file"].format(lang))
#     if not Path.exists(tokenizer_path):
#         tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
#         # split by wordspace
#         tokenizer.pre_tokenizers = Whitespace()
#         trainer = WordLevelTrainer(
#             special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
#             min_frequency=2)

#         print("tokenizer training started...")
#         tokenizer.train_from_iterator(get_all_sentences(ds, lang),
#                                       trainer=trainer)
#         tokenizer.save(str(tokenizer_path))
#     else:
#         tokenizer = Tokenizer.from_file(str(tokenizer_path))

#     print("tokenizer initiated!")
#     return tokenizer

In [888]:
# load dataset and get tokenizer

from torch.utils.data import Dataset, DataLoader, random_split

# from dataset import BilingualDataset, causal_mask


def get_ds(config):
    ds_raw = load_dataset("opus_books", f"{config['lang_src']}-{config['lang_tgt']}", split="train")
    print(f"\n\n\n\n\n\n\n\nds_raw: {ds_raw}")
    # build tokenizer
    tokenizer_src = get_or_build_tokenizer(config, ds_raw, config["lang_src"])
    tokenizer_tgt = get_or_build_tokenizer(config, ds_raw, config["lang_tgt"])
    


    # keep 10% for val, 90 for trainig ... hf dst has single split so we'll be splitting manually
    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    #
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    # max sequence length of src and tgt in both splits
    max_len_src, max_len_tgt = 0, 0

    for item in ds_raw:
        # print("\n\n item:", item)

        # load each sentence , convert it to ids using tokenizer and i check length.
        src_ids = tokenizer_src.encode(item["translation"][config["lang_src"]]).ids
        tgt_ids = tokenizer_tgt.encode(item["translation"][config["lang_tgt"]]).ids
        # print("\n\nsrc_ids:", src_ids )
        # print("\n\tgt_ids:", tgt_ids )
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
    print(f"Max len src:{max_len_src}")
    print(f"Max len tgt:{max_len_tgt}")

    print(f"Max length of src sentence{max_len_src}")
    print(f"Max length of tgt sentence{max_len_tgt}")

    # data loader
    train_dataloader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)  # batch_size=1 because we want to process each sentence one by one
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt

In [889]:
# from model import build_transformer


def get_model(config, vocab_src_len, vocab_tgt_len):
    """_summary_

    Args:
        config (_type_): _description_
        vocab_src_len (_type_): source vocab size
        vocab_tgt_len (_type_): target vocab size
    """
    model = build_transformer(vocab_src_len, vocab_tgt_len, config["seq_len"], config["seq_len"], config["d_model"])
    return model

In [890]:
# dataset.py

import torch
import torch.nn as nn
from torch.utils.data import Dataset


class BilingualDataset(Dataset):
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.seq_len = seq_len

        # saving particular tokens to create the tensor for the model.so we need SOS, EOS, and PAD tokens so how are we going to assign ID to these tokens?
        self.sos_token = torch.tensor([tokenizer_src.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_src.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_src.token_to_id("[PAD]")], dtype=torch.int64)

    # length of this dataset
    def __len__(self):
        return len(self.ds)

    def __getitem__(self, index):
        # extracting original pair from hfDataset
        src_target_pair = self.ds[index]
        src_text = src_target_pair["translation"][self.src_lang]
        tgt_text = src_target_pair["translation"][self.tgt_lang]

        # converting text into IDs
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids

        #  adding padding to reach `seq_length` because model works with fixed length
        # adding padding tokens

        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # -2 for sos and eos token
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1  # -1 for sos token  # eos will be generated by model
        """
        while training,on decode side, we only add SOS and on label side we add EOS token.
        so for dec_num_padding_tokens we only need to add one of special tokens
        """

        # make sure our seq_len is enough to represent all of sentences in our dataset. i.e. padding must never become negative
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # lets build three tensors for encoder input and decoder input and also for lebel. So, one tensor would be send to encoder,
        # one to decoder input and one that we expect as decoder's output and that output will be called label/target

        # encoder input = sos + src_text + eos + pad_tokens
        encoder_input = torch.cat([self.sos_token, torch.tensor(enc_input_tokens, dtype=torch.int64), self.eos_token, torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)])

        # decoder input = sos + decoder input + padtokens
        decoder_input = torch.cat([self.sos_token, torch.tensor(dec_input_tokens, dtype=torch.int64), torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)])

        # label: decoder output that we expect=  decoder input +eos+ padtokens (add EOS to label)
        label = torch.cat([torch.tensor(dec_input_tokens, dtype=torch.int64), self.eos_token, torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)])


        print(f"\t\t\t\tPadding added")
        print(f"\t\t\t\tsequenceTextLimit: {self.seq_len}")
        print(f"\t\t\t\tsrctextShape: {encoder_input.shape}")
        print(f"\t\t\t\ttgttextShape: {decoder_input.shape}")
        print(f"\t\t\t\tlabeltextShap: {label.shape}")

        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        """
        In encoder inout, we are adding padding to match length but we dont want these paddings to effect in attenation mechanism so we'll mask these paddings 
        that will indicate that these tokens shouldn't be considered: encoder_mask 
        it says all the tokens that are not padding are OK.        
        -------
        
        for decoder we need a specail mask called causal mask which means each word can only look at previous token and non-padding token. 
        Remember we only want real words to be considered in self attention.
        
        encoder_input: (seq,_len)  
        Decoder_input: (seq,_len)  
        encoder_mask: (1, batch, seq,_len)  
        decoder_mask: (1,batch, seq_len) & (1,seq_len, seq_len)
        label:(seq_len)
        """

        # dc_mask1singlesqueeze = (decoder_input != self.pad_token).unsqueeze(0).int()
        # print("dc_mask1singlesqueeze: ",dc_mask1singlesqueeze.shape)
        # dc_mask1douoblesqueeze = (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int()
        # print("dc_mask1douoblesqueeze: ",dc_mask1douoblesqueeze.shape)
        
        # causal_mask1 = causal_mask(decoder_input.size(0))
        # print("causal_mask1: ",causal_mask1.shape)
        # #############3
        
        
        # productsinglesqueeze = (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0))
        # print("productsinglesqueeze: ",dc_mask1singlesqueeze.shape)
        # productdoublesqueeze = (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0))
        # print("productdoublesqueeze: ",productdoublesqueeze.shape)
        
        # dc_mask2 = (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0))
        # print("dc_mask2: ",dc_mask2.shape)

        return {
            "encoder_input": encoder_input,
            "decoder_input": decoder_input,
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
            "label": label,
            "src_text": src_text,
            "tgt_text": tgt_text,
        }  # (seq_len)  # (seq_len)  # (1, batch, seq,_len)  # causal_mask will build matrix of seq_len * seq_len     # (1,batch, seq_len) & (1,seq_len, seq_len)


def causal_mask(size):
    """
    causal_mask will build matrix of seq_len * seq_len     # (1,batch, seq_len) & (1,seq_len, seq_len)
    we want each word in decider to only watch non padding words that come before it.
    Below matrix represent K\*Q in softmax attention, we want to hide all the values above the diagonal.
    so we want all values above diagonal to be masked out."""

    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    print(mask[:50,:50])
    return mask == 0

In [891]:
causal_mask(10)

tensor([[[0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]], dtype=torch.int32)


tensor([[[ True, False, False, False, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False, False, False, False],
         [ True,  True,  True,  True, False, False, False, False, False, False],
         [ True,  True,  True,  True,  True, False, False, False, False, False],
         [ True,  True,  True,  True,  True,  True, False, False, False, False],
         [ True,  True,  True,  True,  True,  True,  True, False, False, False],
         [ True,  True,  True,  True,  True,  True,  True,  True, False, False],
         [ True,  True,  True,  True,  True,  True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True]]])

You're right to focus on this part. It's a crucial detail in sequence-to-sequence models like those used for machine translation. Let's break this down further:

1. Encoder (Source Language) Side:

   - We add both SOS (Start of Sequence) and EOS (End of Sequence) tokens.
   - SOS tells the model "The sentence is starting here."
   - EOS tells the model "The sentence ends here."
   - This helps the encoder to understand the full context of the input sentence.

2. Decoder (Target Language) Side:
   - During training, we only add the SOS token at the beginning.
   - The EOS token is not added to the input, but is expected in the output.

Why this difference?

1. For the Encoder:

   - The full context is important. The model needs to know where the sentence starts and ends to encode all the information correctly.

2. For the Decoder:
   - During training, the decoder is typically fed the correct translation one word at a time (a technique called teacher forcing).
   - It starts with SOS to know when to begin generating the translation.
   - It should learn to generate EOS when it thinks the translation is complete.
   - By not providing EOS in the input but expecting it in the output, we're teaching the model to decide when to stop generating.

Practical example:

Let's say we're translating "Hello, how are you?" from English to French.

Encoder input might look like:
[SOS] Hello , how are you ? [EOS]

Decoder input during training might look like:
[SOS] Bonjour , comment allez - vous ?

And the expected output (label) would be:
Bonjour , comment allez - vous ? [EOS]

This way, the model learns to:

1. Understand complete sentences (encoder)
2. Start generating translations (decoder input)
3. Know when to stop generating (decoder output)

This subtle difference is key to training a model that can both understand full sentences and generate complete translations of varying lengths.

### causal Mask

we want each word in decider to only watch non padding words that come before it.
Below matrix represent K\*Q in softmax attention, we want to hide all the values above the diagonal.
so we want all values above diagonal to be masked out.
![alt text](<Screenshot from 2024-07-29 10-43-45.png>)


In [892]:
torch.tensor([1, 2, 3]) != 1


tensor([False,  True,  True])

In [893]:
torch.triu(torch.ones(1, 10, 10)).shape

torch.Size([1, 10, 10])

In [894]:
from pathlib import Path


def get_config():
    return {
        "batch_size": 8,
        "num_epochs": 20,
        "lr": 10**-4,
        "seq_len": 350,
        "d_model": 512,
        "lang_src": "en",
        "lang_tgt": "it",
        "model_folder": "weights",
        "model_basename": "tmodel_",
        "preload": None,  # preload to restart training if crashed
        "tokenizer_file": "tokenizer{0}.json",
        "experiment_name": "runs/tmodel",
    }

config = get_config()

def get_weights_file_path(config, epoch: str):
    model_folder = config["model_folder"]
    model_basename = config["model_basename"]
    model_filename = f"{model_basename}{epoch}.pt"
    return str(Path(".") / model_folder / model_filename)


### training loop


In [895]:
# from config import get_weights_file_path, get_config
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import warnings
def train_model(config):
    # define device
    device =  torch.device('cuda' if( torch.cuda.is_available) else 'cpu')
    device =torch.device('cpu')
    print(f'Using device: {device}')
    
    # making sure weight folder is created
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
    print(f'config: {config}')
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    # Tensorboard: to visualize loss
    writer = SummaryWriter(config['experiment_name'])
    # optimize
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
    
    # resume training incase model crashes... restores stateof model and optimizer
    initial_epoch=0
    global_step=0
    if(config['preload']):
        model_filename = get_weights_file_path(config, config['preload'])
        print(f"Preloading model: {model_filename}")
        state = torch.load(model_filename)
        inital_epoch = state['epoch']+1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step=state['global_step']
        
    # loss fn 
    loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1)
    # ignore_index: To ignore padding tokens so that they don't have any impact on calculating loss
    # Label smoothing is a technique used to smooth the target labels by assigning a small probability to the incorrect classes and reducing the confidence on the correct class. 
    # This helps prevent the model from becoming too confident and overfitting to the training data. 
    # label_smoothing=0.1 means that for each true label, 10% of the probability mass is redistributed to all other classes.

    # training loop
    for epoch in range(initial_epoch, config['num_epochs']):
        model.train() # model.train() tells your model that you are training the model. This helps inform layers such as Dropout and BatchNorm, which are designed to behave differently during 
        # training and evaluation. For instance, in training mode, BatchNorm updates a moving average on each new batch; whereas, for evaluation mode, these updates are frozen.
        batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch: {epoch:02d}')
        for batch in batch_iterator:
            print(f"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nbatch: {batch}")

            break
        break
        #     encoder_input = batch['encoder_input'].to(device) # (batch_size, seq_len)
        #     decoder_input = batch['decoder_input'].to(device) # (batch_size, seq_len)
        #     encoder_mask= batch['encoder_mask'].to(device) # (batch_size,1,1, seq_len)
        #     decoder_mask= batch['decoder_mask'].to(device) # (batch_size, 1 ,seq_len, seq_len)
            
        #     # run tensors through transformers
        #     encoder_output = model.encode(encoder_input, encoder_mask ) # (batch, seq_len, d_model)
        #     print("decoderoutput###################################")
        #     decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (batch, seq_len, d_model)
        #     proj_ouput = model.project(decoder_output) # (batch, seq_len, tgt_vocab_size)
            
        #     # compare output with our labels
        #     label = batch['label'].to(device)#(B, seq_len)
        #     # (Batch, seq_len, tgt_vocab_size)  -->   (batch * seq_len, tgt_vocab_size)
        #     loss = loss_fn(proj_ouput.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1)) 
        #     # update progress bar
        #     # batch_iterator.set_prefix(f"loss: {loss.item():6.3f}")
        #     batch_iterator = tqdm(train_dataloader, desc=f'Processing epoch: {epoch:02d}, Loss: {loss.item():.3f}')

        #     # log the loss
        #     writer.add_scalar('train_loss', loss.item(), global_step)
        #     writer.flush()
            
        #     # backpropagate
        #     loss.backward()
            
        #     # update weihts
            
        #     optimizer.step()
        #     optimizer.zero_grad()
            
        #     global_step+=1  # this is basically or tensorboard
            
        # # save the model at end of every epoch
        
        # model_filename = get_weights_file_path(config, f'{epoch:02d}')
        # #  it is good ide to save state of model + optimizer
        # torch.save({
        #     'epoch':epoch,
        #     'model_state_dict': model.state_dict(),
        #     'optimizer_state_dict':optimizer.state_dict(),
        #     'global_step':global_step
        #     }, model_filename)
            
if __name__ =='__main__':
    warnings.filterwarnings('ignore')
    config = get_config()
    train_model(config)

Using device: cpu
config: {'batch_size': 8, 'num_epochs': 20, 'lr': 0.0001, 'seq_len': 350, 'd_model': 512, 'lang_src': 'en', 'lang_tgt': 'it', 'model_folder': 'weights', 'model_basename': 'tmodel_', 'preload': None, 'tokenizer_file': 'tokenizer{0}.json', 'experiment_name': 'runs/tmodel'}








ds_raw: Dataset({
    features: ['id', 'translation'],
    num_rows: 32332
})
Sentence (en): Having considered me at leisure, he said--
Sentence (en): "What made you ill yesterday?"
Sentence (en): "She had a fall," said Bessie, again putting in her word.
Sentence (en): "Fall! why, that is like a baby again!
Sentence (en): Can't she manage to walk at her age?
Sentence (en): She must be eight or nine years old."
Sentence (en): "I was knocked down," was the blunt explanation, jerked out of me by another pang of mortified pride; "but that did not make me ill," I added; while Mr. Lloyd helped himself to a pinch of snuff.
Sentence (en): As he was returning the box to his waistcoat pocket, a loud bel

Processing epoch: 00:   0%|          | 0/3638 [00:00<?, ?it/s]

				Padding added
				sequenceTextLimit: 350
				srctextShape: torch.Size([350])
				tgttextShape: torch.Size([350])
				labeltextShap: torch.Size([350])
tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [0, 0, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]], dtype=torch.int32)
				Padding added
				sequenceTextLimit: 350
				srctextShape: torch.Size([350])
				tgttextShape: torch.Size([350])
				labeltextShap: torch.Size([350])
tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [0, 0, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]], dtype=torch.int32)
				Padding added
				sequenceTextLimit: 350
				srctextShape: torch.Size([350])
				tgttextShape: torch.Size([350])
				labeltextShap: torch.Size([350])
tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [0, 




In [896]:
num=23123234234
f"loss: {num:6.3f}"

'loss: 23123234234.000'

In [897]:
torch.ones(2,3,5).view(-1,10)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [898]:
torch.ones(2,3,5).view(10,-1)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [899]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

NameError: name 'tokenizer' is not defined

In [None]:
x = load_dataset("opus_books", f"{config['lang_src']}-{config['lang_tgt']}", split="train")
x

Dataset({
    features: ['id', 'translation'],
    num_rows: 32332
})

In [None]:
np.array(x['translation']).shape

(32332,)

In [None]:
x['translation'][10]

{'en': '"Jane, I don\'t like cavillers or questioners; besides, there is something truly forbidding in a child taking up her elders in that manner.',
 'it': '— Jane, non mi piace di essere interrogata. Sta male, del resto, che una bimba tratti così i suoi superiori.'}

In [None]:
x['translation'][10]['en']

'"Jane, I don\'t like cavillers or questioners; besides, there is something truly forbidding in a child taking up her elders in that manner.'

In [None]:
500 - len(x['translation'][10]['en'])

362

In [None]:
torch.tensor(x['translation'][10]['en'], torch.int64)

TypeError: tensor() takes 1 positional argument but 2 were given